; Module/File: UTF8_BadCharWorkaround.pb ; Function: Correction and detection of chars in Utf8-strings > 2 Byte, correct StringLen, ... - Linux ; Author: Omi ; Date: Jan. 05, 2016 ; Version: 0.1 ; Target Compiler: PureBasic 5.22/5.31/5.40 ; Target OS: Linux: (X/K/L)ubuntu, Mint, 32/64, Ascii/Uni ;-------------------------------------------------------------- ; Strings are displayed correctly in IDE and from PureBasic, but the commands in the StringLib don't handle them ; correct, when they contains characters > $ffff (> 2 Byte) in the unicode-exe which can hold only 2 Byte characters! ; Barely testet :-( functions for a workaround. EnableExplicit ImportC "" g_utf8_validate(str.p-utf8, max_len, *end) g_utf8_strlen(p.p-utf8, max) g_utf8_substring(str.p-utf8, start_pos, end_pos) g_utf8_offset_to_pointer(str.p-utf8, offset) EndImport Global.i Dummy ; a 'bad' (invalid) string example w. chars > $ffff ... Global.s gS= "𐎀😋Ω𐌏☺𐌈" ; a 'good' (valid) string w. chars within 2 Bytes ... ; Global.s gS= "ABCdefΩ←→Φπ123" ;- gLib-check of Utf8-String on general validity (no check on chars > $ffff !) ... Procedure.i sUtf8_Valid(S.s) ProcedureReturn g_utf8_validate(S, -1, #Null); #True / #False EndProcedure ;- gLib-Len = real len of String in chars (also including chars > $ffff as 1 char) ... Procedure.i sUtf8_Len(S.s) ProcedureReturn g_utf8_strlen(S, -1); in chars EndProcedure ;- returns Position (in chars) of the first char > $ffff, or #Null ... Procedure sUtf8_InvalidCharPos(S.s, *pAsc.Integer) Protected.i cAsc, cInvalPos= #Null, n= #Null Protected *pVar, *pFix= AllocateMemory(StringByteLength(S, #PB_UTF8) + 1) PokeS(*pFix, S, -1, #PB_UTF8) *pVar= *pFix cAsc = g_utf8_get_char_validated_(*pVar, -1) While cAsc n+ 1 If cAsc > $ffff : cInvalPos= n : Break : EndIf *pVar= g_utf8_find_next_char_(*pVar, 0) cAsc = g_utf8_get_char_validated_(*pVar, -1) Wend FreeMemory(*pFix) *pAsc\i= cAsc ProcedureReturn cInvalPos; Position of invalid char or #Null EndProcedure ;- returns Utf8-String with replaced chars > $ffff by 'lDummy' ... Procedure.s sUtf8_ReplaceInvalidChars(S.s, lDummy.s) Protected.i cAsc Protected.i cInvalPos= sUtf8_InvalidCharPos(S, @cAsc) If cInvalPos > 0 ;Position to sequentially save the previous (replaced) character: cAsc EndIf While cInvalPos S = PeekS(g_utf8_substring(S, 0, cInvalPos-1), cInvalPos-1, #PB_UTF8) + lDummy + PeekS(g_utf8_substring(S, cInvalPos, sUtf8_Len(S)), -1, #PB_UTF8) cInvalPos= sUtf8_InvalidCharPos(S, @cAsc) If cInvalPos > 0 ;Position to sequentially save the previous (replaced) character: cAsc EndIf Wend ProcedureReturn S; the 'cleaned' string EndProcedure ;- whether a string contains chars > $ffff ... Procedure.i sUtf8_HasInvalidPbChars(S.s) ProcedureReturn Bool(g_utf8_strlen(S, -1) <> Len(S)); #True / #False EndProcedure If OpenWindow(0, 300, 200, 400, 300, "gtk-dummy window", #PB_Window_SystemMenu | #PB_Window_ScreenCentered) Debug "Original test string : " + gS Debug "Utf8 string generally valid? : " + Str(sUtf8_Valid(gS)) Debug "Utf8 string invalid for PB (Unicode) : " + Str(sUtf8_HasInvalidPbChars(gS)) Debug "Correct Utf8-string length with API : " + Str(sUtf8_Len(gS)) Debug "String length from PureBasic : " + Str(Len(gS)) Debug "Number of 'bad' Utf8 chars (>$ffff) : " + Str(Len(gS) - sUtf8_Len(gS)) Debug "First pos of 'bad' Utf8 char (>$ffff): " + Str(sUtf8_InvalidCharPos(gS, @Dummy)) Debug "'Repaired' test string : " + sUtf8_ReplaceInvalidChars(gS, "�"); =$fffd Repeat If WaitWindowEvent() = #PB_Event_CloseWindow Break EndIf ForEver EndIf ; IDE Options = PureBasic 5.41 LTS Beta 5 (Linux - x86) ; Folding = - ; EnableUnicode ; EnableXP ; Watchlist = gS