; Module/File:     UTF8_BadCharWorkaround.pb
; Function:        Correction and detection of chars in Utf8-strings > 2 Byte, correct StringLen, ... - Linux
; Author:          Omi
; Date:            Jan. 05, 2016
; Version:         0.1
; Target Compiler: PureBasic 5.22/5.31/5.40
; Target OS:       Linux: (X/K/L)ubuntu, Mint, 32/64, Ascii/Uni
;--------------------------------------------------------------
; Strings are displayed correctly in IDE and from PureBasic, but the commands in the StringLib don't handle them 
; correct, when they contains characters > $ffff (> 2 Byte) in the unicode-exe which can hold only 2 Byte characters! 
; Barely testet :-( functions for a workaround.

EnableExplicit

ImportC ""
	g_utf8_validate(str.p-utf8, max_len, *end)
	g_utf8_strlen(p.p-utf8, max)
	g_utf8_substring(str.p-utf8, start_pos, end_pos)
	g_utf8_offset_to_pointer(str.p-utf8, offset)
EndImport

Global.i Dummy

; a 'bad' (invalid) string example w. chars > $ffff ...
Global.s gS= "𐎀￼😋Ω𐌏☺𐌈"
; a 'good' (valid) string w. chars within 2 Bytes ...
; Global.s gS= "ABCdefΩ←→Φπ123"

;- gLib-check of Utf8-String on general validity (no check on chars > $ffff !) ...
Procedure.i sUtf8_Valid(S.s)
	ProcedureReturn g_utf8_validate(S, -1, #Null); #True / #False
EndProcedure

;- gLib-Len = real len of String in chars (also including chars > $ffff as 1 char) ...
Procedure.i sUtf8_Len(S.s)
	ProcedureReturn g_utf8_strlen(S, -1);          in chars
EndProcedure

;- returns Position (in chars) of the first char > $ffff, or #Null  ...
Procedure sUtf8_InvalidCharPos(S.s, *pAsc.Integer)
	Protected.i cAsc, cInvalPos= #Null, n= #Null
	Protected   *pVar, *pFix= AllocateMemory(StringByteLength(S, #PB_UTF8) + 1)
	
	PokeS(*pFix, S, -1, #PB_UTF8)
	*pVar= *pFix
	cAsc = g_utf8_get_char_validated_(*pVar, -1)
	While cAsc
		n+ 1
		If cAsc > $ffff : cInvalPos= n : Break : EndIf
		*pVar= g_utf8_find_next_char_(*pVar, 0)
		cAsc = g_utf8_get_char_validated_(*pVar, -1)
	Wend
	FreeMemory(*pFix)
	*pAsc\i= cAsc
	ProcedureReturn cInvalPos;                     Position of invalid char or #Null
EndProcedure

;- returns Utf8-String with replaced chars > $ffff by 'lDummy' ...
Procedure.s sUtf8_ReplaceInvalidChars(S.s, lDummy.s)
	Protected.i cAsc
	Protected.i cInvalPos= sUtf8_InvalidCharPos(S, @cAsc)
	
	If cInvalPos > 0
		;Position to sequentially save the previous (replaced) character: cAsc
	EndIf
	While cInvalPos
		S        = PeekS(g_utf8_substring(S, 0, cInvalPos-1), cInvalPos-1, #PB_UTF8) + 
		           lDummy + 
		           PeekS(g_utf8_substring(S, cInvalPos, sUtf8_Len(S)), -1, #PB_UTF8)
		cInvalPos= sUtf8_InvalidCharPos(S, @cAsc)
		If cInvalPos > 0
			;Position to sequentially save the previous (replaced) character: cAsc
		EndIf
	Wend
	ProcedureReturn S;                             the 'cleaned' string
EndProcedure

;- whether a string contains chars > $ffff ...
Procedure.i sUtf8_HasInvalidPbChars(S.s)
	ProcedureReturn Bool(g_utf8_strlen(S, -1) <> Len(S)); #True / #False
EndProcedure


If OpenWindow(0, 300, 200, 400, 300, "gtk-dummy window", #PB_Window_SystemMenu | #PB_Window_ScreenCentered)
	Debug "Original test string                 : " + gS
	Debug "Utf8 string generally valid?         : " + Str(sUtf8_Valid(gS))
	Debug "Utf8 string invalid for PB (Unicode) : " + Str(sUtf8_HasInvalidPbChars(gS))
	Debug "Correct Utf8-string length with API  : " + Str(sUtf8_Len(gS))
	Debug "String length from PureBasic         : " + Str(Len(gS))
	Debug "Number of 'bad' Utf8 chars (>$ffff)  : " + Str(Len(gS) - sUtf8_Len(gS))
	Debug "First pos of 'bad' Utf8 char (>$ffff): " + Str(sUtf8_InvalidCharPos(gS, @Dummy))
	Debug "'Repaired' test string               : " + sUtf8_ReplaceInvalidChars(gS, "�"); =$fffd
	
	Repeat
		If WaitWindowEvent() = #PB_Event_CloseWindow
			Break
		EndIf
	ForEver
EndIf
; IDE Options = PureBasic 5.41 LTS Beta 5 (Linux - x86)
; Folding = -
; EnableUnicode
; EnableXP
; Watchlist = gS