Reviewers: , Message: FYI: This makes a `check` with Guile 2.2 for `baseline` from Guile 1.8 almost clean. Left are: * output races, ie `input/regression/option-help.log`, `input/regression/woodwind-diagrams-key-lists.log`, `input/regression/safe.log` * rounding differences: `input/regression/markup-brace-warning.log` (missed warning), `input/regression/song-reordering.log`, `input/regression/song-reordering2.log`, `input/regression/song-breathe.log`, `input/regression/beam-quant-standard.log` * a few extra warnings for `input/regression/song-tempo.log`, `input/regression/song-slurs.log`, `input/regression/song-repetition.log` * print of internal structures `input/regression/scheme-engraver.log` * and an encoding problem in `input/regression/song-basic-nonenglish.log`
I guess all except the last are acceptable. If somebody has an idea for that please let me know: @@ -5,7 +5,7 @@ <!DOCTYPE SINGING PUBLIC "-//SINGING//DTD SINGING mark up//EN" "Singing.v0_1.dtd" []> <SINGING BPM="100.0"> <DURATION BEATS="1.0"><PITCH NOTE="C5">ov</PITCH></DURATION> -<DURATION BEATS="1.0"><PITCH NOTE="E5">čá</PITCH></DURATION> +<DURATION BEATS="1.0"><PITCH NOTE="E5">??</PITCH></DURATION> <DURATION BEATS="1.0"><PITCH NOTE="G5">ci</PITCH></DURATION> <REST BEATS="1.0"></REST> </SINGING> Description: Fix most encoding problems with Guile 2.x Individual commits: 1) Treat possibly incomplete UTF-8 as binary replace_special_characters checks that the substring doesn't start mid-UTF-8, but it does not guarantee that it ends in a complete glyph. So just explicitly treat it as binary when creating the SCM. While modifying the function, avoid comparison of zero-length substrings. 2) Use UTF-8 for all conversions to / from Scheme LilyPond really expects all input to be encoded in UTF-8, and we should not let GUILE 2.x mangle with it. Please review this at https://codereview.appspot.com/555420043/ Affected files (+16, -8 lines): M lily/general-scheme.cc M lily/include/lily-guile-macros.hh M lily/lily-guile.cc M lily/text-interface.cc Index: lily/general-scheme.cc diff --git a/lily/general-scheme.cc b/lily/general-scheme.cc index 98a49f5adace82c3cf4e6623506d11d8c3886261..4ab742298c6908b7e86f621987307af26c310425 100644 --- a/lily/general-scheme.cc +++ b/lily/general-scheme.cc @@ -597,7 +597,9 @@ LY_DEFINE (ly_format, "ly:format", } *ptr = '\0'; - return scm_take_locale_stringn (result, len); + SCM ret = scm_from_utf8_stringn (result, len); + free(result); + return ret; } int Index: lily/include/lily-guile-macros.hh diff --git a/lily/include/lily-guile-macros.hh b/lily/include/lily-guile-macros.hh index 0504eb7f152b9614d43886e6e44c1c547b529a18..d2a752724749254df20a7b86ac85f88e0d7f3c5c 100644 --- a/lily/include/lily-guile-macros.hh +++ b/lily/include/lily-guile-macros.hh @@ -37,8 +37,10 @@ #define scm_from_latin1_string scm_from_locale_string #define scm_from_latin1_stringn scm_from_locale_stringn #define scm_from_utf8_string scm_from_locale_string +#define scm_from_utf8_stringn scm_from_locale_stringn #define scm_from_utf8_symbol scm_from_locale_symbol #define scm_to_utf8_string scm_to_locale_string +#define scm_to_utf8_stringn scm_to_locale_stringn #endif #ifndef SMOB_FREE_RETURN_VAL Index: lily/lily-guile.cc diff --git a/lily/lily-guile.cc b/lily/lily-guile.cc index ac999dc500e51c25936015f94186313738c961a0..c73c825bc260eb5af8350b0f6c87f5e0c3f9f351 100644 --- a/lily/lily-guile.cc +++ b/lily/lily-guile.cc @@ -123,7 +123,7 @@ ly_scm2string (SCM str) assert (scm_is_string (str)); string result; size_t len; - char *c_string = scm_to_locale_stringn (str, &len); + char *c_string = scm_to_utf8_stringn (str, &len); if (len) { result.assign (c_string, len); @@ -135,8 +135,7 @@ ly_scm2string (SCM str) SCM ly_string2scm (string const &str) { - return scm_from_locale_stringn (str.c_str (), - str.length ()); + return scm_from_utf8_stringn (str.c_str (), str.length ()); } char * Index: lily/text-interface.cc diff --git a/lily/text-interface.cc b/lily/text-interface.cc index 1f62e9b287290439d45375e717b03196656ac763..91385b0184b24d8097ab5a7fd6c6c8c026941676 100644 --- a/lily/text-interface.cc +++ b/lily/text-interface.cc @@ -56,13 +56,18 @@ replace_special_characters (string &str, SCM props) /* Don't match in mid-UTF-8 */ if ((str[i] & 0xc0) == 0x80) continue; - for (vsize j = max_length + 1; j--;) + for (vsize j = max_length; j > 0; j--) { if (j > str.size () - i) continue; - string dummy = str.substr (i, j); - SCM ligature = ly_assoc_get (ly_string2scm (dummy), - replacement_alist, SCM_BOOL_F); + // TODO: It could make sense to skip if not at the end of a UTF-8 + // glyph. However that requires finding the start of the last glyph + // (not necessarily at str[i] - the longest replacement could match + // multiple glyphs) to get the glyph's length which is not trivial. + // So for now just continue checking all substrings that could be + // valid UTF-8 (see check for str[i] not in mid-UTF-8 above). + SCM substr = scm_from_latin1_stringn (str.c_str() + i, j); + SCM ligature = ly_assoc_get (substr, replacement_alist, SCM_BOOL_F); if (scm_is_true (ligature)) str.replace (i, j, robust_scm2string (ligature, "")); }