Fix most encoding problems with Guile 2.x (issue 555420043 by jonas.hahnfeld@gmail.com)

jonas . hahnfeld Sat, 07 Mar 2020 13:48:11 -0800

Reviewers: ,

Message:
FYI: This makes a `check` with Guile 2.2 for `baseline` from Guile 1.8
almost clean. Left are:
* output races, ie `input/regression/option-help.log`,
`input/regression/woodwind-diagrams-key-lists.log`,
`input/regression/safe.log`
* rounding differences: `input/regression/markup-brace-warning.log`
(missed warning), `input/regression/song-reordering.log`,
`input/regression/song-reordering2.log`,
`input/regression/song-breathe.log`,
`input/regression/beam-quant-standard.log`
* a few extra warnings for `input/regression/song-tempo.log`,
`input/regression/song-slurs.log`,
`input/regression/song-repetition.log`
* print of internal structures `input/regression/scheme-engraver.log`
* and an encoding problem in
`input/regression/song-basic-nonenglish.log`


I guess all except the last are acceptable. If somebody has an idea for
that please let me know:
@@ -5,7 +5,7 @@
 <!DOCTYPE SINGING PUBLIC "-//SINGING//DTD SINGING mark up//EN"
"Singing.v0_1.dtd" []>
 <SINGING BPM="100.0">
 <DURATION BEATS="1.0"><PITCH NOTE="C5">ov</PITCH></DURATION>
-<DURATION BEATS="1.0"><PITCH NOTE="E5">čá</PITCH></DURATION>
+<DURATION BEATS="1.0"><PITCH NOTE="E5">??</PITCH></DURATION>
 <DURATION BEATS="1.0"><PITCH NOTE="G5">ci</PITCH></DURATION>
 <REST BEATS="1.0"></REST>
 </SINGING>

Description:
Fix most encoding problems with Guile 2.x

Individual commits:
1) Treat possibly incomplete UTF-8 as binary

replace_special_characters checks that the substring doesn't start
mid-UTF-8, but it does not guarantee that it ends in a complete glyph.
So just explicitly treat it as binary when creating the SCM.
While modifying the function, avoid comparison of zero-length
substrings.

2) Use UTF-8 for all conversions to / from Scheme

LilyPond really expects all input to be encoded in UTF-8, and we should
not let GUILE 2.x mangle with it.

Please review this at https://codereview.appspot.com/555420043/

Affected files (+16, -8 lines):
  M lily/general-scheme.cc
  M lily/include/lily-guile-macros.hh
  M lily/lily-guile.cc
  M lily/text-interface.cc


Index: lily/general-scheme.cc
diff --git a/lily/general-scheme.cc b/lily/general-scheme.cc
index 
98a49f5adace82c3cf4e6623506d11d8c3886261..4ab742298c6908b7e86f621987307af26c310425
 100644
--- a/lily/general-scheme.cc
+++ b/lily/general-scheme.cc
@@ -597,7 +597,9 @@ LY_DEFINE (ly_format, "ly:format",
     }
   *ptr = '\0';
 
-  return scm_take_locale_stringn (result, len);
+  SCM ret = scm_from_utf8_stringn (result, len);
+  free(result);
+  return ret;
 }
 
 int
Index: lily/include/lily-guile-macros.hh
diff --git a/lily/include/lily-guile-macros.hh 
b/lily/include/lily-guile-macros.hh
index 
0504eb7f152b9614d43886e6e44c1c547b529a18..d2a752724749254df20a7b86ac85f88e0d7f3c5c
 100644
--- a/lily/include/lily-guile-macros.hh
+++ b/lily/include/lily-guile-macros.hh
@@ -37,8 +37,10 @@
 #define scm_from_latin1_string scm_from_locale_string
 #define scm_from_latin1_stringn scm_from_locale_stringn
 #define scm_from_utf8_string scm_from_locale_string
+#define scm_from_utf8_stringn scm_from_locale_stringn
 #define scm_from_utf8_symbol scm_from_locale_symbol
 #define scm_to_utf8_string scm_to_locale_string
+#define scm_to_utf8_stringn scm_to_locale_stringn
 #endif
 
 #ifndef SMOB_FREE_RETURN_VAL
Index: lily/lily-guile.cc
diff --git a/lily/lily-guile.cc b/lily/lily-guile.cc
index 
ac999dc500e51c25936015f94186313738c961a0..c73c825bc260eb5af8350b0f6c87f5e0c3f9f351
 100644
--- a/lily/lily-guile.cc
+++ b/lily/lily-guile.cc
@@ -123,7 +123,7 @@ ly_scm2string (SCM str)
   assert (scm_is_string (str));
   string result;
   size_t len;
-  char *c_string = scm_to_locale_stringn (str, &len);
+  char *c_string = scm_to_utf8_stringn (str, &len);
   if (len)
     {
       result.assign (c_string, len);
@@ -135,8 +135,7 @@ ly_scm2string (SCM str)
 SCM
 ly_string2scm (string const &str)
 {
-  return scm_from_locale_stringn (str.c_str (),
-                                  str.length ());
+  return scm_from_utf8_stringn (str.c_str (), str.length ());
 }
 
 char *
Index: lily/text-interface.cc
diff --git a/lily/text-interface.cc b/lily/text-interface.cc
index 
1f62e9b287290439d45375e717b03196656ac763..91385b0184b24d8097ab5a7fd6c6c8c026941676
 100644
--- a/lily/text-interface.cc
+++ b/lily/text-interface.cc
@@ -56,13 +56,18 @@ replace_special_characters (string &str, SCM props)
       /* Don't match in mid-UTF-8 */
       if ((str[i] & 0xc0) == 0x80)
         continue;
-      for (vsize j = max_length + 1; j--;)
+      for (vsize j = max_length; j > 0; j--)
         {
           if (j > str.size () - i)
             continue;
-          string dummy = str.substr (i, j);
-          SCM ligature = ly_assoc_get (ly_string2scm (dummy),
-                                       replacement_alist, SCM_BOOL_F);
+          // TODO: It could make sense to skip if not at the end of a UTF-8
+          // glyph. However that requires finding the start of the last glyph
+          // (not necessarily at str[i] - the longest replacement could match
+          // multiple glyphs) to get the glyph's length which is not trivial.
+          // So for now just continue checking all substrings that could be
+          // valid UTF-8 (see check for str[i] not in mid-UTF-8 above).
+          SCM substr = scm_from_latin1_stringn (str.c_str() + i, j);
+          SCM ligature = ly_assoc_get (substr, replacement_alist, SCM_BOOL_F);
           if (scm_is_true (ligature))
             str.replace (i, j, robust_scm2string (ligature, ""));
         }

Fix most encoding problems with Guile 2.x (issue 555420043 by jonas.hahnfeld@gmail.com)

Reply via email to