[bug-gettext] [RFC Patch2] Implement \u support in xgettext for C family (C11/C++11)

Miguel Ángel Sun, 17 Feb 2013 15:42:10 -0800

Hi,

This is a second try for the patch. It breaks format-c-5 test, because
it converts every string to UTF-8. The error is


"""
xgettext: Non-ASCII string at ../tests/format-c-5-prg.c:60.
          Please specify the source encoding through --from-code.
FAIL: format-c-5
""""

because the sixtieth line of gettext-tools/tests/format-c-5-prg.c is

"""
  expected_result = "Vater von \xdb\xb5 Kindern";
"""

that is not a valid ASCII string. With this patch, every string is
translated to UTF-8, so this string abort() xgettext.

Adding --from-code=UTF-8 in the test is a workaround, although I do not
have any actual idea right now about how to solve it. :(

Happy Hacking!
Miguel

diff --git a/gettext-tools/src/ChangeLog b/gettext-tools/src/ChangeLog
index f188106..9b3a93a 100644
--- a/gettext-tools/src/ChangeLog
+++ b/gettext-tools/src/ChangeLog
@@ -1,3 +1,30 @@
+2013-02-17  Miguel Angel Arruga Vivas  <[email protected]>
+
+	Add support for Unicode escaped sequences in x-c.c based in
+	x-java.c Unicode support.
+	* x-c.c (po-charset.h): Included for 'po_charset_utf8'.
+	(unistr.h): Included for 'ucs4_t'.
+	(comment_line_end): Store comments in UTF-8.
+	(check_unicode_codepoint): New function.
+	(P7_UNICODE4): New macro.
+	(P7_UNICODE8): Likewise.
+	(phase7_getc): Add new case for 'u'/'U'.
+	(extract_unicode_codepoint): New function.
+	(mixed_string_buffer): New struct. Code from x-python.c without
+	UTF-16 surrogates support.
+	(mixed_string_buffer_init): New function. Code from x-python.c
+	without UTF-16 surrogates support.
+	(mixed_string_buffer_free): Likewise.
+	(mixed_string_buffer_append_byte): Likewise.
+	(mixed_string_buffer_append_unicode_grow): Likewise.
+	(mixed_string_buffer_append_flush): Likewise.
+	(mixed_string_buffer_append_unicode): Likewise.
+	(mixed_string_buffer_append): Likewise.
+	(mixed_string_buffer_result): Likewise.
+	(phase5_get): Store UTF-8 string literals and identifiers.
+	(extract_parenthesized): Change 'xgettext_current_source_encoding'
+	to 'po_charset_utf8'.
+
 2013-02-06  Miguel Angel Arruga Vivas  <[email protected]>
 
 	GtkBuilder support in xgettext.
diff --git a/gettext-tools/src/x-c.c b/gettext-tools/src/x-c.c
index ea0a874..24798b4 100644
--- a/gettext-tools/src/x-c.c
+++ b/gettext-tools/src/x-c.c
@@ -36,6 +36,8 @@
 #include "xalloc.h"
 #include "xvasprintf.h"
 #include "hash.h"
+#include "po-charset.h"
+#include "unistr.h"
 #include "gettext.h"
 
 #define _(s) gettext(s)
@@ -711,6 +713,7 @@ comment_add (int c)
 static inline void
 comment_line_end (size_t chars_to_remove)
 {
+  char *utf8_buffer;
   buflen -= chars_to_remove;
   while (buflen >= 1
          && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
@@ -721,7 +724,11 @@ comment_line_end (size_t chars_to_remove)
       buffer = xrealloc (buffer, bufmax);
     }
   buffer[buflen] = '\0';
-  savable_comment_add (buffer);
+  utf8_buffer = from_current_source_encoding (buffer, lc_comment,
+                                              logical_file_name, line_number);
+  savable_comment_add (utf8_buffer);
+  if (utf8_buffer != buffer)
+    free (utf8_buffer);
 }
 
 
@@ -857,6 +864,40 @@ struct token_ty
   int line_number;
 };
 
+/* Check the number of digits of an escaped unicode codepoint.
+   FOUR_DIGITS: True when it is '\u' escaped sequence.  */
+static bool
+check_unicode_codepoint (bool four_digits)
+{
+  int stored[8];
+  int i, j, n;
+
+  for (i = 0; i < 8; ++i)
+    {
+      if (four_digits && (i < 4))
+        stored[i] = 0;
+      else
+        {
+          stored[i] = phase3_getc ();
+          switch (stored[i])
+            {
+            default:
+              /* Fallback.  */
+              n = four_digits ? 4 : 0;
+              for (j = i; j >= n; --j)
+                phase3_ungetc (stored[j]);
+              return false;
+
+            case '0': case '1': case '2': case '3': case '4':
+            case '5': case '6': case '7': case '8': case '9':
+            case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
+            case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
+              break;
+            }
+        }
+    }
+  return true;
+}
 
 /* 7. Replace escape sequences within character strings with their
    single character equivalents.  This is called from phase 5, because
@@ -867,6 +908,8 @@ struct token_ty
 #define P7_QUOTES (1000 + '"')
 #define P7_QUOTE (1000 + '\'')
 #define P7_NEWLINE (1000 + '\n')
+#define P7_UNICODE4 (1000 + 'u')
+#define P7_UNICODE8 (1000 + 'U')
 
 static int
 phase7_getc ()
@@ -998,6 +1041,18 @@ phase7_getc ()
         }
       phase3_ungetc (c);
       return n;
+
+    /* Unicode support.  */
+    case 'u':case 'U':
+      if (!check_unicode_codepoint (c == 'u'))
+        {
+          phase3_ungetc (c);
+          return '\\';
+        }
+      else if (c == 'u')
+        return P7_UNICODE4;
+      else
+        return P7_UNICODE8;
     }
 }
 
@@ -1020,6 +1075,218 @@ free_token (token_ty *tp)
     drop_reference (tp->comment);
 }
 
+/* Unicode support.  */
+
+static ucs4_t
+extract_unicode_codepoint (bool four_digits)
+{
+  int stored[8];
+  int i;
+  ucs4_t uc = 0;
+
+  for (i = 0; i < 8; i++)
+    {
+      if (four_digits && (i < 4))
+        stored[i] = 0;
+      else
+        {
+          stored[i] = phase3_getc ();
+          switch (stored[i])
+            {
+            default:
+              /* This should be called pointing to a valid unicode
+                 escaped sequence.  */
+              abort ();
+              return 0;
+
+            case '0': case '1': case '2': case '3': case '4':
+            case '5': case '6': case '7': case '8': case '9':
+              uc = uc * 16 + stored[i] - '0';
+              break;
+
+            case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
+              uc = uc * 16 + 10 + stored[i] - 'A';
+              break;
+
+            case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
+              uc = uc * 16 + 10 + stored[i] - 'a';
+              break;
+            }
+        }
+    }
+  return uc;
+}
+
+/* A string buffer type that allows appending bytes (in the
+   xgettext_current_source_encoding) or Unicode characters.
+   Returns the entire string in UTF-8 encoding.  */
+struct mixed_string_buffer
+{
+  /* The part of the string in local encoding.  */
+  char *buffer;
+  size_t bufmax;
+  size_t buflen;
+  /* The part of the string converted to UTF-8.  */
+  char *utf8_buffer;
+  size_t utf8_bufmax;
+  size_t utf8_buflen;
+  /* The lexical context.  Used only for error message purposes.  */
+  lexical_context_ty lcontext;
+};
+
+/* Initialize a 'struct mixed_string_buffer' to empty.  */
+static void
+mixed_string_buffer_init (struct mixed_string_buffer *bp,
+                          lexical_context_ty lcontext)
+{
+  bp->buffer = NULL;
+  bp->bufmax = 0;
+  bp->buflen = 0;
+  bp->utf8_buffer = NULL;
+  bp->utf8_bufmax = 0;
+  bp->utf8_buflen = 0;
+  bp->lcontext = lcontext;
+}
+
+/* Destroy the content of a 'struct mixed_string_buffer'.  */
+static void
+mixed_string_buffer_free (struct mixed_string_buffer *bp)
+{
+  if (bp->buffer != NULL)
+    free (bp->buffer);
+  if (bp->utf8_buffer != NULL)
+    free (bp->utf8_buffer);
+  bp->buffer = NULL;
+  bp->utf8_buffer = NULL;
+}
+
+/* Auxiliary function: Append a byte to bp->buffer.  */
+static void
+mixed_string_buffer_append_byte (struct mixed_string_buffer *bp, int c)
+{
+  if (bp->buflen == bp->bufmax)
+    {
+      bp->bufmax = 2 * bp->bufmax + 10;
+      bp->buffer = xrealloc (bp->buffer, bp->bufmax);
+    }
+  bp->buffer[bp->buflen++] = c;
+}
+
+
+/* Auxiliary function: Ensure count more bytes are available in bp->utf8.  */
+static inline void
+mixed_string_buffer_append_unicode_grow (struct mixed_string_buffer *bp,
+                                         size_t count)
+{
+  if (bp->utf8_buflen + count > bp->utf8_bufmax)
+    {
+      size_t new_allocated = 2 * bp->utf8_bufmax + 10;
+      if (new_allocated < bp->utf8_buflen + count)
+        new_allocated = bp->utf8_buflen + count;
+      bp->utf8_bufmax = new_allocated;
+      bp->utf8_buffer = xrealloc (bp->utf8_buffer, new_allocated);
+    }
+}
+
+/* Auxiliary function: Flush bp->buffer into bp->utf8_buffer.  */
+static inline void
+mixed_string_buffer_flush (struct mixed_string_buffer *bp,
+                           int lineno)
+{
+  if (bp->buflen > 0)
+    {
+      char *curr;
+      size_t count;
+
+      mixed_string_buffer_append_byte (bp, '\0');
+
+      /* Convert from the source encoding to UTF-8.  */
+      curr = from_current_source_encoding (bp->buffer, bp->lcontext,
+                                           logical_file_name, lineno);
+
+      /* Append it to bp->utf8_buffer.  */
+      count = strlen (curr);
+      mixed_string_buffer_append_unicode_grow (bp, count);
+      memcpy (bp->utf8_buffer + bp->utf8_buflen, curr, count);
+      bp->utf8_buflen += count;
+
+      if (curr != bp->buffer)
+        free (curr);
+      bp->buflen = 0;
+    }
+}
+
+/* Auxiliary function: Append a Unicode character to bp->utf8.
+   uc must be < 0x110000.  */
+static void
+mixed_string_buffer_append_unicode (struct mixed_string_buffer *bp, ucs4_t uc)
+{
+  unsigned char utf8buf[6];
+  int count = u8_uctomb (utf8buf, uc, 6);
+
+  if (count < 0)
+    /* The caller should have ensured that uc is not out-of-range.  */
+    abort ();
+
+  mixed_string_buffer_append_unicode_grow (bp, count);
+  memcpy (bp->utf8_buffer + bp->utf8_buflen, utf8buf, count);
+  bp->utf8_buflen += count;
+}
+
+/* Append a character or Unicode character to a 'struct mixed_string_buffer'.  */
+static void
+mixed_string_buffer_append (struct mixed_string_buffer *bp, int c)
+{
+  if (c == P7_UNICODE4 || c == P7_UNICODE8)
+    {
+      /* Append a Unicode character.  */
+      ucs4_t uc = extract_unicode_codepoint (c == P7_UNICODE4);
+      /* Switch from multibyte character mode to Unicode character mode.  */
+      mixed_string_buffer_flush (bp, line_number);
+
+      if (uc >= 0x110000
+          || (uc > 0 && uc < 0x20)
+          || (uc < 0xa0 && uc >= 0x7f)
+          || (uc < 0xe000 && uc >= 0xd800))
+        {
+          /* GCC will not compile this source file.
+             Any reason to actually extract this string?  */
+          error_with_progname = false;
+          error (0, 0, _("%s:%d: warning: invalid universal character"),
+                 logical_file_name, line_number - 1);
+          error_with_progname = true;
+          mixed_string_buffer_append_unicode (bp, 0xfffd);
+        }
+      else
+        mixed_string_buffer_append_unicode (bp, uc);
+    }
+  else
+    {
+      /* Append a single byte.  */
+
+      /* When a newline is seen, convert the accumulated multibyte sequence.
+         This ensures a correct line number in the error message in case of
+         a conversion error.  The "- 1" is to account for the newline.  */
+      if (c == '\n')
+        mixed_string_buffer_flush (bp, line_number - 1);
+
+      mixed_string_buffer_append_byte (bp, c);
+    }
+}
+
+/* Return the string buffer's contents.  */
+static char *
+mixed_string_buffer_result (struct mixed_string_buffer *bp)
+{
+  /* Flush all into bp->utf8_buffer.  */
+  mixed_string_buffer_flush (bp, line_number);
+  /* NUL-terminate it.  */
+  mixed_string_buffer_append_unicode_grow (bp, 1);
+  bp->utf8_buffer[bp->utf8_buflen] = '\0';
+  /* Return it.  */
+  return bp->utf8_buffer;
+}
+
 
 /* 5. Parse each resulting logical line as preprocessing tokens and
    white space.  Preprocessing tokens and C tokens don't always match.  */
@@ -1033,6 +1300,7 @@ phase5_get (token_ty *tp)
 {
   static char *buffer;
   static int bufmax;
+  struct mixed_string_buffer msbuffer;
   int bufpos;
   int c;
 
@@ -1086,15 +1354,11 @@ phase5_get (token_ty *tp)
     case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
     case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
     case 'v': case 'w': case 'x': case 'y': case 'z':
-      bufpos = 0;
+      /* FIXME: \uXXXX or \UXXXXXXXX are valid in the identifier.  */
+      mixed_string_buffer_init (&msbuffer, lc_outside);
       for (;;)
         {
-          if (bufpos >= bufmax)
-            {
-              bufmax = 2 * bufmax + 10;
-              buffer = xrealloc (buffer, bufmax);
-            }
-          buffer[bufpos++] = c;
+          mixed_string_buffer_append (&msbuffer, c);
           c = phase4_getc ();
           switch (c)
             {
@@ -1119,14 +1383,9 @@ phase5_get (token_ty *tp)
             }
           break;
         }
-      if (bufpos >= bufmax)
-        {
-          bufmax = 2 * bufmax + 10;
-          buffer = xrealloc (buffer, bufmax);
-        }
-      buffer[bufpos] = 0;
-      tp->string = xstrdup (buffer);
+      tp->string = xstrdup (mixed_string_buffer_result (&msbuffer));
       tp->type = token_type_name;
+      mixed_string_buffer_free (&msbuffer);
       return;
 
     case '.':
@@ -1237,7 +1496,7 @@ phase5_get (token_ty *tp)
          but since gettext's argument is not a wide character string,
          let the compiler complain about the argument not matching the
          prototype.  Just pretend it won't happen.  */
-      bufpos = 0;
+      mixed_string_buffer_init (&msbuffer, lc_string);
       for (;;)
         {
           c = phase7_getc ();
@@ -1254,22 +1513,12 @@ phase5_get (token_ty *tp)
             break;
           if (c == P7_QUOTE)
             c = '\'';
-          if (bufpos >= bufmax)
-            {
-              bufmax = 2 * bufmax + 10;
-              buffer = xrealloc (buffer, bufmax);
-            }
-          buffer[bufpos++] = c;
-        }
-      if (bufpos >= bufmax)
-        {
-          bufmax = 2 * bufmax + 10;
-          buffer = xrealloc (buffer, bufmax);
+          mixed_string_buffer_append (&msbuffer, c);
         }
-      buffer[bufpos] = 0;
       tp->type = token_type_string_literal;
-      tp->string = xstrdup (buffer);
+      tp->string = xstrdup (mixed_string_buffer_result (&msbuffer));
       tp->comment = add_reference (savable_comment);
+      mixed_string_buffer_free (&msbuffer);
       return;
 
     case '(':
@@ -1843,7 +2092,10 @@ extract_parenthesized (message_list_ty *mlp,
                                      arglist_parser_alloc (mlp,
                                                            state ? next_shapes : NULL)))
             {
+              xgettext_current_source_encoding = po_charset_utf8;
               arglist_parser_done (argparser, arg);
+              xgettext_current_source_encoding =
+                xgettext_global_source_encoding;
               return true;
             }
           next_context_iter = null_context_list_iterator;
@@ -1852,7 +2104,9 @@ extract_parenthesized (message_list_ty *mlp,
           continue;
 
         case xgettext_token_type_rparen:
+          xgettext_current_source_encoding = po_charset_utf8;
           arglist_parser_done (argparser, arg);
+          xgettext_current_source_encoding = xgettext_global_source_encoding;
           return false;
 
         case xgettext_token_type_comma:
@@ -1886,6 +2140,7 @@ extract_parenthesized (message_list_ty *mlp,
           continue;
 
         case xgettext_token_type_string_literal:
+          xgettext_current_source_encoding = po_charset_utf8;
           if (extract_all)
             remember_a_message (mlp, NULL, token.string, inner_context,
                                 &token.pos, NULL, token.comment);
@@ -1894,6 +2149,7 @@ extract_parenthesized (message_list_ty *mlp,
                                      inner_context,
                                      token.pos.file_name, token.pos.line_number,
                                      token.comment);
+          xgettext_current_source_encoding = xgettext_global_source_encoding;
           drop_reference (token.comment);
           next_context_iter = null_context_list_iterator;
           selectorcall_context_iter = null_context_list_iterator;
@@ -1907,7 +2163,9 @@ extract_parenthesized (message_list_ty *mlp,
           continue;
 
         case xgettext_token_type_eof:
+          xgettext_current_source_encoding = po_charset_utf8;
           arglist_parser_done (argparser, arg);
+          xgettext_current_source_encoding = xgettext_global_source_encoding;
           return true;
 
         default:

[bug-gettext] [RFC Patch2] Implement \u support in xgettext for C family (C11/C++11)

Reply via email to