Re: Displaying characters in user's locale

Gavin Smith Thu, 06 Feb 2014 11:28:45 -0800

On Thu, Feb 6, 2014 at 6:44 PM, Karl Berry <[email protected]> wrote:
> Hi Gavin,
>
>     that is, not output encoding -
>
> Good.
>
>     I would think that we should leave files as they are if we don't
>     know their encoding - that way we don't risk breaking something that
>     works already.
>
> Right.  So what do you mean by "the default file encoding is set to
> UTF-8"?  What default?  If the default input enc is to leave it as is
> (good), and the default output enc is set per the locale (good), what's
> left?
>
Updated patch leaves things as they are if file encoding is unknown.

diff -u -x 'Makefile*' -x '*.o' -x '*~' -u texinfo/trunk/info/nodes.c info-locale-5405/trunk/info/nodes.c
--- texinfo/trunk/info/nodes.c	2014-01-07 20:11:42.000000000 +0000
+++ info-locale-5405/trunk/info/nodes.c	2014-02-06 19:23:53.000000000 +0000
@@ -27,6 +27,11 @@
 #include "info-utils.h"
 #include "tag.h"
 
+#include <nl_types.h>
+#include <langinfo.h>
+#if HAVE_ICONV
+# include <iconv.h>
+#endif
 
 #if defined (HANDLE_MAN_PAGES)
 #  include "man.h"
@@ -42,6 +47,8 @@
     SEARCH_BINDING *indirect_binding, SEARCH_BINDING *tags_binding);
 static void info_reload_file_buffer_contents (FILE_BUFFER *fb);
 static char *adjust_nodestart (NODE *node, int min, int max);
+static void get_file_character_encoding (FILE_BUFFER *fb);
+static void convert_characters (FILE_BUFFER *fb);
 static FILE_BUFFER *info_load_file_internal (char *filename, int get_tags);
 static FILE_BUFFER *info_find_file_internal (char *filename, int get_tags);
 static NODE *info_node_of_file_buffer_tags (FILE_BUFFER *file_buffer,
@@ -326,6 +333,310 @@
   return file_buffer;
 }
 
+/* Look for local variables section in FB and set encoding */
+static void
+get_file_character_encoding (FILE_BUFFER *fb)
+{
+  SEARCH_BINDING binding;
+  long position;
+
+  long int enc_start, enc_end;
+  char *enc_string;
+
+  char **encoding_name;
+
+  /* See if there is a local variables section in this info file. */
+  binding.buffer = fb->contents;
+  binding.start = fb->filesize;
+  binding.end = binding.start - 1000;
+  if (binding.end < 0)
+    binding.end = 0;
+  binding.flags = S_FoldCase;
+
+  /* Null means the encoding is unknown. */
+  fb->encoding = 0;
+
+  if (search_backward (LOCAL_VARIABLES_LABEL, &binding, &position)
+      != search_success)
+    return;
+
+  binding.start = position;
+  binding.end = fb->filesize;
+
+  if (search_forward (CHARACTER_ENCODING_LABEL, &binding, &enc_start)
+      != search_success)
+    return;
+
+  enc_start += strlen(CHARACTER_ENCODING_LABEL); /* Skip to after "coding:" */
+  enc_start += skip_whitespace(fb->contents + enc_start);
+  binding.start = enc_start;
+
+  search_forward ("\n", &binding, &enc_end);
+
+  enc_string = xmalloc (enc_end - enc_start + 1);
+  strncpy (enc_string, fb->contents + enc_start, enc_end - enc_start);
+  enc_string[enc_end - enc_start] = '\0';
+
+  fb->encoding = enc_string;
+}
+
+struct encoding_replacement
+{
+  char *from_string;
+  char *to_string;
+};
+
+/* Read one character at *FROM and write out at *TO a sequenceo
+   of bytes representing that character in ASCII. *FROM
+   and *TO are both advanced past the read/written bytes. Calling code
+   assumes that replacement strings are no more than 4 characters. */
+static void
+degrade_utf8 (char **from, size_t *from_left, char **to, size_t *to_left)
+{
+  struct encoding_replacement er[] = {
+  {"\xE2\x80\x98","'"}, /* Opening single quote */
+  {"\xE2\x80\x99","'"}, /* Closing single quote */
+  {"\xE2\x80\x9C","\""},/* Opening double quote */
+  {"\xE2\x80\x9D","\""},/* Closing double quote */
+  {"\xC2\xA9","(C)"},   /* Copyright symbol */
+  {"\xC2\xBB",">>"},    /* Closing double angle brackets */
+  {"\xE2\x86\x92","->"},/* Right arrow */
+  {"\xC3\xA0","a`"},   /* Lower case letter a with grave accent */
+  {"\xC3\xA9","e'"},   /* Lower case letter e with acute accent */
+  {"\xC3\xA8","e`"},   /* Lower case letter e with grave accent */
+  {"\xC3\xAA","e^"},   /* Lower case letter e with circumflex */
+  {"\xC3\xAA","e\""},  /* Lower case letter e with diaeresis */
+  {0, 0}};
+
+  struct encoding_replacement *erp;
+
+  for (erp = er; erp->from_string != 0; erp++)
+    {
+      if (!strncmp (*from, erp->from_string, strlen (erp->from_string)))
+        {
+          strncpy(*to, erp->to_string, strlen(erp->to_string));
+          *from      += strlen (erp->from_string);
+          *from_left -= strlen (erp->to_string);
+          *to      += strlen (erp->to_string);
+          *to_left -= strlen (erp->to_string);
+          return;
+        }
+    }
+
+  /* Failing this, just copy a byte across */
+  /* FIXME: Use SUB instead (^Z)? */
+  **to = **from;
+  (*to)++; (*from)++;
+  (*to_left)--; (*from_left)--;
+}
+
+/* Convert characters in the nodes for FB to the current locale */
+static void
+convert_characters (FILE_BUFFER *fb)
+{
+#if !HAVE_ICONV
+  return;
+#else
+  long node = 0, nextnode;
+  SEARCH_BINDING binding;
+  char *target_encoding;
+
+  char *new_contents, *outptr;
+  size_t new_contents_allocated;
+  size_t out_bytes_left;
+
+  /* Used for conversion from file encoding to output encoding */
+  iconv_t iconv_state;
+
+  iconv_t iconv_to_utf8;
+  char utf8_char[4]; /* Maximum 4 bytes in a UTF-8 character */
+  char *utf8_char_ptr = utf8_char;
+
+  /* Whether file buffer is encoded in UTF-8 */
+  int file_is_in_utf8 = 0;
+
+  /* Used to check return value of iconv() */
+  size_t iconv_ret;
+
+  /* Don't process file if encoding is unknown. */
+  if (!fb->encoding) return;
+
+  /* Read name of character encoding from environment locale */
+  target_encoding = nl_langinfo(CODESET);
+
+  /* Don't convert the contents if the locale
+     uses the same character encoding as the file */
+  if (!strcasecmp(target_encoding, fb->encoding))
+    return;
+
+  /* Check if an iconv conversion from file locale to system
+     locale exists */
+  iconv_state = iconv_open (target_encoding, fb->encoding);
+  if (iconv_state == (iconv_t) -1)
+    return; /* Return if no conversion function implemented */
+
+  if (   !strcasecmp ("UTF8",  fb->encoding)
+      || !strcasecmp ("UTF-8", fb->encoding))
+    file_is_in_utf8 = 1;
+
+  if (!file_is_in_utf8)
+    {
+      iconv_to_utf8 = iconv_open ("UTF-8", fb->encoding);
+      if (iconv_to_utf8 == (iconv_t) -1)
+        return; /* Return if no conversion function implemented */
+    }
+
+  /* Allocate space for the converted file buffer (including
+     terminating NULL). */
+  new_contents = xcalloc (1, fb->filesize + 1);
+  new_contents_allocated = fb->filesize;
+  outptr = new_contents;
+  out_bytes_left = fb->filesize;
+
+  binding.buffer = fb->contents;
+  binding.start = 0;
+  binding.end = fb->filesize;
+
+  /* Convert sections of the file separated by node separators. These
+     will be preambles, nodes, tag tables, or local variable sections.
+     We convert all of them, although probably only the nodes need to
+     be converted. 
+     The second part of the condition makes us operate on the last
+     section, which does not end with a node separator. */
+  while ((nextnode = find_node_separator (&binding)) != -1
+    || (node != fb->filesize && (nextnode = fb->filesize)))
+    {
+      char *inptr;
+      size_t in_bytes_left;
+
+      /* Update search for next iteration */
+      binding.start = nextnode + 1;
+
+      /* Convert characters from node to nextnode */
+      inptr = binding.buffer + node;
+      in_bytes_left = nextnode - node;
+
+      while (inptr < binding.buffer + nextnode)
+        {
+          int out_offset; /* Only used when reallocating */
+
+          /* Attempt to convert node contents using iconv */
+          while (1)
+            {
+              iconv_ret = iconv (iconv_state, &inptr, &in_bytes_left,
+                       &outptr, &out_bytes_left);
+
+              if (iconv_ret != (size_t) -1)
+                {
+                  /* Success */
+                  goto continue_node_loop;
+                }
+            
+              /* There's been an error while converting. */
+              switch (errno)
+                {
+                case E2BIG:
+                  /* Ran out of space in output buffer. Reallocate and
+                     try again. */
+                  out_offset = outptr - new_contents;
+                  new_contents_allocated *= 2;
+                  new_contents = xrealloc(new_contents,
+                                    new_contents_allocated);
+
+                  /* Update outptr */
+                  outptr = new_contents + out_offset;
+                  out_bytes_left = new_contents_allocated - out_offset;
+
+                  continue;
+                case EILSEQ:
+                  /* Byte sequence in input buffer not recognized. Degrade
+                     to ASCII instead.
+                     (FIXME: Check that output encoding
+                     is backwards compatible with ASCII). */
+                  goto degrade_to_ascii;
+                case EINVAL:
+                  /* Incomplete byte sequence at end of input buffer */
+                  goto degrade_to_ascii;
+                default: /* Unknown error - abort */
+                  return;
+                }
+            }
+            
+        degrade_to_ascii:
+          /* Make sure that there is enough space to write
+           * replacement string. 4 bytes should be enough for one
+           * character */
+          if (out_bytes_left <= 4)
+            {
+              out_offset = outptr - new_contents;
+              new_contents_allocated *= 2;
+              new_contents = xrealloc(new_contents,
+                                new_contents_allocated);
+
+              /* Update outptr */
+              outptr = new_contents + out_offset;
+              out_bytes_left = new_contents_allocated - out_offset;
+            }
+
+          if (file_is_in_utf8)
+            {
+              degrade_utf8(&inptr, &in_bytes_left, &outptr, &out_bytes_left);
+            }
+          else
+            {
+              /* When a character in file cannot be represented in the output
+                 encoding, convert the character to UTF-8, then call
+                 degrade_utf8() to get an ASCII replacement. */
+
+              size_t utf8_char_free, i;
+
+              /* First convert character at read pointer to UTF-8 */
+
+              utf8_char_ptr = utf8_char;
+
+              /* We want to read exactly one character. Do this by
+                 restricting size of output buffer. */
+              for (i = 1; i <= 4; i++)
+                {
+                  utf8_char_free = i;
+                  iconv_ret = iconv(iconv_to_utf8, &inptr, &in_bytes_left,
+                                    &utf8_char_ptr, &utf8_char_free);
+                  /* If we managed to write a character */
+                  if (utf8_char_ptr > utf8_char) break;
+                }
+
+              /* errno == E2BIG if iconv ran out of output buffer,
+                 which is expected. */
+              if (iconv_ret == (size_t) -1 && errno != E2BIG)
+                {
+                  /* Character is not recognized. Copy a single byte. */
+                  *outptr = *inptr;
+                  outptr++; inptr++;
+                  out_bytes_left--; in_bytes_left--;
+                }
+              else
+                {
+                  utf8_char_ptr = utf8_char;
+                  /* The value of i before or after this call doesn't
+                     matter. */
+                  degrade_utf8(&utf8_char_ptr, &i,
+                               &outptr, &out_bytes_left);
+                }
+            }
+        }
+    continue_node_loop:
+      node = nextnode;
+      node += skip_whitespace (binding.buffer + node);
+    }
+
+  iconv_close (iconv_state);
+  if (!file_is_in_utf8) iconv_close (iconv_to_utf8);
+  free(fb->contents);
+  fb->contents = new_contents;
+  fb->filesize = outptr - new_contents;
+#endif /* HAVE_ICONF */
+}
+
 /* The workhorse function for info_load_file ().  Non-zero second argument
    says to build a list of tags (or nodes) for this file.  This is the
    default behaviour when info_load_file () is called, but it is not
@@ -397,7 +708,14 @@
   file_buffer->contents = contents;
   if (compressed)
     file_buffer->flags |= N_IsCompressed;
+
+  /* Find encoding of file, if set */
+  get_file_character_encoding (file_buffer);
   
+  /* Convert characters in file buffer to current locale as much
+   * as possible. */
+  convert_characters (file_buffer);
+
   /* If requested, build the tags and nodes for this file buffer. */
   if (get_tags)
     build_tags_and_nodes (file_buffer);
diff -u -x 'Makefile*' -x '*.o' -x '*~' -u texinfo/trunk/info/nodes.h info-locale-5405/trunk/info/nodes.h
--- texinfo/trunk/info/nodes.h	2013-12-28 17:11:03.000000000 +0000
+++ info-locale-5405/trunk/info/nodes.h	2014-02-06 19:09:11.000000000 +0000
@@ -72,6 +72,8 @@
 #define TAGS_TABLE_BEG_LABEL            "Tag Table:\n"
 #define INDIRECT_TAGS_TABLE_LABEL       "Indirect:\n"
 #define TAGS_TABLE_IS_INDIRECT_LABEL    "(Indirect)"
+#define LOCAL_VARIABLES_LABEL		"Local Variables"
+#define CHARACTER_ENCODING_LABEL        "coding:"
 
 /* Character constants. */
 #define INFO_COOKIE '\037'
@@ -112,7 +114,9 @@
   TAG **tags;                   /* If non-null, the indirect tags table. */
   size_t tags_slots;            /* Number of slots allocated for TAGS. */
   int flags;                    /* Various flags.  Mimics of N_* flags. */
+  char *encoding;               /* Name of character encoding of file. */
 } FILE_BUFFER;
+
 
 /* Externally visible functions.  */

Re: Displaying characters in user's locale

Reply via email to