On Thu, Feb 6, 2014 at 6:44 PM, Karl Berry <[email protected]> wrote:
> Hi Gavin,
>
> that is, not output encoding -
>
> Good.
>
> I would think that we should leave files as they are if we don't
> know their encoding - that way we don't risk breaking something that
> works already.
>
> Right. So what do you mean by "the default file encoding is set to
> UTF-8"? What default? If the default input enc is to leave it as is
> (good), and the default output enc is set per the locale (good), what's
> left?
>
Updated patch leaves things as they are if file encoding is unknown.
diff -u -x 'Makefile*' -x '*.o' -x '*~' -u texinfo/trunk/info/nodes.c info-locale-5405/trunk/info/nodes.c
--- texinfo/trunk/info/nodes.c 2014-01-07 20:11:42.000000000 +0000
+++ info-locale-5405/trunk/info/nodes.c 2014-02-06 19:23:53.000000000 +0000
@@ -27,6 +27,11 @@
#include "info-utils.h"
#include "tag.h"
+#include <nl_types.h>
+#include <langinfo.h>
+#if HAVE_ICONV
+# include <iconv.h>
+#endif
#if defined (HANDLE_MAN_PAGES)
# include "man.h"
@@ -42,6 +47,8 @@
SEARCH_BINDING *indirect_binding, SEARCH_BINDING *tags_binding);
static void info_reload_file_buffer_contents (FILE_BUFFER *fb);
static char *adjust_nodestart (NODE *node, int min, int max);
+static void get_file_character_encoding (FILE_BUFFER *fb);
+static void convert_characters (FILE_BUFFER *fb);
static FILE_BUFFER *info_load_file_internal (char *filename, int get_tags);
static FILE_BUFFER *info_find_file_internal (char *filename, int get_tags);
static NODE *info_node_of_file_buffer_tags (FILE_BUFFER *file_buffer,
@@ -326,6 +333,310 @@
return file_buffer;
}
+/* Look for local variables section in FB and set encoding */
+static void
+get_file_character_encoding (FILE_BUFFER *fb)
+{
+ SEARCH_BINDING binding;
+ long position;
+
+ long int enc_start, enc_end;
+ char *enc_string;
+
+ char **encoding_name;
+
+ /* See if there is a local variables section in this info file. */
+ binding.buffer = fb->contents;
+ binding.start = fb->filesize;
+ binding.end = binding.start - 1000;
+ if (binding.end < 0)
+ binding.end = 0;
+ binding.flags = S_FoldCase;
+
+ /* Null means the encoding is unknown. */
+ fb->encoding = 0;
+
+ if (search_backward (LOCAL_VARIABLES_LABEL, &binding, &position)
+ != search_success)
+ return;
+
+ binding.start = position;
+ binding.end = fb->filesize;
+
+ if (search_forward (CHARACTER_ENCODING_LABEL, &binding, &enc_start)
+ != search_success)
+ return;
+
+ enc_start += strlen(CHARACTER_ENCODING_LABEL); /* Skip to after "coding:" */
+ enc_start += skip_whitespace(fb->contents + enc_start);
+ binding.start = enc_start;
+
+ search_forward ("\n", &binding, &enc_end);
+
+ enc_string = xmalloc (enc_end - enc_start + 1);
+ strncpy (enc_string, fb->contents + enc_start, enc_end - enc_start);
+ enc_string[enc_end - enc_start] = '\0';
+
+ fb->encoding = enc_string;
+}
+
+struct encoding_replacement
+{
+ char *from_string;
+ char *to_string;
+};
+
+/* Read one character at *FROM and write out at *TO a sequenceo
+ of bytes representing that character in ASCII. *FROM
+ and *TO are both advanced past the read/written bytes. Calling code
+ assumes that replacement strings are no more than 4 characters. */
+static void
+degrade_utf8 (char **from, size_t *from_left, char **to, size_t *to_left)
+{
+ struct encoding_replacement er[] = {
+ {"\xE2\x80\x98","'"}, /* Opening single quote */
+ {"\xE2\x80\x99","'"}, /* Closing single quote */
+ {"\xE2\x80\x9C","\""},/* Opening double quote */
+ {"\xE2\x80\x9D","\""},/* Closing double quote */
+ {"\xC2\xA9","(C)"}, /* Copyright symbol */
+ {"\xC2\xBB",">>"}, /* Closing double angle brackets */
+ {"\xE2\x86\x92","->"},/* Right arrow */
+ {"\xC3\xA0","a`"}, /* Lower case letter a with grave accent */
+ {"\xC3\xA9","e'"}, /* Lower case letter e with acute accent */
+ {"\xC3\xA8","e`"}, /* Lower case letter e with grave accent */
+ {"\xC3\xAA","e^"}, /* Lower case letter e with circumflex */
+ {"\xC3\xAA","e\""}, /* Lower case letter e with diaeresis */
+ {0, 0}};
+
+ struct encoding_replacement *erp;
+
+ for (erp = er; erp->from_string != 0; erp++)
+ {
+ if (!strncmp (*from, erp->from_string, strlen (erp->from_string)))
+ {
+ strncpy(*to, erp->to_string, strlen(erp->to_string));
+ *from += strlen (erp->from_string);
+ *from_left -= strlen (erp->to_string);
+ *to += strlen (erp->to_string);
+ *to_left -= strlen (erp->to_string);
+ return;
+ }
+ }
+
+ /* Failing this, just copy a byte across */
+ /* FIXME: Use SUB instead (^Z)? */
+ **to = **from;
+ (*to)++; (*from)++;
+ (*to_left)--; (*from_left)--;
+}
+
+/* Convert characters in the nodes for FB to the current locale */
+static void
+convert_characters (FILE_BUFFER *fb)
+{
+#if !HAVE_ICONV
+ return;
+#else
+ long node = 0, nextnode;
+ SEARCH_BINDING binding;
+ char *target_encoding;
+
+ char *new_contents, *outptr;
+ size_t new_contents_allocated;
+ size_t out_bytes_left;
+
+ /* Used for conversion from file encoding to output encoding */
+ iconv_t iconv_state;
+
+ iconv_t iconv_to_utf8;
+ char utf8_char[4]; /* Maximum 4 bytes in a UTF-8 character */
+ char *utf8_char_ptr = utf8_char;
+
+ /* Whether file buffer is encoded in UTF-8 */
+ int file_is_in_utf8 = 0;
+
+ /* Used to check return value of iconv() */
+ size_t iconv_ret;
+
+ /* Don't process file if encoding is unknown. */
+ if (!fb->encoding) return;
+
+ /* Read name of character encoding from environment locale */
+ target_encoding = nl_langinfo(CODESET);
+
+ /* Don't convert the contents if the locale
+ uses the same character encoding as the file */
+ if (!strcasecmp(target_encoding, fb->encoding))
+ return;
+
+ /* Check if an iconv conversion from file locale to system
+ locale exists */
+ iconv_state = iconv_open (target_encoding, fb->encoding);
+ if (iconv_state == (iconv_t) -1)
+ return; /* Return if no conversion function implemented */
+
+ if ( !strcasecmp ("UTF8", fb->encoding)
+ || !strcasecmp ("UTF-8", fb->encoding))
+ file_is_in_utf8 = 1;
+
+ if (!file_is_in_utf8)
+ {
+ iconv_to_utf8 = iconv_open ("UTF-8", fb->encoding);
+ if (iconv_to_utf8 == (iconv_t) -1)
+ return; /* Return if no conversion function implemented */
+ }
+
+ /* Allocate space for the converted file buffer (including
+ terminating NULL). */
+ new_contents = xcalloc (1, fb->filesize + 1);
+ new_contents_allocated = fb->filesize;
+ outptr = new_contents;
+ out_bytes_left = fb->filesize;
+
+ binding.buffer = fb->contents;
+ binding.start = 0;
+ binding.end = fb->filesize;
+
+ /* Convert sections of the file separated by node separators. These
+ will be preambles, nodes, tag tables, or local variable sections.
+ We convert all of them, although probably only the nodes need to
+ be converted.
+ The second part of the condition makes us operate on the last
+ section, which does not end with a node separator. */
+ while ((nextnode = find_node_separator (&binding)) != -1
+ || (node != fb->filesize && (nextnode = fb->filesize)))
+ {
+ char *inptr;
+ size_t in_bytes_left;
+
+ /* Update search for next iteration */
+ binding.start = nextnode + 1;
+
+ /* Convert characters from node to nextnode */
+ inptr = binding.buffer + node;
+ in_bytes_left = nextnode - node;
+
+ while (inptr < binding.buffer + nextnode)
+ {
+ int out_offset; /* Only used when reallocating */
+
+ /* Attempt to convert node contents using iconv */
+ while (1)
+ {
+ iconv_ret = iconv (iconv_state, &inptr, &in_bytes_left,
+ &outptr, &out_bytes_left);
+
+ if (iconv_ret != (size_t) -1)
+ {
+ /* Success */
+ goto continue_node_loop;
+ }
+
+ /* There's been an error while converting. */
+ switch (errno)
+ {
+ case E2BIG:
+ /* Ran out of space in output buffer. Reallocate and
+ try again. */
+ out_offset = outptr - new_contents;
+ new_contents_allocated *= 2;
+ new_contents = xrealloc(new_contents,
+ new_contents_allocated);
+
+ /* Update outptr */
+ outptr = new_contents + out_offset;
+ out_bytes_left = new_contents_allocated - out_offset;
+
+ continue;
+ case EILSEQ:
+ /* Byte sequence in input buffer not recognized. Degrade
+ to ASCII instead.
+ (FIXME: Check that output encoding
+ is backwards compatible with ASCII). */
+ goto degrade_to_ascii;
+ case EINVAL:
+ /* Incomplete byte sequence at end of input buffer */
+ goto degrade_to_ascii;
+ default: /* Unknown error - abort */
+ return;
+ }
+ }
+
+ degrade_to_ascii:
+ /* Make sure that there is enough space to write
+ * replacement string. 4 bytes should be enough for one
+ * character */
+ if (out_bytes_left <= 4)
+ {
+ out_offset = outptr - new_contents;
+ new_contents_allocated *= 2;
+ new_contents = xrealloc(new_contents,
+ new_contents_allocated);
+
+ /* Update outptr */
+ outptr = new_contents + out_offset;
+ out_bytes_left = new_contents_allocated - out_offset;
+ }
+
+ if (file_is_in_utf8)
+ {
+ degrade_utf8(&inptr, &in_bytes_left, &outptr, &out_bytes_left);
+ }
+ else
+ {
+ /* When a character in file cannot be represented in the output
+ encoding, convert the character to UTF-8, then call
+ degrade_utf8() to get an ASCII replacement. */
+
+ size_t utf8_char_free, i;
+
+ /* First convert character at read pointer to UTF-8 */
+
+ utf8_char_ptr = utf8_char;
+
+ /* We want to read exactly one character. Do this by
+ restricting size of output buffer. */
+ for (i = 1; i <= 4; i++)
+ {
+ utf8_char_free = i;
+ iconv_ret = iconv(iconv_to_utf8, &inptr, &in_bytes_left,
+ &utf8_char_ptr, &utf8_char_free);
+ /* If we managed to write a character */
+ if (utf8_char_ptr > utf8_char) break;
+ }
+
+ /* errno == E2BIG if iconv ran out of output buffer,
+ which is expected. */
+ if (iconv_ret == (size_t) -1 && errno != E2BIG)
+ {
+ /* Character is not recognized. Copy a single byte. */
+ *outptr = *inptr;
+ outptr++; inptr++;
+ out_bytes_left--; in_bytes_left--;
+ }
+ else
+ {
+ utf8_char_ptr = utf8_char;
+ /* The value of i before or after this call doesn't
+ matter. */
+ degrade_utf8(&utf8_char_ptr, &i,
+ &outptr, &out_bytes_left);
+ }
+ }
+ }
+ continue_node_loop:
+ node = nextnode;
+ node += skip_whitespace (binding.buffer + node);
+ }
+
+ iconv_close (iconv_state);
+ if (!file_is_in_utf8) iconv_close (iconv_to_utf8);
+ free(fb->contents);
+ fb->contents = new_contents;
+ fb->filesize = outptr - new_contents;
+#endif /* HAVE_ICONF */
+}
+
/* The workhorse function for info_load_file (). Non-zero second argument
says to build a list of tags (or nodes) for this file. This is the
default behaviour when info_load_file () is called, but it is not
@@ -397,7 +708,14 @@
file_buffer->contents = contents;
if (compressed)
file_buffer->flags |= N_IsCompressed;
+
+ /* Find encoding of file, if set */
+ get_file_character_encoding (file_buffer);
+ /* Convert characters in file buffer to current locale as much
+ * as possible. */
+ convert_characters (file_buffer);
+
/* If requested, build the tags and nodes for this file buffer. */
if (get_tags)
build_tags_and_nodes (file_buffer);
diff -u -x 'Makefile*' -x '*.o' -x '*~' -u texinfo/trunk/info/nodes.h info-locale-5405/trunk/info/nodes.h
--- texinfo/trunk/info/nodes.h 2013-12-28 17:11:03.000000000 +0000
+++ info-locale-5405/trunk/info/nodes.h 2014-02-06 19:09:11.000000000 +0000
@@ -72,6 +72,8 @@
#define TAGS_TABLE_BEG_LABEL "Tag Table:\n"
#define INDIRECT_TAGS_TABLE_LABEL "Indirect:\n"
#define TAGS_TABLE_IS_INDIRECT_LABEL "(Indirect)"
+#define LOCAL_VARIABLES_LABEL "Local Variables"
+#define CHARACTER_ENCODING_LABEL "coding:"
/* Character constants. */
#define INFO_COOKIE '\037'
@@ -112,7 +114,9 @@
TAG **tags; /* If non-null, the indirect tags table. */
size_t tags_slots; /* Number of slots allocated for TAGS. */
int flags; /* Various flags. Mimics of N_* flags. */
+ char *encoding; /* Name of character encoding of file. */
} FILE_BUFFER;
+
/* Externally visible functions. */