The attached patch implements some support for changing from a UTF-8 encoded file to an ASCII one. This is against SVN version 5400. set_file_lc_type() checks if the file contains a Local Variables section with a coding line. Then the convert_characters() function can substitute characters in the file if the document encoding is different from the encoding from the environment. At the moment only three characters in UTF-8 are implemented.
I used the strcasecmp and nl_langinfo functions; I'm not sure if they are standard. I've attached a file I used to test this patch - it gives the expected different behaviour in UTF-8 and Latin 1 terminals for me. On Wed, Jan 1, 2014 at 12:15 AM, Karl Berry <[email protected]> wrote: > In my experience, the problem is not specific to Info and not specific > to quotes. If I run cat or more or ... on a UTF-8 file in a non-UTF-8 > terminal, characters are dropped and the result beyond 7-bit ASCII is > garbled. > > This has always seemed like a fundamental problem in UTF-8 usage to me, > one that would be better addressed at the terminal level, so at least > one can always see the bytes, if not the "best possible" > transliteration, without every single program that writes to stdout > having to implement the same thing. But since nothing like that is > going to happen, I suppose Info should somehow deal with it, just like > every other program in the world. Sigh. Patches are welcome. > > As for controlling the output of quotes by makeinfo, an option could be > invented, but I am not inclined to change the default behavior so I'm > not convinced it has much utility. We changed it in the first place > because of vociferous complaints about getting ASCII quotes even with > @documentencoding UTF-8. And after all, there is some logic to using > UTF-8 quotes when the document says it wants UTF-8. It's no different > in principle than accented letters. > > At any rate, the best answer, IMHO, not requiring any changes to any > programs, is simply not to use @documentencoding UTF-8 unless one > actually needs it, which should be never in English-language manuals. > 7-bit ASCII source with Texinfo @-commands is preferable. These days > many people reflexively think that UTF-8 is wonderful, always use it, > and want to inflict it on everyone else too, but that is simply wrong. > > karl >
diff -x 'Makefile*' -x '*.o' -x '*~' texinfo/trunk/info/nodes.c info-locale/info/nodes.c
29a30,31
> #include <nl_types.h>
> #include <langinfo.h>
44a47,48
> static void set_file_lc_ctype (FILE_BUFFER *fb);
> static void convert_characters (FILE_BUFFER *fb);
328a333,485
> char *locale_names[] = { "US-ASCII", "UTF-8", "ISO-8859-1", "ISO-8859-2",
> "ISO-8859-15", 0 };
>
> /* Look for local variables section in FB and set encoding */
> static void
> set_file_lc_ctype (FILE_BUFFER *fb)
> {
> SEARCH_BINDING binding;
> long position;
>
> long int enc_start, enc_end;
> char *enc_string;
>
> char **locale_name;
>
> /* See if there is a local variables section in this info file. */
> binding.buffer = fb->contents;
> binding.start = fb->filesize;
> binding.end = binding.start - 1000;
> if (binding.end < 0)
> binding.end = 0;
> binding.flags = S_FoldCase;
>
> fb->lc_ctype = ENC_UNKNOWN;
>
> if (search_backward (LOCAL_VARIABLES_LABEL, &binding, &position)
> != search_success)
> return;
>
> binding.start = position;
> binding.end = fb->filesize;
>
> if (search_forward ("coding:", &binding, &enc_start)
> != search_success)
> return;
>
> enc_start += 7; /* Skip to after "coding:" */
> enc_start += skip_whitespace(fb->contents + enc_start);
> binding.start = enc_start;
>
> search_forward ("\n", &binding, &enc_end);
>
> enc_string = xmalloc (enc_end - enc_start + 1);
> strncpy (enc_string, fb->contents + enc_start, enc_end - enc_start);
> enc_string[enc_end - enc_start] = '\0';
>
> for (locale_name = locale_names; *locale_name != 0; locale_name++)
> if (!strcasecmp(enc_string, *locale_name))
> fb->lc_ctype = locale_name - locale_names;
> }
>
> /* The degrade functions replace one character in their encoding at *C with
> * an ASCII equivalent and return the number of extra bytes. *C is left to
> * point at the end of written bytes. At present adding more bytes than
> * originally existed is not possible */
>
> static int
> degrade_dummy (char **c) {}
>
> struct encoding_replacement
> {
> char *from_string;
> char *to_string;
> };
>
> static int
> degrade_utf8 (char **c)
> {
> struct encoding_replacement er[] = {
> {"\xc3\xb6","o"}, /* lower-case o with umlaut */
> {"\xe2\x80\x98","'"}, /* Opening quote */
> {"\xe2\x80\x99","'"}, /* Closing quote */
> {0, 0}};
>
> struct encoding_replacement *erp;
>
> for (erp = er; erp->from_string != 0; erp++)
> {
> if (!strncmp (*c, erp->from_string, strlen (erp->from_string)))
> {
> strncpy(*c, erp->to_string, strlen(erp->to_string));
> *c += strlen (erp->to_string) - 1;
> return strlen (erp->from_string) - strlen (erp->to_string);
> }
> }
> return 0;
> }
>
> /* Convert characters in the nodes for FB to the current locale */
> static void
> convert_characters (FILE_BUFFER *fb)
> {
> char *c;
> long node = 0, nextnode;
> SEARCH_BINDING binding;
> char *to_locale;
>
> int (*degrade_funcs[5])(char **) = {
> degrade_dummy, degrade_utf8, degrade_dummy,
> degrade_dummy, degrade_dummy };
>
> int (*degrade)(char **);
>
> if (fb->lc_ctype == ENC_UNKNOWN) return;
>
> /* Read environment locale */
>
> to_locale = nl_langinfo(CODESET);
>
> /* Don't degrade the contents if we are in fact
> * in the right locale for the file */
> if (!strcasecmp(to_locale, locale_names[fb->lc_ctype]))
> return;
>
> degrade = degrade_funcs [fb->lc_ctype];
>
> /* Return if no conversion function implemented */
> if (degrade == degrade_dummy) return;
>
> binding.buffer = fb->contents;
> binding.start = 0;
> binding.end = fb->filesize;
>
> /* Loop between node_separators. The second part of the condition
> * makes us operate on the last node, which does not end with a
> * node separator (although it will be a tags table or local variables
> * section anyway). */
> while ((nextnode = find_node_separator (&binding)) != -1
> || (node != fb->filesize && (nextnode = fb->filesize)))
> {
> binding.start = nextnode + 1;
>
> /* Convert characters from node to nextnode */
>
> for (c = binding.buffer + node; c < binding.buffer + nextnode; c++)
> {
> int shrink_by;
> shrink_by = degrade(&c);
> if (shrink_by != 0)
> {
> /* Shift rest of file backwards by shrink_by bytes */
> memmove (c + 1, c + 1 + shrink_by,
> (fb->contents + fb->filesize) - (c + 1 + shrink_by));
> fb->filesize -= shrink_by;
> binding.end -= shrink_by;
> }
> }
>
> node = nextnode;
> node += skip_whitespace (binding.buffer + node);
> }
> }
>
399a557,559
>
> /* Find encoding of file, if set */
> set_file_lc_ctype(file_buffer);
400a561,564
> /* Convert characters in file buffer to current locale as much
> * as possible. */
> convert_characters (file_buffer);
>
diff -x 'Makefile*' -x '*.o' -x '*~' texinfo/trunk/info/nodes.h info-locale/info/nodes.h
74a75
> #define LOCAL_VARIABLES_LABEL "Local Variables"
114a116
> int lc_ctype; /* Encoding - index into locale_names */
115a118,124
>
> /* Null-terminated array of strings naming locales that
> extern char *locale_names[];
>
> /* Value of FILE_BUFFER.lc_ctype if encoding is unknown */
> #define ENC_UNKNOWN -1
>
utf8.info
Description: Binary data
