[Latex2rtf-users] Some bugfixes in Latex2RTF

itkes Wed, 14 Mar 2018 03:40:38 -0700

Hello.

For me Latex2RTF is no doubt the best tool to transform TeX files to
Word-style docs, but nothing is perfect. There are two disadvantages that
make its usage more difficult for me.


1. If a verbatim block contains non-ascii characters, they are ignored on
output.
2. If a theorem caption (defined with newtheorem command) contains
non-ascii characters, they are also ignored on output.

Luckily both disadvantages are quite easy to correct, so I have written a
patch to fix them. I can't be completely sure I didn't break anything, but
all tests passes and this patch does exactly what I want. If it works
really fine, I would be happy to see these changes applied in the next
Latex2RTF release.

With best regards,

Alex Itkes.

diff -urN latex2rtf-2.3.16/funct1.c latex2rtf-2.3.16-alexpatch-0.2/funct1.c
--- latex2rtf-2.3.16/funct1.c	2015-11-19 22:37:55.000000000 +0300
+++ latex2rtf-2.3.16-alexpatch-0.2/funct1.c	2018-03-12 14:59:01.000000000 +0300
@@ -50,6 +50,7 @@
 #include "styles.h"
 #include "graphics.h"
 #include "vertical.h"
+#include "utf8_support.h"
 
 #define ARABIC_NUMBERING 0
 #define ALPHA_NUMBERING  1
@@ -257,7 +258,13 @@
             CmdEndParagraph(0);
             CmdVspace(VSPACE_SMALL_SKIP);
             startParagraph("theorem", PARAGRAPH_FIRST);
-            fprintRTF("{\\b %s} {\\i ", str);
+            fprintRTF("{\\b ");
+            if (CurrentFontEncoding() == ENCODING_UTF8) {
+                putRtfUtf8StrEscaped(str);
+            } else {
+                putRtfStrEscaped(str);
+            }
+            fprintRTF("} {\\i ");
             PushBrace();
             if (option)
                 free(option);
@@ -1182,7 +1189,11 @@
         else if (true_code == VERBATIM_1 || true_code == VERBATIM_2) {
 
             show_string(5, verbatim_text, "verbatim");
-            putRtfStrEscaped(verbatim_text);
+            if (CurrentFontEncoding() == ENCODING_UTF8) {
+                putRtfUtf8StrEscaped(verbatim_text);
+            } else {
+                putRtfStrEscaped(verbatim_text);
+            }
         }
 
         free(verbatim_text);
diff -urN latex2rtf-2.3.16/Makefile latex2rtf-2.3.16-alexpatch-0.2/Makefile
--- latex2rtf-2.3.16/Makefile	2014-03-14 22:09:26.000000000 +0400
+++ latex2rtf-2.3.16-alexpatch-0.2/Makefile	2018-03-12 15:04:05.000000000 +0300
@@ -53,7 +53,7 @@
 	main.c stack.c cfg.c utils.c parser.c lengths.c counters.c letterformat.c \
 	preamble.c equations.c convert.c xrefs.c definitions.c graphics.c \
 	mygetopt.c styles.c preparse.c vertical.c fields.c \
-	labels.c biblio.c acronyms.c auxfile.c
+	labels.c biblio.c acronyms.c auxfile.c utf8_support.c
 
 HDRS=commands.h chars.h direct.h encodings.h fonts.h funct1.h tables.h ignore.h \
     main.h stack.h cfg.h utils.h parser.h lengths.h counters.h letterformat.h \
@@ -143,7 +143,7 @@
 	chars.o ignore.o cfg.o main.o utils.o parser.o lengths.o counters.o \
 	preamble.o letterformat.o equations.o convert.o xrefs.o definitions.o graphics.o \
 	mygetopt.o styles.o preparse.o vertical.o fields.o \
-	labels.o biblio.o auxfile.o	acronyms.o
+	labels.o biblio.o auxfile.o acronyms.o utf8_support.o
 
 all : checkdir latex2rtf    # Windows: remove "checkdir"
 
diff -urN latex2rtf-2.3.16/utf8_support.c latex2rtf-2.3.16-alexpatch-0.2/utf8_support.c
--- latex2rtf-2.3.16/utf8_support.c	1970-01-01 03:00:00.000000000 +0300
+++ latex2rtf-2.3.16-alexpatch-0.2/utf8_support.c	2018-03-12 15:03:04.000000000 +0300
@@ -0,0 +1,101 @@
+/* utf8_support.c - LaTeX to RTF conversion program
+
+This file contains a function used to convert verbatim sections containing
+Utf-8 characters properly.
+
+Authors:
+    2018 Alex Itkes
+*/
+
+#include "main.h"
+#include "chars.h"
+#include "encodings.h"
+
+/* Initializes a unicode character represented by the Utf-8 sequencd
+   based on the first byte of the sequence. The returned value must
+   then be updated with updateUf8Sequence a number of times (returned
+   by getUtf8SequenceLength using the following bytes of the input
+   stream.
+*/
+uint16_t getUtf8SequenceInitialValue (uint8_t byte)
+{
+    if (byte >= 0xF0) {
+        return byte & ~0xF0;
+    } else if (byte >= 0xE0) {
+        return byte & ~0xE0;
+    } else if (byte >= 0xC0) {
+        return byte & ~0xC0;
+    } else {
+        return 0;
+    }
+}
+
+/* Determines the length of a Utf-8 sequence based on its first byte.
+   Actually returns the length decreased by 1, i.e. the number of
+   bytes to be read later.
+*/
+uint16_t getUtf8SequenceLength (uint8_t byte)
+{
+    if (byte >= 0xF0) {
+        return 3;
+    } else if (byte >= 0xE0) {
+        return 2;
+    } else if (byte >= 0xC0) {
+        return 1;
+    } else {
+        return 0;
+    }
+}
+
+/* Adds a following byte of Utf-8 sequence to the unicode
+   character, return the resulting code.
+*/
+uint16_t updateUtf8Sequence (uint16_t value, uint8_t byte)
+{
+    return ((value << 6) + (byte & ~0xC0));
+}
+
+/* Prints a string to RTF with escaped characters if needed.
+   unlike putRtfStrEscaped it prints Utf8 characters properly.
+*/
+void putRtfUtf8StrEscaped(const char * string)
+{
+    char *s = (char *) string;
+    if (string == NULL) return;
+    while (*s) {
+        /* Much of code actually copied from Convert () function, but some
+           code later moved to additional functions (getUtf8SequenceLength,
+           getUtf8SequenceInitialValue and updateUtf8Sequence).
+           TODO: Could it be a good idea to call them also from Convert?
+        */
+        if ((uint8_t)(*s) >= 0x80 && (CurrentFontEncoding() == ENCODING_UTF8) && *(s + 1)) {
+            /* Handle a Utf-8 byte sequence. Try to convert it to a Unicode
+               character in same way with the Convert function does and then
+               output it with CmdUnicodeChar.
+            */
+            uint8_t byte = *s;
+
+            uint16_t len = getUtf8SequenceLength (byte);
+            uint16_t value = getUtf8SequenceInitialValue (byte);
+            uint16_t i;
+
+            s++;
+            for (i=0; i<len; i++) {
+                if (*s) {
+                    value = updateUtf8Sequence (value, *s);
+                    s++;
+                } else {
+                    /* If the sequnce is shorted then it should be, display warning and output nothing. */
+                    diagnostics(1, "An incorrect Utf-8 sequence encountered at end of string '%s', continuing anyway...", string);
+                    return;
+                }
+            }
+
+            diagnostics(4,"(flag = 0x%X) char value = 0X%04X or %u (%u bytes)", (unsigned char) byte, value, value, len);
+            CmdUnicodeChar(value);
+        } else {
+            /* Not a Utf8 character. */
+            putRtfCharEscaped(*s++);
+        }
+    }
+}
diff -urN latex2rtf-2.3.16/utf8_support.h latex2rtf-2.3.16-alexpatch-0.2/utf8_support.h
--- latex2rtf-2.3.16/utf8_support.h	1970-01-01 03:00:00.000000000 +0300
+++ latex2rtf-2.3.16-alexpatch-0.2/utf8_support.h	2018-03-10 14:35:54.000000000 +0300
@@ -0,0 +1,10 @@
+#ifndef _UTF8_SUPPORT_H_INCLUDED
+#define _UTF8_SUPPORT_H_INCLUDED 1
+
+uint16_t getUtf8SequenceInitialValue (uint8_t byte);
+uint16_t getUtf8SequenceLength (uint8_t byte);
+uint16_t updateUf8Sequence (uint16_t value, uint8_t byte);
+
+void putRtfUtf8StrEscaped(const char * string);
+
+#endif

------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most
engaging tech sites, Slashdot.org! http://sdm.link/slashdot

_______________________________________________
Latex2rtf-users mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/latex2rtf-users

[Latex2rtf-users] Some bugfixes in Latex2RTF

Reply via email to