Hello.
For me Latex2RTF is no doubt the best tool to transform TeX files to
Word-style docs, but nothing is perfect. There are two disadvantages that
make its usage more difficult for me.
1. If a verbatim block contains non-ascii characters, they are ignored on
output.
2. If a theorem caption (defined with newtheorem command) contains
non-ascii characters, they are also ignored on output.
Luckily both disadvantages are quite easy to correct, so I have written a
patch to fix them. I can't be completely sure I didn't break anything, but
all tests passes and this patch does exactly what I want. If it works
really fine, I would be happy to see these changes applied in the next
Latex2RTF release.
With best regards,
Alex Itkes.
diff -urN latex2rtf-2.3.16/funct1.c latex2rtf-2.3.16-alexpatch-0.2/funct1.c
--- latex2rtf-2.3.16/funct1.c 2015-11-19 22:37:55.000000000 +0300
+++ latex2rtf-2.3.16-alexpatch-0.2/funct1.c 2018-03-12 14:59:01.000000000 +0300
@@ -50,6 +50,7 @@
#include "styles.h"
#include "graphics.h"
#include "vertical.h"
+#include "utf8_support.h"
#define ARABIC_NUMBERING 0
#define ALPHA_NUMBERING 1
@@ -257,7 +258,13 @@
CmdEndParagraph(0);
CmdVspace(VSPACE_SMALL_SKIP);
startParagraph("theorem", PARAGRAPH_FIRST);
- fprintRTF("{\\b %s} {\\i ", str);
+ fprintRTF("{\\b ");
+ if (CurrentFontEncoding() == ENCODING_UTF8) {
+ putRtfUtf8StrEscaped(str);
+ } else {
+ putRtfStrEscaped(str);
+ }
+ fprintRTF("} {\\i ");
PushBrace();
if (option)
free(option);
@@ -1182,7 +1189,11 @@
else if (true_code == VERBATIM_1 || true_code == VERBATIM_2) {
show_string(5, verbatim_text, "verbatim");
- putRtfStrEscaped(verbatim_text);
+ if (CurrentFontEncoding() == ENCODING_UTF8) {
+ putRtfUtf8StrEscaped(verbatim_text);
+ } else {
+ putRtfStrEscaped(verbatim_text);
+ }
}
free(verbatim_text);
diff -urN latex2rtf-2.3.16/Makefile latex2rtf-2.3.16-alexpatch-0.2/Makefile
--- latex2rtf-2.3.16/Makefile 2014-03-14 22:09:26.000000000 +0400
+++ latex2rtf-2.3.16-alexpatch-0.2/Makefile 2018-03-12 15:04:05.000000000 +0300
@@ -53,7 +53,7 @@
main.c stack.c cfg.c utils.c parser.c lengths.c counters.c letterformat.c \
preamble.c equations.c convert.c xrefs.c definitions.c graphics.c \
mygetopt.c styles.c preparse.c vertical.c fields.c \
- labels.c biblio.c acronyms.c auxfile.c
+ labels.c biblio.c acronyms.c auxfile.c utf8_support.c
HDRS=commands.h chars.h direct.h encodings.h fonts.h funct1.h tables.h ignore.h \
main.h stack.h cfg.h utils.h parser.h lengths.h counters.h letterformat.h \
@@ -143,7 +143,7 @@
chars.o ignore.o cfg.o main.o utils.o parser.o lengths.o counters.o \
preamble.o letterformat.o equations.o convert.o xrefs.o definitions.o graphics.o \
mygetopt.o styles.o preparse.o vertical.o fields.o \
- labels.o biblio.o auxfile.o acronyms.o
+ labels.o biblio.o auxfile.o acronyms.o utf8_support.o
all : checkdir latex2rtf # Windows: remove "checkdir"
diff -urN latex2rtf-2.3.16/utf8_support.c latex2rtf-2.3.16-alexpatch-0.2/utf8_support.c
--- latex2rtf-2.3.16/utf8_support.c 1970-01-01 03:00:00.000000000 +0300
+++ latex2rtf-2.3.16-alexpatch-0.2/utf8_support.c 2018-03-12 15:03:04.000000000 +0300
@@ -0,0 +1,101 @@
+/* utf8_support.c - LaTeX to RTF conversion program
+
+This file contains a function used to convert verbatim sections containing
+Utf-8 characters properly.
+
+Authors:
+ 2018 Alex Itkes
+*/
+
+#include "main.h"
+#include "chars.h"
+#include "encodings.h"
+
+/* Initializes a unicode character represented by the Utf-8 sequencd
+ based on the first byte of the sequence. The returned value must
+ then be updated with updateUf8Sequence a number of times (returned
+ by getUtf8SequenceLength using the following bytes of the input
+ stream.
+*/
+uint16_t getUtf8SequenceInitialValue (uint8_t byte)
+{
+ if (byte >= 0xF0) {
+ return byte & ~0xF0;
+ } else if (byte >= 0xE0) {
+ return byte & ~0xE0;
+ } else if (byte >= 0xC0) {
+ return byte & ~0xC0;
+ } else {
+ return 0;
+ }
+}
+
+/* Determines the length of a Utf-8 sequence based on its first byte.
+ Actually returns the length decreased by 1, i.e. the number of
+ bytes to be read later.
+*/
+uint16_t getUtf8SequenceLength (uint8_t byte)
+{
+ if (byte >= 0xF0) {
+ return 3;
+ } else if (byte >= 0xE0) {
+ return 2;
+ } else if (byte >= 0xC0) {
+ return 1;
+ } else {
+ return 0;
+ }
+}
+
+/* Adds a following byte of Utf-8 sequence to the unicode
+ character, return the resulting code.
+*/
+uint16_t updateUtf8Sequence (uint16_t value, uint8_t byte)
+{
+ return ((value << 6) + (byte & ~0xC0));
+}
+
+/* Prints a string to RTF with escaped characters if needed.
+ unlike putRtfStrEscaped it prints Utf8 characters properly.
+*/
+void putRtfUtf8StrEscaped(const char * string)
+{
+ char *s = (char *) string;
+ if (string == NULL) return;
+ while (*s) {
+ /* Much of code actually copied from Convert () function, but some
+ code later moved to additional functions (getUtf8SequenceLength,
+ getUtf8SequenceInitialValue and updateUtf8Sequence).
+ TODO: Could it be a good idea to call them also from Convert?
+ */
+ if ((uint8_t)(*s) >= 0x80 && (CurrentFontEncoding() == ENCODING_UTF8) && *(s + 1)) {
+ /* Handle a Utf-8 byte sequence. Try to convert it to a Unicode
+ character in same way with the Convert function does and then
+ output it with CmdUnicodeChar.
+ */
+ uint8_t byte = *s;
+
+ uint16_t len = getUtf8SequenceLength (byte);
+ uint16_t value = getUtf8SequenceInitialValue (byte);
+ uint16_t i;
+
+ s++;
+ for (i=0; i<len; i++) {
+ if (*s) {
+ value = updateUtf8Sequence (value, *s);
+ s++;
+ } else {
+ /* If the sequnce is shorted then it should be, display warning and output nothing. */
+ diagnostics(1, "An incorrect Utf-8 sequence encountered at end of string '%s', continuing anyway...", string);
+ return;
+ }
+ }
+
+ diagnostics(4,"(flag = 0x%X) char value = 0X%04X or %u (%u bytes)", (unsigned char) byte, value, value, len);
+ CmdUnicodeChar(value);
+ } else {
+ /* Not a Utf8 character. */
+ putRtfCharEscaped(*s++);
+ }
+ }
+}
diff -urN latex2rtf-2.3.16/utf8_support.h latex2rtf-2.3.16-alexpatch-0.2/utf8_support.h
--- latex2rtf-2.3.16/utf8_support.h 1970-01-01 03:00:00.000000000 +0300
+++ latex2rtf-2.3.16-alexpatch-0.2/utf8_support.h 2018-03-10 14:35:54.000000000 +0300
@@ -0,0 +1,10 @@
+#ifndef _UTF8_SUPPORT_H_INCLUDED
+#define _UTF8_SUPPORT_H_INCLUDED 1
+
+uint16_t getUtf8SequenceInitialValue (uint8_t byte);
+uint16_t getUtf8SequenceLength (uint8_t byte);
+uint16_t updateUf8Sequence (uint16_t value, uint8_t byte);
+
+void putRtfUtf8StrEscaped(const char * string);
+
+#endif------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most
engaging tech sites, Slashdot.org! http://sdm.link/slashdot
_______________________________________________
Latex2rtf-users mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/latex2rtf-users