[elinks-dev] Encodings in xbel

Witold Filipczyk Mon, 25 Aug 2008 23:53:31 -0700

Hej,
Here are two patches, which change a bit xbel bookmarks backend.
These patches let read bookmarks in XBEL format in any encoding supported
by ELinks, and save it in UTF-8.
In fact if the user has other codepage than UTF-8 and uses
in bookmarks non-ascii characters , it won't work right.


Witek

diff --git a/src/intl/charsets.c b/src/intl/charsets.c
index de853b9..03f7f2b 100644
--- a/src/intl/charsets.c
+++ b/src/intl/charsets.c
@@ -29,43 +29,6 @@
 #include "util/memory.h"
 #include "util/string.h"
 
-
-/* Fix namespace clash on MacOS. */
-#define table table_elinks
-
-struct table_entry {
-       unsigned char c;
-       /* This should in principle be unicode_val_T, but because all
-        * the values currently in codepage.inc fit in 16 bits, we can
-        * as well use uint16_t and halve sizeof(struct table_entry)
-        * from 8 bytes to 4.  Should other characters ever be needed,
-        * unicode_val_T u : 24 might be a possibility, although it
-        * seems a little unportable as bitfields are in principle
-        * restricted to int, which may be 16-bit.  */
-       uint16_t u;
-};
-
-struct codepage_desc {
-       unsigned char *name;
-       unsigned char *const *aliases;
-
-       /* The Unicode mappings of codepage bytes 0x80...0xFF.
-        * (0x00...0x7F are assumed to be ASCII in all codepages.)
-        * Because all current values fit in 16 bits, we store them as
-        * uint16_t rather than unicode_val_T.  If the codepage does
-        * not use some byte, then @highhalf maps that byte to 0xFFFF,
-        * which C code converts to UCS_REPLACEMENT_CHARACTER where
-        * appropriate.  (U+FFFF is reserved and will never be
-        * assigned as a character.)  */
-       const uint16_t *highhalf;
-
-       /* If some byte in the codepage corresponds to multiple Unicode
-        * characters, then the preferred character is in @highhalf
-        * above, and the rest are listed here in @table.  This table
-        * is not used for translating from the codepage to Unicode.  */
-       const struct table_entry *table;
-};
-
 #include "intl/codepage.inc"
 #include "intl/uni_7b.inc"
 #include "intl/entity.inc"
diff --git a/src/intl/charsets.h b/src/intl/charsets.h
index d87e2ee..478c539 100644
--- a/src/intl/charsets.h
+++ b/src/intl/charsets.h
@@ -74,6 +74,43 @@ struct conv_table {
        } u;
 };
 
+#define table table_elinks
+
+struct table_entry {
+       unsigned char c;
+       /* This should in principle be unicode_val_T, but because all
+        * the values currently in codepage.inc fit in 16 bits, we can
+        * as well use uint16_t and halve sizeof(struct table_entry)
+        * from 8 bytes to 4.  Should other characters ever be needed,
+        * unicode_val_T u : 24 might be a possibility, although it
+        * seems a little unportable as bitfields are in principle
+        * restricted to int, which may be 16-bit.  */
+       uint16_t u;
+};
+
+struct codepage_desc {
+       unsigned char *name;
+       unsigned char *const *aliases;
+
+       /* The Unicode mappings of codepage bytes 0x80...0xFF.
+        * (0x00...0x7F are assumed to be ASCII in all codepages.)
+        * Because all current values fit in 16 bits, we store them as
+        * uint16_t rather than unicode_val_T.  If the codepage does
+        * not use some byte, then @highhalf maps that byte to 0xFFFF,
+        * which C code converts to UCS_REPLACEMENT_CHARACTER where
+        * appropriate.  (U+FFFF is reserved and will never be
+        * assigned as a character.)  */
+       const uint16_t *highhalf;
+
+       /* If some byte in the codepage corresponds to multiple Unicode
+        * characters, then the preferred character is in @highhalf
+        * above, and the rest are listed here in @table.  This table
+        * is not used for translating from the codepage to Unicode.  */
+       const struct table_entry *table;
+};
+
+extern const struct codepage_desc codepages[];
+
 enum convert_string_mode {
        CSM_DEFAULT, /* Convert any char. */
        CSM_QUERY, /* Special handling of '&' and '=' chars. */

diff --git a/src/bookmarks/backend/xbel.c b/src/bookmarks/backend/xbel.c
index 432d3ba..1aa3ba6 100644
--- a/src/bookmarks/backend/xbel.c
+++ b/src/bookmarks/backend/xbel.c
@@ -82,6 +82,23 @@ static struct tree_node *current_node = NULL;
  * different format. */
 static int readok = 1;
 
+static int XMLCALL
+unknown_encoding(void *data, const char *name, XML_Encoding *info)
+{
+       int index = get_cp_index(name);
+       int i;
+
+       if (index < 0) return XML_STATUS_ERROR;
+       for (i = 0; i < 128; i++) {
+               info->map[i] = i;
+       }
+       for (; i < 256; i++) {
+               info->map[i] = codepages[index].highhalf[i - 128];
+       }
+       return XML_STATUS_OK;
+}
+
+
 static void
 read_bookmarks_xbel(FILE *f)
 {
@@ -100,6 +117,7 @@ read_bookmarks_xbel(FILE *f)
 
        XML_SetElementHandler(p, on_element_open, on_element_close);
        XML_SetCharacterDataHandler(p, on_text);
+       XML_SetUnknownEncodingHandler(p, unknown_encoding, NULL);
 
        while (!done && !err) {
                size_t len = fread(in_buffer, 1, BUFSIZ, f);
@@ -168,6 +186,7 @@ indentation(struct secure_save_info *ssi, int num)
                secure_fputs(ssi, "    ");
 }
 
+#if 0
 /* FIXME This is totally broken, we should use the Unicode value in
  *       numeric entities.
  *       Additionally it is slow, not elegant, incomplete and
@@ -205,6 +224,7 @@ print_xml_entities(struct secure_save_info *ssi, const 
unsigned char *str)
 #undef accept_char
 
 }
+#endif
 
 static void
 write_bookmarks_list(struct secure_save_info *ssi,
@@ -225,7 +245,7 @@ write_bookmarks_list(struct secure_save_info *ssi,
 
                        indentation(ssi, n + 2);
                        secure_fputs(ssi, "<title>");
-                       print_xml_entities(ssi, bm->title);
+                       secure_fputs(ssi, bm->title);
                        secure_fputs(ssi, "</title>\n");
 
                        if (!list_empty(bm->child))
@@ -237,12 +257,12 @@ write_bookmarks_list(struct secure_save_info *ssi,
                } else if (bm->box_item->type == BI_LEAF) {
 
                        secure_fputs(ssi, "<bookmark href=\"");
-                       print_xml_entities(ssi, bm->url);
+                       secure_fputs(ssi, bm->url);
                        secure_fputs(ssi, "\">\n");
 
                        indentation(ssi, n + 2);
                        secure_fputs(ssi, "<title>");
-                       print_xml_entities(ssi, bm->title);
+                       secure_fputs(ssi, bm->title);
                        secure_fputs(ssi, "</title>\n");
 
                        indentation(ssi, n + 1);

_______________________________________________
elinks-dev mailing list
[email protected]
http://linuxfromscratch.org/mailman/listinfo/elinks-dev

[elinks-dev] Encodings in xbel

Reply via email to