Hej,
Here are two patches, which change a bit xbel bookmarks backend.
These patches let read bookmarks in XBEL format in any encoding supported
by ELinks, and save it in UTF-8.
In fact if the user has other codepage than UTF-8 and uses
in bookmarks non-ascii characters , it won't work right.
Witek
diff --git a/src/intl/charsets.c b/src/intl/charsets.c
index de853b9..03f7f2b 100644
--- a/src/intl/charsets.c
+++ b/src/intl/charsets.c
@@ -29,43 +29,6 @@
#include "util/memory.h"
#include "util/string.h"
-
-/* Fix namespace clash on MacOS. */
-#define table table_elinks
-
-struct table_entry {
- unsigned char c;
- /* This should in principle be unicode_val_T, but because all
- * the values currently in codepage.inc fit in 16 bits, we can
- * as well use uint16_t and halve sizeof(struct table_entry)
- * from 8 bytes to 4. Should other characters ever be needed,
- * unicode_val_T u : 24 might be a possibility, although it
- * seems a little unportable as bitfields are in principle
- * restricted to int, which may be 16-bit. */
- uint16_t u;
-};
-
-struct codepage_desc {
- unsigned char *name;
- unsigned char *const *aliases;
-
- /* The Unicode mappings of codepage bytes 0x80...0xFF.
- * (0x00...0x7F are assumed to be ASCII in all codepages.)
- * Because all current values fit in 16 bits, we store them as
- * uint16_t rather than unicode_val_T. If the codepage does
- * not use some byte, then @highhalf maps that byte to 0xFFFF,
- * which C code converts to UCS_REPLACEMENT_CHARACTER where
- * appropriate. (U+FFFF is reserved and will never be
- * assigned as a character.) */
- const uint16_t *highhalf;
-
- /* If some byte in the codepage corresponds to multiple Unicode
- * characters, then the preferred character is in @highhalf
- * above, and the rest are listed here in @table. This table
- * is not used for translating from the codepage to Unicode. */
- const struct table_entry *table;
-};
-
#include "intl/codepage.inc"
#include "intl/uni_7b.inc"
#include "intl/entity.inc"
diff --git a/src/intl/charsets.h b/src/intl/charsets.h
index d87e2ee..478c539 100644
--- a/src/intl/charsets.h
+++ b/src/intl/charsets.h
@@ -74,6 +74,43 @@ struct conv_table {
} u;
};
+#define table table_elinks
+
+struct table_entry {
+ unsigned char c;
+ /* This should in principle be unicode_val_T, but because all
+ * the values currently in codepage.inc fit in 16 bits, we can
+ * as well use uint16_t and halve sizeof(struct table_entry)
+ * from 8 bytes to 4. Should other characters ever be needed,
+ * unicode_val_T u : 24 might be a possibility, although it
+ * seems a little unportable as bitfields are in principle
+ * restricted to int, which may be 16-bit. */
+ uint16_t u;
+};
+
+struct codepage_desc {
+ unsigned char *name;
+ unsigned char *const *aliases;
+
+ /* The Unicode mappings of codepage bytes 0x80...0xFF.
+ * (0x00...0x7F are assumed to be ASCII in all codepages.)
+ * Because all current values fit in 16 bits, we store them as
+ * uint16_t rather than unicode_val_T. If the codepage does
+ * not use some byte, then @highhalf maps that byte to 0xFFFF,
+ * which C code converts to UCS_REPLACEMENT_CHARACTER where
+ * appropriate. (U+FFFF is reserved and will never be
+ * assigned as a character.) */
+ const uint16_t *highhalf;
+
+ /* If some byte in the codepage corresponds to multiple Unicode
+ * characters, then the preferred character is in @highhalf
+ * above, and the rest are listed here in @table. This table
+ * is not used for translating from the codepage to Unicode. */
+ const struct table_entry *table;
+};
+
+extern const struct codepage_desc codepages[];
+
enum convert_string_mode {
CSM_DEFAULT, /* Convert any char. */
CSM_QUERY, /* Special handling of '&' and '=' chars. */
diff --git a/src/bookmarks/backend/xbel.c b/src/bookmarks/backend/xbel.c
index 432d3ba..1aa3ba6 100644
--- a/src/bookmarks/backend/xbel.c
+++ b/src/bookmarks/backend/xbel.c
@@ -82,6 +82,23 @@ static struct tree_node *current_node = NULL;
* different format. */
static int readok = 1;
+static int XMLCALL
+unknown_encoding(void *data, const char *name, XML_Encoding *info)
+{
+ int index = get_cp_index(name);
+ int i;
+
+ if (index < 0) return XML_STATUS_ERROR;
+ for (i = 0; i < 128; i++) {
+ info->map[i] = i;
+ }
+ for (; i < 256; i++) {
+ info->map[i] = codepages[index].highhalf[i - 128];
+ }
+ return XML_STATUS_OK;
+}
+
+
static void
read_bookmarks_xbel(FILE *f)
{
@@ -100,6 +117,7 @@ read_bookmarks_xbel(FILE *f)
XML_SetElementHandler(p, on_element_open, on_element_close);
XML_SetCharacterDataHandler(p, on_text);
+ XML_SetUnknownEncodingHandler(p, unknown_encoding, NULL);
while (!done && !err) {
size_t len = fread(in_buffer, 1, BUFSIZ, f);
@@ -168,6 +186,7 @@ indentation(struct secure_save_info *ssi, int num)
secure_fputs(ssi, " ");
}
+#if 0
/* FIXME This is totally broken, we should use the Unicode value in
* numeric entities.
* Additionally it is slow, not elegant, incomplete and
@@ -205,6 +224,7 @@ print_xml_entities(struct secure_save_info *ssi, const
unsigned char *str)
#undef accept_char
}
+#endif
static void
write_bookmarks_list(struct secure_save_info *ssi,
@@ -225,7 +245,7 @@ write_bookmarks_list(struct secure_save_info *ssi,
indentation(ssi, n + 2);
secure_fputs(ssi, "<title>");
- print_xml_entities(ssi, bm->title);
+ secure_fputs(ssi, bm->title);
secure_fputs(ssi, "</title>\n");
if (!list_empty(bm->child))
@@ -237,12 +257,12 @@ write_bookmarks_list(struct secure_save_info *ssi,
} else if (bm->box_item->type == BI_LEAF) {
secure_fputs(ssi, "<bookmark href=\"");
- print_xml_entities(ssi, bm->url);
+ secure_fputs(ssi, bm->url);
secure_fputs(ssi, "\">\n");
indentation(ssi, n + 2);
secure_fputs(ssi, "<title>");
- print_xml_entities(ssi, bm->title);
+ secure_fputs(ssi, bm->title);
secure_fputs(ssi, "</title>\n");
indentation(ssi, n + 1);
_______________________________________________
elinks-dev mailing list
[email protected]
http://linuxfromscratch.org/mailman/listinfo/elinks-dev