Tatsuo Ishii <is...@postgresql.org> writes:
> I have started to looking into it. I wonder how do you create the part
> of your patch:

The code I used is below.

> In the above you seem to disable the conversion from 0x96 of win1250
> to ISO-8859-2 by using the Unicode mapping files in
> src/backend/utils/mb/Unicode. But the corresponding mapping file
> (iso8859_2_to_utf8.amp) does include following entry:

>   {0x0096, 0xc296},

> How do you know 0x96 should be removed from the conversion?

Right, but there is no mapping in the win1250-utf8 files that matches
U+C296.  The complaint over in the other thread is precisely that we
have no business translating 0x96 in WIN1250 to this character.  What
WIN1250 0x96 could translate to is U+E28093 (at least, according to
win1250_to_utf8.map) but that Unicode character has no equivalent in
LATIN2.

AFAICS, whoever made these tables just arbitrarily decided that 0x96
in WIN1250 could be mapped to 0x96 in LATIN2, and likewise for a number
of other codes; but those are false equivalences, as you find out if
you try to perform the same conversion via other encoding conversion
paths, ie convert to UTF8 and then to the other encoding.

                        regards, tom lane

#include "c.h"
#include "mb/pg_wchar.h"

#include "src/backend/utils/mb/Unicode/iso8859_2_to_utf8.map"
#include "src/backend/utils/mb/Unicode/iso8859_5_to_utf8.map"
#include "src/backend/utils/mb/Unicode/win1250_to_utf8.map"
#include "src/backend/utils/mb/Unicode/win1251_to_utf8.map"
#include "src/backend/utils/mb/Unicode/win866_to_utf8.map"
#include "src/backend/utils/mb/Unicode/koi8r_to_utf8.map"
#include "src/backend/utils/mb/Unicode/koi8u_to_utf8.map"


typedef struct
{
	const pg_local_to_utf *map1;	/* to UTF8 map name */
	int			size1;			/* size of map1 */
	const pg_local_to_utf *map2;	/* to UTF8 map name */
	int			size2;			/* size of map2 */
	const char *tabname;
	int			upper;
} pg_conv_map;

static const pg_conv_map maps[] = {
	{
		LUmapWIN1250, lengthof(LUmapWIN1250),
		LUmapISO8859_2, lengthof(LUmapISO8859_2),
		"win1250_2_iso88592", 1
	},
	{
		LUmapISO8859_2, lengthof(LUmapISO8859_2),
		LUmapWIN1250, lengthof(LUmapWIN1250),
		"iso88592_2_win1250", 1
	},
	{
		LUmapISO8859_5, lengthof(LUmapISO8859_5),
		LUmapKOI8R, lengthof(LUmapKOI8R),
		"iso2koi", 0
	},
	{
		LUmapKOI8R, lengthof(LUmapKOI8R),
		LUmapISO8859_5, lengthof(LUmapISO8859_5),
		"koi2iso", 0
	},
	{
		LUmapWIN1251, lengthof(LUmapWIN1251),
		LUmapKOI8R, lengthof(LUmapKOI8R),
		"win2koi", 0
	},
	{
		LUmapKOI8R, lengthof(LUmapKOI8R),
		LUmapWIN1251, lengthof(LUmapWIN1251),
		"koi2win", 0
	},
	{
		LUmapWIN866, lengthof(LUmapWIN866),
		LUmapKOI8R, lengthof(LUmapKOI8R),
		"win8662koi", 0
	},
	{
		LUmapKOI8R, lengthof(LUmapKOI8R),
		LUmapWIN866, lengthof(LUmapWIN866),
		"koi2win866", 0
	},

};

static void
domap(const pg_conv_map *info)
{
	uint32 c;

	printf("	static const unsigned char %s[] = {\n", info->tabname);

	for (c = 0x80; c <= 0xff; c++)
	{
		uint32 u = 0;
		uint32 c2 = 0;
		int i;

		for (i = 0; i < info->size1; i++)
		{
			if (info->map1[i].code == c)
			{
				u = info->map1[i].utf;
				break;
			}
		}
		if (u != 0)
		{
			for (i = 0; i < info->size2; i++)
			{
				if (info->map2[i].utf == u)
				{
					c2 = info->map2[i].code;
					break;
				}
			}
		}
#if 0
		if (c2)
			printf("0x%02x maps to 0x%02x via U+%04X\n", c, c2, u);
		else
			printf("0x%02x has no equivalent\n", c);
#endif
		if (c % 8 == 0)
			printf("\t\t");
		if (info->upper)
			printf("0x%02X", c2);
		else
			printf("0x%02x", c2);
		if (c == 0xff)
			printf("\n");
		else if (c % 8 == 7)
			printf(",\n");
		else
			printf(", ");
	}
	printf("\t};\n\n");
}

int
main()
{
	int i;

	for (i = 0; i < lengthof(maps); i++)
		domap(maps + i);

	return 0;
}
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to