On Thu, Dec 08, 2005 at 05:54:35PM -0500, Gregory Maxwell wrote: > No, what is needed for people who care about fixing their data is a > loadable strip_invalid_utf8() that works in older versions.. then just > select * from bar where foo != strip_invalid_utf8(foo); The function > would be useful in general, for example, if you have an application > which doesn't already have much utf8 logic, you want to use a text > field, and stripping is the behaviour you want. For example, lots of > simple web applications.
Would something like the following work? It's written in pl/pgsql and does (AFAICS) the same checking as the backend in recent releases. Except the backend only supports up to 4-byte UTF-8 whereas this function checks upto six byte. For a six byte UTF-8 character, who is wrong? In any case, people should be able to do something like: SELECT field FROM table WHERE NOT utf8_verify(field,4); To check conformance with PostgreSQL 8.1. Note, I don't have large chunks of UTF-8 to test with but it works for the characters I tried with. Tested with 7.4. Have a nice day, -- Martijn van Oosterhout <kleptog@svana.org> http://svana.org/kleptog/ > Patent. n. Genius is 5% inspiration and 95% perspiration. A patent is a > tool for doing 5% of the work and then sitting around waiting for someone > else to do the other 95% so you can sue them.
create or replace function utf8_verify(bytea,integer) returns bool as ' DECLARE str ALIAS FOR $1; maxlen ALIAS FOR $2; strlen INTEGER; i integer; j INTEGER; len integer; chr integer; wchr integer; BEGIN i := 0; strlen := length(str); WHILE i < strlen LOOP -- Check leading byte chr := get_byte(str,i); IF chr < 128 THEN -- 0x00 - 0x80 - single byte len := 1; wchr := chr; ELSIF chr < 192 THEN -- 0x80 - 0xC0 - illegal RETURN false; ELSIF chr < 224 THEN -- 0xC0 - 0xE0 - two bytes len := 2; wchr := chr - 192; ELSIF chr < 240 THEN -- 0xE0 - 0xF0 - three bytes len := 3; wchr := chr - 224; ELSIF chr < 248 THEN -- 0xF0 - 0xF8 - four bytes len := 4; wchr := chr - 240; ELSIF chr < 252 THEN -- 0xF8 - 0xFC - five bytes len := 5; wchr := chr - 248; ELSIF chr < 254 THEN -- 0xFC - 0xFE - six bytes len := 6; wchr := chr - 252; ELSE RETURN false; -- FE and FF not currently defined END IF; IF i + len > strlen THEN RETURN false; END IF; IF len > maxlen THEN RETURN false; END IF; -- Check remaining characters j := 1; WHILE len > j LOOP chr := get_byte(str, i+j); IF chr < 128 OR chr >= 192 THEN RETURN false; END IF; wchr := (wchr << 6) + (chr - 192); j := j+1; END LOOP; -- Verify shortest possible string IF len = 1 AND wchr >= 128 THEN RETURN false; ELSIF len = 2 AND (wchr < 128 OR wchr >= 2048) THEN RETURN false; ELSIF len = 3 AND (wchr < 2048 OR wchr >= 65536) THEN RETURN false; ELSIF len = 4 AND (wchr < 65536 OR wchr >= 2097152) THEN RETURN false; ELSIF len = 5 AND (wchr < 2097152 OR wchr >= 67108864) THEN RETURN false; ELSIF len = 6 AND (wchr < 67108864 OR wchr >= 2147483648) THEN RETURN false; END IF; -- RAISE NOTICE ''Checked char offset %, OK (wchr=%,len=%)'', i, wchr, len; i := i+len; END LOOP; RETURN true; END; ' language plpgsql;
pgpBVYFeuKe6z.pgp
Description: PGP signature