On Thu, Dec 08, 2005 at 05:54:35PM -0500, Gregory Maxwell wrote:
> No, what is needed for people who care about fixing their data is a
> loadable strip_invalid_utf8() that works in older versions.. then just
> select * from bar where foo != strip_invalid_utf8(foo);  The function
> would be useful in general, for example, if you have an application
> which doesn't already have much utf8 logic, you want to use a text
> field, and stripping is the behaviour you want. For example, lots of
> simple web applications.

Would something like the following work? It's written in pl/pgsql and
does (AFAICS) the same checking as the backend in recent releases.
Except the backend only supports up to 4-byte UTF-8 whereas this
function checks upto six byte. For a six byte UTF-8 character, who is
wrong?

In any case, people should be able to do something like:

SELECT field FROM table WHERE NOT utf8_verify(field,4);

To check conformance with PostgreSQL 8.1. Note, I don't have large
chunks of UTF-8 to test with but it works for the characters I tried
with. Tested with 7.4.

Have a nice day,
-- 
Martijn van Oosterhout   <kleptog@svana.org>   http://svana.org/kleptog/
> Patent. n. Genius is 5% inspiration and 95% perspiration. A patent is a
> tool for doing 5% of the work and then sitting around waiting for someone
> else to do the other 95% so you can sue them.
create or replace function utf8_verify(bytea,integer) returns bool as '
DECLARE
   str ALIAS FOR $1;
   maxlen ALIAS FOR $2;
   strlen INTEGER;
   i integer;
   j INTEGER;
   len integer;
   chr integer;
   wchr integer;
BEGIN
   i := 0;
   strlen := length(str);

   WHILE i < strlen LOOP

     -- Check leading byte
     chr := get_byte(str,i);

     IF chr < 128 THEN     -- 0x00 - 0x80   - single byte
        len := 1;
        wchr := chr;
     ELSIF chr < 192 THEN  -- 0x80 - 0xC0   - illegal
        RETURN false;
     ELSIF chr < 224 THEN  -- 0xC0 - 0xE0   - two bytes
        len := 2;
        wchr := chr - 192;
     ELSIF chr < 240 THEN  -- 0xE0 - 0xF0   - three bytes
        len := 3;
        wchr := chr - 224;
     ELSIF chr < 248 THEN  -- 0xF0 - 0xF8   - four bytes
        len := 4;
        wchr := chr - 240;
     ELSIF chr < 252 THEN  -- 0xF8 - 0xFC   - five bytes
        len := 5;
        wchr := chr - 248;
     ELSIF chr < 254 THEN  -- 0xFC - 0xFE   - six bytes
        len := 6;
        wchr := chr - 252;
     ELSE
        RETURN false;   -- FE and FF not currently defined
     END IF;

     IF i + len > strlen THEN
        RETURN false;
     END IF;

     IF len > maxlen THEN
        RETURN false;
     END IF;

     -- Check remaining characters
     j := 1;
     WHILE len > j LOOP
        chr := get_byte(str, i+j);
        IF chr < 128 OR chr >= 192 THEN
            RETURN false;
        END IF;
        wchr := (wchr << 6) + (chr - 192);
        j := j+1;
     END LOOP;

     -- Verify shortest possible string
     IF len = 1 AND wchr >= 128 THEN
        RETURN false;
     ELSIF len = 2 AND (wchr < 128 OR wchr >= 2048) THEN
        RETURN false;
     ELSIF len = 3 AND (wchr < 2048 OR wchr >= 65536) THEN
        RETURN false;
     ELSIF len = 4 AND (wchr < 65536 OR wchr >= 2097152) THEN
        RETURN false;
     ELSIF len = 5 AND (wchr < 2097152 OR wchr >= 67108864) THEN
        RETURN false;
     ELSIF len = 6 AND (wchr < 67108864 OR wchr >= 2147483648) THEN
        RETURN false;
     END IF;

--     RAISE NOTICE ''Checked char offset %, OK (wchr=%,len=%)'', i, wchr, len;

     i := i+len;
   END LOOP;

  RETURN true;
END;
' language plpgsql;

Attachment: pgpBVYFeuKe6z.pgp
Description: PGP signature

Reply via email to