On Tue, 26 Jun 2007 at 12:35, Artur Litwinowicz wrote: > O I think it is not good idea to change the code of OpenBSD by me. > Years ago I was coding in C++ (at the University but with best mark ;). Now > I am working for Oracle Corp. (PL/SQL and etc.) and I am a little out of > practice ;) with C/C++. > > Maybe someone core OpenBSD Developer will agree with me, that the extended > COLLATION in OpenBSD will be the strong point in the system functionality ? >
I had the same problem with PostgreSQL on OpenBSD a few years ago and I've written some C-language extensions (database functions). Be advised that this is a totally non-standard solution. Here you go: The idea was to make simple functions for sorting and upper/lower conversion, that would be faster than typical unicode table lookups. Since I only ever needed a single language at a time, I didn't need all the fancy unicode stuff. In "lang.h", there are lower and upper alphabet strings for each language (currently english and slovenian). You can add your own, of course. If you run "make" (you may need to edit the Makefile first), you'll get the "hash_en.so", "hash_sl.so", "upper_en.so", "upper_sl.so", "lower_en.so" and "lower_sl.so" shared libs containing postgres functions with the same names. To load them into database, use CREATE FUNCTION func_name(TEXT) RETURNS TEXT AS \ 'path/to/func_file.so', 'func_name' LANGUAGE C IMMUTABLE STRICT for each of them. The upper_XX and lower_XX functions return the upper/lowercase version of the input string. The hash_XX function replaces the input string with new string where each letter is replaced with its position in the alphabet. For example, instead of SELECT ... ORDER BY my_column you can use SELECT ... ORDER BY hash_sl(my_column) and you've got slovenian sort order. For performance, create an index on hash_sl(my_column), not my_column. This will only work on "unicode" databases. Of course, there may be bugs. They are quite likely, actually. I remember I wrote all this in a hurry. But it has worked OK for at least three projects now. All the files except for "lang.h" follow below. For "lang.h", go to "http://www.komna.com/tin/lang.h" (it's UTF-8 encoded, so I can't put it here inline). # Makefile # # Copyright (c) 2004 Valentin Kozamernik <[EMAIL PROTECTED]> # # Permission to use, copy, modify, and distribute this software for any # purpose with or without fee is hereby granted, provided that the above # copyright notice and this permission notice appear in all copies. # # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR # ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF # OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. # INCLUDE_DIR=/usr/local/include/postgresql/server build: hash_en.so hash_sl.so lower_en.so lower_sl.so upper_en.so upper_sl.so hash_en.so: hash.c cc -Wall -Werror -fpic -c -o hash_en.o hash.c -I$(INCLUDE_DIR) -DEN ld -Bshareable -o hash_en.so hash_en.o hash_sl.so: hash.c cc -Wall -Werror -fpic -c -o hash_sl.o hash.c -I$(INCLUDE_DIR) -DSL ld -Bshareable -o hash_sl.so hash_sl.o lower_en.so: lower.c cc -Wall -Werror -fpic -c -o lower_en.o lower.c -I$(INCLUDE_DIR) -DEN ld -Bshareable -o lower_en.so lower_en.o lower_sl.so: lower.c cc -Wall -Werror -fpic -c -o lower_sl.o lower.c -I$(INCLUDE_DIR) -DSL ld -Bshareable -o lower_sl.so lower_sl.o upper_en.so: upper.c cc -Wall -Werror -fpic -c -o upper_en.o upper.c -I$(INCLUDE_DIR) -DEN ld -Bshareable -o upper_en.so upper_en.o upper_sl.so: upper.c cc -Wall -Werror -fpic -c -o upper_sl.o upper.c -I$(INCLUDE_DIR) -DSL ld -Bshareable -o upper_sl.so upper_sl.o clean: -rm *.o *.so /* * $Id: utf8.h,v 1.1.1.1 2004/12/14 14:53:28 tin Exp $ * * Copyright (c) 2004 Valentin Kozamernik <[EMAIL PROTECTED]> * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include <sys/types.h> typedef u_int32_t widechar_t; #define STR_WCHAR_UNKNOWN ((widechar_t)'?') /* read a wide char from UTF-8 string *ptr and point *ptr to the next one */ widechar_t utf8get(char **ptr) { unsigned char uch; widechar_t res; uch = (unsigned char)(**ptr); (*ptr)++; if (!(uch & 0x80)) return((widechar_t)uch); if ((uch & 0xFE) == 0xC0) return(STR_WCHAR_UNKNOWN); if (((uch & 0xFF) == 0xE0) && ((**ptr & 0xE0) == 0x80)) return(STR_WCHAR_UNKNOWN); if (((uch & 0xFF) == 0xF0) && ((**ptr & 0xF0) == 0x80)) return(STR_WCHAR_UNKNOWN); if (((uch & 0xFF) == 0xF8) && ((**ptr & 0xF8) == 0x80)) return(STR_WCHAR_UNKNOWN); if (((uch & 0xFF) == 0xFC) && ((**ptr & 0xFC) == 0x80)) return(STR_WCHAR_UNKNOWN); if ((**ptr & 0xC0) != 0x80) return(STR_WCHAR_UNKNOWN); res = (widechar_t)((unsigned char)*((*ptr)++)) & 0x0000003F; if(uch & 0x20) { if((**ptr & 0xC0) != 0x80) { (*ptr)--; return(STR_WCHAR_UNKNOWN); } res = (res << 6) | ((widechar_t)((unsigned char)*((*ptr)++)) & 0x0000003F); } else if((uch & 0xE0) != 0xC0) { (*ptr)-=2; return(STR_WCHAR_UNKNOWN); } else return(res | (((widechar_t)uch & 0x0000001F) << 6)); if(uch & 0x10) { res = (res << 6) | ((widechar_t)((unsigned char)*((*ptr)++)) & 0x0000003F); } else if((uch & 0xF0) != 0xE0) { (*ptr)-=2; return(STR_WCHAR_UNKNOWN); } else return(res | (((widechar_t)uch & 0x0000000F) << 12)); if(uch & 0x08) { if((**ptr & 0xC0) != 0x80) { (*ptr)-=3; return(STR_WCHAR_UNKNOWN); } res = (res << 6) | ((widechar_t)((unsigned char)*((*ptr)++)) & 0x0000003F); } else if((uch & 0xF8) != 0xF0) { (*ptr)-=3; return(STR_WCHAR_UNKNOWN); } else return(res | (((widechar_t)uch & 0x00000007) << 18)); if(uch & 0x04) { if((**ptr & 0xC0) != 0x80) { (*ptr)-=4; return(STR_WCHAR_UNKNOWN); } res = (res << 6) | ((widechar_t)((unsigned char)*((*ptr)++)) & 0x0000003F); } else if((uch & 0xFC) != 0xF8) { (*ptr)-=4; return(STR_WCHAR_UNKNOWN); } else return(res | (((widechar_t)uch & 0x00000003) << 24)); if ((uch & 0xFE) != 0xFC) { (*ptr)-=5; return(STR_WCHAR_UNKNOWN); } return (res | (((widechar_t)uch & 0x00000001) << 30)); } /* write character ch to *ptr and point *ptr at the end */ void utf8put(char **ptr, widechar_t ch) { widechar_t bech = ch; if (bech < 0x00000080) { *((*ptr)++) = (char)bech; return; } if (bech < 0x00000800) { *((*ptr)++) = (char)(((bech >> 6) & 0x0000001F) | (0x000000C0)); *((*ptr)++) = (char)(((bech) & 0x0000003F) | (0x00000080)); return; } if (bech < 0x00010000) { *((*ptr)++) = (char)(((bech >> 12) & 0x0000000F) | (0x000000E0)); *((*ptr)++) = (char)(((bech >> 6) & 0x0000003F) | (0x00000080)); *((*ptr)++) = (char)(((bech) & 0x0000003F) | (0x00000080)); return; } if (bech < 0x00200000) { *((*ptr)++) = (char)(((bech >> 18) & 0x00000007) | (0x000000F0)); *((*ptr)++) = (char)(((bech >> 12) & 0x0000003F) | (0x00000080)); *((*ptr)++) = (char)(((bech >> 6) & 0x0000003F) | (0x00000080)); *((*ptr)++) = (char)(((bech) & 0x0000003F) | (0x00000080)); return; } if (bech < 0x04000000) { *((*ptr)++) = (char)(((bech >> 24) & 0x00000003) | (0x000000F8)); *((*ptr)++) = (char)(((bech >> 18) & 0x0000003F) | (0x00000080)); *((*ptr)++) = (char)(((bech >> 12) & 0x0000003F) | (0x00000080)); *((*ptr)++) = (char)(((bech >> 6) & 0x0000003F) | (0x00000080)); *((*ptr)++) = (char)(((bech) & 0x0000003F) | (0x00000080)); return; } *((*ptr)++) = (char)(((bech >> 30) & 0x00000001) | (0x000000FC)); *((*ptr)++) = (char)(((bech >> 24) & 0x0000003F) | (0x00000080)); *((*ptr)++) = (char)(((bech >> 18) & 0x0000003F) | (0x00000080)); *((*ptr)++) = (char)(((bech >> 12) & 0x0000003F) | (0x00000080)); *((*ptr)++) = (char)(((bech >> 6) & 0x0000003F) | (0x00000080)); *((*ptr)++) = (char)(((bech) & 0x0000003F) | (0x00000080)); } /* get the number of UTF-8 characters in a string */ int utf8len(char *p, char *e) { int n; n = 0; while (*p && p < e) { n++; if (!(*p & 0x80)) { p++; continue; } else if ((*p & 0xE0) == 0xC0) { p+=2; continue; } else if ((*p & 0xF0) == 0xE0) { p+=3; continue; } else if ((*p & 0xF8) == 0xF0) { p+=4; continue; } else if ((*p & 0xFC) == 0xF8) { p+=5; continue; } else if ((*p & 0xFE) == 0xFC) { p+=6; continue; } else return (n); } return (n); } /* get the number of bytes the character takes when UTF-8 encoded */ int utf8size(widechar_t ch) { if (ch < 0x00000080) return(1); if (ch < 0x00000800) return(2); if (ch < 0x00010000) return(3); if (ch < 0x00200000) return(4); if (ch < 0x04000000) return(5); return (6); } /* * $Id: hash.c,v 1.2 2007/02/15 14:16:39 tin Exp $ * * Copyright (c) 2004 Valentin Kozamernik <[EMAIL PROTECTED]> * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include <postgres.h> #include <fmgr.h> #include "lang.h" #include "utf8.h" #ifdef PG_MODULE_MAGIC PG_MODULE_MAGIC; #endif PG_FUNCTION_INFO_V1(HASH); Datum HASH(PG_FUNCTION_ARGS) { static widechar_t input[sizeof(ABC_U)]; static int value[sizeof(ABC_U)]; static int count = -1; text *in, *out; char *pi, *po, *limit; widechar_t wc1, wc2; unsigned int i, size, val; if (count < 0) { count = 0; val = 0; pi = ABC_L; po = ABC_U; while ((wc1 = utf8get(&pi))) { wc2 = utf8get(&po); if (wc1 == (widechar_t)',') { val++; continue; } input[count] = wc1; value[count] = val; if(wc1 != wc2) { count++; input[count] = wc2; value[count] = val; } count++; } } in = PG_GETARG_TEXT_P(0); limit = VARDATA(in) + VARSIZE(in) - VARHDRSZ; size = VARHDRSZ + (2 * utf8len((char *)VARDATA(in), limit)); out = (text *)palloc(size); VARATT_SIZEP(out) = size; pi = (char *)VARDATA(in); po = (char *)VARDATA(out); while (pi < limit) { wc1 = utf8get(&pi); for (i=0; i<count; i++) { if(input[i] == wc1) break; } if (i < count) { po[0] = 65 + (value[i] / 26); po[1] = 65 + (value[i] % 26); } else { po[0] = po[1] = 'Z'; } po += 2; } PG_RETURN_TEXT_P(out); } /* * $Id: lower.c,v 1.2 2007/02/15 14:16:39 tin Exp $ * * Copyright (c) 2004 Valentin Kozamernik <[EMAIL PROTECTED]> * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include <postgres.h> #include <fmgr.h> #include "lang.h" #include "utf8.h" #ifdef PG_MODULE_MAGIC PG_MODULE_MAGIC; #endif PG_FUNCTION_INFO_V1(LOWER); Datum LOWER(PG_FUNCTION_ARGS) { static widechar_t input[sizeof(ABC_U)]; static widechar_t output[sizeof(ABC_U)]; static int diff[sizeof(ABC_U)]; static int count = -1; text *in, *out; char *pi, *po, *limit; widechar_t wc1, wc2; unsigned int i, size; if (count < 0) { count = 0; pi = ABC_U; po = ABC_L; while ((wc1 = utf8get(&pi))) { wc2 = utf8get(&po); if (wc1 != wc2) { input[count] = wc1; output[count] = wc2; diff[count] = utf8size(wc2) - utf8size(wc1); count++; } } } in = PG_GETARG_TEXT_P(0); limit = VARDATA(in) + VARSIZE(in) - VARHDRSZ; size = VARSIZE(in); out = (text *)palloc(size); pi = (char *)VARDATA(in); po = (char *)VARDATA(out); while (pi < limit) { wc1 = utf8get(&pi); for (i=0; i<count; i++) { if (input[i] == wc1) break; } if (i < count) { if (diff[i]) { text *new_out; new_out = (text *)palloc(size + diff[i]); memcpy(VARDATA(new_out), VARDATA(out), size - VARHDRSZ); po = VARDATA(new_out) + (po - VARDATA(out)); pfree(out); out = new_out; } utf8put(&po, output[i]); } else utf8put(&po, wc1); } VARATT_SIZEP(out) = size; PG_RETURN_TEXT_P(out); } /* * $Id: upper.c,v 1.2 2007/02/15 14:16:39 tin Exp $ * * Copyright (c) 2004 Valentin Kozamernik <[EMAIL PROTECTED]> * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include <postgres.h> #include <fmgr.h> #include "lang.h" #include "utf8.h" #ifdef PG_MODULE_MAGIC PG_MODULE_MAGIC; #endif PG_FUNCTION_INFO_V1(UPPER); Datum UPPER(PG_FUNCTION_ARGS) { static widechar_t input[sizeof(ABC_U)]; static widechar_t output[sizeof(ABC_U)]; static int diff[sizeof(ABC_U)]; static int count = -1; text *in, *out; char *pi, *po, *limit; widechar_t wc1, wc2; unsigned int i, size; if (count < 0) { count = 0; pi = ABC_L; po = ABC_U; while ((wc1 = utf8get(&pi))) { wc2 = utf8get(&po); if (wc1 != wc2) { input[count] = wc1; output[count] = wc2; diff[count] = utf8size(wc2) - utf8size(wc1); count++; } } } in = PG_GETARG_TEXT_P(0); limit = VARDATA(in) + VARSIZE(in) - VARHDRSZ; size = VARSIZE(in); out = (text *)palloc(size); pi = (char *)VARDATA(in); po = (char *)VARDATA(out); while (pi < limit) { wc1 = utf8get(&pi); for (i=0; i<count; i++) { if (input[i] == wc1) break; } if (i < count) { if (diff[i]) { text *new_out; new_out = (text *)palloc(size + diff[i]); memcpy(VARDATA(new_out), VARDATA(out), size - VARHDRSZ); po = VARDATA(new_out) + (po - VARDATA(out)); pfree(out); out = new_out; } utf8put(&po, output[i]); } else utf8put(&po, wc1); } VARATT_SIZEP(out) = size; PG_RETURN_TEXT_P(out); } -- Tin