Hello to the list.
I did my own hack for support for UTF8 encoding in GNU Prolog. It peeks
ahead on the stream and makes each extended character/byte in a UTF8
sequence appear to be of the same type as the entire 'wide' character.
It uses the functions declared in <wctype.h>.
This logic has a problem with pushback *BUT* it is all internal to the
scanner (the scanner just treats non-ASCII characters specially).
Invalid UTF8 is read in the old-fashioned way.
Long '\XXXXX\' escape sequences are read/written.
I also did everything I could for atom_chars, atom_codes etc. but I
don't know gprolog's internals very well. They all seem to work OK
though.
There are #ifdef blocks and a new option in configure.in (untested).
src/BipsPl/c_supp.[ch]
*Char* and *Code* functions updated
src/BipsPl/scan_supp.c
UTF8_Hack_Peek_Next_Char
src/BipsPl/stream_supp.[ch]
I had to add some fields to StmInf for the scanner to use
src/BipsPl/write_supp.c
iswprint test
src/EnginePl/atom.[ch]
UTF8_Hack_Classify_Char, Is_Valid_Code
I hope you will find all this useful.
Stamatis Mitrofanis
diff -ru gprolog-1.2.16/src/BipsPl/c_supp.c gprolog-1.2.16.patch/src/BipsPl/c_supp.c
--- gprolog-1.2.16/src/BipsPl/c_supp.c 2002-04-05 09:47:32.000000000 +0300
+++ gprolog-1.2.16.patch/src/BipsPl/c_supp.c 2005-06-02 00:56:16.389371872 +0300
@@ -439,21 +439,36 @@
* RD_CHAR_CHECK *
* *
*-------------------------------------------------------------------------*/
-int
+long
Rd_Char_Check(WamWord start_word)
{
WamWord word, tag_mask;
int atom;
+#ifdef USE_UTF8_HACK
+ wchar_t wc;
+#endif
DEREF(start_word, word, tag_mask);
if (tag_mask == TAG_REF_MASK)
Pl_Err_Instantiation();
atom = UnTag_ATM(word);
- if (tag_mask != TAG_ATM_MASK || atom_tbl[atom].prop.length != 1)
+ if (tag_mask != TAG_ATM_MASK || atom_tbl[atom].prop.length !=
+#ifdef USE_UTF8_HACK
+ mblen(atom_tbl[atom].name, atom_tbl[atom].prop.length)
+#else
+ 1
+#endif
+ )
Pl_Err_Type(type_character, word);
+#ifdef USE_UTF8_HACK
+ wc = atom_tbl[atom].name[0];
+ mbtowc(&wc, atom_tbl[atom].name, atom_tbl[atom].prop.length);
+ return wc;
+#else
return atom_tbl[atom].name[0];
+#endif
}
@@ -463,16 +478,25 @@
* RD_CHAR *
* *
*-------------------------------------------------------------------------*/
-int
+long
Rd_Char(WamWord start_word)
{
WamWord word, tag_mask;
int atom;
+#ifdef USE_UTF8_HACK
+ wchar_t wc;
+#endif
DEREF(start_word, word, tag_mask);
atom = UnTag_ATM(word);
+#ifdef USE_UTF8_HACK
+ wc = atom_tbl[atom].name[0];
+ mbtowc(&wc, atom_tbl[atom].name, atom_tbl[atom].prop.length);
+ return wc;
+#else
return atom_tbl[atom].name[0];
+#endif
}
@@ -482,11 +506,14 @@
* RD_IN_CHAR_CHECK *
* *
*-------------------------------------------------------------------------*/
-int
+long
Rd_In_Char_Check(WamWord start_word)
{
WamWord word, tag_mask;
int atom;
+#ifdef USE_UTF8_HACK
+ wchar_t wc;
+#endif
DEREF(start_word, word, tag_mask);
if (tag_mask == TAG_REF_MASK)
@@ -494,10 +521,25 @@
atom = UnTag_ATM(word);
if (tag_mask != TAG_ATM_MASK ||
- (atom != atom_end_of_file && atom_tbl[atom].prop.length != 1))
+ (atom != atom_end_of_file && atom_tbl[atom].prop.length !=
+#ifdef USE_UTF8_HACK
+ mblen(atom_tbl[atom].name, atom_tbl[atom].prop.length)
+#else
+ 1
+#endif
+ ))
Pl_Err_Type(type_in_character, word);
- return (atom != atom_end_of_file) ? atom_tbl[atom].name[0] : -1;
+ if (atom == atom_end_of_file)
+ return -1;
+
+#ifdef USE_UTF8_HACK
+ wc = atom_tbl[atom].name[0];
+ mbtowc(&wc, atom_tbl[atom].name, atom_tbl[atom].prop.length);
+ return wc;
+#else
+ return atom_tbl[atom].name[0];
+#endif
}
@@ -507,15 +549,28 @@
* RD_IN_CHAR *
* *
*-------------------------------------------------------------------------*/
-int
+long
Rd_In_Char(WamWord start_word)
{
WamWord word, tag_mask;
int atom;
+#ifdef USE_UTF8_HACK
+ wchar_t wc;
+#endif
DEREF(start_word, word, tag_mask);
atom = UnTag_ATM(word);
- return (atom != atom_end_of_file) ? atom_tbl[atom].name[0] : -1;
+
+ if (atom == atom_end_of_file)
+ return -1;
+
+#ifdef USE_UTF8_HACK
+ wc = atom_tbl[atom].name[0];
+ mbtowc(&wc, atom_tbl[atom].name, atom_tbl[atom].prop.length);
+ return wc;
+#else
+ return atom_tbl[atom].name[0];
+#endif
}
@@ -525,10 +580,10 @@
* RD_CODE_CHECK *
* *
*-------------------------------------------------------------------------*/
-int
+long
Rd_Code_Check(WamWord start_word)
{
- int c;
+ long c;
c = Rd_Integer_Check(start_word);
if (!Is_Valid_Code(c))
@@ -544,7 +599,7 @@
* RD_CODE *
* *
*-------------------------------------------------------------------------*/
-int
+long
Rd_Code(WamWord start_word)
{
return Rd_Integer(start_word);
@@ -557,10 +612,10 @@
* RD_IN_CODE_CHECK *
* *
*-------------------------------------------------------------------------*/
-int
+long
Rd_In_Code_Check(WamWord start_word)
{
- int c;
+ long c;
c = Rd_Integer_Check(start_word);
if (c != -1 && !Is_Valid_Code(c))
@@ -576,7 +631,7 @@
* RD_IN_CODE *
* *
*-------------------------------------------------------------------------*/
-int
+long
Rd_In_Code(WamWord start_word)
{
return Rd_Integer(start_word);
@@ -761,6 +816,9 @@
WamWord save_start_word;
WamWord *lst_adr;
int n = 0;
+#ifdef USE_UTF8_HACK
+ int clen;
+#endif
save_start_word = start_word;
@@ -779,8 +837,18 @@
lst_adr = UnTag_LST(word);
+#ifdef USE_UTF8_HACK
+ wctomb(NULL,0);
+ clen = wctomb(str, Rd_Char_Check(Car(lst_adr)));
+ if (clen > 0)
+ {
+ str += clen;
+ n += clen;
+ }
+#else
*str++ = Rd_Char_Check(Car(lst_adr));
n++;
+#endif
start_word = Cdr(lst_adr);
}
@@ -803,6 +871,9 @@
WamWord save_start_word;
WamWord *lst_adr;
int n = 0;
+#ifdef USE_UTF8_HACK
+ int clen;
+#endif
save_start_word = start_word;
@@ -815,8 +886,18 @@
lst_adr = UnTag_LST(word);
+#ifdef USE_UTF8_HACK
+ wctomb(NULL,0);
+ clen = wctomb(str, Rd_Char_Check(Car(lst_adr)));
+ if (clen > 0)
+ {
+ str += clen;
+ n += clen;
+ }
+#else
*str++ = Rd_Char_Check(Car(lst_adr));
n++;
+#endif
start_word = Cdr(lst_adr);
}
@@ -839,6 +920,9 @@
WamWord save_start_word;
WamWord *lst_adr;
int n = 0;
+#ifdef USE_UTF8_HACK
+ int clen;
+#endif
save_start_word = start_word;
@@ -857,8 +941,18 @@
lst_adr = UnTag_LST(word);
+#ifdef USE_UTF8_HACK
+ wctomb(NULL,0);
+ clen = wctomb(str, Rd_Code_Check(Car(lst_adr)));
+ if (clen > 0)
+ {
+ str += clen;
+ n += clen;
+ }
+#else
*str++ = Rd_Code_Check(Car(lst_adr));
n++;
+#endif
start_word = Cdr(lst_adr);
}
@@ -881,6 +975,9 @@
WamWord save_start_word;
WamWord *lst_adr;
int n = 0;
+#ifdef USE_UTF8_HACK
+ int clen;
+#endif
save_start_word = start_word;
@@ -893,8 +990,18 @@
lst_adr = UnTag_LST(word);
+#ifdef USE_UTF8_HACK
+ wctomb(NULL,0);
+ clen = wctomb(str, Rd_Code_Check(Car(lst_adr)));
+ if (clen > 0)
+ {
+ str += clen;
+ n += clen;
+ }
+#else
*str++ = Rd_Code_Check(Car(lst_adr));
n++;
+#endif
start_word = Cdr(lst_adr);
}
@@ -1836,9 +1943,27 @@
Bool
Un_Chars(char *str, WamWord start_word)
{
+ int atom;
+#ifdef USE_UTF8_HACK
+ int clen = 1;
+ int len = strlen(str);
+ for (; *str; str+=clen, len-=clen)
+ {
+ wchar_t wc;
+ if ( (mbtowc(NULL,NULL,0), clen = mbtowc(&wc,str,len)) > 1 && wc >= 256)
+ atom = Create_Char_Atom(wc);
+ else
+ {
+ clen = 1;
+ atom = ATOM_CHAR(*str);
+ }
+#else
for (; *str; str++)
{
- if (!Get_List(start_word) || !Unify_Atom(ATOM_CHAR(*str)))
+ atom = ATOM_CHAR(*str);
+#endif
+
+ if (!Get_List(start_word) || !Unify_Atom(atom))
return FALSE;
start_word = Unify_Variable();
@@ -1872,9 +1997,24 @@
Bool
Un_Codes(char *str, WamWord start_word)
{
+#ifdef USE_UTF8_HACK
+ int clen = 1;
+ int len = strlen(str);
+ for (; *str; str+=clen, len-=clen)
+ {
+ wchar_t c;
+ if ( (mbtowc(NULL,NULL,0), clen = mbtowc(&c,str,len)) < 1 )
+ {
+ clen = 1;
+ c = *str;
+ }
+#else
for (; *str; str++)
{
- if (!Get_List(start_word) || !Unify_Integer(*str))
+ unsigned char c = *str;
+#endif
+
+ if (!Get_List(start_word) || !Unify_Integer(c))
return FALSE;
start_word = Unify_Variable();
diff -ru gprolog-1.2.16/src/BipsPl/c_supp.h gprolog-1.2.16.patch/src/BipsPl/c_supp.h
--- gprolog-1.2.16/src/BipsPl/c_supp.h 2002-03-19 20:24:34.000000000 +0200
+++ gprolog-1.2.16.patch/src/BipsPl/c_supp.h 2005-06-02 00:56:16.389371872 +0300
@@ -64,21 +64,21 @@
int Rd_Boolean(WamWord start_word);
-int Rd_Char_Check(WamWord start_word);
+long Rd_Char_Check(WamWord start_word);
-int Rd_Char(WamWord start_word);
+long Rd_Char(WamWord start_word);
-int Rd_In_Char_Check(WamWord start_word);
+long Rd_In_Char_Check(WamWord start_word);
-int Rd_In_Char(WamWord start_word);
+long Rd_In_Char(WamWord start_word);
-int Rd_Code_Check(WamWord start_word);
+long Rd_Code_Check(WamWord start_word);
-int Rd_Code(WamWord start_word);
+long Rd_Code(WamWord start_word);
-int Rd_In_Code_Check(WamWord start_word);
+long Rd_In_Code_Check(WamWord start_word);
-int Rd_In_Code(WamWord start_word);
+long Rd_In_Code(WamWord start_word);
int Rd_Byte_Check(WamWord start_word);
diff -ru gprolog-1.2.16/src/BipsPl/scan_supp.c gprolog-1.2.16.patch/src/BipsPl/scan_supp.c
--- gprolog-1.2.16/src/BipsPl/scan_supp.c 2002-04-05 09:47:32.000000000 +0300
+++ gprolog-1.2.16.patch/src/BipsPl/scan_supp.c 2005-06-02 00:56:16.390371720 +0300
@@ -42,6 +42,9 @@
/*---------------------------------*
* Constants *
*---------------------------------*/
+#ifndef USE_UTF8_HACK
+#define wint_t int
+#endif
/*---------------------------------*
* Type Definitions *
@@ -69,8 +72,8 @@
static void Scan_Quoted(StmInf *pstm);
-static int Scan_Quoted_Char(StmInf *pstm, Bool convert, int c0,
- Bool no_escape);
+static wint_t Scan_Quoted_Char(StmInf *pstm, Bool convert, int c0,
+ Bool no_escape);
@@ -103,6 +106,10 @@
* READ_NEXT_CHAR *
* *
*-------------------------------------------------------------------------*/
+#ifdef USE_UTF8_HACK
+static int
+UTF8_Hack_Peek_Next_Char(StmInf *pstm, Bool convert);
+#endif
static int
Read_Next_Char(StmInf *pstm, Bool convert)
{
@@ -115,11 +122,62 @@
if (convert)
c = Char_Conversion(c);
- c_type = char_type[c];
+#ifdef USE_UTF8_HACK
+ if (c >= 192)
+ c_type = UTF8_Hack_Peek_Next_Char(pstm,convert);
+ else if ( pstm->char_count <= pstm->utf8_end
+ && pstm->utf8_begin < pstm->char_count )
+ c_type = pstm->utf8_ctype;
+ else
+#endif
+ c_type = char_type[c];
}
return c;
}
+#ifdef USE_UTF8_HACK
+static int
+UTF8_Hack_Peek_Next_Char(StmInf *pstm, Bool convert)
+{
+ int i, len;
+ unsigned char cs[6];
+ wchar_t wc;
+
+ if (!(c&0x40)) return pstm->utf8_ctype;
+
+ pstm->utf8_end = pstm->utf8_begin = pstm->char_count-1;
+
+ cs[0] = c;
+ for (i=1, c<<=1 ; i<6 && c&0x80 ; c<<=1)
+ {
+ int a = Stream_Getc(pstm);
+ if (a == EOF)
+ goto on_eof;
+ cs[i++] = a;
+ }
+ len = i;
+ while (--i)
+ Stream_Ungetc(cs[i],pstm);
+ c = cs[0];
+
+ mbtowc(NULL,NULL,0);
+ if (mbtowc(&wc, cs, len) < 2) goto bad_char;
+
+ /*if (convert)
+ wc = Char_Conversion(wc);*/
+
+ pstm->utf8_ctype = UTF8_Hack_Classify_Char(wc);
+ pstm->utf8_end = pstm->utf8_begin+len;
+ return pstm->utf8_ctype;
+
+on_eof:
+ while (--i)
+ Stream_Ungetc(cs[i],pstm);
+ c = cs[0];
+bad_char:
+ return char_type[c];
+}
+#endif
@@ -321,6 +379,7 @@
if (!integer_only && /* float if . and digit */
c == '.' && isdigit(Scan_Peek_Char(pstm, TRUE)))
goto is_a_float;
+
/* integer number */
token.type = TOKEN_INTEGER;
*p++ = '\0';
@@ -330,15 +389,16 @@
if (c == '\'') /* 0'<character> */
{
- c = Scan_Quoted_Char(pstm, TRUE, '\'', FALSE);
- if (c == -1) /* <character> is ' */
+ wint_t wc = Scan_Quoted_Char(pstm, TRUE, '\'', FALSE);
+
+ if (wc == -1) /* <character> is ' */
{
token.line = pstm->line_count + 1;
token.col = pstm->line_pos + 1;
err_msg = "quote character expected here";
}
- if (c == -2 || c == -3)
+ if (wc == -2 || wc == -3) /* EOF or NL or \ NL */
{
Unget_Last_Char;
@@ -347,7 +407,7 @@
err_msg = "character expected here";
}
- token.int_num = c;
+ token.int_num = wc;
return;
}
@@ -449,20 +509,25 @@
for (;;)
{
- c = Scan_Quoted_Char(pstm, convert, c0, no_escape);
- if (c == -1)
+ wint_t wc = Scan_Quoted_Char(pstm, convert, c0, no_escape);
+ if (wc == -1)
{
*s = '\0';
return;
}
- if (c == -2) /* EOF or \n */
+ if (wc == -2) /* EOF or \n */
break;
- if (c == -3) /* \ followed by \n */
+ if (wc == -3) /* \ followed by \n */
continue;
- *s++ = c;
+#ifdef USE_UTF8_HACK
+ wctomb(NULL,0);
+ s += wctomb(s,wc);
+#else
+ *s++ = wc;
+#endif
}
/* error */
*s = '\0';
@@ -501,12 +566,13 @@
* SCAN_QUOTED_CHAR *
* *
*-------------------------------------------------------------------------*/
-static int
+static wint_t
Scan_Quoted_Char(StmInf *pstm, Bool convert, int c0, Bool no_escape)
{
- int radix;
+ int shift;
char *p, *f;
- int x, i;
+ int i;
+ wint_t x;
Read_Next_Char(pstm, convert);
if (c == c0)
@@ -538,13 +604,13 @@
{
if (c == 'x')
{
- radix = 16;
+ shift = 4;
f = "0123456789abcdefABCDEF";
x = 0;
}
else
{
- radix = 8;
+ shift = 3;
f = "01234567";
x = c - '0';
}
@@ -555,7 +621,8 @@
i = p - f;
if (i >= 16)
i -= 6;
- x = x * radix + i;
+ x = (x<<shift) + i;
+
Read_Next_Char(pstm, convert);
}
@@ -565,6 +632,7 @@
token.col = pstm->line_pos;
err_msg = "invalid character code in \\constant\\ sequence";
}
+
if (c != '\\')
{
if (err_msg == NULL)
@@ -577,7 +645,7 @@
Unget_Last_Char;
}
- return (int) (unsigned char) x;
+ return (wint_t) (unsigned long) x;
}
if (err_msg == NULL)
diff -ru gprolog-1.2.16/src/BipsPl/stream_supp.c gprolog-1.2.16.patch/src/BipsPl/stream_supp.c
--- gprolog-1.2.16/src/BipsPl/stream_supp.c 2002-09-19 14:00:36.000000000 +0300
+++ gprolog-1.2.16.patch/src/BipsPl/stream_supp.c 2005-06-02 00:56:16.390371720 +0300
@@ -401,6 +401,12 @@
pstm->line_count = 0;
pstm->line_pos = 0;
PB_Init(pstm->pb_line_pos);
+
+#ifdef USE_UTF8_HACK
+ pstm->utf8_ctype = 0;
+ pstm->utf8_begin = 0;
+ pstm->utf8_end = 0;
+#endif
}
diff -ru gprolog-1.2.16/src/BipsPl/stream_supp.h gprolog-1.2.16.patch/src/BipsPl/stream_supp.h
--- gprolog-1.2.16/src/BipsPl/stream_supp.h 2002-05-07 20:45:48.000000000 +0300
+++ gprolog-1.2.16.patch/src/BipsPl/stream_supp.h 2005-06-02 00:56:16.390371720 +0300
@@ -145,6 +145,11 @@
int line_count; /* line read count */
int line_pos; /* line position */
PbStk pb_line_pos; /* line position push back stack */
+#ifdef USE_UTF8_HACK /* -------- UTF-8 hack ---------- */
+ int utf8_ctype; /* type of multibyte character||0 */
+ int utf8_begin; /* beginning of sequence position */
+ int utf8_end; /* end of sequence position */
+#endif
}
StmInf;
diff -ru gprolog-1.2.16/src/BipsPl/write_supp.c gprolog-1.2.16.patch/src/BipsPl/write_supp.c
--- gprolog-1.2.16/src/BipsPl/write_supp.c 2002-04-09 11:26:42.000000000 +0300
+++ gprolog-1.2.16.patch/src/BipsPl/write_supp.c 2005-06-02 00:56:16.390371720 +0300
@@ -503,7 +503,14 @@
if (prop.needs_scan)
{
+#ifdef USE_UTF8_HACK
+ int len = prop.length;
+ int clen;
+ wchar_t wc;
+ for (p = atom_tbl[atom].name; *p; p++, len--)
+#else
for (p = atom_tbl[atom].name; *p; p++)
+#endif
if ((q = (char *) strchr(escape_char, *p)))
{
Out_Char('\\');
@@ -514,6 +521,23 @@
Out_Char(*p);
Out_Char(*p);
}
+#ifdef USE_UTF8_HACK
+ else if ( (mbtowc(NULL,NULL,0), clen = mbtowc(&wc,p,len)) >= 2 )
+ {
+ if (!iswprint(wc))
+ {
+ sprintf(str, "\\x%lx\\", wc);
+ Out_String(str);
+ p += clen-1;
+ }
+ else
+ {
+ while (--clen)
+ Out_Char(*p++);
+ Out_Char(*p);
+ }
+ }
+#endif
else if (!isprint(*p))
{
sprintf(str, "\\x%x\\", (unsigned) (unsigned char) *p);
diff -ru gprolog-1.2.16/src/configure.in gprolog-1.2.16.patch/src/configure.in
--- gprolog-1.2.16/src/configure.in 2002-09-19 13:57:32.000000000 +0300
+++ gprolog-1.2.16.patch/src/configure.in 2005-06-02 00:56:16.388372024 +0300
@@ -45,6 +45,7 @@
USE_GUI_CONSOLE=yes
USE_SOCKETS=yes
USE_FD_SOLVER=yes
+USE_UTF8_HACK=yes
DLL_W32GUICONS=w32guicons
LIB_LINEDIT=liblinedit
@@ -201,6 +202,12 @@
*) USE_FD_SOLVER=yes;;
esac])
+AC_ARG_ENABLE(utf8-hack, [ --enable-utf8-hack recognise types of UTF-8 characters],
+ [case "$enableval" in
+ yes) AC_DEFINE(USE_UTF8_HACK) USE_UTF8_HACK=yes;;
+ *) USE_UTF8_HACK=no;;
+ esac])
+
# ***********************
diff -ru gprolog-1.2.16/src/EnginePl/atom.c gprolog-1.2.16.patch/src/EnginePl/atom.c
--- gprolog-1.2.16/src/EnginePl/atom.c 2002-03-19 20:24:36.000000000 +0200
+++ gprolog-1.2.16.patch/src/EnginePl/atom.c 2005-06-02 00:56:16.390371720 +0300
@@ -29,6 +29,9 @@
#include <string.h>
#include <locale.h>
#include <ctype.h>
+#ifdef USE_UTF8_HACK
+#include <wctype.h>
+#endif
#define ATOM_FILE
@@ -258,10 +261,13 @@
AtomInf *patom;
AtomProp prop;
char *p;
- int c_type;
+ int c_type, first_c_type;
int lg;
Bool identifier;
Bool graphic;
+#ifdef USE_UTF8_HACK
+ int clen, len;
+#endif
patom = Locate_Atom(name);
@@ -279,9 +285,28 @@
identifier = graphic = (*name != '\0');
+ prop.length = lg = strlen(name);
+
+#ifdef USE_UTF8_HACK
+ len = lg;
+ for (p = name; *p; p += clen, len -= clen)
+ {
+ wchar_t wc;
+ if ( (mbtowc(NULL,NULL,0), clen = mbtowc(&wc, p, len)) < 2)
+ {
+ clen = 1;
+ c_type = char_type[(unsigned char) *p];
+ }
+ else
+ c_type = UTF8_Hack_Classify_Char(wc);
+#else
for (p = name; *p; p++)
{
c_type = char_type[(unsigned char) *p];
+#endif
+
+ if (p == name)
+ first_c_type = c_type;
if ((c_type & (UL | CL | SL | DI)) == 0)
identifier = FALSE;
@@ -293,14 +318,12 @@
prop.needs_scan = TRUE;
}
- prop.length = lg = p - name;
-
#ifndef NO_USE_LINEDIT
if (lg > 1 && identifier)
LE_Compl_Add_Word(name, lg);
#endif
- if (char_type[(unsigned char) *name] != SL) /* small letter */
+ if (first_c_type != SL) /* small letter */
identifier = FALSE;
@@ -318,7 +341,7 @@
goto finish;
}
- if (lg == 1 && char_type[(unsigned char) *name] == SC)
+ if (lg == 1 && first_c_type == SC)
{
prop.type = SOLO_ATOM;
prop.needs_quote = (*name == ',');
@@ -340,9 +363,40 @@
+#ifdef USE_UTF8_HACK
+/*-------------------------------------------------------------------------*
+ * CREATE_CHAR_ATOM *
+ * *
+ *-------------------------------------------------------------------------*/
+int Create_Char_Atom(unsigned long wc)
+{
+ char c[7];
+ int len = wctomb(c, wc);
+ c[len] = 0;
+ return Create_Allocate_Atom(c);
+}
/*-------------------------------------------------------------------------*
- * CREATE_ATOM *
+ * UTF8_HACK_CLASSIFY_CHAR *
+ * *
+ *-------------------------------------------------------------------------*/
+int
+UTF8_Hack_Classify_Char(unsigned long wc)
+{
+ if (wc < 256) return char_type[wc];
+ if (iswupper(wc)) return CL;
+ if (iswlower(wc)) return SL;
+ if (iswpunct(wc)) return GR;
+ if (iswcntrl(wc) || iswspace(wc)) return LA;
+ return EX;
+}
+#endif
+
+
+
+
+/*-------------------------------------------------------------------------*
+ * CREATE_ATOM_TAGGED *
* *
* Called by compiled prolog code. *
*-------------------------------------------------------------------------*/
diff -ru gprolog-1.2.16/src/EnginePl/atom.h gprolog-1.2.16.patch/src/EnginePl/atom.h
--- gprolog-1.2.16/src/EnginePl/atom.h 2002-03-19 20:24:36.000000000 +0200
+++ gprolog-1.2.16.patch/src/EnginePl/atom.h 2005-06-02 00:56:16.390371720 +0300
@@ -68,7 +68,11 @@
+#ifndef USE_UTF8_HACK
#define Is_Valid_Code(c) ((unsigned) (c)-1 <256-1) /* 1<= c <256 */
+#else
+#define Is_Valid_Code(c) ((signed long)(c) > 0) /* UTF-8 is 31 bits */
+#endif
#define Is_Valid_Byte(c) ((unsigned) (c) <256) /* 0=< c <256 */
#define Is_Valid_Atom(a) ((a)>=0 && (a)<MAX_ATOM && \
atom_tbl[(a)].name!=NULL)
@@ -169,6 +173,11 @@
int Create_Atom(char *name) FC;
+#ifdef USE_UTF8_HACK
+int Create_Char_Atom(unsigned long wc) FC;
+int UTF8_Hack_Classify_Char(unsigned long c) FC;
+#endif
+
WamWord Create_Atom_Tagged(char *name) FC;
int Find_Atom(char *name) FC;
_______________________________________________
Users-prolog mailing list
[email protected]
http://lists.gnu.org/mailman/listinfo/users-prolog