UTF8 support

stamit Fri, 03 Jun 2005 11:29:31 -0700

Hello to the list.

I did my own hack for support for UTF8 encoding in GNU Prolog. It peeks
ahead on the stream and makes each extended character/byte in a UTF8
sequence appear to be of the same type as the entire 'wide' character.
It uses the functions declared in <wctype.h>.


This logic has a problem with pushback *BUT* it is all internal to the
scanner (the scanner just treats non-ASCII characters specially).

Invalid UTF8 is read in the old-fashioned way.

Long '\XXXXX\' escape sequences are read/written.

I also did everything I could for atom_chars, atom_codes etc. but I
don't know gprolog's internals very well. They all seem to work OK
though.

There are #ifdef blocks and a new option in configure.in (untested).


src/BipsPl/c_supp.[ch]
        *Char* and *Code* functions updated
src/BipsPl/scan_supp.c
        UTF8_Hack_Peek_Next_Char
src/BipsPl/stream_supp.[ch]
        I had to add some fields to StmInf for the scanner to use
src/BipsPl/write_supp.c
        iswprint test
src/EnginePl/atom.[ch]
        UTF8_Hack_Classify_Char, Is_Valid_Code


I hope you will find all this useful.


Stamatis Mitrofanis

diff -ru gprolog-1.2.16/src/BipsPl/c_supp.c gprolog-1.2.16.patch/src/BipsPl/c_supp.c
--- gprolog-1.2.16/src/BipsPl/c_supp.c	2002-04-05 09:47:32.000000000 +0300
+++ gprolog-1.2.16.patch/src/BipsPl/c_supp.c	2005-06-02 00:56:16.389371872 +0300
@@ -439,21 +439,36 @@
  * RD_CHAR_CHECK                                                           *
  *                                                                         *
  *-------------------------------------------------------------------------*/
-int
+long
 Rd_Char_Check(WamWord start_word)
 {
   WamWord word, tag_mask;
   int atom;
+#ifdef USE_UTF8_HACK
+  wchar_t wc;
+#endif
 
   DEREF(start_word, word, tag_mask);
   if (tag_mask == TAG_REF_MASK)
     Pl_Err_Instantiation();
 
   atom = UnTag_ATM(word);
-  if (tag_mask != TAG_ATM_MASK || atom_tbl[atom].prop.length != 1)
+  if (tag_mask != TAG_ATM_MASK || atom_tbl[atom].prop.length !=
+#ifdef USE_UTF8_HACK
+								mblen(atom_tbl[atom].name, atom_tbl[atom].prop.length)
+#else
+								1
+#endif
+								)
     Pl_Err_Type(type_character, word);
 
+#ifdef USE_UTF8_HACK
+  wc = atom_tbl[atom].name[0];
+  mbtowc(&wc, atom_tbl[atom].name, atom_tbl[atom].prop.length);
+  return wc;
+#else
   return atom_tbl[atom].name[0];
+#endif
 }
 
 
@@ -463,16 +478,25 @@
  * RD_CHAR                                                                 *
  *                                                                         *
  *-------------------------------------------------------------------------*/
-int
+long
 Rd_Char(WamWord start_word)
 {
   WamWord word, tag_mask;
   int atom;
+#ifdef USE_UTF8_HACK
+  wchar_t wc;
+#endif
 
   DEREF(start_word, word, tag_mask);
   atom = UnTag_ATM(word);
 
+#ifdef USE_UTF8_HACK
+  wc = atom_tbl[atom].name[0];
+  mbtowc(&wc, atom_tbl[atom].name, atom_tbl[atom].prop.length);
+  return wc;
+#else
   return atom_tbl[atom].name[0];
+#endif
 }
 
 
@@ -482,11 +506,14 @@
  * RD_IN_CHAR_CHECK                                                        *
  *                                                                         *
  *-------------------------------------------------------------------------*/
-int
+long
 Rd_In_Char_Check(WamWord start_word)
 {
   WamWord word, tag_mask;
   int atom;
+#ifdef USE_UTF8_HACK
+  wchar_t wc;
+#endif
 
   DEREF(start_word, word, tag_mask);
   if (tag_mask == TAG_REF_MASK)
@@ -494,10 +521,25 @@
 
   atom = UnTag_ATM(word);
   if (tag_mask != TAG_ATM_MASK || 
-      (atom != atom_end_of_file && atom_tbl[atom].prop.length != 1))
+      (atom != atom_end_of_file && atom_tbl[atom].prop.length !=
+#ifdef USE_UTF8_HACK
+								mblen(atom_tbl[atom].name, atom_tbl[atom].prop.length)
+#else
+								1
+#endif
+								))
     Pl_Err_Type(type_in_character, word);
 
-  return (atom != atom_end_of_file) ? atom_tbl[atom].name[0] : -1;
+  if (atom == atom_end_of_file)
+    return -1;
+
+#ifdef USE_UTF8_HACK
+  wc = atom_tbl[atom].name[0];
+  mbtowc(&wc, atom_tbl[atom].name, atom_tbl[atom].prop.length);
+  return wc;
+#else
+  return atom_tbl[atom].name[0];
+#endif
 }
 
 
@@ -507,15 +549,28 @@
  * RD_IN_CHAR                                                              *
  *                                                                         *
  *-------------------------------------------------------------------------*/
-int
+long
 Rd_In_Char(WamWord start_word)
 {
   WamWord word, tag_mask;
   int atom;
+#ifdef USE_UTF8_HACK
+  wchar_t wc;
+#endif
 
   DEREF(start_word, word, tag_mask);
   atom = UnTag_ATM(word);
-  return (atom != atom_end_of_file) ? atom_tbl[atom].name[0] : -1;
+
+  if (atom == atom_end_of_file)
+    return -1;
+
+#ifdef USE_UTF8_HACK
+  wc = atom_tbl[atom].name[0];
+  mbtowc(&wc, atom_tbl[atom].name, atom_tbl[atom].prop.length);
+  return wc;
+#else
+  return atom_tbl[atom].name[0];
+#endif
 }
 
 
@@ -525,10 +580,10 @@
  * RD_CODE_CHECK                                                           *
  *                                                                         *
  *-------------------------------------------------------------------------*/
-int
+long
 Rd_Code_Check(WamWord start_word)
 {
-  int c;
+  long c;
 
   c = Rd_Integer_Check(start_word);
   if (!Is_Valid_Code(c))
@@ -544,7 +599,7 @@
  * RD_CODE                                                                 *
  *                                                                         *
  *-------------------------------------------------------------------------*/
-int
+long
 Rd_Code(WamWord start_word)
 {
   return Rd_Integer(start_word);
@@ -557,10 +612,10 @@
  * RD_IN_CODE_CHECK                                                        *
  *                                                                         *
  *-------------------------------------------------------------------------*/
-int
+long
 Rd_In_Code_Check(WamWord start_word)
 {
-  int c;
+  long c;
 
   c = Rd_Integer_Check(start_word);
   if (c != -1 && !Is_Valid_Code(c))
@@ -576,7 +631,7 @@
  * RD_IN_CODE                                                              *
  *                                                                         *
  *-------------------------------------------------------------------------*/
-int
+long
 Rd_In_Code(WamWord start_word)
 {
   return Rd_Integer(start_word);
@@ -761,6 +816,9 @@
   WamWord save_start_word;
   WamWord *lst_adr;
   int n = 0;
+#ifdef USE_UTF8_HACK
+  int clen;
+#endif
 
   save_start_word = start_word;
 
@@ -779,8 +837,18 @@
 
       lst_adr = UnTag_LST(word);
 
+#ifdef USE_UTF8_HACK
+      wctomb(NULL,0);
+      clen = wctomb(str, Rd_Char_Check(Car(lst_adr)));
+      if (clen > 0)
+	{
+	  str += clen;
+	  n += clen;
+	}
+#else
       *str++ = Rd_Char_Check(Car(lst_adr));
       n++;
+#endif
 
       start_word = Cdr(lst_adr);
     }
@@ -803,6 +871,9 @@
   WamWord save_start_word;
   WamWord *lst_adr;
   int n = 0;
+#ifdef USE_UTF8_HACK
+  int clen;
+#endif
 
   save_start_word = start_word;
 
@@ -815,8 +886,18 @@
 
       lst_adr = UnTag_LST(word);
 
+#ifdef USE_UTF8_HACK
+      wctomb(NULL,0);
+      clen = wctomb(str, Rd_Char_Check(Car(lst_adr)));
+      if (clen > 0)
+	{
+	  str += clen;
+	  n += clen;
+	}
+#else
       *str++ = Rd_Char_Check(Car(lst_adr));
       n++;
+#endif
 
       start_word = Cdr(lst_adr);
     }
@@ -839,6 +920,9 @@
   WamWord save_start_word;
   WamWord *lst_adr;
   int n = 0;
+#ifdef USE_UTF8_HACK
+  int clen;
+#endif
 
   save_start_word = start_word;
 
@@ -857,8 +941,18 @@
 
       lst_adr = UnTag_LST(word);
 
+#ifdef USE_UTF8_HACK
+      wctomb(NULL,0);
+      clen = wctomb(str, Rd_Code_Check(Car(lst_adr)));
+      if (clen > 0)
+	{
+	  str += clen;
+	  n += clen;
+	}
+#else
       *str++ = Rd_Code_Check(Car(lst_adr));
       n++;
+#endif
 
       start_word = Cdr(lst_adr);
     }
@@ -881,6 +975,9 @@
   WamWord save_start_word;
   WamWord *lst_adr;
   int n = 0;
+#ifdef USE_UTF8_HACK
+  int clen;
+#endif
 
   save_start_word = start_word;
 
@@ -893,8 +990,18 @@
 
       lst_adr = UnTag_LST(word);
 
+#ifdef USE_UTF8_HACK
+      wctomb(NULL,0);
+      clen = wctomb(str, Rd_Code_Check(Car(lst_adr)));
+      if (clen > 0)
+	{
+	  str += clen;
+	  n += clen;
+	}
+#else
       *str++ = Rd_Code_Check(Car(lst_adr));
       n++;
+#endif
 
       start_word = Cdr(lst_adr);
     }
@@ -1836,9 +1943,27 @@
 Bool
 Un_Chars(char *str, WamWord start_word)
 {
+  int atom;
+#ifdef USE_UTF8_HACK
+  int clen = 1;
+  int len = strlen(str);
+  for (; *str; str+=clen, len-=clen)
+    {
+      wchar_t wc;
+      if ( (mbtowc(NULL,NULL,0), clen = mbtowc(&wc,str,len)) > 1 && wc >= 256)
+	atom = Create_Char_Atom(wc);
+      else
+	{
+	  clen = 1;
+	  atom = ATOM_CHAR(*str);
+	}
+#else
   for (; *str; str++)
     {
-      if (!Get_List(start_word) || !Unify_Atom(ATOM_CHAR(*str)))
+      atom = ATOM_CHAR(*str);
+#endif
+
+      if (!Get_List(start_word) || !Unify_Atom(atom))
 	return FALSE;
 
       start_word = Unify_Variable();
@@ -1872,9 +1997,24 @@
 Bool
 Un_Codes(char *str, WamWord start_word)
 {
+#ifdef USE_UTF8_HACK
+  int clen = 1;
+  int len = strlen(str);
+  for (; *str; str+=clen, len-=clen)
+    {
+      wchar_t c;
+      if ( (mbtowc(NULL,NULL,0), clen = mbtowc(&c,str,len)) < 1 )
+	{
+	  clen = 1;
+	  c = *str;
+	}
+#else
   for (; *str; str++)
     {
-      if (!Get_List(start_word) || !Unify_Integer(*str))
+      unsigned char c = *str;
+#endif
+
+      if (!Get_List(start_word) || !Unify_Integer(c))
 	return FALSE;
 
       start_word = Unify_Variable();
diff -ru gprolog-1.2.16/src/BipsPl/c_supp.h gprolog-1.2.16.patch/src/BipsPl/c_supp.h
--- gprolog-1.2.16/src/BipsPl/c_supp.h	2002-03-19 20:24:34.000000000 +0200
+++ gprolog-1.2.16.patch/src/BipsPl/c_supp.h	2005-06-02 00:56:16.389371872 +0300
@@ -64,21 +64,21 @@
 
 int Rd_Boolean(WamWord start_word);
 
-int Rd_Char_Check(WamWord start_word);
+long Rd_Char_Check(WamWord start_word);
 
-int Rd_Char(WamWord start_word);
+long Rd_Char(WamWord start_word);
 
-int Rd_In_Char_Check(WamWord start_word);
+long Rd_In_Char_Check(WamWord start_word);
 
-int Rd_In_Char(WamWord start_word);
+long Rd_In_Char(WamWord start_word);
 
-int Rd_Code_Check(WamWord start_word);
+long Rd_Code_Check(WamWord start_word);
 
-int Rd_Code(WamWord start_word);
+long Rd_Code(WamWord start_word);
 
-int Rd_In_Code_Check(WamWord start_word);
+long Rd_In_Code_Check(WamWord start_word);
 
-int Rd_In_Code(WamWord start_word);
+long Rd_In_Code(WamWord start_word);
 
 int Rd_Byte_Check(WamWord start_word);
 
diff -ru gprolog-1.2.16/src/BipsPl/scan_supp.c gprolog-1.2.16.patch/src/BipsPl/scan_supp.c
--- gprolog-1.2.16/src/BipsPl/scan_supp.c	2002-04-05 09:47:32.000000000 +0300
+++ gprolog-1.2.16.patch/src/BipsPl/scan_supp.c	2005-06-02 00:56:16.390371720 +0300
@@ -42,6 +42,9 @@
 /*---------------------------------*
  * Constants                       *
  *---------------------------------*/
+#ifndef USE_UTF8_HACK
+#define wint_t int
+#endif
 
 /*---------------------------------*
  * Type Definitions                *
@@ -69,8 +72,8 @@
 
 static void Scan_Quoted(StmInf *pstm);
 
-static int Scan_Quoted_Char(StmInf *pstm, Bool convert, int c0, 
-			    Bool no_escape);
+static wint_t Scan_Quoted_Char(StmInf *pstm, Bool convert, int c0, 
+                               Bool no_escape);
 
 
 
@@ -103,6 +106,10 @@
  * READ_NEXT_CHAR                                                          *
  *                                                                         *
  *-------------------------------------------------------------------------*/
+#ifdef USE_UTF8_HACK
+static int
+UTF8_Hack_Peek_Next_Char(StmInf *pstm, Bool convert);
+#endif
 static int
 Read_Next_Char(StmInf *pstm, Bool convert)
 {
@@ -115,11 +122,62 @@
       if (convert)
 	c = Char_Conversion(c);
 
-      c_type = char_type[c];
+#ifdef USE_UTF8_HACK
+      if (c >= 192)
+	c_type = UTF8_Hack_Peek_Next_Char(pstm,convert);
+      else if ( pstm->char_count <= pstm->utf8_end
+	        && pstm->utf8_begin < pstm->char_count )
+	c_type = pstm->utf8_ctype;
+      else
+#endif
+	c_type = char_type[c];
     }
 
   return c;
 }
+#ifdef USE_UTF8_HACK
+static int
+UTF8_Hack_Peek_Next_Char(StmInf *pstm, Bool convert)
+{
+	int i, len;
+	unsigned char cs[6];
+	wchar_t wc;
+
+	if (!(c&0x40)) return pstm->utf8_ctype;
+
+	pstm->utf8_end = pstm->utf8_begin = pstm->char_count-1;
+
+	cs[0] = c;
+	for (i=1, c<<=1 ; i<6 && c&0x80 ; c<<=1)
+	{
+		int a = Stream_Getc(pstm);
+		if (a == EOF)
+			goto on_eof;
+		cs[i++] = a;
+	}
+	len = i;
+	while (--i)
+		Stream_Ungetc(cs[i],pstm);
+	c = cs[0];
+
+	mbtowc(NULL,NULL,0);
+	if (mbtowc(&wc, cs, len) < 2) goto bad_char;
+
+	/*if (convert)
+		wc = Char_Conversion(wc);*/
+
+	pstm->utf8_ctype = UTF8_Hack_Classify_Char(wc);
+	pstm->utf8_end = pstm->utf8_begin+len;
+	return pstm->utf8_ctype;
+
+on_eof:
+	while (--i)
+		Stream_Ungetc(cs[i],pstm);
+	c = cs[0];
+bad_char:
+	return char_type[c];
+}
+#endif
 
 
 
@@ -321,6 +379,7 @@
   if (!integer_only &&		/* float if . and digit */
       c == '.' && isdigit(Scan_Peek_Char(pstm, TRUE)))
     goto is_a_float;
+
   /* integer number */
   token.type = TOKEN_INTEGER;
   *p++ = '\0';
@@ -330,15 +389,16 @@
 
   if (c == '\'')		/* 0'<character> */
     {
-      c = Scan_Quoted_Char(pstm, TRUE, '\'', FALSE);
-      if (c == -1)		/* <character> is ' */
+      wint_t wc = Scan_Quoted_Char(pstm, TRUE, '\'', FALSE);
+
+      if (wc == -1)		/* <character> is ' */
 	{
 	  token.line = pstm->line_count + 1;
 	  token.col = pstm->line_pos + 1;
 	  err_msg = "quote character expected here";
 	}
 
-      if (c == -2 || c == -3)
+      if (wc == -2 || wc == -3)	/* EOF or NL or \ NL  */
 	{
 	  Unget_Last_Char;
 
@@ -347,7 +407,7 @@
 	  err_msg = "character expected here";
 	}
 
-      token.int_num = c;
+      token.int_num = wc;
       return;
     }
 
@@ -449,20 +509,25 @@
 
   for (;;)
     {
-      c = Scan_Quoted_Char(pstm, convert, c0, no_escape);
-      if (c == -1)
+      wint_t wc = Scan_Quoted_Char(pstm, convert, c0, no_escape);
+      if (wc == -1)
 	{
 	  *s = '\0';
 	  return;
 	}
 
-      if (c == -2)		/* EOF or \n */
+      if (wc == -2)		/* EOF or \n */
 	break;
 
-      if (c == -3)		/* \ followed by \n */
+      if (wc == -3)		/* \ followed by \n */
 	continue;
 
-      *s++ = c;
+#ifdef USE_UTF8_HACK
+      wctomb(NULL,0);
+      s += wctomb(s,wc);
+#else
+      *s++ = wc;
+#endif
     }
   /* error */
   *s = '\0';
@@ -501,12 +566,13 @@
  * SCAN_QUOTED_CHAR                                                        *
  *                                                                         *
  *-------------------------------------------------------------------------*/
-static int
+static wint_t
 Scan_Quoted_Char(StmInf *pstm, Bool convert, int c0, Bool no_escape)
 {
-  int radix;
+  int shift;
   char *p, *f;
-  int x, i;
+  int i;
+  wint_t x;
 
   Read_Next_Char(pstm, convert);
   if (c == c0)
@@ -538,13 +604,13 @@
     {
       if (c == 'x')
 	{
-	  radix = 16;
+	  shift = 4;
 	  f = "0123456789abcdefABCDEF";
 	  x = 0;
 	}
       else
 	{
-	  radix = 8;
+	  shift = 3;
 	  f = "01234567";
 	  x = c - '0';
 	}
@@ -555,7 +621,8 @@
 	  i = p - f;
 	  if (i >= 16)
 	    i -= 6;
-	  x = x * radix + i;
+	  x = (x<<shift) + i;
+
 	  Read_Next_Char(pstm, convert);
 	}
 
@@ -565,6 +632,7 @@
 	  token.col = pstm->line_pos;
 	  err_msg = "invalid character code in \\constant\\ sequence";
 	}
+
       if (c != '\\')
 	{
 	  if (err_msg == NULL)
@@ -577,7 +645,7 @@
 	  Unget_Last_Char;
 	}
 
-      return (int) (unsigned char) x;
+      return (wint_t) (unsigned long) x;
     }
 
   if (err_msg == NULL)
diff -ru gprolog-1.2.16/src/BipsPl/stream_supp.c gprolog-1.2.16.patch/src/BipsPl/stream_supp.c
--- gprolog-1.2.16/src/BipsPl/stream_supp.c	2002-09-19 14:00:36.000000000 +0300
+++ gprolog-1.2.16.patch/src/BipsPl/stream_supp.c	2005-06-02 00:56:16.390371720 +0300
@@ -401,6 +401,12 @@
   pstm->line_count = 0;
   pstm->line_pos = 0;
   PB_Init(pstm->pb_line_pos);
+
+#ifdef USE_UTF8_HACK
+  pstm->utf8_ctype = 0;
+  pstm->utf8_begin = 0;
+  pstm->utf8_end = 0;
+#endif
 }
 
 
diff -ru gprolog-1.2.16/src/BipsPl/stream_supp.h gprolog-1.2.16.patch/src/BipsPl/stream_supp.h
--- gprolog-1.2.16/src/BipsPl/stream_supp.h	2002-05-07 20:45:48.000000000 +0300
+++ gprolog-1.2.16.patch/src/BipsPl/stream_supp.h	2005-06-02 00:56:16.390371720 +0300
@@ -145,6 +145,11 @@
   int line_count;		/* line read count                */
   int line_pos;			/* line position                  */
   PbStk pb_line_pos;		/* line position push back stack  */
+#ifdef USE_UTF8_HACK            /* -------- UTF-8 hack ---------- */
+  int utf8_ctype;		/* type of multibyte character||0 */
+  int utf8_begin;		/* beginning of sequence position */
+  int utf8_end;			/* end of sequence position       */
+#endif
 }
 StmInf;
 
diff -ru gprolog-1.2.16/src/BipsPl/write_supp.c gprolog-1.2.16.patch/src/BipsPl/write_supp.c
--- gprolog-1.2.16/src/BipsPl/write_supp.c	2002-04-09 11:26:42.000000000 +0300
+++ gprolog-1.2.16.patch/src/BipsPl/write_supp.c	2005-06-02 00:56:16.390371720 +0300
@@ -503,7 +503,14 @@
 
       if (prop.needs_scan)
 	{
+#ifdef USE_UTF8_HACK
+	  int len = prop.length;
+	  int clen;
+	  wchar_t wc;
+	  for (p = atom_tbl[atom].name; *p; p++, len--)
+#else
 	  for (p = atom_tbl[atom].name; *p; p++)
+#endif
 	    if ((q = (char *) strchr(escape_char, *p)))
 	      {
 		Out_Char('\\');
@@ -514,6 +521,23 @@
 		Out_Char(*p);
 		Out_Char(*p);
 	      }
+#ifdef USE_UTF8_HACK
+	    else if ( (mbtowc(NULL,NULL,0), clen = mbtowc(&wc,p,len)) >= 2 )
+	      {
+		if (!iswprint(wc))
+		  {
+		    sprintf(str, "\\x%lx\\", wc);
+		    Out_String(str);
+		    p += clen-1;
+		  }
+		else
+		  {
+		    while (--clen)
+		      Out_Char(*p++);
+		    Out_Char(*p);
+		  }
+	      }
+#endif
 	    else if (!isprint(*p))
 	      {
 		sprintf(str, "\\x%x\\", (unsigned) (unsigned char) *p);
diff -ru gprolog-1.2.16/src/configure.in gprolog-1.2.16.patch/src/configure.in
--- gprolog-1.2.16/src/configure.in	2002-09-19 13:57:32.000000000 +0300
+++ gprolog-1.2.16.patch/src/configure.in	2005-06-02 00:56:16.388372024 +0300
@@ -45,6 +45,7 @@
 USE_GUI_CONSOLE=yes
 USE_SOCKETS=yes
 USE_FD_SOLVER=yes
+USE_UTF8_HACK=yes
 
 DLL_W32GUICONS=w32guicons
 LIB_LINEDIT=liblinedit
@@ -201,6 +202,12 @@
 		   *)  USE_FD_SOLVER=yes;;
 	       esac])
 
+AC_ARG_ENABLE(utf8-hack, [  --enable-utf8-hack      recognise types of UTF-8 characters],
+              [case "$enableval" in
+		   yes) AC_DEFINE(USE_UTF8_HACK) USE_UTF8_HACK=yes;;
+		   *)   USE_UTF8_HACK=no;;
+	       esac])
+
 
 
 # ***********************
diff -ru gprolog-1.2.16/src/EnginePl/atom.c gprolog-1.2.16.patch/src/EnginePl/atom.c
--- gprolog-1.2.16/src/EnginePl/atom.c	2002-03-19 20:24:36.000000000 +0200
+++ gprolog-1.2.16.patch/src/EnginePl/atom.c	2005-06-02 00:56:16.390371720 +0300
@@ -29,6 +29,9 @@
 #include <string.h>
 #include <locale.h>
 #include <ctype.h>
+#ifdef USE_UTF8_HACK
+#include <wctype.h>
+#endif
 
 #define ATOM_FILE
 
@@ -258,10 +261,13 @@
   AtomInf *patom;
   AtomProp prop;
   char *p;
-  int c_type;
+  int c_type, first_c_type;
   int lg;
   Bool identifier;
   Bool graphic;
+#ifdef USE_UTF8_HACK
+  int clen, len;
+#endif
 
 
   patom = Locate_Atom(name);
@@ -279,9 +285,28 @@
 
   identifier = graphic = (*name != '\0');
 
+  prop.length = lg = strlen(name);
+
+#ifdef USE_UTF8_HACK
+  len = lg;
+  for (p = name; *p; p += clen, len -= clen)
+    {
+      wchar_t wc;
+      if ( (mbtowc(NULL,NULL,0), clen = mbtowc(&wc, p, len)) < 2)
+	{
+	  clen = 1;
+	  c_type = char_type[(unsigned char) *p];
+	}
+      else
+        c_type = UTF8_Hack_Classify_Char(wc);
+#else
   for (p = name; *p; p++)
     {
       c_type = char_type[(unsigned char) *p];
+#endif
+
+      if (p == name)
+	 first_c_type = c_type;
 
       if ((c_type & (UL | CL | SL | DI)) == 0)
 	identifier = FALSE;
@@ -293,14 +318,12 @@
 	prop.needs_scan = TRUE;
     }
 
-  prop.length = lg = p - name;
-
 #ifndef NO_USE_LINEDIT
   if (lg > 1 && identifier)
     LE_Compl_Add_Word(name, lg);
 #endif
 
-  if (char_type[(unsigned char) *name] != SL)	/* small letter */
+  if (first_c_type != SL)	/* small letter */
     identifier = FALSE;
 
 
@@ -318,7 +341,7 @@
       goto finish;
     }
 
-  if (lg == 1 && char_type[(unsigned char) *name] == SC)
+  if (lg == 1 && first_c_type == SC)
     {
       prop.type = SOLO_ATOM;
       prop.needs_quote = (*name == ',');
@@ -340,9 +363,40 @@
 
 
 
+#ifdef USE_UTF8_HACK
+/*-------------------------------------------------------------------------*
+ * CREATE_CHAR_ATOM                                                        *
+ *                                                                         *
+ *-------------------------------------------------------------------------*/
+int Create_Char_Atom(unsigned long wc)
+{
+	char c[7];
+	int len = wctomb(c, wc);
+	c[len] = 0;
+	return Create_Allocate_Atom(c);
+}
 
 /*-------------------------------------------------------------------------*
- * CREATE_ATOM                                                             *
+ * UTF8_HACK_CLASSIFY_CHAR                                                 *
+ *                                                                         *
+ *-------------------------------------------------------------------------*/
+int
+UTF8_Hack_Classify_Char(unsigned long wc)
+{
+	if (wc < 256) return char_type[wc];
+	if (iswupper(wc)) return CL;
+	if (iswlower(wc)) return SL;
+	if (iswpunct(wc)) return GR;
+	if (iswcntrl(wc) || iswspace(wc)) return LA;
+	return EX;
+}
+#endif
+
+
+
+
+/*-------------------------------------------------------------------------*
+ * CREATE_ATOM_TAGGED                                                      *
  *                                                                         *
  * Called by compiled prolog code.                                         *
  *-------------------------------------------------------------------------*/
diff -ru gprolog-1.2.16/src/EnginePl/atom.h gprolog-1.2.16.patch/src/EnginePl/atom.h
--- gprolog-1.2.16/src/EnginePl/atom.h	2002-03-19 20:24:36.000000000 +0200
+++ gprolog-1.2.16.patch/src/EnginePl/atom.h	2005-06-02 00:56:16.390371720 +0300
@@ -68,7 +68,11 @@
 
 
 
+#ifndef USE_UTF8_HACK
 #define Is_Valid_Code(c)           ((unsigned) (c)-1 <256-1)	/* 1<= c <256 */
+#else
+#define Is_Valid_Code(c)           ((signed long)(c) > 0)	/* UTF-8 is 31 bits */
+#endif
 #define Is_Valid_Byte(c)           ((unsigned) (c) <256)	/* 0=< c <256 */
 #define Is_Valid_Atom(a)           ((a)>=0 && (a)<MAX_ATOM && \
                                     atom_tbl[(a)].name!=NULL)
@@ -169,6 +173,11 @@
 
 int Create_Atom(char *name) FC;
 
+#ifdef USE_UTF8_HACK
+int Create_Char_Atom(unsigned long wc) FC;
+int UTF8_Hack_Classify_Char(unsigned long c) FC;
+#endif
+
 WamWord Create_Atom_Tagged(char *name) FC;
 
 int Find_Atom(char *name) FC;

_______________________________________________
Users-prolog mailing list
[email protected]
http://lists.gnu.org/mailman/listinfo/users-prolog

UTF8 support

Reply via email to