Bug#230695: xjdic: Doesn't work with UTF-8

Frédéric Brière Sun, 10 Jan 2016 08:24:30 -0800

On Wed, Aug 08, 2007 at 01:57:34PM -0400, Frédéric Brière wrote:
> I was thinking of giving this issue a try when a little Googling turned
> up a very thorough patch from Phillip Vandry that makes xjdic
> locale-aware: <http://tzone.org/~vandry/xjdic/xjdic-24.locale.patch>.


There's actually a bug in the original patch which can trigger a
segfault on amd64 (due to the size difference between int and size_t).

Here's a corrected version, refreshed to apply to 24-10; I'm also
including a stand-alone patch for the bugfix itself.

diff --git a/xjdfrontend.c b/xjdfrontend.c
index 58cc95f..c4135a3 100644
--- a/xjdfrontend.c
+++ b/xjdfrontend.c
@@ -30,6 +30,15 @@
 #include <signal.h>
 #include <errno.h>
 #include <unistd.h>
+
+#define HAVE_LOCALE
+#ifdef HAVE_LOCALE
+#include <locale.h>
+#include <iconv.h>
+#include <langinfo.h>
+#include <wchar.h>
+#endif
+
 #include "xjdic.h"
 
 /*    Paul Burchard supplied a patch to provide BSD compatibility for xjdic
@@ -100,7 +109,13 @@ int ShiftJIS = FALSE,NoSJIS=FALSE;
 unsigned char instr[256],radkanj[250][2];
 int radnos[250];
 unsigned char kanatab[NRKANA*2][7];
-int Omode = 0,Smode = 0,Dmode = 0,AKanaMode;
+#ifdef HAVE_LOCALE
+int Omode = 3;
+int new_input_mode = 1;
+#else
+int Omode = 0;
+#endif
+int Smode = 0,Dmode = 0,AKanaMode;
 int DRow,DCol,MaxY=MAXLINES,MaxX=MAXCOLS-1,KFlushRes,nok;
 unsigned long hittab[NOHITS];
 int verblen,DispHit,ksp,hitind,FirstKanj = 0,prieng = FALSE,Extopen=FALSE,NoSkip;
@@ -178,6 +193,13 @@ int RVACTIVE = TRUE;
 int DicNum;
 long DicLoc;
 
+#define GETKBSTR_SPACE_AFTER_PROMPT 1
+#define GETKBSTR_ALLOW_HELP 2
+#define GETKBSTR_ALLOW_ROMAJI 4
+#define GETKBSTR_CTRLD 8
+#define GETKBSTR_CTRLZ 16
+#define GETKBSTR_ROMAJI_DEFAULT 32
+
 /*====== Prototypes========================================================*/
 
 FILE  *xfopen(char *file_name, char *file_mode, int *xfilelen);
@@ -627,6 +649,104 @@ void jis2sjis(unsigned char *p1,unsigned char *p2) /* courtesy of Ken Lunde */
     *p2 = c2 + cellOffset;
 }
 
+#ifdef HAVE_LOCALE
+/*
+ * Locale support:
+ *
+ * Because xjdic uses EUC internally extensively, we only convert
+ * from EUC to the locale character set on output and convert to
+ * EUC on input. iconv is used for this. The input code (see
+ * locale_GetKBStr) keeps the entire input string in a wide
+ * character array and convert it to EUC-JP before returning it.
+ * -pkv Tue Sep 23 15:45:59 EDT 2003
+ */
+static char *get_locale_charset(void)
+{
+static int locale_initted = 0;
+static char *locale_charset = NULL;
+
+	if (!locale_initted) {
+		setlocale(LC_CTYPE, "");
+		locale_initted = 1;
+	}
+	if (!locale_charset)
+		locale_charset = nl_langinfo(CODESET);
+
+	return locale_charset;
+}
+
+/*======locale_output  (convert EUC to current locale's charset) =======*/
+void locale_output(unsigned int length, unsigned char c1, unsigned char c2, unsigned char c3)
+{
+static int iconv_initted = 0;
+static int conversion_failed = 0;
+static iconv_t descr;
+char *target_charset;
+char inbuf[4];
+char outbuf[64]; /* this better be big enough */
+char *inbuf_p, *outbuf_p;
+size_t inbytesleft, outbytesleft;
+
+	inbuf[0] = c1;
+	inbuf[1] = c2;
+	inbuf[2] = c3;
+	if (conversion_failed) {
+		fwrite(&(inbuf[0]), 1, length, stdout);
+		return;
+	}
+	if (!(target_charset = get_locale_charset())) {
+		fprintf(stderr, "locale does not specify a target charset, using EUC-JP!\n");
+		conversion_failed = 1;
+		locale_output(length, c1, c2, c3);
+		return;
+	}
+
+	if (!strcmp(target_charset, "EUC-JP")) {
+		/* if no conversion is required, then pretent conversion failed. This
+		   will cause the data to be passed straight through and may be more
+		   efficient than calling iconv with an identity descriptor */
+		conversion_failed = 1;
+		locale_output(length, c1, c2, c3);
+		return;
+	}
+	if (!iconv_initted) {
+		descr = iconv_open(target_charset, "EUC-JP");
+		if (descr == (iconv_t)-1) {
+			fprintf(stderr, "conversion from EUC-JP to %s not supported. using EUC-JP.\n", target_charset);
+			conversion_failed = 1;
+			locale_output(length, c1, c2, c3);
+			return;
+		}
+		iconv_initted = 1;
+	}
+
+	inbuf_p = &(inbuf[0]);
+	outbuf_p = &(outbuf[0]);
+	inbytesleft = length;
+	outbytesleft = sizeof(outbuf);
+	/* The caller is supposed to provide a valid, complete multibyte sequence as */
+	/* input so we will ignore errors concerning invalid input. And if the output */
+	/* buffer is not big enough, let's just fail, so don't check for that either */
+
+	iconv(descr, &inbuf_p, &inbytesleft, &outbuf_p, &outbytesleft);
+
+	fwrite(&(outbuf[0]), 1, outbuf_p - &(outbuf[0]), stdout);
+
+	/* This function might not be called again to output the next character
+	   so put the output back into the initial state. This is wasteful for
+	   character sets that need to use shift sequences to enter and exit
+	   Kanji mode (ISO-2022) but xjdic already has that problem, and besides,
+	   this has no effect if the character encoding is something like UTF-8 */
+
+	inbytesleft = 0;
+	outbytesleft = sizeof(outbuf);
+	outbuf_p = &(outbuf[0]);
+	iconv(descr, NULL, &inbytesleft, &outbuf_p, &outbytesleft);
+	if (outbuf_p - &(outbuf[0]))
+		fwrite(&(outbuf[0]), 1, outbuf_p - &(outbuf[0]), stdout);
+}
+#endif /* HAVE_LOCALE */
+
 /*====KEOS===End of screen processing for KFlush==================*/
 int KEOS (unsigned char *msg)
 {
@@ -654,6 +774,7 @@ int KEOS (unsigned char *msg)
 int KFlush(unsigned char *msg)
 {
 	unsigned char *kptr,ktemp[512];
+	unsigned char *p;
 	int retf,it,j;
 	int Test;
 
@@ -677,6 +798,20 @@ int KFlush(unsigned char *msg)
 			strcpy(ktemp,ktemp+1);
 		}
 		it = strlen(ktemp);
+
+		/* Look for instances of RVon and RVoff inside the string. */
+		/* These do not consume any columns -pkv */
+		p = ktemp;
+		while (p && (*p) && (p = strstr(p, RVon))) {
+			p += strlen(RVon);
+			it -= strlen(RVon);
+		}
+		p = ktemp;
+		while (p && (*p) && (p = strstr(p, RVoff))) {
+			p += strlen(RVoff);
+			it -= strlen(RVoff);
+		}
+
 		if (DCol+it < Test)
 		{
 			DCol = DCol+it+1;
@@ -690,7 +825,9 @@ int KFlush(unsigned char *msg)
 			if (!retf) return (FALSE);
 		}
 		KOut(ktemp);
-		if (DCol <= MAXCOLS) KOut(" ");
+		/* if (DCol <= MAXCOLS) KOut(" "); */
+		/* -pkv */
+		if (DCol <= MaxX) KOut(" ");
 		kptr = (unsigned char *)strtok(NULL," ");
 	}
 	KOut("\n");
@@ -757,6 +894,19 @@ void KOut(unsigned char *sout)
 			printf("%c%c",c1,c2);
 			i++;
 			break;
+
+#ifdef HAVE_LOCALE
+		case 3 : /* locale's character set */
+			if (c1 == 0x8f)
+			{
+				locale_output(3, c1, c2, sout[i+2]);
+				i+=2;
+				break;
+			}
+			locale_output(2, c1, c2, 0);
+			i++;
+			break;
+#endif /* HAVE_LOCALE */
 		}
 	}
 }
@@ -1949,7 +2099,7 @@ void DoJIS()
 
 /*=====  GetKBStr=== Collect ASCII or JIS string from keyboard=========*/
 
-void GetKBStr(unsigned char *prompt)
+void legacy_GetKBStr(unsigned char *prompt)
 {
 	int ShowIt,escf,bit8,i;
 	unsigned char c;
@@ -2019,6 +2169,200 @@ void GetKBStr(unsigned char *prompt)
 	printf("\n\r");
 }
 
+#ifdef HAVE_LOCALE
+char locale_GetKBStr(unsigned char *prompt, const wchar_t *specials, int flags)
+{
+char c;
+char *source_charset;
+iconv_t descr;
+int length = 0;
+int done = 0;
+int i;
+mbstate_t instate;
+size_t result;
+int use_iconv;
+char *convert_buffer;
+wchar_t wbuf[512];
+char *inbuf_p, *outbuf_p;
+size_t inbytesleft, outbytesleft;
+
+	fbuff[0] = 0;
+
+	memset(&instate, 0, sizeof(instate));
+
+	/* the following called setlocale() if it has not been done already */
+	source_charset = get_locale_charset();
+
+	while (!done) {
+		/* See if we can get a character */
+		c = getcharxx();
+		result = mbrtowc(&(wbuf[length]), &c, 1, &instate);
+		if (result == -1) {
+			/* illegal byte sequence */
+			memset(&instate, 0, sizeof(instate)); /* reset state */
+			/* skip byte */
+			continue;
+		} else if (result == -2) {
+			continue;
+		} else if (result == 0) {
+			/* Got NULL character */
+			done = 1;
+			break;
+		} else {
+			if (wcschr(specials, wbuf[length])) {
+				/* XXX this is not a proper cast. It is a bug that I
+				   depend on this working */
+				return (char)(wbuf[length]);
+			} else if ((wbuf[length] == L'\n') || (wbuf[length] == L'\r')) {
+				done = 1;
+				break;
+			} else if ((wbuf[length] == L'\004') && (flags & GETKBSTR_CTRLD)) {
+				return 4;
+			} else if ((wbuf[length] == L'\032') && (flags & GETKBSTR_CTRLZ)) {
+				return 26;
+			} else if ((wbuf[length] == L'?') && (flags & GETKBSTR_ALLOW_HELP)) {
+				DRow = 0;
+				for (i = 0; strcmp(Help[i], "$$$")!=0;i++) {
+					strcpy(KLine, Help[i]);
+					if (!KFlush("Continue Help Display? (y/n)")) break;
+				}
+				return 0;
+			} else if ((wbuf[length] == L'@') && (flags & GETKBSTR_ALLOW_ROMAJI)) {
+				DoRomaji('@');
+				GetEUC(fbuff);
+				return 0;
+			} else if ((wbuf[length] == L'#') && (flags & GETKBSTR_ALLOW_ROMAJI)) {
+				DoRomaji('#');
+				GetEUC(fbuff);
+				return 0;
+			} else if ((wbuf[length] == L'\010') || (wbuf[length] == L'\177')) {
+				/* backspace */
+				if (length) length--;
+				wbuf[length] = L'\0';
+
+				printf("\r%s%s%s%s%ls  ", RVon, prompt, RVoff, (flags &
+					GETKBSTR_SPACE_AFTER_PROMPT) ?
+					" " : "", wbuf);
+				fflush(stdout);
+				printf("\r%s%s%s%s%ls", RVon, prompt, RVoff, (flags &
+					GETKBSTR_SPACE_AFTER_PROMPT) ? " " : "", wbuf);
+				fflush(stdout);
+			} else if (wbuf[length] == L'\025') {
+				/* line kill */
+				/* send more erase sequences than we have to, to
+				   make sure wide characters get erased even on buggy
+				   terminals */
+				while (length--) printf("\b\b  \b\b");
+				length = 0;
+				printf("\r%s%s%s%s", RVon, prompt, RVoff, (flags &
+					GETKBSTR_SPACE_AFTER_PROMPT) ? " " : "");
+			} else if ((flags & GETKBSTR_ROMAJI_DEFAULT) &&
+				(((wbuf[length] >= L'a') && (wbuf[length] <= L'z')) ||
+				((wbuf[length] >= L'A') && (wbuf[length] <= L'Z')))) {
+					/* romaji mode by default, and character is a letter */
+					if ((wbuf[length] == L'L') || (wbuf[length] == L'l')) {
+						/* back to normal mode */
+						flags &= ~GETKBSTR_ROMAJI_DEFAULT;
+					} else {
+						ungetc((char)(wbuf[length]), stdin); /* XXX */
+						DoRomaji('@');
+						GetEUC(fbuff);
+						return 0;
+					}
+			} else if (iswprint(wbuf[length])) {
+				wbuf[length+1] = L'\0';
+				printf("%ls", wbuf+(length++));
+			}
+		}
+	}
+
+	if (source_charset && (strcmp(source_charset, "EUC-JP"))) {
+		descr = iconv_open("EUC-JP", source_charset);
+		if (descr == (iconv_t)-1) use_iconv = 0;
+		else use_iconv = 1;
+	} else {
+		use_iconv = 0;
+	}
+
+	convert_buffer = malloc(MB_CUR_MAX+1);
+	if (!convert_buffer) {
+		strcpy(fbuff, "");	/* oops memory */
+		return 0;
+	}
+
+	memset(&instate, 0, sizeof(instate));
+
+	outbuf_p = fbuff;
+	outbytesleft = sizeof(fbuff)-1;
+
+	done = 0;
+	for (i = 0; i < length; i++) {
+		result = wcrtomb(convert_buffer, wbuf[i], &instate);
+
+		if (use_iconv) {
+			inbuf_p = convert_buffer;
+			inbytesleft = result;
+			if (iconv(descr, &inbuf_p, &inbytesleft, &outbuf_p, &outbytesleft) == -1)
+				break;
+		} else {
+			if (!outbytesleft) break;
+			if (outbytesleft >= result) {
+				memcpy(outbuf_p, convert_buffer, result);
+				outbuf_p += result;
+				outbytesleft -= result;
+			} else {
+				break;
+			}
+		}
+	}
+	*outbuf_p = 0;
+
+	if (use_iconv) iconv_close(descr);
+
+	return 0;
+}
+
+/* output in fbuff */
+void convert_to_euc(char *in, char *out, size_t outlen)
+{
+iconv_t descr;
+int use_iconv;
+char *source_charset;
+char *inbuf_p;
+size_t inbytesleft;
+
+	source_charset = get_locale_charset();
+	if (source_charset && (strcmp(source_charset, "EUC-JP"))) {
+		descr = iconv_open("EUC-JP", source_charset);
+		if (descr == (iconv_t)-1) use_iconv = 0;
+		else use_iconv = 1;
+	} else {
+		use_iconv = 0;
+	}
+	if (!use_iconv) {
+		strncpy(out, in, outlen);
+		return;
+	}
+
+	inbytesleft = strlen(in);
+	iconv(descr, &in, &inbytesleft, &out, &outlen);
+	*out = 0;
+	iconv_close(descr);
+	return;
+}
+#endif /* HAVE_LOCALE */
+
+void GetKBStr(unsigned char *prompt)
+{
+#ifdef HAVE_LOCALE
+	if (new_input_mode)
+		locale_GetKBStr(prompt, L"", 0);
+	else
+#endif
+		legacy_GetKBStr(prompt);
+	printf("\r\n");
+}
+
 /*=====  OneShot === Collect and set single filter=============*/
 
 void OneShot()
@@ -2690,6 +3034,13 @@ main(int argc,char **argv)
 					Omode = 1;
 					printf("Output mode set to EUC\n");
 				}
+#ifdef HAVE_LOCALE
+				if (strtmp[0] == 'l')
+				{
+					Omode = 3;
+					printf("Output mode set to locale dependant\n");
+				}
+#endif
 				continue;
 			}
 #ifdef XJDCLSERV
@@ -2802,6 +3153,16 @@ main(int argc,char **argv)
 				NoSJIS = TRUE;
 				printf("EUC (No Shift-JIS) operation enforced\n");
 			}
+			if ((xap[0] == '-') && (xap[1] == 'O'))
+			{
+#ifdef HAVE_LOCALE
+				new_input_mode = 0;
+				if (Omode == 3) Omode = 0;
+				printf("Legacy input/output mode selected (no locale support)\n");
+#else
+				printf("Locale support not compiled; -O ignored\n");
+#endif
+			}
 			if ((xap[0] == '-') && (xap[1] == 'v'))
 			{
 				Jverb = FALSE;
@@ -2853,7 +3214,7 @@ exit(0);
 	{
 		GetWinSize(); /* Just in case the screen has changed  */
 		sprintf(kbprompt,"%sXJDIC [%d:%s] SEARCH KEY:%s ",RVon,CurrDic,DicName(CurrDic),RVoff);
-		sprintf(kbprompt2,"XJDIC [%d:%s] SEARCH KEY: ",CurrDic,DicName(CurrDic));
+		sprintf(kbprompt2,"XJDIC [%d:%s] SEARCH KEY:",CurrDic,DicName(CurrDic));
 		if (GDmode)
 		{
 			sprintf(kbprompt,"%sXJDIC [GLOBAL] SEARCH KEY:%s ",RVon,RVoff);
@@ -2862,6 +3223,9 @@ exit(0);
 		printf("\n\r%s",kbprompt);
 		c = 0;
 		cmdmode = FALSE;
+#ifdef HAVE_LOCALE
+if (!new_input_mode) {
+#endif
 		strf = FALSE;
 		escf = FALSE;
 		bit8 = FALSE;
@@ -2970,8 +3334,33 @@ exit(0);
 			if ((instr[i] == 'B')&&(instr[i-1] == '(')&&(instr[i-2] == 0x1b)) break;
 		}
 		fseek(stdin,0L,SEEK_END); /*kill any leftovers*/
+		GetEUC(fbuff);
+#ifdef HAVE_LOCALE
+} else {	/* if (!new_input_mode) */
+		/* new locale based code */
+		if (!clipmode) {
+			c = locale_GetKBStr(kbprompt2, L"!{}$%*&^=/-:\'+\\;][|_`",
+				GETKBSTR_SPACE_AFTER_PROMPT|GETKBSTR_ALLOW_HELP|
+				((KImode == 0) ? GETKBSTR_ROMAJI_DEFAULT : 0)|
+				GETKBSTR_ALLOW_ROMAJI|GETKBSTR_CTRLD|GETKBSTR_CTRLZ);
+
+			if (c > 0) cmdmode = TRUE;
+			if (c == 4) {
+				cbreakoff();
+				exit(0);
+			} else if (c == 26) {
+				cbreakoff();
+				printf("\nSuspending XJDIC. Type `fg' to resume.\n");
+				pid = getpid();
+				kill(pid,sig);
+				cbreakon();
+				cmdmode = FALSE;
+			}
+		}
+}
+#endif /* HAVE_LOCALE */
 		/* "bye" is the end of the run			*/
-		if ((instr[2] == 'e')&&(instr[1] == 'y')&&(instr[0] == 'b')) 
+		if ((fbuff[2] == 'e')&&(fbuff[1] == 'y')&&(fbuff[0] == 'b')) 
 		{
 			cbreakoff();
 			exit(0);
@@ -2986,7 +3375,7 @@ exit(0);
 				clipmode = TRUE;
 				continue;
 			}
-			if (c == '}')   /* matching {  */
+			if /* { */ (c == '}')
 			{
 				printf("\r                                      \r");
 				RVtoggle();
@@ -3192,6 +3581,7 @@ exit(0);
 					DoKANJI();
 					break;
 			}
+			continue;
 		}
  		if (clipmode)
  		{
@@ -3210,12 +3600,6 @@ exit(0);
  				fgets(clipstring1,50,fclip);
  				fclose(fclip);
  				if (clipstring1[strlen(clipstring1)-1] < 32) clipstring1[strlen(clipstring1)-1] = 0;
- 				if (strcmp(clipstring1,"quit") == 0)
- 				{
- 					clipmode = FALSE;
- 					printf("\nLeaving Clipboard mode\n");
- 					break;
- 				}
  				if (strcmp(clipstring1,clipstring2) == 0)
  				{
  					continue;
@@ -3224,13 +3608,30 @@ exit(0);
  				{
  					strcpy(clipstring2,clipstring1);
  					strcpy(instr,clipstring1);
+#ifdef HAVE_LOCALE
+					if (new_input_mode)
+						convert_to_euc(instr, fbuff, sizeof(fbuff));
+					else
+#endif
+						GetEUC(fbuff);
+ 					if (strcmp(fbuff, "quit") == 0) {
+ 						clipmode = FALSE;
+ 						printf("\nLeaving Clipboard mode\n");
+						fbuff[0] = 0;
+ 						break;
+ 					}
  					break;
  				}
 			}
 		}
-		if(strlen(instr) < 2) continue;
-		GetEUC(fbuff);
-		if (escf) KOut(fbuff);
+#ifdef HAVE_LOCALE
+		if (!new_input_mode) {
+#endif
+			if (escf) KOut(fbuff);
+#ifdef HAVE_LOCALE
+		}
+#endif
+		if(strlen(fbuff) < 2) continue;
 		snprintf(tempout,sizeof(tempout),"\nSearching for: %s%s%s\n",RVon,fbuff,RVoff);
 		KOut(tempout);
 		Dmode = 0;
diff --git a/xjdic.1 b/xjdic.1
index 3a812ac..4de9f15 100644
--- a/xjdic.1
+++ b/xjdic.1
@@ -115,11 +115,22 @@ specify a dictionary file to use (up to 9 may be specified.)
 specify a kanji data file to use.
 
 .At
-.B -j j/e/s 
+.B -j j/e/s/l 
 [CL,SA]
 .Ap
 Specify the output coding for Japanese text (j=JIS, e=EUC, s=Shift-JIS)
 
+l=Locale based output. Output will be according to the character set
+specified by the current system locale.
+
+.At
+.B -O
+[CL,SA]
+.Ap
+Request the old input code. Also selects -j j (which controls output) unless
+overridden. The old code does not respect the current locale but it does
+EUC/JIS detection on input.
+
 .At
 .B -P port_no 
 [CL,SV]

commit 47f2736e9114fc1ac9e7074181374b56ee29f20c
Author: Frédéric Brière <fbri...@fbriere.net>
Date:   Tue Oct 27 11:30:11 2015 -0400

    iconv() requires size_t, not int

diff --git a/xjdfrontend.c b/xjdfrontend.c
index 3b3e86d..200a065 100644
--- a/xjdfrontend.c
+++ b/xjdfrontend.c
@@ -2162,7 +2162,7 @@ int use_iconv;
 char *convert_buffer;
 wchar_t wbuf[512];
 char *inbuf_p, *outbuf_p;
-int inbytesleft, outbytesleft;
+size_t inbytesleft, outbytesleft;
 
 	fbuff[0] = 0;
 
@@ -2301,13 +2301,13 @@ int inbytesleft, outbytesleft;
 }
 
 /* output in fbuff */
-void convert_to_euc(char *in, char *out, int outlen)
+void convert_to_euc(char *in, char *out, size_t outlen)
 {
 iconv_t descr;
 int use_iconv;
 char *source_charset;
 char *inbuf_p;
-int inbytesleft;
+size_t inbytesleft;
 
 	source_charset = get_locale_charset();
 	if (source_charset && (strcmp(source_charset, "EUC-JP"))) {

Bug#230695: xjdic: Doesn't work with UTF-8

Reply via email to