On Wed, Aug 08, 2007 at 01:57:34PM -0400, Frédéric Brière wrote: > I was thinking of giving this issue a try when a little Googling turned > up a very thorough patch from Phillip Vandry that makes xjdic > locale-aware: <http://tzone.org/~vandry/xjdic/xjdic-24.locale.patch>.
There's actually a bug in the original patch which can trigger a segfault on amd64 (due to the size difference between int and size_t). Here's a corrected version, refreshed to apply to 24-10; I'm also including a stand-alone patch for the bugfix itself.
diff --git a/xjdfrontend.c b/xjdfrontend.c index 58cc95f..c4135a3 100644 --- a/xjdfrontend.c +++ b/xjdfrontend.c @@ -30,6 +30,15 @@ #include <signal.h> #include <errno.h> #include <unistd.h> + +#define HAVE_LOCALE +#ifdef HAVE_LOCALE +#include <locale.h> +#include <iconv.h> +#include <langinfo.h> +#include <wchar.h> +#endif + #include "xjdic.h" /* Paul Burchard supplied a patch to provide BSD compatibility for xjdic @@ -100,7 +109,13 @@ int ShiftJIS = FALSE,NoSJIS=FALSE; unsigned char instr[256],radkanj[250][2]; int radnos[250]; unsigned char kanatab[NRKANA*2][7]; -int Omode = 0,Smode = 0,Dmode = 0,AKanaMode; +#ifdef HAVE_LOCALE +int Omode = 3; +int new_input_mode = 1; +#else +int Omode = 0; +#endif +int Smode = 0,Dmode = 0,AKanaMode; int DRow,DCol,MaxY=MAXLINES,MaxX=MAXCOLS-1,KFlushRes,nok; unsigned long hittab[NOHITS]; int verblen,DispHit,ksp,hitind,FirstKanj = 0,prieng = FALSE,Extopen=FALSE,NoSkip; @@ -178,6 +193,13 @@ int RVACTIVE = TRUE; int DicNum; long DicLoc; +#define GETKBSTR_SPACE_AFTER_PROMPT 1 +#define GETKBSTR_ALLOW_HELP 2 +#define GETKBSTR_ALLOW_ROMAJI 4 +#define GETKBSTR_CTRLD 8 +#define GETKBSTR_CTRLZ 16 +#define GETKBSTR_ROMAJI_DEFAULT 32 + /*====== Prototypes========================================================*/ FILE *xfopen(char *file_name, char *file_mode, int *xfilelen); @@ -627,6 +649,104 @@ void jis2sjis(unsigned char *p1,unsigned char *p2) /* courtesy of Ken Lunde */ *p2 = c2 + cellOffset; } +#ifdef HAVE_LOCALE +/* + * Locale support: + * + * Because xjdic uses EUC internally extensively, we only convert + * from EUC to the locale character set on output and convert to + * EUC on input. iconv is used for this. The input code (see + * locale_GetKBStr) keeps the entire input string in a wide + * character array and convert it to EUC-JP before returning it. + * -pkv Tue Sep 23 15:45:59 EDT 2003 + */ +static char *get_locale_charset(void) +{ +static int locale_initted = 0; +static char *locale_charset = NULL; + + if (!locale_initted) { + setlocale(LC_CTYPE, ""); + locale_initted = 1; + } + if (!locale_charset) + locale_charset = nl_langinfo(CODESET); + + return locale_charset; +} + +/*======locale_output (convert EUC to current locale's charset) =======*/ +void locale_output(unsigned int length, unsigned char c1, unsigned char c2, unsigned char c3) +{ +static int iconv_initted = 0; +static int conversion_failed = 0; +static iconv_t descr; +char *target_charset; +char inbuf[4]; +char outbuf[64]; /* this better be big enough */ +char *inbuf_p, *outbuf_p; +size_t inbytesleft, outbytesleft; + + inbuf[0] = c1; + inbuf[1] = c2; + inbuf[2] = c3; + if (conversion_failed) { + fwrite(&(inbuf[0]), 1, length, stdout); + return; + } + if (!(target_charset = get_locale_charset())) { + fprintf(stderr, "locale does not specify a target charset, using EUC-JP!\n"); + conversion_failed = 1; + locale_output(length, c1, c2, c3); + return; + } + + if (!strcmp(target_charset, "EUC-JP")) { + /* if no conversion is required, then pretent conversion failed. This + will cause the data to be passed straight through and may be more + efficient than calling iconv with an identity descriptor */ + conversion_failed = 1; + locale_output(length, c1, c2, c3); + return; + } + if (!iconv_initted) { + descr = iconv_open(target_charset, "EUC-JP"); + if (descr == (iconv_t)-1) { + fprintf(stderr, "conversion from EUC-JP to %s not supported. using EUC-JP.\n", target_charset); + conversion_failed = 1; + locale_output(length, c1, c2, c3); + return; + } + iconv_initted = 1; + } + + inbuf_p = &(inbuf[0]); + outbuf_p = &(outbuf[0]); + inbytesleft = length; + outbytesleft = sizeof(outbuf); + /* The caller is supposed to provide a valid, complete multibyte sequence as */ + /* input so we will ignore errors concerning invalid input. And if the output */ + /* buffer is not big enough, let's just fail, so don't check for that either */ + + iconv(descr, &inbuf_p, &inbytesleft, &outbuf_p, &outbytesleft); + + fwrite(&(outbuf[0]), 1, outbuf_p - &(outbuf[0]), stdout); + + /* This function might not be called again to output the next character + so put the output back into the initial state. This is wasteful for + character sets that need to use shift sequences to enter and exit + Kanji mode (ISO-2022) but xjdic already has that problem, and besides, + this has no effect if the character encoding is something like UTF-8 */ + + inbytesleft = 0; + outbytesleft = sizeof(outbuf); + outbuf_p = &(outbuf[0]); + iconv(descr, NULL, &inbytesleft, &outbuf_p, &outbytesleft); + if (outbuf_p - &(outbuf[0])) + fwrite(&(outbuf[0]), 1, outbuf_p - &(outbuf[0]), stdout); +} +#endif /* HAVE_LOCALE */ + /*====KEOS===End of screen processing for KFlush==================*/ int KEOS (unsigned char *msg) { @@ -654,6 +774,7 @@ int KEOS (unsigned char *msg) int KFlush(unsigned char *msg) { unsigned char *kptr,ktemp[512]; + unsigned char *p; int retf,it,j; int Test; @@ -677,6 +798,20 @@ int KFlush(unsigned char *msg) strcpy(ktemp,ktemp+1); } it = strlen(ktemp); + + /* Look for instances of RVon and RVoff inside the string. */ + /* These do not consume any columns -pkv */ + p = ktemp; + while (p && (*p) && (p = strstr(p, RVon))) { + p += strlen(RVon); + it -= strlen(RVon); + } + p = ktemp; + while (p && (*p) && (p = strstr(p, RVoff))) { + p += strlen(RVoff); + it -= strlen(RVoff); + } + if (DCol+it < Test) { DCol = DCol+it+1; @@ -690,7 +825,9 @@ int KFlush(unsigned char *msg) if (!retf) return (FALSE); } KOut(ktemp); - if (DCol <= MAXCOLS) KOut(" "); + /* if (DCol <= MAXCOLS) KOut(" "); */ + /* -pkv */ + if (DCol <= MaxX) KOut(" "); kptr = (unsigned char *)strtok(NULL," "); } KOut("\n"); @@ -757,6 +894,19 @@ void KOut(unsigned char *sout) printf("%c%c",c1,c2); i++; break; + +#ifdef HAVE_LOCALE + case 3 : /* locale's character set */ + if (c1 == 0x8f) + { + locale_output(3, c1, c2, sout[i+2]); + i+=2; + break; + } + locale_output(2, c1, c2, 0); + i++; + break; +#endif /* HAVE_LOCALE */ } } } @@ -1949,7 +2099,7 @@ void DoJIS() /*===== GetKBStr=== Collect ASCII or JIS string from keyboard=========*/ -void GetKBStr(unsigned char *prompt) +void legacy_GetKBStr(unsigned char *prompt) { int ShowIt,escf,bit8,i; unsigned char c; @@ -2019,6 +2169,200 @@ void GetKBStr(unsigned char *prompt) printf("\n\r"); } +#ifdef HAVE_LOCALE +char locale_GetKBStr(unsigned char *prompt, const wchar_t *specials, int flags) +{ +char c; +char *source_charset; +iconv_t descr; +int length = 0; +int done = 0; +int i; +mbstate_t instate; +size_t result; +int use_iconv; +char *convert_buffer; +wchar_t wbuf[512]; +char *inbuf_p, *outbuf_p; +size_t inbytesleft, outbytesleft; + + fbuff[0] = 0; + + memset(&instate, 0, sizeof(instate)); + + /* the following called setlocale() if it has not been done already */ + source_charset = get_locale_charset(); + + while (!done) { + /* See if we can get a character */ + c = getcharxx(); + result = mbrtowc(&(wbuf[length]), &c, 1, &instate); + if (result == -1) { + /* illegal byte sequence */ + memset(&instate, 0, sizeof(instate)); /* reset state */ + /* skip byte */ + continue; + } else if (result == -2) { + continue; + } else if (result == 0) { + /* Got NULL character */ + done = 1; + break; + } else { + if (wcschr(specials, wbuf[length])) { + /* XXX this is not a proper cast. It is a bug that I + depend on this working */ + return (char)(wbuf[length]); + } else if ((wbuf[length] == L'\n') || (wbuf[length] == L'\r')) { + done = 1; + break; + } else if ((wbuf[length] == L'\004') && (flags & GETKBSTR_CTRLD)) { + return 4; + } else if ((wbuf[length] == L'\032') && (flags & GETKBSTR_CTRLZ)) { + return 26; + } else if ((wbuf[length] == L'?') && (flags & GETKBSTR_ALLOW_HELP)) { + DRow = 0; + for (i = 0; strcmp(Help[i], "$$$")!=0;i++) { + strcpy(KLine, Help[i]); + if (!KFlush("Continue Help Display? (y/n)")) break; + } + return 0; + } else if ((wbuf[length] == L'@') && (flags & GETKBSTR_ALLOW_ROMAJI)) { + DoRomaji('@'); + GetEUC(fbuff); + return 0; + } else if ((wbuf[length] == L'#') && (flags & GETKBSTR_ALLOW_ROMAJI)) { + DoRomaji('#'); + GetEUC(fbuff); + return 0; + } else if ((wbuf[length] == L'\010') || (wbuf[length] == L'\177')) { + /* backspace */ + if (length) length--; + wbuf[length] = L'\0'; + + printf("\r%s%s%s%s%ls ", RVon, prompt, RVoff, (flags & + GETKBSTR_SPACE_AFTER_PROMPT) ? + " " : "", wbuf); + fflush(stdout); + printf("\r%s%s%s%s%ls", RVon, prompt, RVoff, (flags & + GETKBSTR_SPACE_AFTER_PROMPT) ? " " : "", wbuf); + fflush(stdout); + } else if (wbuf[length] == L'\025') { + /* line kill */ + /* send more erase sequences than we have to, to + make sure wide characters get erased even on buggy + terminals */ + while (length--) printf("\b\b \b\b"); + length = 0; + printf("\r%s%s%s%s", RVon, prompt, RVoff, (flags & + GETKBSTR_SPACE_AFTER_PROMPT) ? " " : ""); + } else if ((flags & GETKBSTR_ROMAJI_DEFAULT) && + (((wbuf[length] >= L'a') && (wbuf[length] <= L'z')) || + ((wbuf[length] >= L'A') && (wbuf[length] <= L'Z')))) { + /* romaji mode by default, and character is a letter */ + if ((wbuf[length] == L'L') || (wbuf[length] == L'l')) { + /* back to normal mode */ + flags &= ~GETKBSTR_ROMAJI_DEFAULT; + } else { + ungetc((char)(wbuf[length]), stdin); /* XXX */ + DoRomaji('@'); + GetEUC(fbuff); + return 0; + } + } else if (iswprint(wbuf[length])) { + wbuf[length+1] = L'\0'; + printf("%ls", wbuf+(length++)); + } + } + } + + if (source_charset && (strcmp(source_charset, "EUC-JP"))) { + descr = iconv_open("EUC-JP", source_charset); + if (descr == (iconv_t)-1) use_iconv = 0; + else use_iconv = 1; + } else { + use_iconv = 0; + } + + convert_buffer = malloc(MB_CUR_MAX+1); + if (!convert_buffer) { + strcpy(fbuff, ""); /* oops memory */ + return 0; + } + + memset(&instate, 0, sizeof(instate)); + + outbuf_p = fbuff; + outbytesleft = sizeof(fbuff)-1; + + done = 0; + for (i = 0; i < length; i++) { + result = wcrtomb(convert_buffer, wbuf[i], &instate); + + if (use_iconv) { + inbuf_p = convert_buffer; + inbytesleft = result; + if (iconv(descr, &inbuf_p, &inbytesleft, &outbuf_p, &outbytesleft) == -1) + break; + } else { + if (!outbytesleft) break; + if (outbytesleft >= result) { + memcpy(outbuf_p, convert_buffer, result); + outbuf_p += result; + outbytesleft -= result; + } else { + break; + } + } + } + *outbuf_p = 0; + + if (use_iconv) iconv_close(descr); + + return 0; +} + +/* output in fbuff */ +void convert_to_euc(char *in, char *out, size_t outlen) +{ +iconv_t descr; +int use_iconv; +char *source_charset; +char *inbuf_p; +size_t inbytesleft; + + source_charset = get_locale_charset(); + if (source_charset && (strcmp(source_charset, "EUC-JP"))) { + descr = iconv_open("EUC-JP", source_charset); + if (descr == (iconv_t)-1) use_iconv = 0; + else use_iconv = 1; + } else { + use_iconv = 0; + } + if (!use_iconv) { + strncpy(out, in, outlen); + return; + } + + inbytesleft = strlen(in); + iconv(descr, &in, &inbytesleft, &out, &outlen); + *out = 0; + iconv_close(descr); + return; +} +#endif /* HAVE_LOCALE */ + +void GetKBStr(unsigned char *prompt) +{ +#ifdef HAVE_LOCALE + if (new_input_mode) + locale_GetKBStr(prompt, L"", 0); + else +#endif + legacy_GetKBStr(prompt); + printf("\r\n"); +} + /*===== OneShot === Collect and set single filter=============*/ void OneShot() @@ -2690,6 +3034,13 @@ main(int argc,char **argv) Omode = 1; printf("Output mode set to EUC\n"); } +#ifdef HAVE_LOCALE + if (strtmp[0] == 'l') + { + Omode = 3; + printf("Output mode set to locale dependant\n"); + } +#endif continue; } #ifdef XJDCLSERV @@ -2802,6 +3153,16 @@ main(int argc,char **argv) NoSJIS = TRUE; printf("EUC (No Shift-JIS) operation enforced\n"); } + if ((xap[0] == '-') && (xap[1] == 'O')) + { +#ifdef HAVE_LOCALE + new_input_mode = 0; + if (Omode == 3) Omode = 0; + printf("Legacy input/output mode selected (no locale support)\n"); +#else + printf("Locale support not compiled; -O ignored\n"); +#endif + } if ((xap[0] == '-') && (xap[1] == 'v')) { Jverb = FALSE; @@ -2853,7 +3214,7 @@ exit(0); { GetWinSize(); /* Just in case the screen has changed */ sprintf(kbprompt,"%sXJDIC [%d:%s] SEARCH KEY:%s ",RVon,CurrDic,DicName(CurrDic),RVoff); - sprintf(kbprompt2,"XJDIC [%d:%s] SEARCH KEY: ",CurrDic,DicName(CurrDic)); + sprintf(kbprompt2,"XJDIC [%d:%s] SEARCH KEY:",CurrDic,DicName(CurrDic)); if (GDmode) { sprintf(kbprompt,"%sXJDIC [GLOBAL] SEARCH KEY:%s ",RVon,RVoff); @@ -2862,6 +3223,9 @@ exit(0); printf("\n\r%s",kbprompt); c = 0; cmdmode = FALSE; +#ifdef HAVE_LOCALE +if (!new_input_mode) { +#endif strf = FALSE; escf = FALSE; bit8 = FALSE; @@ -2970,8 +3334,33 @@ exit(0); if ((instr[i] == 'B')&&(instr[i-1] == '(')&&(instr[i-2] == 0x1b)) break; } fseek(stdin,0L,SEEK_END); /*kill any leftovers*/ + GetEUC(fbuff); +#ifdef HAVE_LOCALE +} else { /* if (!new_input_mode) */ + /* new locale based code */ + if (!clipmode) { + c = locale_GetKBStr(kbprompt2, L"!{}$%*&^=/-:\'+\\;][|_`", + GETKBSTR_SPACE_AFTER_PROMPT|GETKBSTR_ALLOW_HELP| + ((KImode == 0) ? GETKBSTR_ROMAJI_DEFAULT : 0)| + GETKBSTR_ALLOW_ROMAJI|GETKBSTR_CTRLD|GETKBSTR_CTRLZ); + + if (c > 0) cmdmode = TRUE; + if (c == 4) { + cbreakoff(); + exit(0); + } else if (c == 26) { + cbreakoff(); + printf("\nSuspending XJDIC. Type `fg' to resume.\n"); + pid = getpid(); + kill(pid,sig); + cbreakon(); + cmdmode = FALSE; + } + } +} +#endif /* HAVE_LOCALE */ /* "bye" is the end of the run */ - if ((instr[2] == 'e')&&(instr[1] == 'y')&&(instr[0] == 'b')) + if ((fbuff[2] == 'e')&&(fbuff[1] == 'y')&&(fbuff[0] == 'b')) { cbreakoff(); exit(0); @@ -2986,7 +3375,7 @@ exit(0); clipmode = TRUE; continue; } - if (c == '}') /* matching { */ + if /* { */ (c == '}') { printf("\r \r"); RVtoggle(); @@ -3192,6 +3581,7 @@ exit(0); DoKANJI(); break; } + continue; } if (clipmode) { @@ -3210,12 +3600,6 @@ exit(0); fgets(clipstring1,50,fclip); fclose(fclip); if (clipstring1[strlen(clipstring1)-1] < 32) clipstring1[strlen(clipstring1)-1] = 0; - if (strcmp(clipstring1,"quit") == 0) - { - clipmode = FALSE; - printf("\nLeaving Clipboard mode\n"); - break; - } if (strcmp(clipstring1,clipstring2) == 0) { continue; @@ -3224,13 +3608,30 @@ exit(0); { strcpy(clipstring2,clipstring1); strcpy(instr,clipstring1); +#ifdef HAVE_LOCALE + if (new_input_mode) + convert_to_euc(instr, fbuff, sizeof(fbuff)); + else +#endif + GetEUC(fbuff); + if (strcmp(fbuff, "quit") == 0) { + clipmode = FALSE; + printf("\nLeaving Clipboard mode\n"); + fbuff[0] = 0; + break; + } break; } } } - if(strlen(instr) < 2) continue; - GetEUC(fbuff); - if (escf) KOut(fbuff); +#ifdef HAVE_LOCALE + if (!new_input_mode) { +#endif + if (escf) KOut(fbuff); +#ifdef HAVE_LOCALE + } +#endif + if(strlen(fbuff) < 2) continue; snprintf(tempout,sizeof(tempout),"\nSearching for: %s%s%s\n",RVon,fbuff,RVoff); KOut(tempout); Dmode = 0; diff --git a/xjdic.1 b/xjdic.1 index 3a812ac..4de9f15 100644 --- a/xjdic.1 +++ b/xjdic.1 @@ -115,11 +115,22 @@ specify a dictionary file to use (up to 9 may be specified.) specify a kanji data file to use. .At -.B -j j/e/s +.B -j j/e/s/l [CL,SA] .Ap Specify the output coding for Japanese text (j=JIS, e=EUC, s=Shift-JIS) +l=Locale based output. Output will be according to the character set +specified by the current system locale. + +.At +.B -O +[CL,SA] +.Ap +Request the old input code. Also selects -j j (which controls output) unless +overridden. The old code does not respect the current locale but it does +EUC/JIS detection on input. + .At .B -P port_no [CL,SV]
commit 47f2736e9114fc1ac9e7074181374b56ee29f20c Author: Frédéric Brière <fbri...@fbriere.net> Date: Tue Oct 27 11:30:11 2015 -0400 iconv() requires size_t, not int diff --git a/xjdfrontend.c b/xjdfrontend.c index 3b3e86d..200a065 100644 --- a/xjdfrontend.c +++ b/xjdfrontend.c @@ -2162,7 +2162,7 @@ int use_iconv; char *convert_buffer; wchar_t wbuf[512]; char *inbuf_p, *outbuf_p; -int inbytesleft, outbytesleft; +size_t inbytesleft, outbytesleft; fbuff[0] = 0; @@ -2301,13 +2301,13 @@ int inbytesleft, outbytesleft; } /* output in fbuff */ -void convert_to_euc(char *in, char *out, int outlen) +void convert_to_euc(char *in, char *out, size_t outlen) { iconv_t descr; int use_iconv; char *source_charset; char *inbuf_p; -int inbytesleft; +size_t inbytesleft; source_charset = get_locale_charset(); if (source_charset && (strcmp(source_charset, "EUC-JP"))) {