April Chin wrote: > > April Chin wrote: > > > > ksh93 already has support for I18N and for multibyte character > > > > handling. If there is need for change, it should be minimal > > > > since currently all error message translation goes through a single > > > > interface. > > > > > > > > The multibyte character handling uses the POSIX mb*() interface. > > > I believe in ksh93 this may not be working in all respects. > > > An i18n engineer and I tried a few manual tests on ksh93 with > > > multibyte characters, and ksh93 did not appear to recognize some of > > > them which may have included an ASCII byte within the multibyte character. > > > > April - did this problem occur only in interactive terminal mode or even > > when a script writes the japanese/ASCII text mixture (e.g. % cat > > "ksh93_echo_japanese.ksh93" | ksh # ) ? > > The test was done via a ksh93 script. > On a system with the Japanese locales installed, I set up > an appropriate locale: > > % setenv LANG ja_JP.PCK > > I input a file (sjis.dat, attached below) containing multibyte > characters, including ones with an ASCII byte, to a ksh93 script which > reads and echoes each line, and then each word in that line. > > % cat sjis.dat | test.sh > > where test.sh contains: > #!/usr/bin/ksh93 > read a > echo $a > > for b in $a ; > do > echo $b; > done > > Doing the same test with the current Solaris ksh, instead of ksh93, > output all the characters as expected. ksh93 was able to process > 2 out of 3 multibyte characters which contained an ASCII component.
See below (and the attached patch... the fix is quite obvious what is going wrong... :-) ) ... > > Linux may suffer from a similar problem, please read > > https://mailman.research.att.com/pipermail/ast-users/2006q1/000838.html > > and > > https://mailman.research.att.com/pipermail/ast-users/2006q1/000839.html > > > > BTW: Which terminal emulator did you use ? Gnome terminal, kconsole, > > dtterm or xterm ? > > The i18n engineer provided me with a terminal emulator for which > I could turn on sjis mode, the newer mode which will > accept multibyte characters which may have an ASCII component. > It sounds like the Linux problem is related to the terminal emulator. No, the terminal emulator (KDE "konsole") was not the problem (the developers of it tested that explicitly) - seems to be a problem in ksh93r itself. Attached is a patch ("ksh93-shift_ijs.diff.txt" ; this patch applies against ksh93r) written by Werner Fink/SuSE which fixes the problem for both SuSE Linux and Solaris 10 (I tested this with en_US.UTF-8 and japanese text, for ja_JP.PCK I have to wait until next week when the student who knows this locale can run some quick tests herself...). Could you please test whether the problems on your side are now fixed ? The only thing which is interesting now (assuming the patch works): Can we make somehow an automated test script for this kind of failure (I'd like to see this integrated into the ksh93 test suite) ? ---- Bye, Roland -- __ . . __ (o.\ \/ /.o) roland.mainz at nrubsig.org \__\/\/__/ MPEG specialist, C&&JAVA&&Sun&&Unix programmer /O /==\ O\ TEL +49 641 7950090 (;O/ \/ \O;) -------------- next part -------------- --- src/cmd/ksh93/sh/lex.c +++ src/cmd/ksh93/sh/lex.c 2006-04-06 15:58:08.000000000 +0000 @@ -293,11 +293,14 @@ { switch(*len = mbsize(_Fcin.fcptr)) { - case -1: /* bogus multiByte char - parse as bytes? */ - case 0: /* NULL byte */ + case -1: /* bogus multiByte char - parse as bytes? */ + case 0: /* NULL byte */ + *len = 1; case 1: - lexState = state[curChar=fcget()]; - break; + if ((curChar = fcget()) == 0) + curChar = fcfill(); + lexState = state[curChar]; + break; default: /* * None of the state tables contain entries @@ -1596,6 +1599,36 @@ { if(n!=S_NL) { +#if SHOPT_MULTIBYTE + if(mbwide()) + { + do + { + ssize_t len; + switch((len = mbsize(_Fcin.fcptr))) + { + case -1: /* bogus multiByte char - parse as bytes? */ + case 0: /* NULL byte */ + case 1: + n = state[fcget()]; + break; + default: + /* + * None of the state tables contain + * entries for multibyte characters, + * however, they should be treated + * the same as any other alph + * character. Therefore, we'll use + * the state of the 'a' character. + */ + mbchar(_Fcin.fcptr); + n = state['a']; + } + } + while(n == 0); + } + else +#endif /* SHOPT_MULTIBYTE */ /* skip over regular characters */ while((n=state[fcget()])==0); } --- src/cmd/ksh93/sh/macro.c +++ src/cmd/ksh93/sh/macro.c 2006-04-06 16:02:40.000000000 +0000 @@ -266,7 +266,38 @@ cp = fcseek(0); while(1) { +#if SHOPT_MULTIBYTE + if(mbwide()) + { + do + { + ssize_t len; + switch((len = mbsize(cp))) + { + case -1: /* bogus multiByte char - parse as bytes? */ + case 0: /* NULL byte */ + case 1: + n = state[*(unsigned char*)cp++]; + break; + default: + /* + * None of the state tables contain + * entries for multibyte characters, + * however, they should be treated + * the same as any other alph + * character. Therefore, we'll use + * the state of the 'a' character. + */ + cp += len; + n = state['a']; + } + } + while(n == 0); + } + else +#endif /* SHOPT_MULTIBYTE */ while((n=state[*(unsigned char*)cp++])==0); + if(n==S_NL || n==S_QUOTE || n==S_RBRA) continue; if(c=(cp-1)-fcseek(0)) @@ -395,8 +426,42 @@ cp++; while(1) { - while((n=state[*(unsigned char*)cp++])==0); - c = (cp-1) - first; +#if SHOPT_MULTIBYTE + if (mbwide()) + { + ssize_t len; + do + { + switch((len = mbsize(cp))) + { + case -1: /* bogus multiByte char - parse as bytes? */ + case 0: /* NULL byte */ + len = 1; + case 1: + n = state[*(unsigned char*)cp++]; + break; + default: + /* + * None of the state tables contain entries + * for multibyte characters. However, they + * should be treated the same as any other + * alpha character, so we'll use the state + * which would normally be assigned to the + * 'a' character. + */ + cp += len; + n = state['a']; + } + } + while(n == 0); + c = (cp-len) - first; + } + else +#endif /* SHOPT_MULTIBYTE */ + { + while((n=state[*(unsigned char*)cp++])==0); + c = (cp-1) - first; + } switch(n) { case S_ESC: --- src/lib/libcmd/Mamfile +++ src/lib/libcmd/Mamfile 2006-04-06 16:08:42.000000000 +0000 @@ -444,7 +444,7 @@ prev cat.c meta cat.o %.c>%.o cat.c cat prev cat.c -exec - ${CC} ${mam_cc_FLAGS} ${CCFLAGS} -I. -I${PACKAGE_ast_INCLUDE} -DERROR_CATALOG=\""libcmd"\" -DUSAGE_LICENSE=\""[-author?Glenn Fowler <gsf at research.att.com>][-author?David Korn <dgk at research.att.com>][-copyright?Copyright (c) 1992-2006 AT&T Knowledge Ventures][-license?http://www.opensource.org/licenses/cpl1.0.txt][--catalog?libcmd]"\" -D_PACKAGE_ast -D_BLD_cmd -c cat.c +exec - ${CC} ${mam_cc_FLAGS} ${CCFLAGS} -I. -I${PACKAGE_ast_INCLUDE} -DERROR_CATALOG=\""libcmd"\" -DUSAGE_LICENSE=\""[-author?Glenn Fowler <gsf at research.att.com>][-author?David Korn <dgk at research.att.com>][-copyright?Copyright (c) 1992-2006 AT&T Knowledge Ventures][-license?http://www.opensource.org/licenses/cpl1.0.txt][--catalog?libcmd]"\" -D_PACKAGE_ast -D_BLD_cmd -DSHOPT_MULTIBYTE -c cat.c done cat.o generated make chgrp.o prev chgrp.c --- src/lib/libcmd/cat.c +++ src/lib/libcmd/cat.c 2006-04-06 16:09:45.000000000 +0000 @@ -133,8 +133,39 @@ while (endbuff) { cpold = cp; - /* skip over ASCII characters */ + /* skip over ASCII and multi byte characters */ +#if SHOPT_MULTIBYTE + if(mbwide()) + { + do + { + ssize_t len; + switch((len = mbsize(cp))) + { + case -1: /* bogus multiByte char - parse as bytes? */ + case 0: /* NULL byte */ + case 1: + n = states[*cp++]; + break; + default: + /* + * None of the state tables contain + * entries for multibyte characters, + * however, they should be treated + * the same as any other alph + * character. Therefore, we'll use + * the state of the 'a' character. + */ + cp += len; + n = states['a']; + } + } + while(n == 0); + } + else +#endif /* SHOPT_MULTIBYTE */ while ((n = states[*cp++]) == 0); + if (n==T_ENDBUF) { if (cp>endbuff)
