Re: [Lynx-dev] Japanese line wrap patch for UTF-8 terminal
> > Does "document_charset" come from charset defined within of web > > page, e.g., ? document_charset comes from 1. HTTP header: Content-Type: text/html; charset=UTF-8 if header does not contain charset, 2. HTML: or if charset is not defined in HTML, 3. BOM (Byte Order Mark) for local file or cache 4. lynx -assume_local_charset= for local file 5. lynx.cfg ASSUME_LOCAL_CHARSET for local file 6. 'Assumed document character set' in Options Menu 7. lynx -assume_charset= 8. lynx.cfg ASSUME_CHARSET 9. auto detect EUC-JP, Shift_JIS (no UTF-8 support) by USE_TH_JP_AUTO_DETECT (userdefs.h) on display_charset=EUC-JP or Shift_JIS * on display_charset=UTF-8 and HTTP: 1,2, 6,7,8 * on display_charset=UTF-8 and local file: 2, 3,4,5, 6,7,8 * on display_charset=EUC-JP or Shift_JIS and HTTP: 1,2, 9 * on display_charset=EUC-JP or Shift_JIS and local file: 2, 3, 9
Re: [Lynx-dev] Japanese line wrap patch for UTF-8 terminal
On 27/03/2022 15:07, Henry wrote: Does "document_charset" come from charset defined within of web page, e.g., ? It should come from the actual HTTP headers, although the meta element should not conflict and I'm not sure what the current policy is if it does conflict.
Re: [Lynx-dev] Japanese line wrap patch for UTF-8 terminal
Thank you very much. (It is no wonder why I could not write patch!) Does "document_charset" come from charset defined within of web page, e.g., ? Henry 2022年3月26日(土) 13:16 KIHARA Hideto : > > > Does Lynx still add a space wherever there is a line break in the > > text? > > Yes. My Japanese line wrap patch does not change this behavior. > > > is rendered as JJ[space]JJ[space]JJ, > > but I hope for JJ, i.e., continuous text without spaces. > > I wrote an additional patch to avoid adding spaces > when joining lines after Japanese characters. > > But the patch has some issues > if display_charset is not UTF-8 (EUC-JP or Shift_JIS): > (1) Space is still added for some Kanji characters > if document_charset is Shift_JIS. > (2) Space is still added for all Kanji characters > if document_charset is ISO-20220-JP. > > display_charset | document_charset | > | UTF-8 | EUC-JP | Shift_JIS | ISO-2022-JP | > |---||---|-| > UTF-8 | Good | Good | Good|Good | > EUC-JP | Good | Good | (1) |(2) | > Shift_JIS | (not tested. probably same as EUC-JP display_charset) >
Re: [Lynx-dev] Japanese line wrap patch for UTF-8 terminal
> I wrote an additional patch to avoid adding spaces > when joining lines after Japanese characters. Attached test html files are converted to text/plain. Original test html files are here: http://www1.interq.or.jp/~deton/lynx-jajoinspaces/
Re: [Lynx-dev] Japanese line wrap patch for UTF-8 terminal
> Does Lynx still add a space wherever there is a line break in the > text? Yes. My Japanese line wrap patch does not change this behavior. > is rendered as JJ[space]JJ[space]JJ, > but I hope for JJ, i.e., continuous text without spaces. I wrote an additional patch to avoid adding spaces when joining lines after Japanese characters. But the patch has some issues if display_charset is not UTF-8 (EUC-JP or Shift_JIS): (1) Space is still added for some Kanji characters if document_charset is Shift_JIS. (2) Space is still added for all Kanji characters if document_charset is ISO-20220-JP. display_charset | document_charset | | UTF-8 | EUC-JP | Shift_JIS | ISO-2022-JP | |---||---|-| UTF-8 | Good | Good | Good|Good | EUC-JP | Good | Good | (1) |(2) | Shift_JIS | (not tested. probably same as EUC-JP display_charset) diff --git a/src/GridText.c b/src/GridText.c index 04e9a4a..d9a1665 100644 --- a/src/GridText.c +++ b/src/GridText.c @@ -453,7 +453,11 @@ struct _HText { HTList *hidden_links; /* Content-less links ... */ int hiddenlinkflag; /* ... and how to treat them */ BOOL no_cache; /* Always refresh? */ +#ifdef EXP_JAPANESE_SPACES +char LastChars[7]; /* utf-8 buffer */ +#else char LastChar; /* For absorbing white space */ +#endif /* For Internal use: */ HTStyle *style; /* Current style */ @@ -1134,7 +1138,11 @@ HText *HText_new(HTParentAnchor *anchor) anchor->post_data) ? YES : NO); +#ifdef EXP_JAPANESE_SPACES +memset(self->LastChars, 0, sizeof(self->LastChars)); +#else self->LastChar = '\0'; +#endif #ifndef USE_PRETTYSRC if (HTOutputFormat == WWW_SOURCE) @@ -2867,7 +2875,7 @@ static void split_line(HText *text, unsigned split) #ifdef EXP_WCWIDTH_SUPPORT utfxtracells_on_this_line = 0; #endif -text->LastChar = ' '; +HText_setLastChar(text, ' '); #ifdef DEBUG_APPCH CTRACE((tfp, "GridText: split_line(%p,%d) called\n", text, split)); @@ -4648,7 +4656,20 @@ void HText_setLastChar(HText *text, int ch) if (!text) return; +#ifdef EXP_JAPANESE_SPACES +if (IS_UTF_EXTRA(ch) && IS_UTF_FIRST(text->LastChars[0])) { + int i; + for (i = 1; text->LastChars[i] != '\0' && i < sizeof(text->LastChars) - 1; i++) + ; + text->LastChars[i] = (char) ch; + text->LastChars[i + 1] = '\0'; + return; +} +memset(text->LastChars, 0, sizeof(text->LastChars)); +text->LastChars[0] = (char) ch; +#else text->LastChar = (char) ch; +#endif } /* Get LastChar element in the text object. @@ -4659,8 +4680,37 @@ char HText_getLastChar(HText *text) if (!text) return ('\0'); +#ifdef EXP_JAPANESE_SPACES +if (IS_UTF_FIRST(text->LastChars[0])) { + int i; + for (i = 1; text->LastChars[i] != '\0' && i < sizeof(text->LastChars); i++) + ; + return ((char) text->LastChars[i - 1]); +} +return ((char) text->LastChars[0]); +#else return ((char) text->LastChar); +#endif +} + +#ifdef EXP_JAPANESE_SPACES +BOOL HText_checkLastChar_needSpaceOnJoinLines(HText *text) +{ +if (!text) + return YES; + +if (IS_UTF_FIRST(text->LastChars[0]) && isUTF8CJChar(text->LastChars)) + return NO; +if ((HTCJK == CHINESE || HTCJK == JAPANESE) && is8bits(text->LastChars[0])) { + /* TODO: support 2nd byte of some SJIS kanji (!is8bits && IS_SJIS_LO) */ + return NO; +} +if (text->LastChars[0] != ' ') + return YES; +return NO; } +#endif + /* Simple table handling - private * --- @@ -5204,7 +5254,7 @@ static void add_link_number(HText *text, TextAnchor *a, int save_position) && (text->source ? !psrcview_no_anchor_numbering : 1) #endif && links_are_numbered()) { - char saved_lastchar = text->LastChar; + char saved_lastchar = HText_getLastChar(text); int saved_linenum = text->Lines; HTAnchor *link_dest; char *link_text; @@ -5222,7 +5272,7 @@ static void add_link_number(HText *text, TextAnchor *a, int save_position) HText_appendText(text, marker); } if (saved_linenum && text->Lines && saved_lastchar != ' ') - text->LastChar = ']'; /* if marker not after space caused split */ + HText_setLastChar(text, ']'); /* if marker not after space caused split */ if (save_position) { a->line_num = text->Lines; a->line_pos = (short) text->last_line->size; @@ -14973,6 +15023,14 @@ static void permit_split_after_CJchar(HText *text, const char *s, unsigned short { /* Can split after almost any CJ char (Korean uses space) */ /* TODO: UAX#14 Unicode Line Breaking Algorithm (use ICU4C?) */ +if (isUTF8CJChar(s)) + text->permissible_split = pos; +} +#endif /* EXP_WCWIDTH_SUPPORT */ + +#if defined(EXP_WCWIDTH_SUPPORT) || defined(EXP_JAPANESE_SPACES) +BOOL isUTF8CJChar(const char *s) +{ UCode_t u = UCGetUniFromUtf8String(&s); if (u >= 0x4e00 && u
Re: [Lynx-dev] Japanese line wrap patch for UTF-8 terminal
Henry dixit: >is rendered as JJ[space]JJ[space]JJ, You probably mean JJ[newline][space]JJ[…], right? >but I hope for JJ, i.e., continuous text without spaces. Try the -nomargins option. Good luck, //mirabilos -- Gestern Nacht ist mein IRC-Netzwerk explodiert. Ich hatte nicht damit gerechnet, darum bin ich blutverschmiert… wer konnte ahnen, daß SIE so reagier’n… gestern Nacht ist mein IRC-Netzwerk explodiert~~~ (as of 2021-06-15 The MirOS Project temporarily reconvenes on OFTC)
Re: [Lynx-dev] Japanese line wrap patch for UTF-8 terminal
Yes, thank you very much indeed! Does Lynx still add a space wherever there is a line break in the text? Because of my visual handicap I usually compose web pages with lines that are 30 or so Japanese characters in length. At every line break, Lynx used to (still does?) add a space. For example (where J is Japanese character): JJ JJ JJ is rendered as JJ[space]JJ[space]JJ, but I hope for JJ, i.e., continuous text without spaces. I tried to write a patch maybe 20 years ago, but I don't know enough C to do the job. Regards, 2022年3月20日(日) 5:17 Thomas Dickey : > > On Sat, Mar 19, 2022 at 03:33:08PM +0900, KIHARA Hideto wrote: > > Attached patch improves Japanese line wrap > > for UTF-8 display character set. > > thanks :-) > > -- > Thomas E. Dickey > https://invisible-island.net > ftp://ftp.invisible-island.net
Re: [Lynx-dev] Japanese line wrap patch for UTF-8 terminal
On Sat, Mar 19, 2022 at 03:33:08PM +0900, KIHARA Hideto wrote: > Attached patch improves Japanese line wrap > for UTF-8 display character set. thanks :-) -- Thomas E. Dickey https://invisible-island.net ftp://ftp.invisible-island.net signature.asc Description: PGP signature
[Lynx-dev] Japanese line wrap patch for UTF-8 terminal
Attached patch improves Japanese line wrap for UTF-8 display character set. * Issue: Lynx sometimes breaks lines near the beginning of a line for Japanese texts and UTF-8 display character set. These line breaks hinder smooth reading. * Example: Rails 7(Japanese_text_with_no_spaces)... .. is displayed as: Rails 7(Japanese_text_with_no_spaces). expected result is: Rails 7(Japanese_text_with_no_spaces)... .. screen captures: http://www1.interq.or.jp/~deton/lynx-jawrapline/ * Cause: Lynx breaks lines at space, but Japanese texts usually have no spaces. In Japanese, line breaks can usually occur before and after almost any Japanese characters, not just spaces. * Patch: This patch permits line breaks after any Japanese character. (enabled by --enable-wcwidth-support configuration and only called on last byte of multibyte UTF-8 sequence) Note that lynx already has similar code for EUC-JP display character set. --- src/GridText.c.orig 2021-12-29 15:28:45.256049180 +0900 +++ src/GridText.c 2022-02-19 16:56:57.749568192 +0900 @@ -605,6 +605,7 @@ static int utfxtra_on_this_line = 0;/* #ifdef EXP_WCWIDTH_SUPPORT static int utfxtracells_on_this_line = 0; /* num of UTF-8 extra cells in line */ static int utfextracells(const char *s); +static void permit_split_after_CJchar(HText *text, const char *s, unsigned short pos); #endif #ifdef WIDEC_CURSES # ifdef EXP_WCWIDTH_SUPPORT/* TODO: support for !WIDEC_CURSES */ @@ -4165,8 +4166,10 @@ void HText_appendCharacter(HText *text, utff--; utf_xlen = UTF_XLEN(line->data[utff]); - if (line->size - utff == utf_xlen + 1) /* have last byte */ + if (line->size - utff == utf_xlen + 1) { /* have last byte */ utfxtracells_on_this_line += utfextracells(&(line->data[utff])); + permit_split_after_CJchar(text, &(line->data[utff]), line->size); + } } #endif return; @@ -14965,4 +14968,19 @@ static int utfextracells(const char *s) } return result; } + +static void permit_split_after_CJchar(HText *text, const char *s, unsigned short pos) +{ +/* Can split after almost any CJ char (Korean uses space) */ +/* TODO: UAX#14 Unicode Line Breaking Algorithm (use ICU4C?) */ +UCode_t u = UCGetUniFromUtf8String(&s); +if (u >= 0x4e00 && u <= 0x9fff || /* CJK Unified Ideographs */ + u >= 0x3000 && u <= 0x30ff || /* CJK Symbols and Punctuation, Hiragana, Katakana */ + u >= 0xff00 && u <= 0xffef || /* Halfwidth and Fullwidth Forms. Fullwidth ?! are often used */ + /* rare characters */ + u >= 0x3400 && u <= 0x4dbf || /* CJK Unified Ideographs Extension A */ + u >= 0xf900 && u <= 0xfaff || /* CJK Compatibility Ideographs */ + u >= 0x2 && u <= 0x3) /* {Supplementary,Tertiary} Ideographic Plane */ + text->permissible_split = pos; +} #endif