Re: [Lynx-dev] Japanese line wrap patch for UTF-8 terminal

2022-03-31 Thread KIHARA Hideto
> > Does "document_charset" come from charset defined within  of web
> > page, e.g., ?

document_charset comes from
  1. HTTP header: Content-Type: text/html; charset=UTF-8

if header does not contain charset,
  2. HTML: 
 or 

if charset is not defined in HTML,
  3. BOM (Byte Order Mark) for local file or cache
  4. lynx -assume_local_charset= for local file
  5. lynx.cfg ASSUME_LOCAL_CHARSET for local file

  6. 'Assumed document character set' in Options Menu
  7. lynx -assume_charset=
  8. lynx.cfg ASSUME_CHARSET

  9. auto detect EUC-JP, Shift_JIS (no UTF-8 support)
 by USE_TH_JP_AUTO_DETECT (userdefs.h) 
 on display_charset=EUC-JP or Shift_JIS

* on display_charset=UTF-8 and HTTP: 1,2, 6,7,8
* on display_charset=UTF-8 and local file: 2, 3,4,5, 6,7,8
* on display_charset=EUC-JP or Shift_JIS and HTTP: 1,2, 9
* on display_charset=EUC-JP or Shift_JIS and local file: 2, 3, 9




Re: [Lynx-dev] Japanese line wrap patch for UTF-8 terminal

2022-03-27 Thread David Woolley

On 27/03/2022 15:07, Henry wrote:

Does "document_charset" come from charset defined within  of web
page, e.g., ?


It should come from the actual HTTP headers, although the meta element 
should not conflict and I'm not sure what the current policy is if it 
does conflict.






Re: [Lynx-dev] Japanese line wrap patch for UTF-8 terminal

2022-03-27 Thread Henry
Thank you very much.  (It is no wonder why I could not write patch!)

Does "document_charset" come from charset defined within  of web
page, e.g., ?

Henry

2022年3月26日(土) 13:16 KIHARA Hideto :
>
> > Does Lynx still add a space wherever there is a line break in the
> > text?
>
> Yes. My Japanese line wrap patch does not change this behavior.
>
> > is rendered as JJ[space]JJ[space]JJ,
> > but I hope for JJ, i.e., continuous  text without spaces.
>
> I wrote an additional patch to avoid adding spaces
> when joining lines after Japanese characters.
>
> But the patch has some issues
> if display_charset is not UTF-8 (EUC-JP or Shift_JIS):
>   (1) Space is still added for some Kanji characters
>   if document_charset is Shift_JIS.
>   (2) Space is still added for all Kanji characters
>   if document_charset is ISO-20220-JP.
>
> display_charset |   document_charset   |
> | UTF-8 | EUC-JP | Shift_JIS | ISO-2022-JP |
> |---||---|-|
> UTF-8   | Good  |  Good  |   Good|Good |
> EUC-JP  | Good  |  Good  |   (1) |(2)  |
> Shift_JIS   | (not tested. probably same as EUC-JP display_charset)
>



Re: [Lynx-dev] Japanese line wrap patch for UTF-8 terminal

2022-03-25 Thread KIHARA Hideto
> I wrote an additional patch to avoid adding spaces
> when joining lines after Japanese characters.

Attached test html files are converted to text/plain.
Original test html files are here:
http://www1.interq.or.jp/~deton/lynx-jajoinspaces/




Re: [Lynx-dev] Japanese line wrap patch for UTF-8 terminal

2022-03-25 Thread KIHARA Hideto
> Does Lynx still add a space wherever there is a line break in the
> text?

Yes. My Japanese line wrap patch does not change this behavior.

> is rendered as JJ[space]JJ[space]JJ,
> but I hope for JJ, i.e., continuous  text without spaces.

I wrote an additional patch to avoid adding spaces
when joining lines after Japanese characters.

But the patch has some issues
if display_charset is not UTF-8 (EUC-JP or Shift_JIS):
  (1) Space is still added for some Kanji characters
  if document_charset is Shift_JIS.
  (2) Space is still added for all Kanji characters
  if document_charset is ISO-20220-JP.

display_charset |   document_charset   |
| UTF-8 | EUC-JP | Shift_JIS | ISO-2022-JP |
|---||---|-|
UTF-8   | Good  |  Good  |   Good|Good |
EUC-JP  | Good  |  Good  |   (1) |(2)  |
Shift_JIS   | (not tested. probably same as EUC-JP display_charset)

diff --git a/src/GridText.c b/src/GridText.c
index 04e9a4a..d9a1665 100644
--- a/src/GridText.c
+++ b/src/GridText.c
@@ -453,7 +453,11 @@ struct _HText {
 HTList *hidden_links;	/* Content-less links ... */
 int hiddenlinkflag;		/*  ... and how to treat them */
 BOOL no_cache;		/* Always refresh? */
+#ifdef EXP_JAPANESE_SPACES
+char LastChars[7];		/* utf-8 buffer */
+#else
 char LastChar;		/* For absorbing white space */
+#endif
 
 /* For Internal use: */
 HTStyle *style;		/* Current style */
@@ -1134,7 +1138,11 @@ HText *HText_new(HTParentAnchor *anchor)
  anchor->post_data)
 ? YES
 : NO);
+#ifdef EXP_JAPANESE_SPACES
+memset(self->LastChars, 0, sizeof(self->LastChars));
+#else
 self->LastChar = '\0';
+#endif
 
 #ifndef USE_PRETTYSRC
 if (HTOutputFormat == WWW_SOURCE)
@@ -2867,7 +2875,7 @@ static void split_line(HText *text, unsigned split)
 #ifdef EXP_WCWIDTH_SUPPORT
 utfxtracells_on_this_line = 0;
 #endif
-text->LastChar = ' ';
+HText_setLastChar(text, ' ');
 
 #ifdef DEBUG_APPCH
 CTRACE((tfp, "GridText: split_line(%p,%d) called\n", text, split));
@@ -4648,7 +4656,20 @@ void HText_setLastChar(HText *text, int ch)
 if (!text)
 	return;
 
+#ifdef EXP_JAPANESE_SPACES
+if (IS_UTF_EXTRA(ch) && IS_UTF_FIRST(text->LastChars[0])) {
+	int i;
+	for (i = 1; text->LastChars[i] != '\0' && i < sizeof(text->LastChars) - 1; i++)
+	;
+	text->LastChars[i] = (char) ch;
+	text->LastChars[i + 1] = '\0';
+	return;
+}
+memset(text->LastChars, 0, sizeof(text->LastChars));
+text->LastChars[0] = (char) ch;
+#else
 text->LastChar = (char) ch;
+#endif
 }
 
 /*	Get LastChar element in the text object.
@@ -4659,8 +4680,37 @@ char HText_getLastChar(HText *text)
 if (!text)
 	return ('\0');
 
+#ifdef EXP_JAPANESE_SPACES
+if (IS_UTF_FIRST(text->LastChars[0])) {
+	int i;
+	for (i = 1; text->LastChars[i] != '\0' && i < sizeof(text->LastChars); i++)
+	;
+	return ((char) text->LastChars[i - 1]);
+}
+return ((char) text->LastChars[0]);
+#else
 return ((char) text->LastChar);
+#endif
+}
+
+#ifdef EXP_JAPANESE_SPACES
+BOOL HText_checkLastChar_needSpaceOnJoinLines(HText *text)
+{
+if (!text)
+	return YES;
+
+if (IS_UTF_FIRST(text->LastChars[0]) && isUTF8CJChar(text->LastChars))
+	return NO;
+if ((HTCJK == CHINESE || HTCJK == JAPANESE) && is8bits(text->LastChars[0])) {
+	/* TODO: support 2nd byte of some SJIS kanji (!is8bits && IS_SJIS_LO) */
+	return NO;
+}
+if (text->LastChars[0] != ' ')
+	return YES;
+return NO;
 }
+#endif
+
 
 /*		Simple table handling - private
  *		---
@@ -5204,7 +5254,7 @@ static void add_link_number(HText *text, TextAnchor *a, int save_position)
 	&& (text->source ? !psrcview_no_anchor_numbering : 1)
 #endif
 	&& links_are_numbered()) {
-	char saved_lastchar = text->LastChar;
+	char saved_lastchar = HText_getLastChar(text);
 	int saved_linenum = text->Lines;
 	HTAnchor *link_dest;
 	char *link_text;
@@ -5222,7 +5272,7 @@ static void add_link_number(HText *text, TextAnchor *a, int save_position)
 	HText_appendText(text, marker);
 	}
 	if (saved_linenum && text->Lines && saved_lastchar != ' ')
-	text->LastChar = ']';	/* if marker not after space caused split */
+	HText_setLastChar(text, ']');	/* if marker not after space caused split */
 	if (save_position) {
 	a->line_num = text->Lines;
 	a->line_pos = (short) text->last_line->size;
@@ -14973,6 +15023,14 @@ static void permit_split_after_CJchar(HText *text, const char *s, unsigned short
 {
 /* Can split after almost any CJ char (Korean uses space) */
 /* TODO: UAX#14 Unicode Line Breaking Algorithm (use ICU4C?) */
+if (isUTF8CJChar(s))
+	text->permissible_split = pos;
+}
+#endif /* EXP_WCWIDTH_SUPPORT */
+
+#if defined(EXP_WCWIDTH_SUPPORT) || defined(EXP_JAPANESE_SPACES)
+BOOL isUTF8CJChar(const char *s)
+{
 UCode_t u = UCGetUniFromUtf8String(&s);
 if (u >= 0x4e00 && u

Re: [Lynx-dev] Japanese line wrap patch for UTF-8 terminal

2022-03-21 Thread Thorsten Glaser
Henry dixit:

>is rendered as JJ[space]JJ[space]JJ,

You probably mean JJ[newline][space]JJ[…], right?

>but I hope for JJ, i.e., continuous  text without spaces.

Try the -nomargins option.

Good luck,
//mirabilos
-- 
Gestern Nacht ist mein IRC-Netzwerk explodiert. Ich hatte nicht damit
gerechnet, darum bin ich blutverschmiert… wer konnte ahnen, daß SIE so
reagier’n… gestern Nacht ist mein IRC-Netzwerk explodiert~~~
(as of 2021-06-15 The MirOS Project temporarily reconvenes on OFTC)



Re: [Lynx-dev] Japanese line wrap patch for UTF-8 terminal

2022-03-21 Thread Henry
Yes, thank you very much indeed!

Does Lynx still add a space wherever there is a line break in the
text?  Because of my visual handicap I usually compose web pages with
lines that are 30 or so Japanese characters in length.  At every line
break, Lynx used to (still does?) add a space.

For example (where J is Japanese character):
JJ
JJ
JJ
is rendered as JJ[space]JJ[space]JJ,
but I hope for JJ, i.e., continuous  text without spaces.

I tried to write a patch maybe 20 years ago, but I don't know enough C
to do the job.

Regards,



2022年3月20日(日) 5:17 Thomas Dickey :
>
> On Sat, Mar 19, 2022 at 03:33:08PM +0900, KIHARA Hideto wrote:
> > Attached patch improves Japanese line wrap
> > for UTF-8 display character set.
>
> thanks :-)
>
> --
> Thomas E. Dickey 
> https://invisible-island.net
> ftp://ftp.invisible-island.net



Re: [Lynx-dev] Japanese line wrap patch for UTF-8 terminal

2022-03-19 Thread Thomas Dickey
On Sat, Mar 19, 2022 at 03:33:08PM +0900, KIHARA Hideto wrote:
> Attached patch improves Japanese line wrap
> for UTF-8 display character set.

thanks :-)

-- 
Thomas E. Dickey 
https://invisible-island.net
ftp://ftp.invisible-island.net


signature.asc
Description: PGP signature


[Lynx-dev] Japanese line wrap patch for UTF-8 terminal

2022-03-18 Thread KIHARA Hideto
Attached patch improves Japanese line wrap
for UTF-8 display character set.

* Issue:
Lynx sometimes breaks lines near the beginning of a line
for Japanese texts and UTF-8 display character set.
These line breaks hinder smooth reading.

* Example:


Rails 7(Japanese_text_with_no_spaces)...
..


is displayed as:

Rails
7(Japanese_text_with_no_spaces).

expected result is:

Rails 7(Japanese_text_with_no_spaces)...
..

screen captures:
http://www1.interq.or.jp/~deton/lynx-jawrapline/

* Cause:
Lynx breaks lines at space, but Japanese texts usually have no spaces.

In Japanese, line breaks can usually occur
before and after almost any Japanese characters, not just spaces.

* Patch:
This patch permits line breaks after any Japanese character.
(enabled by --enable-wcwidth-support configuration
and only called on last byte of multibyte UTF-8 sequence)

Note that lynx already has similar code for EUC-JP display character set.

--- src/GridText.c.orig 2021-12-29 15:28:45.256049180 +0900
+++ src/GridText.c  2022-02-19 16:56:57.749568192 +0900
@@ -605,6 +605,7 @@ static int utfxtra_on_this_line = 0;/*
 #ifdef EXP_WCWIDTH_SUPPORT
 static int utfxtracells_on_this_line = 0;  /* num of UTF-8 extra cells in 
line */
 static int utfextracells(const char *s);
+static void permit_split_after_CJchar(HText *text, const char *s, unsigned 
short pos);
 #endif
 #ifdef WIDEC_CURSES
 # ifdef EXP_WCWIDTH_SUPPORT/* TODO: support for !WIDEC_CURSES */
@@ -4165,8 +4166,10 @@ void HText_appendCharacter(HText *text,
utff--;
utf_xlen = UTF_XLEN(line->data[utff]);
 
-   if (line->size - utff == utf_xlen + 1)  /* have last byte */
+   if (line->size - utff == utf_xlen + 1) { /* have last byte */
utfxtracells_on_this_line += 
utfextracells(&(line->data[utff]));
+   permit_split_after_CJchar(text, &(line->data[utff]), 
line->size);
+   }
}
 #endif
return;
@@ -14965,4 +14968,19 @@ static int utfextracells(const char *s)
 }
 return result;
 }
+
+static void permit_split_after_CJchar(HText *text, const char *s, unsigned 
short pos)
+{
+/* Can split after almost any CJ char (Korean uses space) */
+/* TODO: UAX#14 Unicode Line Breaking Algorithm (use ICU4C?) */
+UCode_t u = UCGetUniFromUtf8String(&s);
+if (u >= 0x4e00 && u <= 0x9fff || /* CJK Unified Ideographs */
+   u >= 0x3000 && u <= 0x30ff || /* CJK Symbols and Punctuation, Hiragana, 
Katakana */
+   u >= 0xff00 && u <= 0xffef || /* Halfwidth and Fullwidth Forms. 
Fullwidth ?! are often used */
+   /* rare characters */
+   u >= 0x3400 && u <= 0x4dbf || /* CJK Unified Ideographs Extension A */
+   u >= 0xf900 && u <= 0xfaff || /* CJK Compatibility Ideographs */
+   u >= 0x2 && u <= 0x3) /* {Supplementary,Tertiary} Ideographic 
Plane */
+   text->permissible_split = pos;
+}
 #endif