Roman reported that the HTML parser eats too much whitespace: the sequence like <b> text </b> must be parsed preserving at most one leading and trailing space (now all are discarded). As this is an interpretation and not a programming bug,
this patch may break existing HTML parser tests that require complete
discarding of the word surrounding whitespace. These tests should be corrected
later.

2006-09-03  Audrius Meskauskas  <[EMAIL PROTECTED]>

   * gnu/javax/swing/text/html/parser/HTML_401F.java (defineElements):
   Disallow H1 - H6 in the paragraphs.
   * gnu/javax/swing/text/html/parser/support/textPreProcessor.java
   (preprocess): Leave at most one leading and/or trailing space.
   * javax/swing/text/html/HTMLDocument.java (HTMLReader.handleText):
   Do not add any text after closing the HTML tag.

### Eclipse Workspace Patch 1.0
#P classpath
Index: javax/swing/text/html/HTMLDocument.java
===================================================================
RCS file: /sources/classpath/classpath/javax/swing/text/html/HTMLDocument.java,v
retrieving revision 1.40
diff -u -r1.40 HTMLDocument.java
--- javax/swing/text/html/HTMLDocument.java	25 Aug 2006 11:40:44 -0000	1.40
+++ javax/swing/text/html/HTMLDocument.java	3 Sep 2006 20:31:04 -0000
@@ -1181,7 +1181,7 @@
      */
     public void handleText(char[] data, int pos)
     {
-      if (data != null && data.length > 0)
+      if (shouldInsert() && data != null && data.length > 0)
         addContent(data, 0, data.length);
     }
     
Index: gnu/javax/swing/text/html/parser/HTML_401F.java
===================================================================
RCS file: /sources/classpath/classpath/gnu/javax/swing/text/html/parser/HTML_401F.java,v
retrieving revision 1.4
diff -u -r1.4 HTML_401F.java
--- gnu/javax/swing/text/html/parser/HTML_401F.java	16 Jul 2006 15:03:08 -0000	1.4
+++ gnu/javax/swing/text/html/parser/HTML_401F.java	3 Sep 2006 20:31:01 -0000
@@ -2445,8 +2445,10 @@
         attr(VALUE, null, null, 0, IMPLIED)
       }
     );
+      
+      // Headers in the paragraph are not allowed.
       defElement(P, 0, false, true, new ContentModel( 0,
-       new noTagModel(P), null),
+       new noTagModel(new String[] { P, H1, H2, H3, H4, H5, H6 }), null),
       NONE
       ,
       new String[] {
Index: gnu/javax/swing/text/html/parser/support/textPreProcessor.java
===================================================================
RCS file: /sources/classpath/classpath/gnu/javax/swing/text/html/parser/support/textPreProcessor.java,v
retrieving revision 1.2
diff -u -r1.2 textPreProcessor.java
--- gnu/javax/swing/text/html/parser/support/textPreProcessor.java	2 Jul 2005 20:32:15 -0000	1.2
+++ gnu/javax/swing/text/html/parser/support/textPreProcessor.java	3 Sep 2006 20:31:01 -0000
@@ -42,17 +42,17 @@
 
 /**
  * Pre - processes text in text parts of the html document.
- * Not thread - safe.
+ *
  * @author Audrius Meskauskas, Lithuania ([EMAIL PROTECTED])
  */
 public class textPreProcessor
 {
   /**
-   * Pre - process non-preformatted text.
-   * \t, \r and \n mutate into spaces, then multiple spaces mutate
-   * into single one, all whitespace around tags is consumed.
-   * The content of the passed buffer is destroyed.
-   * @param text A text to pre-process.
+   * Pre - process non-preformatted text. \t, \r and \n mutate into spaces, then
+   * multiple spaces mutate into single one, all whitespace around tags is
+   * consumed. The content of the passed buffer is destroyed.
+   * 
+   * @param a_text A text to pre-process.
    */
   public char[] preprocess(StringBuffer a_text)
   {
@@ -64,17 +64,22 @@
     int a = 0;
     int b = text.length - 1;
 
+    // Remove leading/trailing whitespace, leaving at most one character
     try
       {
-        while (Constants.bWHITESPACE.get(text [ a ]))
+        while (Constants.bWHITESPACE.get(text[a])
+               && Constants.bWHITESPACE.get(text[a + 1]))
           a++;
-        while (Constants.bWHITESPACE.get(text [ b ]))
+
+        while (b > a && Constants.bWHITESPACE.get(text[b])
+               && Constants.bWHITESPACE.get(text[b - 1]))
           b--;
       }
     catch (ArrayIndexOutOfBoundsException sx)
       {
-        // A text fragment, consisting from line breaks only.
-        return null;
+        // A text fragment, consisting from spaces and line breaks only,
+        // mutates into single space.
+        return new char[] { ' ' };
       }
 
     a_text.setLength(0);
@@ -83,10 +88,9 @@
     boolean spaceNow;
     char c;
 
-    chars: 
-    for (int i = a; i <= b; i++)
+    chars: for (int i = a; i <= b; i++)
       {
-        c = text [ i ];
+        c = text[i];
         spaceNow = Constants.bWHITESPACE.get(c);
         if (spacesWere && spaceNow)
           continue chars;

Reply via email to