Re: ReducedHTMLParser issues

Martin Marinschek Tue, 01 Nov 2005 02:52:20 -0800

Simon,

I don't seem to be able to apply your patch again - an 'unknown line
type was found in line 12'.


Can you do it again - and attach it to our old jira-issue, I have
reopened it for this purpose.

regards,

Martin

On 11/1/05, Simon Kitching <[EMAIL PROTECTED]> wrote:
> Martin Marinschek wrote:
> > Don't stress yourself - it's just the nightly build, so not to big of a 
> > problem.
>
> Thanks, but it's hopefully done anyway.
>
> changes:
>   * Handle DOCTYPE and Processing Instruction commands in input HTML
>   * Track line# of input for error messages
>   * Remove some debugging printlns
>
> I can also provide a patch soon to format the code to the MyFaces
> convention rather than the Sun convention if you wish. Sorry, my Eclipse
> is set up to format stuff that way automatically and I forgot to
> reformat before posting.
>
> Regards,
>
> Simon
>
>
> Index: ReducedHTMLParser.java
> ===================================================================
> --- ReducedHTMLParser.java      (revision 329922)
> +++ ReducedHTMLParser.java      (working copy)
> @@ -49,6 +49,7 @@
>      private static final int STATE_IN_TAG = 2;
>
>      private int offset;
> +    private int lineNumber;
>      private CharSequence seq;
>      private CallbackListener listener;
>
> @@ -75,15 +76,32 @@
>          return offset >= seq.length();
>      }
>
> +    int getCurrentLineNumber() {
> +        return lineNumber;
> +    }
> +
>      /**
>       * Advance the current parse position over any whitespace characters.
>       */
>      void consumeWhitespace() {
> +        boolean crSeen = false;
> +
>          while (offset < seq.length()) {
>              char c = seq.charAt(offset);
>              if (!Character.isWhitespace(c)) {
>                  break;
>              }
> +
> +            // Track line number for error messages.
> +            if (c == '\r') {
> +                ++lineNumber;
> +                crSeen = true;
> +            } else if ((c == '\n') && !crSeen) {
> +                ++lineNumber;
> +            } else {
> +                crSeen = false;
> +            }
> +
>              ++offset;
>          }
>      }
> @@ -193,6 +211,10 @@
>          // TODO: should we consider a string to be terminated by a newline?
>          // that would help with runaway strings but I think that multiline
>          // strings *are* allowed...
> +        //
> +        // TODO: detect newlines within strings and increment lineNumber.
> +        // This isn't so important, though; they aren't common and being a
> +        // few lines out in an error message isn't serious either.
>          StringBuffer stringBuf = new StringBuffer();
>          boolean escaping = false;
>          while (!isFinished()) {
> @@ -248,6 +270,8 @@
>       * @param s is a set of characters that should not be discarded.
>       */
>      void consumeExcept(String s) {
> +        boolean crSeen = false;
> +
>          while (offset < seq.length()) {
>              char c = seq.charAt(offset);
>              if (s.indexOf(c) >= 0) {
> @@ -255,6 +279,16 @@
>                  return;
>              }
>
> +            // Track line number for error messages.
> +            if (c == '\r') {
> +                ++lineNumber;
> +                crSeen = true;
> +            } else if ((c == '\n') && !crSeen) {
> +                ++lineNumber;
> +            } else {
> +                crSeen = false;
> +            }
> +
>              ++offset;
>          }
>      }
> @@ -269,6 +303,7 @@
>          int currentTagStart = -1;
>          String currentTagName = null;
>
> +        lineNumber = 1;
>          offset = 0;
>          while (offset < seq.length())
>          {
> @@ -282,6 +317,10 @@
>                  if (consumeMatch("<!--")) {
>                      // VERIFY: can "< ! --" start a comment?
>                      state = STATE_IN_COMMENT;
> +                } else if (consumeMatch("<!")) {
> +                    // xml processing instruction or <!DOCTYPE> tag
> +                    // we don't need to actually do anything here
> +                    log.debug("PI found at line " + getCurrentLineNumber());
>                  } else if (consumeMatch("</")) {
>                      // VERIFY: is "< / foo >" a valid end-tag?
>
> @@ -306,10 +345,17 @@
>                      // the current info until the end of this tag.
>                      currentTagStart = offset - 1;
>                      currentTagName = consumeElementName();
> -                    state = STATE_IN_TAG;
> +                    if (currentTagName == null) {
> +                        log.warn("Invalid HTML; bare lessthan sign found at 
> line "
> +                            + getCurrentLineNumber());
> +                        // remain in STATE_READY; this isn't really the 
> start of
> +                        // an xml element.
> +                    } else {
> +                        state = STATE_IN_TAG;
> +                    }
>                  } else {
>                      // should never get here
> -                    throw new Error("Internal error");
> +                    throw new Error("Internal error at line " + 
> getCurrentLineNumber());
>                  }
>
>                  continue;
> @@ -378,7 +424,6 @@
>       */
>      void openedTag(int startOffset, int endOffset, String tagName) {
>          log.debug("Found open tag at " + startOffset + ":" + endOffset + ":" 
> + tagName);
> -        System.out.println("Found open tag at " + startOffset + ":" + 
> endOffset + ":" + tagName);
>
>          if ("head".equalsIgnoreCase(tagName)) {
>              listener.openedStartTag(startOffset, HEAD_TAG);
> @@ -394,7 +439,6 @@
>
>      void closedTag(int startOffset, int endOffset, String tagName) {
>          log.debug("Found close tag at " + startOffset + ":" + endOffset + 
> ":" + tagName);
> -        System.out.println("Found close tag at " + startOffset + ":" + 
> endOffset + ":" + tagName);
>
>          if ("head".equalsIgnoreCase(tagName)) {
>              listener.openedEndTag(startOffset, HEAD_TAG);
>
>
> Index: ReducedHTMLParserTest.java
> ===================================================================
> --- ReducedHTMLParserTest.java  (revision 329925)
> +++ ReducedHTMLParserTest.java  (working copy)
> @@ -322,8 +322,19 @@
>          parser.consumeExcept("z");
>      }
>
> +    // test parsing completes when invalid tag found.
> +    public void testParseBadTag() {
> +        String s = "xxxx \n\n <# \n\n";
> +        CallbackListener listener = new ParseCallbackListener();
> +        ReducedHTMLParser parser = new ReducedHTMLParser(s, listener);
> +
> +        parser.parse();
> +        assertTrue(parser.isFinished());
> +    }
> +
>      // test the full parse method
>      public void testParse() {
> +        String s0 = "<!DOCTYPE PUBLIC \"sss\" \"http:foo\">\n";
>          String s1 = "<html><head>";
>          String s2 = "\n<!-- a comment --><title>foo</title>";
>          String s3 = "</head>";
> @@ -338,6 +349,7 @@
>          String s8 = "</body> </html>";
>
>          StringBuffer buf = new StringBuffer();
> +        buf.append(s0);
>          buf.append(s1);
>          buf.append(s2);
>          buf.append(s3);
> @@ -354,13 +366,13 @@
>
>          // check that listener has correctly computed the offset to the char 
> just
>          // before the </head> tag starts.
> -        int afterHeadPos = s1.length();
> +        int afterHeadPos = s0.length() + s1.length();
>          assertEquals("Pos after <head> tag ", afterHeadPos, 
> listener.headerInsertPosition);
>
> -        int beforeBodyPos = s1.length() + s2.length() + s3.length();
> +        int beforeBodyPos = afterHeadPos + s2.length() + s3.length();
>          assertEquals("Pos before <body> tag", beforeBodyPos, 
> listener.beforeBodyPosition);
>
> -        int afterBodyPos = s1.length() + s2.length() + s3.length() + 
> s4.length();
> +        int afterBodyPos = beforeBodyPos + s4.length();
>          assertEquals("Pos after <body> tag", afterBodyPos, 
> listener.bodyInsertPosition);
>      }
>  }
>
>
>


--

http://www.irian.at
Your JSF powerhouse -
JSF Trainings in English and German

Re: ReducedHTMLParser issues

Reply via email to