Hi, I have some jsp-Pages which are indexed locally with some "<% ... %>"-Tags in it. I am not famillar within JavaCC. How have I to filter this jsp-parts out of the page? Could you add this feature also. This would be great for me!
Regards, Stephan > -----Original Message----- > From: Daniel Calvo [mailto:[EMAIL PROTECTED]] > Sent: Friday, February 15, 2002 10:42 PM > To: Lucene Developers List > Subject: HTMLParser > > > Hi, > > I was playing with HTMLParser.jj and made some changes you > might be interested in. What I did was start handling <META> > tags (added > new methods: getAuthor, getKeywords and getMetadata and > changed getSummary to check if there's any metadata item with > name=="description"). I'm also filtering out any text inside > <STYLE>...</STYLE> (like <SCRIPT> is being handled). > I've performed some tests and I belive I didn't break anything ;-) > > The patch is as follows > > Best regards, > > --Daniel > > Index: HTMLParser.jj > =================================================================== > RCS file: > /home/cvspublic/jakarta-lucene/src/demo/org/apache/lucene/demo /html/HTMLParser.jj,v > retrieving revision 1.1 > diff -u -r1.1 HTMLParser.jj > --- HTMLParser.jj 26 Jan 2002 15:01:31 -0000 1.1 > +++ HTMLParser.jj 15 Feb 2002 20:39:49 -0000 > @@ -66,6 +66,8 @@ > package org.apache.lucene.demo.html; > > import java.io.*; > +import java.util.Map; > +import java.util.HashMap; > > public class HTMLParser { > public static int SUMMARY_LENGTH = 200; > @@ -76,11 +78,13 @@ > boolean titleComplete = false; > boolean inTitle = false; > boolean inScript = false; > + boolean inStyle = false; > boolean afterTag = false; > boolean afterSpace = false; > String eol = System.getProperty("line.separator"); > PipedReader pipeIn = null; > PipedWriter pipeOut; > + HashMap metadata = new HashMap(7); > > public HTMLParser(File file) throws FileNotFoundException { > this(new FileInputStream(file)); > @@ -109,15 +113,60 @@ > wait(10); > } > } > - if (summary.length() > SUMMARY_LENGTH) > - summary.setLength(SUMMARY_LENGTH); > + // look in metadata > + String description = (String) metadata.get("description"); > + if (description != null) > + return description; > + else { > + if (summary.length() > SUMMARY_LENGTH) > + summary.setLength(SUMMARY_LENGTH); > + > + String sum = summary.toString().trim(); > + String tit = getTitle(); > + if (sum.startsWith(tit)) > + return sum.substring(tit.length()); > + else > + return sum; > + } > + } > + > + public String getAuthor() throws IOException, > InterruptedException { > + if (pipeIn == null) > + getReader(); // spawn parsing thread > + while (true) { > + synchronized(this) { > + if (summary.length() > 0) // assume that all metadata > + break; // has already been collected > + wait(10); > + } > + } > + return (String)metadata.get("author"); > + } > + > + public String getKeywords() throws IOException, > InterruptedException { > + if (pipeIn == null) > + getReader(); // spawn parsing thread > + while (true) { > + synchronized(this) { > + if (summary.length() > 0) // assume that all metadata > + break; // has already been collected > + wait(10); > + } > + } > + return (String)metadata.get("keywords"); > + } > > - String sum = summary.toString().trim(); > - String tit = getTitle(); > - if (sum.startsWith(tit)) > - return sum.substring(tit.length()); > - else > - return sum; > + public Map getMetadata() throws IOException, InterruptedException { > + if (pipeIn == null) > + getReader(); // spawn parsing thread > + while (true) { > + synchronized(this) { > + if (summary.length() > 0) // assume that all metadata > + break; // has already been collected > + wait(10); > + } > + } > + return metadata; > } > > public Reader getReader() throws IOException { > @@ -144,7 +193,7 @@ > } > > void addText(String text) throws IOException { > - if (inScript) > + if (inScript || inStyle) > return; > if (inTitle) > title.append(text); > @@ -165,7 +214,7 @@ > } > > void addSpace() throws IOException { > - if (inScript) > + if (inScript || inStyle) > return; > if (!afterSpace) { > if (inTitle) > @@ -216,23 +265,38 @@ > { > Token t1, t2; > boolean inImg = false; > + boolean inMeta = false; > + String name = null; > + String content = null; > } > { > t1=<TagName> { > - inTitle = t1.image.equalsIgnoreCase("<title"); // keep > track if in <TITLE> > - inImg = t1.image.equalsIgnoreCase("<img"); // > keep track if in <IMG> > - if (inScript) { // keep track > if in <SCRIPT> > + inTitle = t1.image.equalsIgnoreCase("<title"); // > keep track if in <TITLE> > + inImg = t1.image.equalsIgnoreCase("<img"); // > keep track if in <IMG> > + inMeta = t1.image.equalsIgnoreCase("<meta"); // > keep track if in <META> > + if (inScript) { // > keep track if in <SCRIPT> > inScript = !t1.image.equalsIgnoreCase("</script"); > } else { > inScript = t1.image.equalsIgnoreCase("<script"); > } > + if (inStyle) { // > keep track if in <STYLE> > + inStyle = !t1.image.equalsIgnoreCase("</style"); > + } else { > + inStyle = t1.image.equalsIgnoreCase("<style"); > + } > } > (t1=<ArgName> > (<ArgEquals> > - (t2=ArgValue() // save ALT > text in IMG tag > + (t2=ArgValue() > { > if (inImg && t1.image.equalsIgnoreCase("alt") && t2 != null) > - addText("[" + t2.image + "]"); > + addText("[" + t2.image + "]"); // save ALT > text in IMG tag > + if (inMeta && t1.image.equalsIgnoreCase("name") && t2 != null) > + name = t2.image.toLowerCase(); // save name > in META tag > + if (inMeta && t1.image.equalsIgnoreCase("content") && > t2 != null) > + content = t2.image; // save > content in META tag > + if (inMeta && name != null && content != null) > + metadata.put(name, content); // save metadata > } > )? > )? > > > -- > To unsubscribe, e-mail: <mailto:[EMAIL PROTECTED]> For additional commands, e-mail: <mailto:[EMAIL PROTECTED]> -- To unsubscribe, e-mail: <mailto:[EMAIL PROTECTED]> For additional commands, e-mail: <mailto:[EMAIL PROTECTED]>
