burton 2004/09/03 12:46:47 Modified: feedparser TODO build.xml feedparser/src/java/org/apache/commons/feedparser FeedFilter.java FeedParser.java Main.java RSSFeedParser.java feedparser/src/java/org/apache/commons/feedparser/locate AnchorParser.java ResourceExpander.java feedparser/src/java/org/apache/commons/feedparser/test TestFeedFilter.java TestFeedParser.java feedparser/src/java/org/apache/commons/feedparser/tools XMLCleanser.java XMLEncodingParser.java Log: don't use links if they are null Revision Changes Path 1.12 +1 -2 jakarta-commons-sandbox/feedparser/TODO Index: TODO =================================================================== RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/TODO,v retrieving revision 1.11 retrieving revision 1.12 diff -u -r1.11 -r1.12 --- TODO 31 Aug 2004 21:01:37 -0000 1.11 +++ TODO 3 Sep 2004 19:46:47 -0000 1.12 @@ -1,4 +1,5 @@ +- BUG: what happens when I put a comment after a UTF-16 BOM?! - Support Base64 Atom values and the ability to enable them. @@ -6,9 +7,7 @@ - Do we support multiple content items in Atom? - - We do not support multipart/alternative in the feedparser. - - Do we support atom:summary at ALL?! I don't think so... 1.7 +1 -0 jakarta-commons-sandbox/feedparser/build.xml Index: build.xml =================================================================== RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/build.xml,v retrieving revision 1.6 retrieving revision 1.7 diff -u -r1.6 -r1.7 --- build.xml 2 Sep 2004 00:36:25 -0000 1.6 +++ build.xml 3 Sep 2004 19:46:47 -0000 1.7 @@ -121,6 +121,7 @@ <formatter type="plain" usefile="false"/> + <test name="org.apache.commons.feedparser.test.TestFeedFilter"/> <test name="org.apache.commons.feedparser.test.TestProbeLocator"/> <test name="org.apache.commons.feedparser.test.TestAtom"/> <test name="org.apache.commons.feedparser.test.TestFeedParserUTF8"/> 1.4 +37 -12 jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/FeedFilter.java Index: FeedFilter.java =================================================================== RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/FeedFilter.java,v retrieving revision 1.3 retrieving revision 1.4 diff -u -r1.3 -r1.4 --- FeedFilter.java 2 Sep 2004 01:19:55 -0000 1.3 +++ FeedFilter.java 3 Sep 2004 19:46:47 -0000 1.4 @@ -32,40 +32,65 @@ private static Pattern entity_pattern = Pattern.compile( "&([a-zA-Z]+);" ); - public static byte[] parse( byte[] bytes ) { + public static byte[] parse( byte[] bytes ) + throws Exception { - String content = new String( bytes ); + return parse( bytes, "UTF-8" ); + + } + + public static byte[] parse( byte[] bytes, String encoding ) + throws Exception { + + String content = new String( bytes, encoding ); + + return parse( content, encoding ); + + } + + public static byte[] parse( String content, String encoding ) + throws Exception { //remove leading prolog... - content = doRemoveLeadingProlog( content ); + content = doRemoveLeadingProlog( content, encoding ); content = doDecodeEntities( content ); - return content.getBytes(); - - } + return content.getBytes( encoding ); + } + /** * Removing prolog whitespace, comments, and other garbage from the * beginning of a feed. * * @author <a href="mailto:[EMAIL PROTECTED]">Kevin A. Burton</a> */ - private static String doRemoveLeadingProlog( String content ) { + private static String doRemoveLeadingProlog( String content, String encoding ) { + + //if we're a UTF-16 or UTF-32 feed we need to LEAVE the prolog because + //it triggers a UTF-16 parse. + if ( "UTF-16".equals( encoding ) || + "UTF-32".equals( encoding ) ) + return content; + //move to the beginning of the first element or comment. When this is a //processing instruction we will move to that int begin = content.indexOf( "<" ); - if ( begin > 0 ) + if ( begin > 0 ) { content = content.substring( begin, content.length() ); + } - //now skip to the XML processing instruction when necessary. + //now skip to the XML processing instruction when necessary. This is + //used to remove comments prior to <?xml which are not allowed. begin = content.indexOf( "<?xml" ); - if ( begin > 0 ) + if ( begin > 0 ) { content = content.substring( begin, content.length() ); + } return content; @@ -107,7 +132,7 @@ } - public static void main( String[] args ) { + public static void main( String[] args ) throws Exception { byte[] b = parse( "hello é world".getBytes() ); 1.10 +47 -23 jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/FeedParser.java Index: FeedParser.java =================================================================== RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/FeedParser.java,v retrieving revision 1.9 retrieving revision 1.10 diff -u -r1.9 -r1.10 --- FeedParser.java 31 Aug 2004 21:00:32 -0000 1.9 +++ FeedParser.java 3 Sep 2004 19:46:47 -0000 1.10 @@ -32,6 +32,8 @@ import org.jaxen.jdom.*; +import org.apache.log4j.Logger; + /** * This FeedParser implementation is based on JDOM and Jaxen and is based around * XPath and JDOM iteration. While the implementation is straight forward it @@ -43,6 +45,8 @@ */ public class FeedParser { + private static Logger log = Logger.getLogger( FeedParser.class ); + /** * Parse this feed. * @@ -56,6 +60,8 @@ try { + is = getCorrectInputStream( is ); + // Need to massage our XML support forfor UTF-8 to prevent the // dreaded "Invalid byte 1 of 1-byte UTF-8 sequence" content bug in // some default feeds. This was tested a great deal under @@ -64,31 +70,12 @@ // In FeedParser 2.0 (or as soon as we use SAX) this code should be // totally removed to use the original stream. - byte[] bytes = toByteArray( is ); - String encoding = XMLEncodingParser.parse( bytes ); - - if ( encoding == null ) - encoding = "UTF-8"; - - if ( encoding.equalsIgnoreCase( "UTF-8" ) ) { - - String result = XMLCleanser.cleanse( bytes, encoding ); - bytes = result.getBytes(); - - } - - //remove prefix whitespace, intern HTML entities, etc. - bytes = FeedFilter.parse( bytes ); - - //build an input stream from the our bytes for parsing... - is = new ByteArrayInputStream( bytes ); - //OK. Now we have the right InputStream so we should build our DOM //and exec. DOMBuilder builder = new DOMBuilder(); - + org.jdom.Document doc = builder.build( is ); - + parse( listener, doc ); } catch ( FeedParserException fpe ) { @@ -99,6 +86,43 @@ } /** + * Perform the Xerces UTF8 correction and FeedFilter. + * + * @author <a href="mailto:[EMAIL PROTECTED]">Kevin A. Burton</a> + */ + private static InputStream getCorrectInputStream( InputStream is ) + throws Exception { + + byte[] bytes = toByteArray( is ); + + //FIXME: if we return the WRONG content type here we will royally fuck + //up getByets... UTF-16 and UTF-32 especially + String encoding = XMLEncodingParser.parse( bytes ); + + if ( encoding == null ) + encoding = "UTF-8"; + + if ( encoding.startsWith( "UTF" ) ) { + + String result = XMLCleanser.cleanse( bytes, encoding ); + bytes = FeedFilter.parse( result, encoding ); + + } else { + + bytes = FeedFilter.parse( bytes, encoding ); + + } + + //remove prefix whitespace, intern HTML entities, etc. + + //build an input stream from the our bytes for parsing... + is = new ByteArrayInputStream( bytes ); + + return is; + + } + + /** * @deprecated Use #parse( FeedParserException, InputStream, String ) */ public static void parse( FeedParserListener listener, @@ -145,7 +169,7 @@ return; } - //fall back on RDF. + //fall back on RDF and RSS RSSFeedParser.parse( listener, doc ); 1.3 +3 -1 jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/Main.java Index: Main.java =================================================================== RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/Main.java,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- Main.java 21 Apr 2004 07:16:03 -0000 1.2 +++ Main.java 3 Sep 2004 19:46:47 -0000 1.3 @@ -51,6 +51,8 @@ if ( input.startsWith( "http://" ) ) { is = new URL( input ).openStream(); } else { + + System.out.println( "Opening from file: " + input ); is = new FileInputStream( input ); } 1.12 +3 -1 jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/RSSFeedParser.java Index: RSSFeedParser.java =================================================================== RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/RSSFeedParser.java,v retrieving revision 1.11 retrieving revision 1.12 diff -u -r1.11 -r1.12 --- RSSFeedParser.java 2 Sep 2004 01:19:55 -0000 1.11 +++ RSSFeedParser.java 3 Sep 2004 19:46:47 -0000 1.12 @@ -245,6 +245,8 @@ public static String getChildElementTextByName( FeedParserState state, String name ) throws Exception { + //FIXME: this can be rewritten to use getChild() + XPath xpath = new XPath( "descendant::*[local-name() = '" + name + "']" ); Object resultNode = xpath.selectSingleNode( state.current ); 1.4 +4 -2 jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/AnchorParser.java Index: AnchorParser.java =================================================================== RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/AnchorParser.java,v retrieving revision 1.3 retrieving revision 1.4 diff -u -r1.3 -r1.4 --- AnchorParser.java 31 Aug 2004 21:00:32 -0000 1.3 +++ AnchorParser.java 3 Sep 2004 19:46:47 -0000 1.4 @@ -40,8 +40,7 @@ parseAnchors( content, listener ); } - - + /** * Get links from the given html with included titles and other metainfo. * @@ -66,6 +65,9 @@ String resource = EntityDecoder.decode( m.group( 1 ) ); String title = EntityDecoder.decode( m.group( 2 ).trim() ); + if ( resource == null || resource.equals( "" ) ) + return; + if ( ! listener.onAnchor( resource, null, title ) ) return; 1.5 +4 -1 jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/ResourceExpander.java Index: ResourceExpander.java =================================================================== RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/ResourceExpander.java,v retrieving revision 1.4 retrieving revision 1.5 diff -u -r1.4 -r1.5 --- ResourceExpander.java 26 Jun 2004 22:42:45 -0000 1.4 +++ ResourceExpander.java 3 Sep 2004 19:46:47 -0000 1.5 @@ -234,6 +234,9 @@ */ public static String getBase( String resource ) { + if ( resource == null ) + return null; + int begin = "http://".length() + 1; int end = resource.lastIndexOf( "/" ); 1.3 +27 -5 jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/test/TestFeedFilter.java Index: TestFeedFilter.java =================================================================== RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/test/TestFeedFilter.java,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- TestFeedFilter.java 2 Sep 2004 00:36:25 -0000 1.2 +++ TestFeedFilter.java 3 Sep 2004 19:46:47 -0000 1.3 @@ -45,6 +45,8 @@ */ public class TestFeedFilter extends TestCase { + public static int current = 0; + public TestFeedFilter( String name ) throws Exception { super( name ); @@ -52,20 +54,31 @@ private void doTest( String resource ) throws Exception { - System.out.println( "resource: " + resource ); + System.out.println( "resource: (" + current + ") " + resource ); URL url = new URL( resource ); - PrintStream out = new PrintStream( new ByteArrayOutputStream() ); + FileOutputStream fos = new FileOutputStream( "/tmp/test-feed-filter-" + current + ".html" ); + PrintStream out = new PrintStream( fos, true, "UTF-8" ); + + out.println( "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html; charset=UTF-8\"> " ); + out.println( "<pre>" ); + DebugFeedParserListener listener = new DebugFeedParserListener( out ); FeedParser.parse( listener, url.openStream(), resource ); - + + out.println( "</pre>" ); + + ++current; + } public void test1() throws Exception { + doTest( "file:tests/feeds/rss-1.0-EUC-JP.rdf" ); + doTest( "file:tests/filter/nbsp-1.xml" ); doTest( "file:tests/filter/entity-atom-1.xml" ); @@ -73,7 +86,16 @@ doTest( "file:tests/filter/prolog-atom-1.xml" ); doTest( "file:tests/filter/prolog-atom-2.xml" ); doTest( "file:tests/filter/prolog-opml-1.xml" ); - + + doTest( "file:tests/feeds/utf16.rss1" ); + doTest( "file:tests/feeds/utf16.rss2" ); + doTest( "file:tests/feeds/i18n.atom" ); + doTest( "file:tests/feeds/utf16.atom" ); + + doTest( "file:tests/feeds/atom-1.xml" ); + doTest( "file:tests/feeds/rss-1.0-EUC-JP.rdf" ); + doTest( "file:tests/feeds/rss-1.0-international-1.rdf" ); + } public static void main( String[] args ) throws Exception { 1.3 +7 -7 jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/test/TestFeedParser.java Index: TestFeedParser.java =================================================================== RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/test/TestFeedParser.java,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- TestFeedParser.java 28 Feb 2004 03:35:22 -0000 1.2 +++ TestFeedParser.java 3 Sep 2004 19:46:47 -0000 1.3 @@ -72,13 +72,13 @@ public void finished() {} }; - - listener.setContext( this ); + + listener.setContext( this ); + + ResourceRequest request = ResourceRequestFactory.getResourceRequest( resource ); + + parser.parse( listener, request.getInputStream() ); - ResourceRequest request = ResourceRequestFactory.getResourceRequest( resource ); - - parser.parse( listener, request.getInputStream() ); - } public static void main( String[] args ) { 1.2 +2 -2 jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/tools/XMLCleanser.java Index: XMLCleanser.java =================================================================== RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/tools/XMLCleanser.java,v retrieving revision 1.1 retrieving revision 1.2 diff -u -r1.1 -r1.2 --- XMLCleanser.java 3 Aug 2004 01:24:17 -0000 1.1 +++ XMLCleanser.java 3 Sep 2004 19:46:47 -0000 1.2 @@ -54,7 +54,7 @@ */ public static String cleanse( byte[] content, String encoding ) throws Exception { - String s = new String( content, encoding); + String s = new String( content, encoding ); StringBuffer buff = new StringBuffer( content.length ); 1.2 +65 -5 jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/tools/XMLEncodingParser.java Index: XMLEncodingParser.java =================================================================== RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/tools/XMLEncodingParser.java,v retrieving revision 1.1 retrieving revision 1.2 diff -u -r1.1 -r1.2 --- XMLEncodingParser.java 3 Aug 2004 01:24:17 -0000 1.1 +++ XMLEncodingParser.java 3 Sep 2004 19:46:47 -0000 1.2 @@ -22,7 +22,8 @@ /** * - * Given an XML document pull out the encoding or null if not specified. + * Given an XML document pull out the encoding or the default (UTF-8) if not + * specified. * * @author <a href="mailto:[EMAIL PROTECTED]">Kevin A. Burton</a> */ @@ -41,17 +42,22 @@ //just use the first 100 bytes String str; - + if ( content.length > 100 ) { str = new String( content, 0, 100 ); } else { str = new String( content ); } + String result = getEncodingFromBOM( content ); + + if ( result != null ) + return result; + int end = str.indexOf( ">" ); if ( end == -1 ) - return null; + return "UTF-8"; String decl = str.substring( 0, end ); @@ -65,16 +71,70 @@ end = encoding.indexOf( "\"" ); if ( end == -1 ) - return null; + return "UTF-8"; encoding = encoding.substring( 0, end); + encoding = encoding.toUpperCase(); + if ( "UTF8".equals( encoding ) ) + encoding = "UTF-8"; + return encoding; } - return null; + return "UTF-8"; + + } + + private static String getEncodingFromBOM( byte[] content ) { + + // Technically speaking if we see a BOM is specified we're supposed to + // return UTF-16 or UTF-32 but because we only care about anything UTF + // returning UTF-8 is incorrect but acceptable. + // + // http://www.unicode.org/faq/utf_bom.html#BOM + + if ( content.length > 2 ) { + + //perform UTF-16 tests + if ( content[0] == -1 && + content[1] == -2 ) + return "UTF-16"; + + if ( content[0] == -2 && + content[1] == -1 ) + return "UTF-16"; + + } + + if ( content.length > 4 ) { + + //perform UTF-16 tests + if ( content[0] == 0 && + content[1] == 0 && + content[2] == -2 && + content[3] == -1 ) + return "UTF-32"; + + if ( content[0] == -1 && + content[1] == -2 && + content[2] == 0 && + content[3] == 0 ) + return "UTF-32"; + } + + return null; + } + public static void main( String[] args ) throws Exception { + + System.out.println( parse( "<?xml encoding=\"utf-8\"?>".getBytes() ) ); + System.out.println( parse( "<?xml encoding=\"UTF-8\"?>".getBytes() ) ); + System.out.println( parse( "<?xml encoding=\"utf8\"?>".getBytes() ) ); + + } + }
--------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]