tools XMLCleanser.java XMLEncodingParser.java

burton Fri, 03 Sep 2004 12:46:54 -0700

burton      2004/09/03 12:46:47

  Modified:    feedparser TODO build.xml
               feedparser/src/java/org/apache/commons/feedparser
                        FeedFilter.java FeedParser.java Main.java
                        RSSFeedParser.java
               feedparser/src/java/org/apache/commons/feedparser/locate
                        AnchorParser.java ResourceExpander.java
               feedparser/src/java/org/apache/commons/feedparser/test
                        TestFeedFilter.java TestFeedParser.java
               feedparser/src/java/org/apache/commons/feedparser/tools
                        XMLCleanser.java XMLEncodingParser.java
  Log:
  don't use links if they are null
  
  Revision  Changes    Path
  1.12      +1 -2      jakarta-commons-sandbox/feedparser/TODO
  
  Index: TODO
  ===================================================================
  RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/TODO,v
  retrieving revision 1.11
  retrieving revision 1.12
  diff -u -r1.11 -r1.12
  --- TODO      31 Aug 2004 21:01:37 -0000      1.11
  +++ TODO      3 Sep 2004 19:46:47 -0000       1.12
  @@ -1,4 +1,5 @@
   
  +- BUG: what happens when I put a comment after a UTF-16 BOM?!
   
   - Support Base64 Atom values and the ability to enable them.
       
  @@ -6,9 +7,7 @@
   
   - Do we support multiple content items in Atom?
   
  -
   - We do not support multipart/alternative in the feedparser.
  -
   
   - Do we support atom:summary at ALL?!  I don't think so...
       
  
  
  
  1.7       +1 -0      jakarta-commons-sandbox/feedparser/build.xml
  
  Index: build.xml
  ===================================================================
  RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/build.xml,v
  retrieving revision 1.6
  retrieving revision 1.7
  diff -u -r1.6 -r1.7
  --- build.xml 2 Sep 2004 00:36:25 -0000       1.6
  +++ build.xml 3 Sep 2004 19:46:47 -0000       1.7
  @@ -121,6 +121,7 @@
                  
               <formatter type="plain" usefile="false"/>
   
  +            <test name="org.apache.commons.feedparser.test.TestFeedFilter"/>
               <test name="org.apache.commons.feedparser.test.TestProbeLocator"/>
               <test name="org.apache.commons.feedparser.test.TestAtom"/>
               <test name="org.apache.commons.feedparser.test.TestFeedParserUTF8"/>
  
  
  
  1.4       +37 -12    
jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/FeedFilter.java
  
  Index: FeedFilter.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/FeedFilter.java,v
  retrieving revision 1.3
  retrieving revision 1.4
  diff -u -r1.3 -r1.4
  --- FeedFilter.java   2 Sep 2004 01:19:55 -0000       1.3
  +++ FeedFilter.java   3 Sep 2004 19:46:47 -0000       1.4
  @@ -32,40 +32,65 @@
   
       private static Pattern entity_pattern = Pattern.compile( "&([a-zA-Z]+);" );
       
  -    public static byte[] parse( byte[] bytes ) {
  +    public static byte[] parse( byte[] bytes )
  +        throws Exception {
   
  -        String content = new String( bytes );
  +        return parse( bytes, "UTF-8" );
  +
  +    }
  +
  +    public static byte[] parse( byte[] bytes, String encoding )
  +        throws Exception {
  +
  +        String content = new String( bytes, encoding );
  +
  +        return parse( content, encoding );
  +
  +    }
  +
  +    public static byte[] parse( String content, String encoding )
  +        throws Exception {
   
           //remove leading prolog...
   
  -        content = doRemoveLeadingProlog( content );
  +        content = doRemoveLeadingProlog( content, encoding );
           content = doDecodeEntities( content );
           
  -        return content.getBytes();
  -        
  -    }
  +        return content.getBytes( encoding );
   
  +    }
  +        
       /**
        * Removing prolog whitespace, comments, and other garbage from the
        * beginning of a feed.
        *
        * @author <a href="mailto:[EMAIL PROTECTED]">Kevin A. Burton</a>
        */
  -    private static String doRemoveLeadingProlog( String content ) {
  +    private static String doRemoveLeadingProlog( String content, String encoding ) {
  +
  +        //if we're a UTF-16 or UTF-32 feed we need to LEAVE the prolog because
  +        //it triggers a UTF-16 parse.
   
  +        if ( "UTF-16".equals( encoding ) ||
  +             "UTF-32".equals( encoding ) )
  +            return content;
  +        
           //move to the beginning of the first element or comment.  When this is a
           //processing instruction we will move to that
           int begin = content.indexOf( "<" );
   
  -        if ( begin > 0 )
  +        if ( begin > 0 ) {
               content = content.substring( begin, content.length() );
  +        }
   
  -        //now skip to the XML processing instruction when necessary.
  +        //now skip to the XML processing instruction when necessary.  This is
  +        //used to remove comments prior to <?xml which are not allowed.
           
           begin = content.indexOf( "<?xml" );
   
  -        if ( begin > 0 )
  +        if ( begin > 0 ) {
               content = content.substring( begin, content.length() );
  +        }
   
           return content;
           
  @@ -107,7 +132,7 @@
           
       }
       
  -    public static void main( String[] args ) {
  +    public static void main( String[] args ) throws Exception {
   
           byte[] b = parse( "hello &eacute; world".getBytes() );
   
  
  
  
  1.10      +47 -23    
jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/FeedParser.java
  
  Index: FeedParser.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/FeedParser.java,v
  retrieving revision 1.9
  retrieving revision 1.10
  diff -u -r1.9 -r1.10
  --- FeedParser.java   31 Aug 2004 21:00:32 -0000      1.9
  +++ FeedParser.java   3 Sep 2004 19:46:47 -0000       1.10
  @@ -32,6 +32,8 @@
   
   import org.jaxen.jdom.*;
   
  +import org.apache.log4j.Logger;
  +
   /**
    * This FeedParser implementation is based on JDOM and Jaxen and is based around
    * XPath and JDOM iteration.  While the implementation is straight forward it
  @@ -43,6 +45,8 @@
    */
   public class FeedParser {
   
  +    private static Logger log = Logger.getLogger( FeedParser.class );
  +    
       /**
        * Parse this feed.
        * 
  @@ -56,6 +60,8 @@
   
           try { 
   
  +            is = getCorrectInputStream( is );
  +
               // Need to massage our XML support forfor UTF-8 to prevent the
               // dreaded "Invalid byte 1 of 1-byte UTF-8 sequence" content bug in
               // some default feeds.  This was tested a great deal under
  @@ -64,31 +70,12 @@
               // In FeedParser 2.0 (or as soon as we use SAX) this code should be
               // totally removed to use the original stream.
   
  -            byte[] bytes = toByteArray( is );
  -            String encoding = XMLEncodingParser.parse( bytes );
  -
  -            if ( encoding == null )
  -                encoding = "UTF-8";
  -
  -            if ( encoding.equalsIgnoreCase( "UTF-8" ) ) {
  -
  -                String result = XMLCleanser.cleanse( bytes, encoding );
  -                bytes = result.getBytes();
  -                
  -            } 
  -
  -            //remove prefix whitespace, intern HTML entities, etc.
  -            bytes = FeedFilter.parse( bytes );
  -
  -            //build an input stream from the our bytes for parsing...
  -            is = new ByteArrayInputStream( bytes );
  -            
               //OK.  Now we have the right InputStream so we should build our DOM
               //and exec.
               DOMBuilder builder = new DOMBuilder();
  -            
  +
               org.jdom.Document doc = builder.build( is );
  -            
  +
               parse( listener, doc );
   
           } catch ( FeedParserException fpe ) {
  @@ -99,6 +86,43 @@
       }
   
       /**
  +     * Perform the Xerces UTF8 correction and FeedFilter.
  +     *
  +     * @author <a href="mailto:[EMAIL PROTECTED]">Kevin A. Burton</a>
  +     */
  +    private static InputStream getCorrectInputStream( InputStream is )
  +        throws Exception {
  +
  +        byte[] bytes = toByteArray( is );
  +
  +        //FIXME: if we return the WRONG content type here we will royally fuck
  +        //up getByets... UTF-16 and UTF-32 especially
  +        String encoding = XMLEncodingParser.parse( bytes );
  +
  +        if ( encoding == null )
  +            encoding = "UTF-8";
  +
  +        if ( encoding.startsWith( "UTF" ) ) {
  +                
  +            String result = XMLCleanser.cleanse( bytes, encoding );
  +            bytes = FeedFilter.parse( result, encoding );
  +
  +        } else {
  +
  +            bytes = FeedFilter.parse( bytes, encoding );
  +             
  +        }
  +
  +        //remove prefix whitespace, intern HTML entities, etc.
  +
  +        //build an input stream from the our bytes for parsing...
  +        is = new ByteArrayInputStream( bytes );
  +
  +        return is;
  +        
  +    }
  +    
  +    /**
        * @deprecated Use #parse( FeedParserException, InputStream, String )
        */
       public static void parse( FeedParserListener listener,
  @@ -145,7 +169,7 @@
                   return;
               }
   
  -            //fall back on RDF.
  +            //fall back on RDF and RSS
   
               RSSFeedParser.parse( listener, doc );
               
  
  
  
  1.3       +3 -1      
jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/Main.java
  
  Index: Main.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/Main.java,v
  retrieving revision 1.2
  retrieving revision 1.3
  diff -u -r1.2 -r1.3
  --- Main.java 21 Apr 2004 07:16:03 -0000      1.2
  +++ Main.java 3 Sep 2004 19:46:47 -0000       1.3
  @@ -51,6 +51,8 @@
           if ( input.startsWith( "http://"; ) ) {
               is = new URL( input ).openStream();
           } else {
  +
  +            System.out.println( "Opening from file: " + input );
               is = new FileInputStream( input );
           }
   
  
  
  
  1.12      +3 -1      
jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/RSSFeedParser.java
  
  Index: RSSFeedParser.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/RSSFeedParser.java,v
  retrieving revision 1.11
  retrieving revision 1.12
  diff -u -r1.11 -r1.12
  --- RSSFeedParser.java        2 Sep 2004 01:19:55 -0000       1.11
  +++ RSSFeedParser.java        3 Sep 2004 19:46:47 -0000       1.12
  @@ -245,6 +245,8 @@
       public static String getChildElementTextByName( FeedParserState state,
                                                       String name ) throws Exception {
   
  +        //FIXME: this can be rewritten to use getChild()
  +        
           XPath xpath = new XPath( "descendant::*[local-name() = '" + name + "']" );
           Object resultNode = xpath.selectSingleNode( state.current );
   
  
  
  
  1.4       +4 -2      
jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/AnchorParser.java
  
  Index: AnchorParser.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/AnchorParser.java,v
  retrieving revision 1.3
  retrieving revision 1.4
  diff -u -r1.3 -r1.4
  --- AnchorParser.java 31 Aug 2004 21:00:32 -0000      1.3
  +++ AnchorParser.java 3 Sep 2004 19:46:47 -0000       1.4
  @@ -40,8 +40,7 @@
           parseAnchors( content, listener );
           
       }
  -        
  -    
  +
       /**
        * Get links from the given html with included titles and other metainfo.
        *
  @@ -66,6 +65,9 @@
               String resource = EntityDecoder.decode( m.group( 1 ) );
               String title = EntityDecoder.decode( m.group( 2 ).trim() );
   
  +            if ( resource == null || resource.equals( "" ) )
  +                return;
  +            
               if ( ! listener.onAnchor( resource, null, title ) )
                   return;
   
  
  
  
  1.5       +4 -1      
jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/ResourceExpander.java
  
  Index: ResourceExpander.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/ResourceExpander.java,v
  retrieving revision 1.4
  retrieving revision 1.5
  diff -u -r1.4 -r1.5
  --- ResourceExpander.java     26 Jun 2004 22:42:45 -0000      1.4
  +++ ResourceExpander.java     3 Sep 2004 19:46:47 -0000       1.5
  @@ -234,6 +234,9 @@
        */
       public static String getBase( String resource ) {
   
  +        if ( resource == null )
  +            return null;
  +        
           int begin = "http://".length() + 1;
           
           int end = resource.lastIndexOf( "/" );
  
  
  
  1.3       +27 -5     
jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/test/TestFeedFilter.java
  
  Index: TestFeedFilter.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/test/TestFeedFilter.java,v
  retrieving revision 1.2
  retrieving revision 1.3
  diff -u -r1.2 -r1.3
  --- TestFeedFilter.java       2 Sep 2004 00:36:25 -0000       1.2
  +++ TestFeedFilter.java       3 Sep 2004 19:46:47 -0000       1.3
  @@ -45,6 +45,8 @@
    */
   public class TestFeedFilter extends TestCase {
   
  +    public static int current = 0;
  +    
       public TestFeedFilter( String name ) throws Exception {
           super( name );
   
  @@ -52,20 +54,31 @@
   
       private void doTest( String resource ) throws Exception {
   
  -        System.out.println( "resource: " + resource );
  +        System.out.println( "resource: (" + current + ") " + resource );
   
           URL url = new URL( resource );
   
  -        PrintStream out = new PrintStream( new ByteArrayOutputStream() );
  +        FileOutputStream fos = new FileOutputStream( "/tmp/test-feed-filter-" + 
current + ".html" );
           
  +        PrintStream out = new PrintStream( fos, true, "UTF-8" );
  +
  +        out.println( "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html; 
charset=UTF-8\"> " );
  +        out.println( "<pre>" );
  +
           DebugFeedParserListener listener = new DebugFeedParserListener( out );
           
           FeedParser.parse( listener, url.openStream(), resource );
  -        
  +
  +        out.println( "</pre>" );
  +
  +        ++current;
  +
       }
       
       public void test1() throws Exception {
   
  +        doTest( "file:tests/feeds/rss-1.0-EUC-JP.rdf" );
  +
           doTest( "file:tests/filter/nbsp-1.xml" );
   
           doTest( "file:tests/filter/entity-atom-1.xml" );
  @@ -73,7 +86,16 @@
           doTest( "file:tests/filter/prolog-atom-1.xml" );
           doTest( "file:tests/filter/prolog-atom-2.xml" );
           doTest( "file:tests/filter/prolog-opml-1.xml" );
  -        
  +
  +        doTest( "file:tests/feeds/utf16.rss1" );
  +        doTest( "file:tests/feeds/utf16.rss2" );
  +        doTest( "file:tests/feeds/i18n.atom" );
  +        doTest( "file:tests/feeds/utf16.atom" );
  +
  +        doTest( "file:tests/feeds/atom-1.xml" );
  +        doTest( "file:tests/feeds/rss-1.0-EUC-JP.rdf" );
  +        doTest( "file:tests/feeds/rss-1.0-international-1.rdf" );
  +
       }
   
       public static void main( String[] args ) throws Exception {
  
  
  
  1.3       +7 -7      
jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/test/TestFeedParser.java
  
  Index: TestFeedParser.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/test/TestFeedParser.java,v
  retrieving revision 1.2
  retrieving revision 1.3
  diff -u -r1.2 -r1.3
  --- TestFeedParser.java       28 Feb 2004 03:35:22 -0000      1.2
  +++ TestFeedParser.java       3 Sep 2004 19:46:47 -0000       1.3
  @@ -72,13 +72,13 @@
                   public void finished() {}
                   
               };
  -
  -            listener.setContext( this );
  +        
  +        listener.setContext( this );
  +        
  +        ResourceRequest request = ResourceRequestFactory.getResourceRequest( 
resource );
  +        
  +        parser.parse( listener, request.getInputStream() );
               
  -            ResourceRequest request = ResourceRequestFactory.getResourceRequest( 
resource );
  -
  -            parser.parse( listener, request.getInputStream() );
  -
       }
   
       public static void main( String[] args ) {
  
  
  
  1.2       +2 -2      
jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/tools/XMLCleanser.java
  
  Index: XMLCleanser.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/tools/XMLCleanser.java,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- XMLCleanser.java  3 Aug 2004 01:24:17 -0000       1.1
  +++ XMLCleanser.java  3 Sep 2004 19:46:47 -0000       1.2
  @@ -54,7 +54,7 @@
        */
       public static String cleanse( byte[] content, String encoding ) throws 
Exception {
   
  -        String s = new String( content, encoding);
  +        String s = new String( content, encoding );
           
           StringBuffer buff = new StringBuffer( content.length );
   
  
  
  
  1.2       +65 -5     
jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/tools/XMLEncodingParser.java
  
  Index: XMLEncodingParser.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/tools/XMLEncodingParser.java,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- XMLEncodingParser.java    3 Aug 2004 01:24:17 -0000       1.1
  +++ XMLEncodingParser.java    3 Sep 2004 19:46:47 -0000       1.2
  @@ -22,7 +22,8 @@
   
   /**
    *
  - * Given an XML document pull out the encoding or null if not specified.
  + * Given an XML document pull out the encoding or the default (UTF-8) if not
  + * specified.
    *
    * @author <a href="mailto:[EMAIL PROTECTED]">Kevin A. Burton</a>
    */
  @@ -41,17 +42,22 @@
           //just use the first 100 bytes
   
           String str;
  -        
  +
           if ( content.length > 100 ) {
               str = new String( content, 0, 100 );
           } else {
               str = new String( content );
           }
   
  +        String result = getEncodingFromBOM( content );
  +
  +        if ( result != null )
  +            return result;
  +        
           int end = str.indexOf( ">" );
   
           if ( end == -1 )
  -            return null;
  +            return "UTF-8";
   
           String decl = str.substring( 0, end );
   
  @@ -65,16 +71,70 @@
               end = encoding.indexOf( "\"" );
               
               if ( end == -1 )
  -                return null;
  +                return "UTF-8";
   
               encoding = encoding.substring( 0, end);
  +            encoding = encoding.toUpperCase();
   
  +            if ( "UTF8".equals( encoding ) )
  +                encoding = "UTF-8";
  +            
               return encoding;
               
           }
   
  -        return null;
  +        return "UTF-8";
  +
  +    }
  +
  +    private static String getEncodingFromBOM( byte[] content ) {
  +
  +        // Technically speaking if we see a BOM is specified we're supposed to
  +        // return UTF-16 or UTF-32 but because we only care about anything UTF
  +        // returning UTF-8 is incorrect but acceptable.
  +        //
  +        // http://www.unicode.org/faq/utf_bom.html#BOM
  +
  +        if ( content.length > 2 ) {
  +
  +            //perform UTF-16 tests
  +            if ( content[0] == -1 &&
  +                 content[1] == -2 ) 
  +                return "UTF-16";
  +
  +            if ( content[0] == -2 &&
  +                 content[1] == -1 ) 
  +                return "UTF-16";
  +
  +        }
  +
  +        if ( content.length > 4 ) {
  +
  +            //perform UTF-16 tests
  +            if ( content[0] == 0 &&
  +                 content[1] == 0 &&
  +                 content[2] == -2 &&
  +                 content[3] == -1 ) 
  +                return "UTF-32";
  +
  +            if ( content[0] == -1 &&
  +                 content[1] == -2 &&
  +                 content[2] == 0 &&
  +                 content[3] == 0 ) 
  +                return "UTF-32";
   
  +        }
  +
  +        return null;
  +        
       }
       
  +    public static void main( String[] args ) throws Exception {
  +
  +        System.out.println( parse( "<?xml encoding=\"utf-8\"?>".getBytes() ) );
  +        System.out.println( parse( "<?xml encoding=\"UTF-8\"?>".getBytes() ) );
  +        System.out.println( parse( "<?xml encoding=\"utf8\"?>".getBytes() ) );
  +
  +    }
  +
   }


---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

cvs commit: jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/tools XMLCleanser.java XMLEncodingParser.java

Reply via email to