Author: burton Date: Sun Feb 6 00:29:56 2005 New Revision: 151555 URL: http://svn.apache.org/viewcvs?view=rev&rev=151555 Log: Fixed potential bug (but worried about regression) with accented text in XML
Modified: jakarta/commons/sandbox/feedparser/trunk/TODO jakarta/commons/sandbox/feedparser/trunk/src/java/org/apache/commons/feedparser/FeedParserImpl.java jakarta/commons/sandbox/feedparser/trunk/src/java/org/apache/commons/feedparser/network/NetworkException.java jakarta/commons/sandbox/feedparser/trunk/src/java/org/apache/commons/feedparser/tools/XMLCleanser.java jakarta/commons/sandbox/feedparser/trunk/xdocs/navigation.xml Modified: jakarta/commons/sandbox/feedparser/trunk/TODO URL: http://svn.apache.org/viewcvs/jakarta/commons/sandbox/feedparser/trunk/TODO?view=diff&r1=151554&r2=151555 ============================================================================== --- jakarta/commons/sandbox/feedparser/trunk/TODO (original) +++ jakarta/commons/sandbox/feedparser/trunk/TODO Sun Feb 6 00:29:56 2005 @@ -3,12 +3,15 @@ - Get viewcvs linked to the app - - Nightly builds + - Nightly builds? - 0.5 public release http://jakarta.apache.org/commons/releases/index.html + + + - maven? - (DONE) All FeedParser exceptions should include the URL of the feed if @@ -26,6 +29,16 @@ - (DONE) Rework the factory mechanism to support multiple FeedParsers... should be an interface. + +- How do I want to maintain a public changelog? + +- How do I want to maintain a public TODO? + +- Fix the feedparsing bug where we'll drop chars: + + current-broken-drop-accents.atom + + The bug is in getCorrectInputStream - Networking layer should support per-request UserAgent settings. This should just be a request header I think Modified: jakarta/commons/sandbox/feedparser/trunk/src/java/org/apache/commons/feedparser/FeedParserImpl.java URL: http://svn.apache.org/viewcvs/jakarta/commons/sandbox/feedparser/trunk/src/java/org/apache/commons/feedparser/FeedParserImpl.java?view=diff&r1=151554&r2=151555 ============================================================================== --- jakarta/commons/sandbox/feedparser/trunk/src/java/org/apache/commons/feedparser/FeedParserImpl.java (original) +++ jakarta/commons/sandbox/feedparser/trunk/src/java/org/apache/commons/feedparser/FeedParserImpl.java Sun Feb 6 00:29:56 2005 @@ -106,10 +106,10 @@ if (encoding == null) encoding = "UTF-8"; - if (encoding.startsWith("UTF")) { + if ( encoding.startsWith( "UTF" ) ) { - String result = XMLCleanser.cleanse(bytes, encoding); - bytes = FeedFilter.parse(result, encoding); + String result = XMLCleanser.cleanse( bytes, encoding ); + bytes = FeedFilter.parse( result, encoding ); } else { @@ -120,7 +120,7 @@ //remove prefix whitespace, intern HTML entities, etc. //build an input stream from the our bytes for parsing... - is = new ByteArrayInputStream(bytes); + is = new ByteArrayInputStream( bytes ); return is; Modified: jakarta/commons/sandbox/feedparser/trunk/src/java/org/apache/commons/feedparser/network/NetworkException.java URL: http://svn.apache.org/viewcvs/jakarta/commons/sandbox/feedparser/trunk/src/java/org/apache/commons/feedparser/network/NetworkException.java?view=diff&r1=151554&r2=151555 ============================================================================== --- jakarta/commons/sandbox/feedparser/trunk/src/java/org/apache/commons/feedparser/network/NetworkException.java (original) +++ jakarta/commons/sandbox/feedparser/trunk/src/java/org/apache/commons/feedparser/network/NetworkException.java Sun Feb 6 00:29:56 2005 @@ -22,7 +22,7 @@ /** * * @author <a href="mailto:[EMAIL PROTECTED]">Kevin A. Burton</a> - * @version $Id: NetworkException.java,v 1.1 2005/01/25 07:55:19 burton Exp $ + * @version $Id$ */ public class NetworkException extends IOException { @@ -109,6 +109,17 @@ public int getResponseCode() { + //FIXME: + // java.lang.NumberFormatException: For input string: "fie" + // at java.lang.NumberFormatException.forInputString(NumberFormatException.java:48) + // at java.lang.Integer.parseInt(Integer.java:468) + // at java.lang.Integer.parseInt(Integer.java:518) + // at org.peerfear.newsmonster.network.NetworkException.getResponseCode(NetworkException.java:142) + // at ksa.robot.FeedTask._doTaskLogFailure(FeedTask.java:264) + // at ksa.robot.FeedTask.run(FeedTask.java:202) + // at ksa.robot.TaskThread.doProcessTask(TaskThread.java:298) + // at ksa.robot.TaskThread.run(TaskThread.java:111) + if ( _urlConnection == null ) { return -1; } Modified: jakarta/commons/sandbox/feedparser/trunk/src/java/org/apache/commons/feedparser/tools/XMLCleanser.java URL: http://svn.apache.org/viewcvs/jakarta/commons/sandbox/feedparser/trunk/src/java/org/apache/commons/feedparser/tools/XMLCleanser.java?view=diff&r1=151554&r2=151555 ============================================================================== --- jakarta/commons/sandbox/feedparser/trunk/src/java/org/apache/commons/feedparser/tools/XMLCleanser.java (original) +++ jakarta/commons/sandbox/feedparser/trunk/src/java/org/apache/commons/feedparser/tools/XMLCleanser.java Sun Feb 6 00:29:56 2005 @@ -20,10 +20,10 @@ * Class that can cleanse a string so that nothing can be present to break an * XML parser. This is a VERY non-portable class as it is meant to work just * with Xalan/Xerces and may remove more text and replace things that are - * non-XML centric. + * non-XML centric. * * @author <a href="mailto:[EMAIL PROTECTED]">Kevin A. Burton</a> - * @version $Id: XMLCleanser.java,v 1.2 2004/09/03 19:46:47 burton Exp $ + * @version $Id$ */ public class XMLCleanser { @@ -125,26 +125,43 @@ } /* - * This is a utility function for determining whether a specified - * character is a character according to production 2 of the - * XML 1.0 specification. + * This is a utility function for determining whether a specified character + * is a character according to production 2 of the XML 1.0 specification. * * @param c <code>char</code> to check for XML compliance. - * @return <code>boolean</code> - true if it's a character, - * false otherwise. + + * @return <code>boolean</code> - true if it's a character, false otherwise. */ - public static boolean isXMLCharacter(char c) { + public static boolean isXMLCharacter( char c ) { + // A parsed entity contains text, a sequence of characters, which may + // represent markup or character data. A character is an atomic unit of + // text as specified by ISO/IEC 10646 [ISO/IEC 10646]. Legal characters + // are tab, carriage return, line feed, and the legal graphic characters + // of Unicode and ISO/IEC 10646. The use of "compatibility characters", + // as defined in section 6.8 of [Unicode], is discouraged. + + // [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | + // [#x10000-#x10FFFF] /* any Unicode character, excluding the surrogate + // blocks, FFFE, and FFFF. */ + if (c == '\n') return true; if (c == '\r') return true; if (c == '\t') return true; + + //NOTE: this was BROKEN! The range between 0x80 and 0xFF is valid XML + //and would end up dropping latin characters in UTF-8. Why did I want + //to return false here again? - if (c < 0x20) return false; if (c < 0x80) return true; - if (c < 0xFF) return false; if (c <= 0xD7FF) return true; + //if (c < 0x20) return false; if (c < 0x80) return true; + //if (c < 0xFF) return false; if (c <= 0xD7FF) return true; + + if (c < 0x20) return false; if (c <= 0xD7FF) return true; if (c < 0xE000) return false; if (c <= 0xFFFD) return true; if (c < 0x10000) return false; if (c <= 0x10FFFF) return true; return false; + } } Modified: jakarta/commons/sandbox/feedparser/trunk/xdocs/navigation.xml URL: http://svn.apache.org/viewcvs/jakarta/commons/sandbox/feedparser/trunk/xdocs/navigation.xml?view=diff&r1=151554&r2=151555 ============================================================================== --- jakarta/commons/sandbox/feedparser/trunk/xdocs/navigation.xml (original) +++ jakarta/commons/sandbox/feedparser/trunk/xdocs/navigation.xml Sun Feb 6 00:29:56 2005 @@ -19,6 +19,10 @@ <item name="Wiki" href="http://wiki.apache.org/jakarta-commons/FeedParser" /> + + <item name="ViewCVS" + href="http://svn.apache.org/viewcvs.cgi/jakarta/commons/sandbox/feedparser/trunk" /> + </menu> &common-menus; </body> --------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]