[feedparser] Patch to Feed Parser 'Locate' System

Brad Neuberg Thu, 21 Oct 2004 09:52:34 -0700

The following patch updates the org.apache.commons.feedparser.locate system to use the new BlogService refactoring described earlier. It mostly consists of patches to BlogServiceDiscovery and ProbeLocator.

The following patch updates the org.apache.commons.feedparser.locate package to correctly use the new 'blogservice' subpackage

Index src/java/org/apache/commons/feedparser/locate/BlogServiceDiscovery.java =================================================================== RCS file /home/cvspublic/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/BlogServiceDiscovery.java,v retrieving revision 1.4 diff -u -B -r1.4 BlogServiceDiscovery.java --- src/java/org/apache/commons/feedparser/locate/BlogServiceDiscovery.java 30 Sep 2004 192738 -0000 1.4 +++ src/java/org/apache/commons/feedparser/locate/BlogServiceDiscovery.java 21 Oct 2004 011457 -0000 @@ -16,310 +16,34 @@

 package org.apache.commons.feedparser.locate;

-import java.util.regex.*;
+import org.apache.commons.feedparser.*;
+import org.apache.commons.feedparser.locate.blogservice.*;

/** - * * Determines what blog provider a given URI is using, * such as whether it is hosted on Blogspot, Radio Userland, etc. * * @author <a href="[EMAIL PROTECTED]">Brad Neuberg/a> */ public class BlogServiceDiscovery { - /** Locates all the generator meta tags - * (i.e. <meta content="generator" content="someGenerator"/>) - */ - private static Pattern metaTagsPattern = - Pattern.compile("<[\\s]*meta[\\w\\s=\"']*name=['\" ]generator[\"' ][\\w\\s=\"']*[^>]*", - Pattern.CASE_INSENSITIVE | Pattern.MULTILINE); - - /** A pattern used to discover PMachine blogs. */ - private static Pattern pmachinePattern = - Pattern.compile("pmachine", Pattern.CASE_INSENSITIVE);

-    /** A pattern used to discover Blosxom blogs. */
-    private static Pattern blosxomPattern =
-                Pattern.compile("alt=[\"' ]powered by blosxom[\"' ]",
-                                Pattern.CASE_INSENSITIVE);
-
-    /** Uses the given resource and content to determine what blog provider
-     *  a given URI is using.
-     */
-    public static BlogService discover( String resource, String content ) {
-        return discoverService( resource, content );
-    }
-
-    /** Uses the given resource to determine what blog provider
-     *  a given URI is using; useful if you have no content but still want
-     *  to do a best guess of what blog service is being used.
-     */
-    public static BlogService discover( String resource ) {
-        return discoverService( resource, null );
+    public static BlogService discover( String resource )
+                                            throws FeedParserException {
+        return discover(resource, null);
     }

- // NOTE ALL of this code should change to a visitor pattern. Bad design. - - protected static BlogService discoverService( String resource, - String content ) { + public static BlogService discover( String resource, + String content ) + throws FeedParserException { resource = resource.toLowerCase(); - - // check the weblogs in order of their popularity - if (isBlogger(resource, content)) { - return BlogService.BLOGGER; - } - else if (isLiveJournal(resource, content)) { - return BlogService.LIVEJOURNAL; - } - else if (isDiaryLand(resource, content)) { - return BlogService.DIARYLAND; - } - else if (isMovableType(resource, content)) { - return BlogService.MOVABLE_TYPE; - } - else if (isXanga(resource, content)) { - return BlogService.XANGA; - } - else if (isWordPress(resource, content)) { - return BlogService.WORDPRESS; - } - else if (isAOLJournals(resource, content)) { - return BlogService.AOL_JOURNAL; - } - else if (isTypePad(resource, content)) { - return BlogService.TYPEPAD; - } - else if (isPMachine(resource, content)) { - return BlogService.PMACHINE; - } - /* FIXME No way to detect Expression Engine weblogs right now - else if (isExpressionEngine(resource, content)) { - return BlogService.EXPRESSION_ENGINE; - }*/ - else if (isGreyMatter(resource, content)) { - return BlogService.GREYMATTER; - } - /* FIXME We can't detect iBlog sites - else if (isIBlog(resource, content)) { - return BlogService.IBLOG; - }*/ - else if (isBlosxom(resource, content)) { - return BlogService.BLOSXOM; - } - /* FIXME We can't detect Manila sites. - else if (isManila(resource, content)) { - return BlogService.MANILA; - }*/ - else if ( isRadioUserland( resource, content ) ) { - return BlogService.RADIO_USERLAND; - } - else if ( isTextPattern( resource, content ) ) { - return BlogService.TEXTPATTERN; - } - else if ( isTextAmerica( resource, content ) ) { - return BlogService.TEXTAMERICA; - } - else if ( isYahooGroups( resource, content ) ) { - return BlogService.YAHOOGROUPS; - } - else if ( BlogService.FLICKR.accept( resource, content ) ) { - return BlogService.FLICKR; - } - else { - return BlogService.UNKNOWN; - } - } - - // **** vendor specific CMS detection code ********************************** - - protected static boolean isBlogger( String resource, String content ) { - boolean results = false; - - results = containsDomain(resource, "blogspot.com"); - - if (results == false) { - results = hasGenerator(content, "blogger"); - } - - return results; - } - - protected static boolean isGreyMatter( String resource, String content ) { - boolean results = false; - - results = hasGenerator(content, "greymatter"); - - return results; - } - - /*protected static boolean isExpressionEngine( String resource, - String content ) { - boolean results = false; - - return results; - } */ - - protected static boolean isMovableType( String resource, String content ) { - boolean results = false; - - results = hasGenerator(content, "movabletype"); - - return results; - } - - protected static boolean isAOLJournals( String resource, String content ) { - boolean results = false; - - results = containsDomain(resource, "journals.aol.com"); - - return results; - } - - protected static boolean isDiaryLand( String resource, String content ) { - boolean results = false; - - results = containsDomain(resource, "diaryland.com"); - - return results; - } - - protected static boolean isPMachine( String resource, String content ) { - boolean results = false; - - Matcher pmachineMatcher = pmachinePattern.matcher(resource); - - results = pmachineMatcher.find(); - - return results; - } - - protected static boolean isTextPattern( String resource, String content ) { - boolean results = false; - - results = hasGenerator(content, "textpattern"); - - return results; - } - - /* FIXME We can't detect Manila sites. - protected static boolean isManila( String resource, String content ) { - boolean results = false; - - return results; - } - */ - - protected static boolean isTypePad( String resource, String content ) { - boolean results = false; - - results = containsDomain(resource, "typepad.com"); - - if (results == false) { - results = hasGenerator(content, "typepad"); - } - - return results; - } - - protected static boolean isRadioUserland( String resource, String content ) { - boolean results = false; - - results = containsDomain(resource, "radio.userland.com"); - - if (results == false) { - results = containsDomain(resource, "radio.weblogs.com"); - } - - return results; - } - - protected static boolean isLiveJournal( String resource, String content ) { - boolean results = false; - - results = containsDomain(resource, "livejournal.com"); - - return results; - } - - protected static boolean isWordPress( String resource, String content ) { - boolean results = false; + BlogService[] blogServices = BlogService.getBlogServices();

- results = hasGenerator(content, "wordpress"); - - return results; - } - - /* FIXME We can't detect iBlog sites. - protected static boolean isIBlog( String resource, String content ) { - boolean results = false; - - return results; - }*/ - - protected static boolean isXanga( String resource, String content ) { - boolean results = false; - - results = containsDomain(resource, "xanga.com"); - - return results; - } - - protected static boolean isBlosxom( String resource, String content ) { - boolean results = false; - - // This is the only kind of blog that we need to check for a - // 'Powered by Blosxom'. We do this with the alt= value on the - // Powered By image. - // FIXME This might be fragile, but it is used across all of the - // Blosxom blogs I have looked at so far. Brad Neuberg, [EMAIL PROTECTED] - - Matcher blosxomMatcher = blosxomPattern.matcher(content); - results = blosxomMatcher.find(); - - return results; - } - - protected static boolean isTextAmerica( String resource, String content ) { - boolean results = false; - - results = containsDomain(resource, "textamerica.com"); - - return results; - } - - protected static boolean isYahooGroups( String resource, String content ) { - boolean results = false; - - results = containsDomain( resource, "groups.yahoo.com" ); - - return results; - } - - // **** util code *********************************************************** - - /** Determines if the given resource contains the given domain name - * fragment. - */ - protected static boolean containsDomain(String resource, String domain) { - return (resource.indexOf(domain) != -1); - } - - /** Determines if the given content was generated by the given generator - * (i.e. this document contains a meta tag with name="generator" and - * content equal to the generatorType). - */ - protected static boolean hasGenerator(String content, String generatorType) { - if (content == null) { - return false; + for (int i = 0; i < blogServices.length; i++) { + if (blogServices[i].isThisService(resource, content)) { + return blogServices[i]; + } }

- Matcher metaTagsMatcher = metaTagsPattern.matcher(content); - if (metaTagsMatcher.find()) { - String metaTag = metaTagsMatcher.group(0).toLowerCase(); - generatorType = generatorType.toLowerCase(); - return (metaTag.indexOf(generatorType) != -1); - } - else { - return false; - } + return new Unknown(); } } Index src/java/org/apache/commons/feedparser/locate/FeedLocator.java =================================================================== RCS file /home/cvspublic/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/FeedLocator.java,v retrieving revision 1.23 diff -u -B -r1.23 FeedLocator.java --- src/java/org/apache/commons/feedparser/locate/FeedLocator.java 4 Oct 2004 005947 -0000 1.23 +++ src/java/org/apache/commons/feedparser/locate/FeedLocator.java 21 Oct 2004 011457 -0000 @@ -141,17 +141,17 @@ Iterator it = l.iterator();

         if ( it.hasNext() == false ) {
-            System.out.println( "NO LINKS FOUND" );
+            log.info( "NO LINKS FOUND" );
         }

-        System.out.println( " FIXME (debug) AD RSS " + l.getAdRSSFeed() );
-        System.out.println( " FIXME (debug) AD Atom " + l.getAdAtomFeed() );
+        log.info( " FIXME (debug) AD RSS " + l.getAdRSSFeed() );
+        log.info( " FIXME (debug) AD Atom " + l.getAdAtomFeed() );

         while ( it.hasNext() ) {

             FeedReference ref = (FeedReference)it.next();

-            System.out.println( ref.resource );
+            log.info( ref.resource );

Index src/java/org/apache/commons/feedparser/locate/ProbeLocator.java =================================================================== RCS file /home/cvspublic/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/ProbeLocator.java,v retrieving revision 1.15 diff -u -B -r1.15 ProbeLocator.java --- src/java/org/apache/commons/feedparser/locate/ProbeLocator.java 30 Sep 2004 192738 -0000 1.15 +++ src/java/org/apache/commons/feedparser/locate/ProbeLocator.java 21 Oct 2004 011457 -0000 @@ -17,12 +17,11 @@ package org.apache.commons.feedparser.locate;

 import org.apache.commons.feedparser.*;
+import org.apache.commons.feedparser.locate.blogservice.*;

 import org.peerfear.newsmonster.network.*;

 import java.util.*;
-import java.util.regex.*;
-import java.net.*;

 import org.apache.log4j.Logger;

@@ -65,124 +64,7 @@ * for client-side aggregators would be too great. */ public static boolean BLOG_SERVICE_PROBING_ENABLED = false; - - /** - * A regex to find any trailing filename and strip it - */ - private static Pattern patternToStrip = Pattern.compile("[^/](/\\w*\\.\\w*$)"); - - /** - * A regex to extract the user from a Xanga URL - */ - private static Pattern xangaURLPattern = Pattern.compile(".*user=(\\w*)"); - - /** - * Internal map to store probe URIs and their services. - */ - private static Map probeMapping = new HashMap(); - - static { - /** Associates a given BlogService with a list of usual locations to find - * their RSS file. The locations are given as an array of FeedReferences, - * with highest quality feeds put first. These blog providers don't - * provide consistent autodiscovery. - */ - FeedReference blosxomLocations[] = - // there is sometimes an index.rss20 file, but Blosxom has a bug where - // it incorrectly responds to HTTP HEAD requests for that file, - // saying that it exists when it doesn't. Most sites don't seem - // to have this file so we don't include it here. - // Brad Neuberg, [EMAIL PROTECTED] - { new FeedReference("index.rss", FeedReference.RSS_MEDIA_TYPE) }; - - // Diaryland doesn't offer feeds - //FeedReference diaryLandLocations[] = { "" }; - FeedReference bloggerLocations[] = - { new FeedReference("atom.xml", FeedReference.ATOM_MEDIA_TYPE) }; - - FeedReference aolJournalLocations[] = - { new FeedReference("atom.xml", FeedReference.ATOM_MEDIA_TYPE), - new FeedReference("rss.xml", FeedReference.RSS_MEDIA_TYPE) }; - - FeedReference pmachineLocations[] = - { new FeedReference("index.xml", FeedReference.RSS_MEDIA_TYPE) }; - - FeedReference textPatternLocations[] = - { new FeedReference("?atom=1", FeedReference.ATOM_MEDIA_TYPE), - new FeedReference("?rss=1", FeedReference.RSS_MEDIA_TYPE) }; - - FeedReference manilaLocations[] = - { new FeedReference("xml/rss.xml", FeedReference.RSS_MEDIA_TYPE), - new FeedReference("rss.xml", FeedReference.RSS_MEDIA_TYPE) }; - - FeedReference typepadLocations[] = - { new FeedReference("atom.xml", FeedReference.ATOM_MEDIA_TYPE), - new FeedReference("index.rdf", FeedReference.RSS_MEDIA_TYPE) }; - - FeedReference radioUserlandLocations[] = - { new FeedReference("rss.xml", FeedReference.RSS_MEDIA_TYPE) }; - - FeedReference liveJournalLocations[] = - { new FeedReference("data/atom", FeedReference.ATOM_MEDIA_TYPE), - new FeedReference("data/rss", FeedReference.RSS_MEDIA_TYPE) }; - - FeedReference wordPressLocations[] = - { new FeedReference("wp-atom.php", FeedReference.ATOM_MEDIA_TYPE), - new FeedReference("wp-rss2.php", FeedReference.RSS_MEDIA_TYPE), - new FeedReference("wp-rss.php", FeedReference.RSS_MEDIA_TYPE) }; - - FeedReference iBlogLocations[] = - { new FeedReference("rss.xml", FeedReference.RSS_MEDIA_TYPE) }; - - // Xanga feeds have to be handled specially since they put their - // feeds at the location http//www.xanga.com/rss.aspx?user=username - FeedReference xangaLocations[] = - { new FeedReference("rss.aspx?user=", FeedReference.RSS_MEDIA_TYPE) }; - - FeedReference textAmericaLocations[] = - { new FeedReference("rss.aspx", FeedReference.RSS_MEDIA_TYPE) }; - - FeedReference unknownLocations[] = - { new FeedReference("atom.xml",FeedReference.ATOM_MEDIA_TYPE), - new FeedReference("index.rss", FeedReference.RSS_MEDIA_TYPE), - new FeedReference("rss.xml", FeedReference.RSS_MEDIA_TYPE), - new FeedReference("index.rdf", FeedReference.RSS_MEDIA_TYPE), - new FeedReference("index.xml", FeedReference.RSS_MEDIA_TYPE), - new FeedReference("xml/rss.xml", FeedReference.RSS_MEDIA_TYPE) }; - - FeedReference yahooGroupsLocations[] = - { new FeedReference( "", FeedReference.RSS_MEDIA_TYPE) }; - - probeMapping.put( BlogService.BLOSXOM, blosxomLocations ); - - //Tue Aug 31 2004 0421 PM ([EMAIL PROTECTED]) Diaryland doesn't - //currently offer RSS or Atom feeds (shame, shame, shame). This is a - //placeholder until they see the light and provide Atom feeds. - - //probeMapping.put( BlogService.DIARYLAND, diaryLandLocations );

- probeMapping.put( BlogService.BLOGGER, bloggerLocations ); - probeMapping.put( BlogService.AOL_JOURNAL, aolJournalLocations ); - probeMapping.put( BlogService.PMACHINE, pmachineLocations ); - probeMapping.put( BlogService.TEXTPATTERN, textPatternLocations ); - probeMapping.put( BlogService.MANILA, manilaLocations ); - probeMapping.put( BlogService.TYPEPAD, typepadLocations ); - probeMapping.put( BlogService.RADIO_USERLAND, radioUserlandLocations ); - probeMapping.put( BlogService.LIVEJOURNAL, liveJournalLocations ); - probeMapping.put( BlogService.WORDPRESS, wordPressLocations ); - probeMapping.put( BlogService.IBLOG, iBlogLocations ); - probeMapping.put( BlogService.XANGA, xangaLocations); - probeMapping.put( BlogService.YAHOOGROUPS, yahooGroupsLocations); - - //probeMapping.put( BlogService.YAHOOGROUPS, yahooGroupsLocations); - - probeMapping.put( BlogService.FLICKR, BlogService.FLICKR.getLocations() ); - - probeMapping.put( BlogService.UNKNOWN, unknownLocations ); - - probeMapping.put( BlogService.TEXTAMERICA, textAmericaLocations ); - - }

     /**
      *
@@ -197,99 +79,49 @@
         // fail-fast if we already have some results and if we determine that
         // we can trust the results (TextAmerica has invalid autodiscovery,
         // for example)
-        if ( list.size() > 0 && blogService.hasValidAutodiscovery() )
+        if ( list.size() > 0 && blogService.hasValidAutoDiscovery() )
             return list;
-        else if ( blogService.hasValidAutodiscovery() == false ) {
+        else if ( blogService.hasValidAutoDiscovery() == false ) {
             // clear out the list so far since we can't trust the results
             list.clear();
         }

if ( BLOG_SERVICE_PROBING_ENABLED || AGGRESIVE_PROBING_ENABLED ) { - - String baseFeedPath = getFeedPath( resource ); - - FeedReference mapping[] = null; - - HashSet previousAttempts = new HashSet(); - - boolean feedFound = false; - - if ( probeMapping.containsKey( blogService ) ) { - - mapping = (FeedReference[])probeMapping.get( blogService ); + List servicesToTry = new ArrayList(); + servicesToTry.add(blogService); + // only try the Unknown service if we want aggresive probing + if (AGGRESIVE_PROBING_ENABLED) + servicesToTry.add(new Unknown()); + Iterator iter = servicesToTry.iterator(); + Set previousAttempts = new HashSet(); + + while (iter.hasNext() && list.size() == 0) { + BlogService currentService = (BlogService)iter.next(); + FeedReference[] mapping = currentService.getFeedLocations(resource, content); log.info( "mapping = " + mapping ); - log.info( "baseFeedPath = " + baseFeedPath ); - + // try out each mapping for (int i = 0; i < mapping.length; i++) { + String baseFeedPath = currentService.getBaseFeedPath(resource); String pathToTest = baseFeedPath + mapping[i].resource; - - //FIXME generalize this in the future. We should NOT have - //custom tests here. - - // we have to do special probing for Xanga - if ( blogService.equals( BlogService.XANGA ) ) { - pathToTest += getXangaUser(resource); - } - - if ( blogService.equals( BlogService.YAHOOGROUPS ) ) { - - pathToTest = BlogService.YAHOOGROUPS.getFeedResource( resource ); - - } - - //right now this is ONLY for Flickr - - if ( blogService.useCustomFeedResource() ) { - pathToTest = blogService.getFeedResource( resource ); - } - log.info( "pathToTest = " + pathToTest );

- if ( feedExists( pathToTest ) ) { + if ( !previousAttempts.contains( pathToTest ) + && feedExists( pathToTest ) ) { log.info("Feed exists"); FeedReference feedReference = new FeedReference( pathToTest, mapping[i].type ); - feedReference.method = FeedReference.METHOD_PROBE_DISCOVERY; - + feedReference.method = FeedReference.METHOD_PROBE_DISCOVERY; + previousAttempts.add( pathToTest ); onFeedReference( feedReference, list ); - - feedFound = true; - } - + // record this attempt so we don't repeat it again if // we are doing aggresive probing previousAttempts.add( pathToTest ); } } - - // if we have nothing so far, do aggresive probing - if ( AGGRESIVE_PROBING_ENABLED && feedFound == false ) { - - mapping = (FeedReference[])probeMapping.get( BlogService.UNKNOWN ); - - // try out each mapping - for (int i = 0; i < mapping.length; i++) {

- //NOTE this shares duplicate code with the above tests. - - String pathToTest = baseFeedPath + mapping[i].resource; - if ( previousAttempts.contains( pathToTest ) == false ) { - if ( feedExists( pathToTest ) ) { - - FeedReference feedReference = new FeedReference( pathToTest, - mapping[i].type); - - feedReference.method = FeedReference.METHOD_PROBE_DISCOVERY; - - onFeedReference( feedReference, list ); - - } - } - } - } - log.info( "Using aggresive probing, found the following" ); log.info( "Blog service " + blogService ); } @@ -323,61 +155,6 @@ list.add( ref );

} - - /** This method takes a resource, such as "http//www.codinginparadise.org/myweblog.php", - * and gets the path necessary to build up a feed, such as - * "http//www.codinginparadise.org/". Basicly it appends a slash to the end if there - * is not one, and removes any file names that might be at the end, such as - * "myweblog.php". - * - * There is a special exception for some Blosxom blogs, - * which have things inside of a cgi-script and 'hang' their RSS files - * off of this cgi-bin. For example, - * http//www.bitbucketheaven.com/cgi-bin/blosxom.cgi has its RSS file - * at http//www.bitbucketheaven.com/cgi-bin/blosxom.cgi/index.rss, so - * we must return the blosxom.cgi at the end as well for this method. - * - * @throws MalformedURLException Thrown if the given resource's URL is incorrectly - * formatted. - * - * @author Brad Neuberg, [EMAIL PROTECTED] - */ - protected static String getFeedPath( String resource ) - throws MalformedURLException { - - // strip off any query string or anchors - int end = resource.lastIndexOf( "#" ); - - if ( end != -1 ) - resource = resource.substring( 0, end ); - - end = resource.lastIndexOf( "?" ); - - if ( end != -1 ) - resource = resource.substring( 0, end ); - - if ( ! resource.endsWith( "blosxom.cgi" ) ) { - Matcher fileMatcher = patternToStrip.matcher(resource); - if (fileMatcher.find()) { - String stringToStrip = fileMatcher.group(1); - int startStrip = resource.indexOf(stringToStrip); - resource = resource.substring(0, startStrip); - } - } - - if ( ! resource.endsWith( "/" ) ) { - resource = resource + "/"; - } - - return resource; - } - - public static void main( String[] args ) throws Exception { - - log.info( getFeedPath( "http//foo.com/bar?cat=dog" ) ); - log.info( getFeedPath( "http//foo.com/bar?cat=dog#adf" ) ); - - }

     /** Does an HTTP HEAD to see if the given resource exists.
      *
@@ -400,15 +177,6 @@
         return response == 200;
     }

-    /** Xanga's feed locations are dependent on the 'user' attribute in a
-     *  Xanga URI.  This method helps extract the user element from an
-     *  existing URI, such as http//www.xanga.com/home.aspx?user=wdfphillz.
-     */
-    protected static String getXangaUser(String resource) {
-        Matcher xangaMatcher = xangaURLPattern.matcher(resource);
-        xangaMatcher.matches();
-
-        return xangaMatcher.group(1);
-    }
+

---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

[feedparser] Patch to Feed Parser 'Locate' System

Reply via email to