The following patch updates the org.apache.commons.feedparser.locate package to correctly use the new 'blogservice' subpackage
Index src/java/org/apache/commons/feedparser/locate/BlogServiceDiscovery.java
===================================================================
RCS file /home/cvspublic/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/BlogServiceDiscovery.java,v
retrieving revision 1.4
diff -u -B -r1.4 BlogServiceDiscovery.java
--- src/java/org/apache/commons/feedparser/locate/BlogServiceDiscovery.java 30 Sep 2004 192738 -0000 1.4
+++ src/java/org/apache/commons/feedparser/locate/BlogServiceDiscovery.java 21 Oct 2004 011457 -0000
@@ -16,310 +16,34 @@
package org.apache.commons.feedparser.locate;
-import java.util.regex.*; +import org.apache.commons.feedparser.*; +import org.apache.commons.feedparser.locate.blogservice.*;
/**
- *
* Determines what blog provider a given URI is using,
* such as whether it is hosted on Blogspot, Radio Userland, etc.
*
* @author <a href="[EMAIL PROTECTED]">Brad Neuberg/a>
*/
public class BlogServiceDiscovery {
- /** Locates all the generator meta tags
- * (i.e. <meta content="generator" content="someGenerator"/>)
- */
- private static Pattern metaTagsPattern =
- Pattern.compile("<[\\s]*meta[\\w\\s=\"']*name=['\" ]generator[\"' ][\\w\\s=\"']*[^>]*",
- Pattern.CASE_INSENSITIVE | Pattern.MULTILINE);
-
- /** A pattern used to discover PMachine blogs. */
- private static Pattern pmachinePattern =
- Pattern.compile("pmachine", Pattern.CASE_INSENSITIVE);
- /** A pattern used to discover Blosxom blogs. */ - private static Pattern blosxomPattern = - Pattern.compile("alt=[\"' ]powered by blosxom[\"' ]", - Pattern.CASE_INSENSITIVE); - - /** Uses the given resource and content to determine what blog provider - * a given URI is using. - */ - public static BlogService discover( String resource, String content ) { - return discoverService( resource, content ); - } - - /** Uses the given resource to determine what blog provider - * a given URI is using; useful if you have no content but still want - * to do a best guess of what blog service is being used. - */ - public static BlogService discover( String resource ) { - return discoverService( resource, null ); + public static BlogService discover( String resource ) + throws FeedParserException { + return discover(resource, null); }
- // NOTE ALL of this code should change to a visitor pattern. Bad design.
-
- protected static BlogService discoverService( String resource,
- String content ) {
+ public static BlogService discover( String resource,
+ String content )
+ throws FeedParserException {
resource = resource.toLowerCase();
-
- // check the weblogs in order of their popularity
- if (isBlogger(resource, content)) {
- return BlogService.BLOGGER;
- }
- else if (isLiveJournal(resource, content)) {
- return BlogService.LIVEJOURNAL;
- }
- else if (isDiaryLand(resource, content)) {
- return BlogService.DIARYLAND;
- }
- else if (isMovableType(resource, content)) {
- return BlogService.MOVABLE_TYPE;
- }
- else if (isXanga(resource, content)) {
- return BlogService.XANGA;
- }
- else if (isWordPress(resource, content)) {
- return BlogService.WORDPRESS;
- }
- else if (isAOLJournals(resource, content)) {
- return BlogService.AOL_JOURNAL;
- }
- else if (isTypePad(resource, content)) {
- return BlogService.TYPEPAD;
- }
- else if (isPMachine(resource, content)) {
- return BlogService.PMACHINE;
- }
- /* FIXME No way to detect Expression Engine weblogs right now
- else if (isExpressionEngine(resource, content)) {
- return BlogService.EXPRESSION_ENGINE;
- }*/
- else if (isGreyMatter(resource, content)) {
- return BlogService.GREYMATTER;
- }
- /* FIXME We can't detect iBlog sites
- else if (isIBlog(resource, content)) {
- return BlogService.IBLOG;
- }*/
- else if (isBlosxom(resource, content)) {
- return BlogService.BLOSXOM;
- }
- /* FIXME We can't detect Manila sites.
- else if (isManila(resource, content)) {
- return BlogService.MANILA;
- }*/
- else if ( isRadioUserland( resource, content ) ) {
- return BlogService.RADIO_USERLAND;
- }
- else if ( isTextPattern( resource, content ) ) {
- return BlogService.TEXTPATTERN;
- }
- else if ( isTextAmerica( resource, content ) ) {
- return BlogService.TEXTAMERICA;
- }
- else if ( isYahooGroups( resource, content ) ) {
- return BlogService.YAHOOGROUPS;
- }
- else if ( BlogService.FLICKR.accept( resource, content ) ) {
- return BlogService.FLICKR;
- }
- else {
- return BlogService.UNKNOWN;
- }
- }
-
- // **** vendor specific CMS detection code **********************************
-
- protected static boolean isBlogger( String resource, String content ) {
- boolean results = false;
-
- results = containsDomain(resource, "blogspot.com");
-
- if (results == false) {
- results = hasGenerator(content, "blogger");
- }
-
- return results;
- }
-
- protected static boolean isGreyMatter( String resource, String content ) {
- boolean results = false;
-
- results = hasGenerator(content, "greymatter");
-
- return results;
- }
-
- /*protected static boolean isExpressionEngine( String resource,
- String content ) {
- boolean results = false;
-
- return results;
- } */
-
- protected static boolean isMovableType( String resource, String content ) {
- boolean results = false;
-
- results = hasGenerator(content, "movabletype");
-
- return results;
- }
-
- protected static boolean isAOLJournals( String resource, String content ) {
- boolean results = false;
-
- results = containsDomain(resource, "journals.aol.com");
-
- return results;
- }
-
- protected static boolean isDiaryLand( String resource, String content ) {
- boolean results = false;
-
- results = containsDomain(resource, "diaryland.com");
-
- return results;
- }
-
- protected static boolean isPMachine( String resource, String content ) {
- boolean results = false;
-
- Matcher pmachineMatcher = pmachinePattern.matcher(resource);
-
- results = pmachineMatcher.find();
-
- return results;
- }
-
- protected static boolean isTextPattern( String resource, String content ) {
- boolean results = false;
-
- results = hasGenerator(content, "textpattern");
-
- return results;
- }
-
- /* FIXME We can't detect Manila sites.
- protected static boolean isManila( String resource, String content ) {
- boolean results = false;
-
- return results;
- }
- */
-
- protected static boolean isTypePad( String resource, String content ) {
- boolean results = false;
-
- results = containsDomain(resource, "typepad.com");
-
- if (results == false) {
- results = hasGenerator(content, "typepad");
- }
-
- return results;
- }
-
- protected static boolean isRadioUserland( String resource, String content ) {
- boolean results = false;
-
- results = containsDomain(resource, "radio.userland.com");
-
- if (results == false) {
- results = containsDomain(resource, "radio.weblogs.com");
- }
-
- return results;
- }
-
- protected static boolean isLiveJournal( String resource, String content ) {
- boolean results = false;
-
- results = containsDomain(resource, "livejournal.com");
-
- return results;
- }
-
- protected static boolean isWordPress( String resource, String content ) {
- boolean results = false;
+ BlogService[] blogServices = BlogService.getBlogServices();
- results = hasGenerator(content, "wordpress");
-
- return results;
- }
-
- /* FIXME We can't detect iBlog sites.
- protected static boolean isIBlog( String resource, String content ) {
- boolean results = false;
-
- return results;
- }*/
-
- protected static boolean isXanga( String resource, String content ) {
- boolean results = false;
-
- results = containsDomain(resource, "xanga.com");
-
- return results;
- }
-
- protected static boolean isBlosxom( String resource, String content ) {
- boolean results = false;
-
- // This is the only kind of blog that we need to check for a
- // 'Powered by Blosxom'. We do this with the alt= value on the
- // Powered By image.
- // FIXME This might be fragile, but it is used across all of the
- // Blosxom blogs I have looked at so far. Brad Neuberg, [EMAIL PROTECTED]
-
- Matcher blosxomMatcher = blosxomPattern.matcher(content);
- results = blosxomMatcher.find();
-
- return results;
- }
-
- protected static boolean isTextAmerica( String resource, String content ) {
- boolean results = false;
-
- results = containsDomain(resource, "textamerica.com");
-
- return results;
- }
-
- protected static boolean isYahooGroups( String resource, String content ) {
- boolean results = false;
-
- results = containsDomain( resource, "groups.yahoo.com" );
-
- return results;
- }
-
- // **** util code ***********************************************************
-
- /** Determines if the given resource contains the given domain name
- * fragment.
- */
- protected static boolean containsDomain(String resource, String domain) {
- return (resource.indexOf(domain) != -1);
- }
-
- /** Determines if the given content was generated by the given generator
- * (i.e. this document contains a meta tag with name="generator" and
- * content equal to the generatorType).
- */
- protected static boolean hasGenerator(String content, String generatorType) {
- if (content == null) {
- return false;
+ for (int i = 0; i < blogServices.length; i++) {
+ if (blogServices[i].isThisService(resource, content)) {
+ return blogServices[i];
+ }
}
- Matcher metaTagsMatcher = metaTagsPattern.matcher(content);
- if (metaTagsMatcher.find()) {
- String metaTag = metaTagsMatcher.group(0).toLowerCase();
- generatorType = generatorType.toLowerCase();
- return (metaTag.indexOf(generatorType) != -1);
- }
- else {
- return false;
- }
+ return new Unknown();
}
}
Index src/java/org/apache/commons/feedparser/locate/FeedLocator.java
===================================================================
RCS file /home/cvspublic/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/FeedLocator.java,v
retrieving revision 1.23
diff -u -B -r1.23 FeedLocator.java
--- src/java/org/apache/commons/feedparser/locate/FeedLocator.java 4 Oct 2004 005947 -0000 1.23
+++ src/java/org/apache/commons/feedparser/locate/FeedLocator.java 21 Oct 2004 011457 -0000
@@ -141,17 +141,17 @@
Iterator it = l.iterator();
if ( it.hasNext() == false ) { - System.out.println( "NO LINKS FOUND" ); + log.info( "NO LINKS FOUND" ); }
- System.out.println( " FIXME (debug) AD RSS " + l.getAdRSSFeed() ); - System.out.println( " FIXME (debug) AD Atom " + l.getAdAtomFeed() ); + log.info( " FIXME (debug) AD RSS " + l.getAdRSSFeed() ); + log.info( " FIXME (debug) AD Atom " + l.getAdAtomFeed() );
while ( it.hasNext() ) {
FeedReference ref = (FeedReference)it.next();
- System.out.println( ref.resource ); + log.info( ref.resource );
}
Index src/java/org/apache/commons/feedparser/locate/ProbeLocator.java
===================================================================
RCS file /home/cvspublic/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/ProbeLocator.java,v
retrieving revision 1.15
diff -u -B -r1.15 ProbeLocator.java
--- src/java/org/apache/commons/feedparser/locate/ProbeLocator.java 30 Sep 2004 192738 -0000 1.15
+++ src/java/org/apache/commons/feedparser/locate/ProbeLocator.java 21 Oct 2004 011457 -0000
@@ -17,12 +17,11 @@
package org.apache.commons.feedparser.locate;
import org.apache.commons.feedparser.*; +import org.apache.commons.feedparser.locate.blogservice.*;
import org.peerfear.newsmonster.network.*;
import java.util.*; -import java.util.regex.*; -import java.net.*;
import org.apache.log4j.Logger;
@@ -65,124 +64,7 @@
* for client-side aggregators would be too great.
*/
public static boolean BLOG_SERVICE_PROBING_ENABLED = false;
-
- /**
- * A regex to find any trailing filename and strip it
- */
- private static Pattern patternToStrip = Pattern.compile("[^/](/\\w*\\.\\w*$)");
-
- /**
- * A regex to extract the user from a Xanga URL
- */
- private static Pattern xangaURLPattern = Pattern.compile(".*user=(\\w*)");
-
- /**
- * Internal map to store probe URIs and their services.
- */
- private static Map probeMapping = new HashMap();
-
- static {
- /** Associates a given BlogService with a list of usual locations to find
- * their RSS file. The locations are given as an array of FeedReferences,
- * with highest quality feeds put first. These blog providers don't
- * provide consistent autodiscovery.
- */
- FeedReference blosxomLocations[] =
- // there is sometimes an index.rss20 file, but Blosxom has a bug where
- // it incorrectly responds to HTTP HEAD requests for that file,
- // saying that it exists when it doesn't. Most sites don't seem
- // to have this file so we don't include it here.
- // Brad Neuberg, [EMAIL PROTECTED]
- { new FeedReference("index.rss", FeedReference.RSS_MEDIA_TYPE) };
-
- // Diaryland doesn't offer feeds
- //FeedReference diaryLandLocations[] = { "" };
- FeedReference bloggerLocations[] =
- { new FeedReference("atom.xml", FeedReference.ATOM_MEDIA_TYPE) };
-
- FeedReference aolJournalLocations[] =
- { new FeedReference("atom.xml", FeedReference.ATOM_MEDIA_TYPE),
- new FeedReference("rss.xml", FeedReference.RSS_MEDIA_TYPE) };
-
- FeedReference pmachineLocations[] =
- { new FeedReference("index.xml", FeedReference.RSS_MEDIA_TYPE) };
-
- FeedReference textPatternLocations[] =
- { new FeedReference("?atom=1", FeedReference.ATOM_MEDIA_TYPE),
- new FeedReference("?rss=1", FeedReference.RSS_MEDIA_TYPE) };
-
- FeedReference manilaLocations[] =
- { new FeedReference("xml/rss.xml", FeedReference.RSS_MEDIA_TYPE),
- new FeedReference("rss.xml", FeedReference.RSS_MEDIA_TYPE) };
-
- FeedReference typepadLocations[] =
- { new FeedReference("atom.xml", FeedReference.ATOM_MEDIA_TYPE),
- new FeedReference("index.rdf", FeedReference.RSS_MEDIA_TYPE) };
-
- FeedReference radioUserlandLocations[] =
- { new FeedReference("rss.xml", FeedReference.RSS_MEDIA_TYPE) };
-
- FeedReference liveJournalLocations[] =
- { new FeedReference("data/atom", FeedReference.ATOM_MEDIA_TYPE),
- new FeedReference("data/rss", FeedReference.RSS_MEDIA_TYPE) };
-
- FeedReference wordPressLocations[] =
- { new FeedReference("wp-atom.php", FeedReference.ATOM_MEDIA_TYPE),
- new FeedReference("wp-rss2.php", FeedReference.RSS_MEDIA_TYPE),
- new FeedReference("wp-rss.php", FeedReference.RSS_MEDIA_TYPE) };
-
- FeedReference iBlogLocations[] =
- { new FeedReference("rss.xml", FeedReference.RSS_MEDIA_TYPE) };
-
- // Xanga feeds have to be handled specially since they put their
- // feeds at the location http//www.xanga.com/rss.aspx?user=username
- FeedReference xangaLocations[] =
- { new FeedReference("rss.aspx?user=", FeedReference.RSS_MEDIA_TYPE) };
-
- FeedReference textAmericaLocations[] =
- { new FeedReference("rss.aspx", FeedReference.RSS_MEDIA_TYPE) };
-
- FeedReference unknownLocations[] =
- { new FeedReference("atom.xml",FeedReference.ATOM_MEDIA_TYPE),
- new FeedReference("index.rss", FeedReference.RSS_MEDIA_TYPE),
- new FeedReference("rss.xml", FeedReference.RSS_MEDIA_TYPE),
- new FeedReference("index.rdf", FeedReference.RSS_MEDIA_TYPE),
- new FeedReference("index.xml", FeedReference.RSS_MEDIA_TYPE),
- new FeedReference("xml/rss.xml", FeedReference.RSS_MEDIA_TYPE) };
-
- FeedReference yahooGroupsLocations[] =
- { new FeedReference( "", FeedReference.RSS_MEDIA_TYPE) };
-
- probeMapping.put( BlogService.BLOSXOM, blosxomLocations );
-
- //Tue Aug 31 2004 0421 PM ([EMAIL PROTECTED]) Diaryland doesn't
- //currently offer RSS or Atom feeds (shame, shame, shame). This is a
- //placeholder until they see the light and provide Atom feeds.
-
- //probeMapping.put( BlogService.DIARYLAND, diaryLandLocations );
- probeMapping.put( BlogService.BLOGGER, bloggerLocations );
- probeMapping.put( BlogService.AOL_JOURNAL, aolJournalLocations );
- probeMapping.put( BlogService.PMACHINE, pmachineLocations );
- probeMapping.put( BlogService.TEXTPATTERN, textPatternLocations );
- probeMapping.put( BlogService.MANILA, manilaLocations );
- probeMapping.put( BlogService.TYPEPAD, typepadLocations );
- probeMapping.put( BlogService.RADIO_USERLAND, radioUserlandLocations );
- probeMapping.put( BlogService.LIVEJOURNAL, liveJournalLocations );
- probeMapping.put( BlogService.WORDPRESS, wordPressLocations );
- probeMapping.put( BlogService.IBLOG, iBlogLocations );
- probeMapping.put( BlogService.XANGA, xangaLocations);
- probeMapping.put( BlogService.YAHOOGROUPS, yahooGroupsLocations);
-
- //probeMapping.put( BlogService.YAHOOGROUPS, yahooGroupsLocations);
-
- probeMapping.put( BlogService.FLICKR, BlogService.FLICKR.getLocations() );
-
- probeMapping.put( BlogService.UNKNOWN, unknownLocations );
-
- probeMapping.put( BlogService.TEXTAMERICA, textAmericaLocations );
-
- }
/** * @@ -197,99 +79,49 @@ // fail-fast if we already have some results and if we determine that // we can trust the results (TextAmerica has invalid autodiscovery, // for example) - if ( list.size() > 0 && blogService.hasValidAutodiscovery() ) + if ( list.size() > 0 && blogService.hasValidAutoDiscovery() ) return list; - else if ( blogService.hasValidAutodiscovery() == false ) { + else if ( blogService.hasValidAutoDiscovery() == false ) { // clear out the list so far since we can't trust the results list.clear(); }
if ( BLOG_SERVICE_PROBING_ENABLED || AGGRESIVE_PROBING_ENABLED ) {
-
- String baseFeedPath = getFeedPath( resource );
-
- FeedReference mapping[] = null;
-
- HashSet previousAttempts = new HashSet();
-
- boolean feedFound = false;
-
- if ( probeMapping.containsKey( blogService ) ) {
-
- mapping = (FeedReference[])probeMapping.get( blogService );
+ List servicesToTry = new ArrayList();
+ servicesToTry.add(blogService);
+ // only try the Unknown service if we want aggresive probing
+ if (AGGRESIVE_PROBING_ENABLED)
+ servicesToTry.add(new Unknown());
+ Iterator iter = servicesToTry.iterator();
+ Set previousAttempts = new HashSet();
+
+ while (iter.hasNext() && list.size() == 0) {
+ BlogService currentService = (BlogService)iter.next();
+ FeedReference[] mapping = currentService.getFeedLocations(resource, content);
log.info( "mapping = " + mapping );
- log.info( "baseFeedPath = " + baseFeedPath );
-
+
// try out each mapping
for (int i = 0; i < mapping.length; i++) {
+ String baseFeedPath = currentService.getBaseFeedPath(resource);
String pathToTest = baseFeedPath + mapping[i].resource;
-
- //FIXME generalize this in the future. We should NOT have
- //custom tests here.
-
- // we have to do special probing for Xanga
- if ( blogService.equals( BlogService.XANGA ) ) {
- pathToTest += getXangaUser(resource);
- }
-
- if ( blogService.equals( BlogService.YAHOOGROUPS ) ) {
-
- pathToTest = BlogService.YAHOOGROUPS.getFeedResource( resource );
-
- }
-
- //right now this is ONLY for Flickr
-
- if ( blogService.useCustomFeedResource() ) {
- pathToTest = blogService.getFeedResource( resource );
- }
-
log.info( "pathToTest = " + pathToTest );
- if ( feedExists( pathToTest ) ) {
+ if ( !previousAttempts.contains( pathToTest )
+ && feedExists( pathToTest ) ) {
log.info("Feed exists");
FeedReference feedReference = new FeedReference( pathToTest,
mapping[i].type );
- feedReference.method = FeedReference.METHOD_PROBE_DISCOVERY;
-
+ feedReference.method = FeedReference.METHOD_PROBE_DISCOVERY;
+ previousAttempts.add( pathToTest );
onFeedReference( feedReference, list );
-
- feedFound = true;
-
}
-
+
// record this attempt so we don't repeat it again if
// we are doing aggresive probing
previousAttempts.add( pathToTest );
}
}
-
- // if we have nothing so far, do aggresive probing
- if ( AGGRESIVE_PROBING_ENABLED && feedFound == false ) {
-
- mapping = (FeedReference[])probeMapping.get( BlogService.UNKNOWN );
-
- // try out each mapping
- for (int i = 0; i < mapping.length; i++) {
- //NOTE this shares duplicate code with the above tests.
-
- String pathToTest = baseFeedPath + mapping[i].resource;
- if ( previousAttempts.contains( pathToTest ) == false ) {
- if ( feedExists( pathToTest ) ) {
-
- FeedReference feedReference = new FeedReference( pathToTest,
- mapping[i].type);
-
- feedReference.method = FeedReference.METHOD_PROBE_DISCOVERY;
-
- onFeedReference( feedReference, list );
-
- }
- }
- }
- }
-
log.info( "Using aggresive probing, found the following" );
log.info( "Blog service " + blogService );
}
@@ -323,61 +155,6 @@
list.add( ref );
}
-
- /** This method takes a resource, such as "http//www.codinginparadise.org/myweblog.php",
- * and gets the path necessary to build up a feed, such as
- * "http//www.codinginparadise.org/". Basicly it appends a slash to the end if there
- * is not one, and removes any file names that might be at the end, such as
- * "myweblog.php".
- *
- * There is a special exception for some Blosxom blogs,
- * which have things inside of a cgi-script and 'hang' their RSS files
- * off of this cgi-bin. For example,
- * http//www.bitbucketheaven.com/cgi-bin/blosxom.cgi has its RSS file
- * at http//www.bitbucketheaven.com/cgi-bin/blosxom.cgi/index.rss, so
- * we must return the blosxom.cgi at the end as well for this method.
- *
- * @throws MalformedURLException Thrown if the given resource's URL is incorrectly
- * formatted.
- *
- * @author Brad Neuberg, [EMAIL PROTECTED]
- */
- protected static String getFeedPath( String resource )
- throws MalformedURLException {
-
- // strip off any query string or anchors
- int end = resource.lastIndexOf( "#" );
-
- if ( end != -1 )
- resource = resource.substring( 0, end );
-
- end = resource.lastIndexOf( "?" );
-
- if ( end != -1 )
- resource = resource.substring( 0, end );
-
- if ( ! resource.endsWith( "blosxom.cgi" ) ) {
- Matcher fileMatcher = patternToStrip.matcher(resource);
- if (fileMatcher.find()) {
- String stringToStrip = fileMatcher.group(1);
- int startStrip = resource.indexOf(stringToStrip);
- resource = resource.substring(0, startStrip);
- }
- }
-
- if ( ! resource.endsWith( "/" ) ) {
- resource = resource + "/";
- }
-
- return resource;
- }
-
- public static void main( String[] args ) throws Exception {
-
- log.info( getFeedPath( "http//foo.com/bar?cat=dog" ) );
- log.info( getFeedPath( "http//foo.com/bar?cat=dog#adf" ) );
-
- }
/** Does an HTTP HEAD to see if the given resource exists. * @@ -400,15 +177,6 @@ return response == 200; }
- /** Xanga's feed locations are dependent on the 'user' attribute in a - * Xanga URI. This method helps extract the user element from an - * existing URI, such as http//www.xanga.com/home.aspx?user=wdfphillz. - */ - protected static String getXangaUser(String resource) { - Matcher xangaMatcher = xangaURLPattern.matcher(resource); - xangaMatcher.matches(); - - return xangaMatcher.group(1); - } +
}
--------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]