[1/4] git commit: [WAGON-388] use HtmlFileListParser based on jsoup

olamy Wed, 27 Mar 2013 04:50:51 -0700

Updated Branches:
  refs/heads/master 98a21cb3f -> 05a76e1ec


[WAGON-388] use HtmlFileListParser based on jsoup


Project: http://git-wip-us.apache.org/repos/asf/maven-wagon/repo
Commit: http://git-wip-us.apache.org/repos/asf/maven-wagon/commit/ec65719a
Tree: http://git-wip-us.apache.org/repos/asf/maven-wagon/tree/ec65719a
Diff: http://git-wip-us.apache.org/repos/asf/maven-wagon/diff/ec65719a

Branch: refs/heads/master
Commit: ec65719a32d3662ee3fa57f81d778e677d2aa082
Parents: 98a21cb
Author: olivier lamy <[email protected]>
Authored: Wed Mar 27 22:43:37 2013 +1100
Committer: olivier lamy <[email protected]>
Committed: Wed Mar 27 22:43:37 2013 +1100

----------------------------------------------------------------------
 wagon-providers/wagon-http-shared/pom.xml          |   17 +--
 .../wagon/shared/http/HtmlFileListParser.java      |  176 +++++++--------
 2 files changed, 82 insertions(+), 111 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/maven-wagon/blob/ec65719a/wagon-providers/wagon-http-shared/pom.xml
----------------------------------------------------------------------
diff --git a/wagon-providers/wagon-http-shared/pom.xml 
b/wagon-providers/wagon-http-shared/pom.xml
index 4a4b4ed..556a3bc 100644
--- a/wagon-providers/wagon-http-shared/pom.xml
+++ b/wagon-providers/wagon-http-shared/pom.xml
@@ -35,20 +35,9 @@ under the License.
 
   <dependencies>
     <dependency>
-      <groupId>nekohtml</groupId>
-      <artifactId>xercesMinimal</artifactId>
-      <version>1.9.6.2</version>
-    </dependency>
-    <dependency>
-      <groupId>nekohtml</groupId>
-      <artifactId>nekohtml</artifactId>
-      <version>1.9.6.2</version>
-      <exclusions>
-        <exclusion>
-          <groupId>xerces</groupId>
-          <artifactId>xercesImpl</artifactId>
-        </exclusion>
-      </exclusions>
+      <groupId>org.jsoup</groupId>
+      <artifactId>jsoup</artifactId>
+      <version>1.7.1</version>
     </dependency>
     <dependency>
       <groupId>commons-io</groupId>

http://git-wip-us.apache.org/repos/asf/maven-wagon/blob/ec65719a/wagon-providers/wagon-http-shared/src/main/java/org/apache/maven/wagon/shared/http/HtmlFileListParser.java
----------------------------------------------------------------------
diff --git 
a/wagon-providers/wagon-http-shared/src/main/java/org/apache/maven/wagon/shared/http/HtmlFileListParser.java
 
b/wagon-providers/wagon-http-shared/src/main/java/org/apache/maven/wagon/shared/http/HtmlFileListParser.java
index 749bd5d..7448a6d 100644
--- 
a/wagon-providers/wagon-http-shared/src/main/java/org/apache/maven/wagon/shared/http/HtmlFileListParser.java
+++ 
b/wagon-providers/wagon-http-shared/src/main/java/org/apache/maven/wagon/shared/http/HtmlFileListParser.java
@@ -19,15 +19,13 @@ package org.apache.maven.wagon.shared.http;
  * under the License.
  */
 
+import org.apache.commons.io.IOUtils;
 import org.apache.maven.wagon.TransferFailedException;
-import org.apache.xerces.xni.Augmentations;
-import org.apache.xerces.xni.QName;
-import org.apache.xerces.xni.XMLAttributes;
-import org.apache.xerces.xni.parser.XMLInputSource;
-import org.apache.xerces.xni.parser.XMLParserConfiguration;
 import org.codehaus.plexus.util.StringUtils;
-import org.cyberneko.html.HTMLConfiguration;
-import org.cyberneko.html.filters.DefaultFilter;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
 
 import java.io.IOException;
 import java.io.InputStream;
@@ -46,9 +44,25 @@ import java.util.regex.Pattern;
  */
 public class HtmlFileListParser
 {
+    // Apache Fancy Index Sort Headers
+    private static final Pattern APACHE_INDEX_SKIP = Pattern.compile( 
"\\?[CDMNS]=.*" );
+
+    // URLs with excessive paths.
+    private static final Pattern URLS_WITH_PATHS = Pattern.compile( "/[^/]*/" 
);
+
+    // URLs that to a parent directory.
+    private static final Pattern URLS_TO_PARENT = Pattern.compile( "\\.\\./" );
+
+    // mailto urls
+    private static final Pattern MAILTO_URLS = Pattern.compile( "mailto:.*"; );
+
+    private static final Pattern[] SKIPS =
+        new Pattern[]{ APACHE_INDEX_SKIP, URLS_WITH_PATHS, URLS_TO_PARENT, 
MAILTO_URLS };
+
     /**
      * Fetches a raw HTML from a provided InputStream, parses it, and returns 
the file list.
      *
+     * @param stream the input stream.
      * @return the file list.
      * @throws TransferFailedException if there was a problem fetching the raw 
html.
      */
@@ -57,126 +71,94 @@ public class HtmlFileListParser
     {
         try
         {
-            // Use URI object to get benefits of proper absolute and relative 
path resolution for free
             URI baseURI = new URI( baseurl );
+            // to make debugging easier, start with a string. This is assuming 
UTF-8, which might not be a safe
+            // assumption.
+            String content = IOUtils.toString( stream, "utf-8" );
+            Document doc = Jsoup.parse( content, baseurl );
+            Elements links = doc.select("a[href]");
+            Set<String> results = new HashSet<String>();
+            for ( int lx = 0; lx < links.size(); lx++ )
+            {
+                Element link = links.get( lx );
+                /*
+                 * The abs:href loses directories, so we deal with absolute 
paths ourselves below in cleanLink
+                 */
+                String target = link.attr( "href" );
+                if ( target != null )
+                {
+                    String clean = cleanLink( baseURI, target );
+                    if ( isAcceptableLink( clean ) )
+                    {
+                        results.add( clean );
+                    }
+                }
 
-            Parser handler = new Parser( baseURI );
-
-            XMLParserConfiguration parser = new HTMLConfiguration();
-            parser.setDocumentHandler( handler );
-            parser.setFeature( 
"http://cyberneko.org/html/features/augmentations";, true );
-            parser.setProperty( 
"http://cyberneko.org/html/properties/names/elems";, "upper" );
-            parser.setProperty( 
"http://cyberneko.org/html/properties/names/attrs";, "upper" );
-            parser.parse( new XMLInputSource( null, baseurl, 
baseURI.toString(), stream, "UTF-8" ) );
-
-            return new ArrayList<String>( handler.getLinks() );
+            }
 
+            return new ArrayList<String>( results );
         }
         catch ( URISyntaxException e )
         {
-            throw new TransferFailedException( "Unable to parse as URI: " + 
baseurl, e );
+            throw new TransferFailedException( "Unable to parse as base URI: " 
+ baseurl, e );
         }
         catch ( IOException e )
         {
-            throw new TransferFailedException( "I/O error: " + e.getMessage(), 
e );
+            throw new TransferFailedException( "I/O error reading HTML listing 
of artifacts: " + e.getMessage(), e );
         }
     }
 
-    private static class Parser
-        extends DefaultFilter
+    private static String cleanLink( URI baseURI, String link )
     {
-        // Apache Fancy Index Sort Headers
-        private static final Pattern APACHE_INDEX_SKIP = Pattern.compile( 
"\\?[CDMNS]=.*" );
-
-        // URLs with excessive paths.
-        private static final Pattern URLS_WITH_PATHS = Pattern.compile( 
"/[^/]*/" );
-
-        // URLs that to a parent directory.
-        private static final Pattern URLS_TO_PARENT = Pattern.compile( 
"\\.\\./" );
-
-        // mailto urls
-        private static final Pattern MAILTO_URLS = Pattern.compile( 
"mailto:.*"; );
-
-        private static final Pattern[] SKIPS =
-            new Pattern[]{ APACHE_INDEX_SKIP, URLS_WITH_PATHS, URLS_TO_PARENT, 
MAILTO_URLS };
-
-        private Set<String> links = new HashSet<String>();
-
-        private URI baseURI;
-
-        public Parser( URI baseURI )
+        if ( StringUtils.isEmpty( link ) )
         {
-            this.baseURI = baseURI.normalize();
+            return "";
         }
 
-        public Set<String> getLinks()
-        {
-            return links;
-        }
+        String ret = link;
 
-        public void startElement( QName element, XMLAttributes attrs, 
Augmentations augs )
+        try
         {
-            if ( "A".equals( element.rawname ) )
+            URI linkuri = new URI( ret );
+            if ( link.startsWith( "/" ) )
             {
-                String href = attrs.getValue( "HREF" );
-                if ( href != null )
-                {
-                    String link = cleanLink( baseURI, href );
-                    if ( isAcceptableLink( link ) )
-                    {
-                        links.add( link );
-                    }
-                }
+                linkuri = baseURI.resolve( linkuri );
             }
-        }
-
-        private static String cleanLink( URI baseURI, String link )
-        {
-            if ( StringUtils.isEmpty( link ) )
+            URI relativeURI = baseURI.relativize( linkuri ).normalize();
+            ret = relativeURI.toASCIIString();
+            if ( ret.startsWith( baseURI.getPath() ) )
             {
-                return "";
+                ret = ret.substring( baseURI.getPath().length() );
             }
 
-            String ret = link;
-
-            try
-            {
-                URI linkuri = new URI( ret );
-                URI relativeURI = baseURI.relativize( linkuri ).normalize();
-                ret = relativeURI.toASCIIString();
-                if ( ret.startsWith( baseURI.getPath() ) )
-                {
-                    ret = ret.substring( baseURI.getPath().length() );
-                }
+            ret = URLDecoder.decode( ret, "UTF-8" );
+        }
+        catch ( URISyntaxException e )
+        {
+        }
+        catch ( UnsupportedEncodingException e )
+        {
+        }
 
-                ret = URLDecoder.decode( ret, "UTF-8" );
-            }
-            catch ( URISyntaxException e )
-            {
-            }
-            catch ( UnsupportedEncodingException e )
-            {
-            }
+        return ret;
+    }
 
-            return ret;
+    private static boolean isAcceptableLink( String link )
+    {
+        if ( StringUtils.isEmpty( link ) )
+        {
+            return false;
         }
 
-        private static boolean isAcceptableLink( String link )
+        for ( int i = 0; i < SKIPS.length; i++ )
         {
-            if ( StringUtils.isEmpty( link ) )
+            if ( SKIPS[i].matcher( link ).find() )
             {
                 return false;
             }
-
-            for ( int i = 0; i < SKIPS.length; i++ )
-            {
-                if ( SKIPS[i].matcher( link ).find() )
-                {
-                    return false;
-                }
-            }
-
-            return true;
         }
+
+        return true;
     }
-}
+
+}
\ No newline at end of file

[1/4] git commit: [WAGON-388] use HtmlFileListParser based on jsoup

Reply via email to