Updated Branches: refs/heads/master 98a21cb3f -> 05a76e1ec
[WAGON-388] use HtmlFileListParser based on jsoup Project: http://git-wip-us.apache.org/repos/asf/maven-wagon/repo Commit: http://git-wip-us.apache.org/repos/asf/maven-wagon/commit/ec65719a Tree: http://git-wip-us.apache.org/repos/asf/maven-wagon/tree/ec65719a Diff: http://git-wip-us.apache.org/repos/asf/maven-wagon/diff/ec65719a Branch: refs/heads/master Commit: ec65719a32d3662ee3fa57f81d778e677d2aa082 Parents: 98a21cb Author: olivier lamy <[email protected]> Authored: Wed Mar 27 22:43:37 2013 +1100 Committer: olivier lamy <[email protected]> Committed: Wed Mar 27 22:43:37 2013 +1100 ---------------------------------------------------------------------- wagon-providers/wagon-http-shared/pom.xml | 17 +-- .../wagon/shared/http/HtmlFileListParser.java | 176 +++++++-------- 2 files changed, 82 insertions(+), 111 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/maven-wagon/blob/ec65719a/wagon-providers/wagon-http-shared/pom.xml ---------------------------------------------------------------------- diff --git a/wagon-providers/wagon-http-shared/pom.xml b/wagon-providers/wagon-http-shared/pom.xml index 4a4b4ed..556a3bc 100644 --- a/wagon-providers/wagon-http-shared/pom.xml +++ b/wagon-providers/wagon-http-shared/pom.xml @@ -35,20 +35,9 @@ under the License. <dependencies> <dependency> - <groupId>nekohtml</groupId> - <artifactId>xercesMinimal</artifactId> - <version>1.9.6.2</version> - </dependency> - <dependency> - <groupId>nekohtml</groupId> - <artifactId>nekohtml</artifactId> - <version>1.9.6.2</version> - <exclusions> - <exclusion> - <groupId>xerces</groupId> - <artifactId>xercesImpl</artifactId> - </exclusion> - </exclusions> + <groupId>org.jsoup</groupId> + <artifactId>jsoup</artifactId> + <version>1.7.1</version> </dependency> <dependency> <groupId>commons-io</groupId> http://git-wip-us.apache.org/repos/asf/maven-wagon/blob/ec65719a/wagon-providers/wagon-http-shared/src/main/java/org/apache/maven/wagon/shared/http/HtmlFileListParser.java ---------------------------------------------------------------------- diff --git a/wagon-providers/wagon-http-shared/src/main/java/org/apache/maven/wagon/shared/http/HtmlFileListParser.java b/wagon-providers/wagon-http-shared/src/main/java/org/apache/maven/wagon/shared/http/HtmlFileListParser.java index 749bd5d..7448a6d 100644 --- a/wagon-providers/wagon-http-shared/src/main/java/org/apache/maven/wagon/shared/http/HtmlFileListParser.java +++ b/wagon-providers/wagon-http-shared/src/main/java/org/apache/maven/wagon/shared/http/HtmlFileListParser.java @@ -19,15 +19,13 @@ package org.apache.maven.wagon.shared.http; * under the License. */ +import org.apache.commons.io.IOUtils; import org.apache.maven.wagon.TransferFailedException; -import org.apache.xerces.xni.Augmentations; -import org.apache.xerces.xni.QName; -import org.apache.xerces.xni.XMLAttributes; -import org.apache.xerces.xni.parser.XMLInputSource; -import org.apache.xerces.xni.parser.XMLParserConfiguration; import org.codehaus.plexus.util.StringUtils; -import org.cyberneko.html.HTMLConfiguration; -import org.cyberneko.html.filters.DefaultFilter; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; import java.io.IOException; import java.io.InputStream; @@ -46,9 +44,25 @@ import java.util.regex.Pattern; */ public class HtmlFileListParser { + // Apache Fancy Index Sort Headers + private static final Pattern APACHE_INDEX_SKIP = Pattern.compile( "\\?[CDMNS]=.*" ); + + // URLs with excessive paths. + private static final Pattern URLS_WITH_PATHS = Pattern.compile( "/[^/]*/" ); + + // URLs that to a parent directory. + private static final Pattern URLS_TO_PARENT = Pattern.compile( "\\.\\./" ); + + // mailto urls + private static final Pattern MAILTO_URLS = Pattern.compile( "mailto:.*" ); + + private static final Pattern[] SKIPS = + new Pattern[]{ APACHE_INDEX_SKIP, URLS_WITH_PATHS, URLS_TO_PARENT, MAILTO_URLS }; + /** * Fetches a raw HTML from a provided InputStream, parses it, and returns the file list. * + * @param stream the input stream. * @return the file list. * @throws TransferFailedException if there was a problem fetching the raw html. */ @@ -57,126 +71,94 @@ public class HtmlFileListParser { try { - // Use URI object to get benefits of proper absolute and relative path resolution for free URI baseURI = new URI( baseurl ); + // to make debugging easier, start with a string. This is assuming UTF-8, which might not be a safe + // assumption. + String content = IOUtils.toString( stream, "utf-8" ); + Document doc = Jsoup.parse( content, baseurl ); + Elements links = doc.select("a[href]"); + Set<String> results = new HashSet<String>(); + for ( int lx = 0; lx < links.size(); lx++ ) + { + Element link = links.get( lx ); + /* + * The abs:href loses directories, so we deal with absolute paths ourselves below in cleanLink + */ + String target = link.attr( "href" ); + if ( target != null ) + { + String clean = cleanLink( baseURI, target ); + if ( isAcceptableLink( clean ) ) + { + results.add( clean ); + } + } - Parser handler = new Parser( baseURI ); - - XMLParserConfiguration parser = new HTMLConfiguration(); - parser.setDocumentHandler( handler ); - parser.setFeature( "http://cyberneko.org/html/features/augmentations", true ); - parser.setProperty( "http://cyberneko.org/html/properties/names/elems", "upper" ); - parser.setProperty( "http://cyberneko.org/html/properties/names/attrs", "upper" ); - parser.parse( new XMLInputSource( null, baseurl, baseURI.toString(), stream, "UTF-8" ) ); - - return new ArrayList<String>( handler.getLinks() ); + } + return new ArrayList<String>( results ); } catch ( URISyntaxException e ) { - throw new TransferFailedException( "Unable to parse as URI: " + baseurl, e ); + throw new TransferFailedException( "Unable to parse as base URI: " + baseurl, e ); } catch ( IOException e ) { - throw new TransferFailedException( "I/O error: " + e.getMessage(), e ); + throw new TransferFailedException( "I/O error reading HTML listing of artifacts: " + e.getMessage(), e ); } } - private static class Parser - extends DefaultFilter + private static String cleanLink( URI baseURI, String link ) { - // Apache Fancy Index Sort Headers - private static final Pattern APACHE_INDEX_SKIP = Pattern.compile( "\\?[CDMNS]=.*" ); - - // URLs with excessive paths. - private static final Pattern URLS_WITH_PATHS = Pattern.compile( "/[^/]*/" ); - - // URLs that to a parent directory. - private static final Pattern URLS_TO_PARENT = Pattern.compile( "\\.\\./" ); - - // mailto urls - private static final Pattern MAILTO_URLS = Pattern.compile( "mailto:.*" ); - - private static final Pattern[] SKIPS = - new Pattern[]{ APACHE_INDEX_SKIP, URLS_WITH_PATHS, URLS_TO_PARENT, MAILTO_URLS }; - - private Set<String> links = new HashSet<String>(); - - private URI baseURI; - - public Parser( URI baseURI ) + if ( StringUtils.isEmpty( link ) ) { - this.baseURI = baseURI.normalize(); + return ""; } - public Set<String> getLinks() - { - return links; - } + String ret = link; - public void startElement( QName element, XMLAttributes attrs, Augmentations augs ) + try { - if ( "A".equals( element.rawname ) ) + URI linkuri = new URI( ret ); + if ( link.startsWith( "/" ) ) { - String href = attrs.getValue( "HREF" ); - if ( href != null ) - { - String link = cleanLink( baseURI, href ); - if ( isAcceptableLink( link ) ) - { - links.add( link ); - } - } + linkuri = baseURI.resolve( linkuri ); } - } - - private static String cleanLink( URI baseURI, String link ) - { - if ( StringUtils.isEmpty( link ) ) + URI relativeURI = baseURI.relativize( linkuri ).normalize(); + ret = relativeURI.toASCIIString(); + if ( ret.startsWith( baseURI.getPath() ) ) { - return ""; + ret = ret.substring( baseURI.getPath().length() ); } - String ret = link; - - try - { - URI linkuri = new URI( ret ); - URI relativeURI = baseURI.relativize( linkuri ).normalize(); - ret = relativeURI.toASCIIString(); - if ( ret.startsWith( baseURI.getPath() ) ) - { - ret = ret.substring( baseURI.getPath().length() ); - } + ret = URLDecoder.decode( ret, "UTF-8" ); + } + catch ( URISyntaxException e ) + { + } + catch ( UnsupportedEncodingException e ) + { + } - ret = URLDecoder.decode( ret, "UTF-8" ); - } - catch ( URISyntaxException e ) - { - } - catch ( UnsupportedEncodingException e ) - { - } + return ret; + } - return ret; + private static boolean isAcceptableLink( String link ) + { + if ( StringUtils.isEmpty( link ) ) + { + return false; } - private static boolean isAcceptableLink( String link ) + for ( int i = 0; i < SKIPS.length; i++ ) { - if ( StringUtils.isEmpty( link ) ) + if ( SKIPS[i].matcher( link ).find() ) { return false; } - - for ( int i = 0; i < SKIPS.length; i++ ) - { - if ( SKIPS[i].matcher( link ).find() ) - { - return false; - } - } - - return true; } + + return true; } -} + +} \ No newline at end of file
