Revision: 16972 http://sourceforge.net/p/gate/code/16972 Author: markagreenwood Date: 2013-10-03 12:48:15 +0000 (Thu, 03 Oct 2013) Log Message: ----------- added a check to see if we are trying to load a gzip encoded document and if so then we wrap the input stream in a gzip input stream -- this fixes loading HTML documents from wikipedia but possibly needs slightly more thinking about for a long term solution
Modified Paths: -------------- gate/trunk/src/gate/corpora/DocumentContentImpl.java gate/trunk/src/gate/corpora/NekoHtmlDocumentFormat.java Modified: gate/trunk/src/gate/corpora/DocumentContentImpl.java =================================================================== --- gate/trunk/src/gate/corpora/DocumentContentImpl.java 2013-10-03 11:24:25 UTC (rev 16971) +++ gate/trunk/src/gate/corpora/DocumentContentImpl.java 2013-10-03 12:48:15 UTC (rev 16972) @@ -24,6 +24,8 @@ import java.io.IOException; import java.io.InputStream; import java.net.URL; +import java.net.URLConnection; +import java.util.zip.GZIPInputStream; import org.apache.commons.io.IOUtils; @@ -63,8 +65,15 @@ } try { - uStream = u.openStream(); + URLConnection conn = u.openConnection(); + uStream = conn.getInputStream(); + System.out.println(conn.getContentEncoding()); + + if ("gzip".equals(conn.getContentEncoding())) { + uStream = new GZIPInputStream(uStream); + } + if(encoding != null && !encoding.equalsIgnoreCase("")) { uReader = new BomStrippingInputStreamReader(uStream, encoding, INTERNAL_BUFFER_SIZE); } else { Modified: gate/trunk/src/gate/corpora/NekoHtmlDocumentFormat.java =================================================================== --- gate/trunk/src/gate/corpora/NekoHtmlDocumentFormat.java 2013-10-03 11:24:25 UTC (rev 16971) +++ gate/trunk/src/gate/corpora/NekoHtmlDocumentFormat.java 2013-10-03 12:48:15 UTC (rev 16972) @@ -29,12 +29,15 @@ import gate.util.Out; import java.io.IOException; +import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.io.StringReader; +import java.net.URLConnection; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; +import java.util.zip.GZIPInputStream; import org.apache.xerces.xni.parser.XMLInputSource; import org.cyberneko.html.HTMLConfiguration; @@ -166,8 +169,16 @@ // textual document - load with user specified encoding String docEncoding = ((TextualDocument)doc).getEncoding(); // XML, so no BOM stripping. + + URLConnection conn = doc.getSourceUrl().openConnection(); + InputStream uStream = conn.getInputStream(); + + if ("gzip".equals(conn.getContentEncoding())) { + uStream = new GZIPInputStream(uStream); + } + Reader docReader = - new InputStreamReader(doc.getSourceUrl().openStream(), + new InputStreamReader(uStream, docEncoding); is = new XMLInputSource(null, doc.getSourceUrl().toString(), doc This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. ------------------------------------------------------------------------------ October Webinars: Code for Performance Free Intel webinars can help you accelerate application performance. Explore tips for MPI, OpenMP, advanced profiling, and more. Get the most from the latest Intel processors and coprocessors. See abstracts and register > http://pubads.g.doubleclick.net/gampad/clk?id=60134791&iu=/4140/ostg.clktrk _______________________________________________ GATE-cvs mailing list GATE-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/gate-cvs