corpora

markagreenwood Thu, 03 Oct 2013 05:49:13 -0700

Revision: 16972
          http://sourceforge.net/p/gate/code/16972
Author:   markagreenwood
Date:     2013-10-03 12:48:15 +0000 (Thu, 03 Oct 2013)
Log Message:
-----------
added a check to see if we are trying to load a gzip encoded document and if so 
then we wrap the input stream in a gzip input stream -- this fixes loading HTML 
documents from wikipedia but possibly needs slightly more thinking about for a 
long term solution


Modified Paths:
--------------
    gate/trunk/src/gate/corpora/DocumentContentImpl.java
    gate/trunk/src/gate/corpora/NekoHtmlDocumentFormat.java

Modified: gate/trunk/src/gate/corpora/DocumentContentImpl.java
===================================================================
--- gate/trunk/src/gate/corpora/DocumentContentImpl.java        2013-10-03 
11:24:25 UTC (rev 16971)
+++ gate/trunk/src/gate/corpora/DocumentContentImpl.java        2013-10-03 
12:48:15 UTC (rev 16972)
@@ -24,6 +24,8 @@
 import java.io.IOException;
 import java.io.InputStream;
 import java.net.URL;
+import java.net.URLConnection;
+import java.util.zip.GZIPInputStream;
 
 import org.apache.commons.io.IOUtils;
 
@@ -63,8 +65,15 @@
     }
 
     try {
-      uStream = u.openStream();
+      URLConnection conn = u.openConnection();
+      uStream = conn.getInputStream();
       
+      System.out.println(conn.getContentEncoding());
+      
+      if ("gzip".equals(conn.getContentEncoding())) {
+        uStream = new GZIPInputStream(uStream);
+      }
+      
       if(encoding != null && !encoding.equalsIgnoreCase("")) {
         uReader = new BomStrippingInputStreamReader(uStream, encoding, 
INTERNAL_BUFFER_SIZE);
       } else {

Modified: gate/trunk/src/gate/corpora/NekoHtmlDocumentFormat.java
===================================================================
--- gate/trunk/src/gate/corpora/NekoHtmlDocumentFormat.java     2013-10-03 
11:24:25 UTC (rev 16971)
+++ gate/trunk/src/gate/corpora/NekoHtmlDocumentFormat.java     2013-10-03 
12:48:15 UTC (rev 16972)
@@ -29,12 +29,15 @@
 import gate.util.Out;
 
 import java.io.IOException;
+import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.Reader;
 import java.io.StringReader;
+import java.net.URLConnection;
 import java.util.Set;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
+import java.util.zip.GZIPInputStream;
 
 import org.apache.xerces.xni.parser.XMLInputSource;
 import org.cyberneko.html.HTMLConfiguration;
@@ -166,8 +169,16 @@
         // textual document - load with user specified encoding
         String docEncoding = ((TextualDocument)doc).getEncoding();
         // XML, so no BOM stripping.
+        
+        URLConnection conn = doc.getSourceUrl().openConnection();
+        InputStream uStream = conn.getInputStream();
+                
+        if ("gzip".equals(conn.getContentEncoding())) {
+          uStream = new GZIPInputStream(uStream);
+        }
+        
         Reader docReader =
-                new InputStreamReader(doc.getSourceUrl().openStream(),
+                new InputStreamReader(uStream,
                         docEncoding);
         is =
                 new XMLInputSource(null, doc.getSourceUrl().toString(), doc

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
October Webinars: Code for Performance
Free Intel webinars can help you accelerate application performance.
Explore tips for MPI, OpenMP, advanced profiling, and more. Get the most from 
the latest Intel processors and coprocessors. See abstracts and register >
http://pubads.g.doubleclick.net/gampad/clk?id=60134791&iu=/4140/ostg.clktrk
_______________________________________________
GATE-cvs mailing list
GATE-cvs@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/gate-cvs

[gate-cvs] SF.net SVN: gate:[16972] gate/trunk/src/gate/corpora

Reply via email to