Author: kwright
Date: Fri Feb 12 19:13:38 2021
New Revision: 1886472

URL: http://svn.apache.org/viewvc?rev=1886472&view=rev
Log:
Fix for CONNECTORS-1656.  Thanks Julien for the patch.

Modified:
    manifoldcf/trunk/CHANGES.txt
    
manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java

Modified: manifoldcf/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1886472&r1=1886471&r2=1886472&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Fri Feb 12 19:13:38 2021
@@ -3,6 +3,9 @@ $Id$
 
 ======================= 2.19-dev =====================
 
+CONNECTORS-1656: Ensure legit XML is produced for Tika by the html extractor.
+(Julien Massiera)
+
 CONNECTORS-1661: Encoding for multipart requests is sometimes not set by the 
new
 UI, so assume UTF-8 if that happens.
 (Julien Massiera)

Modified: 
manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java
URL: 
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java?rev=1886472&r1=1886471&r2=1886472&view=diff
==============================================================================
--- 
manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java
 (original)
+++ 
manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java
 Fri Feb 12 19:13:38 2021
@@ -33,6 +33,7 @@ import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
 import org.jsoup.select.Elements;
 import org.jsoup.nodes.Document.OutputSettings;
+import org.jsoup.nodes.Entities.EscapeMode;
 import org.jsoup.safety.Whitelist;
 
 public class JsoupProcessing {
@@ -42,6 +43,7 @@ public class JsoupProcessing {
 
   public static Hashtable<String,String> 
extractTextAndMetadataHtmlDocument(InputStream streamDoc,String 
whitelist,List<String> blacklist, boolean stripHtml) throws IOException{
     Document doc = Jsoup.parse(streamDoc, "UTF-8", "");
+    doc.outputSettings().escapeMode(EscapeMode.xhtml);
     Hashtable<String,String> metadata = new Hashtable<String,String>();
     for(Element meta : doc.select("meta")) {
       Logging.connectors.debug("Name: " + meta.attr("name") + " - Content: " + 
meta.attr("content"));


Reply via email to