Author: kwright
Date: Fri Feb 12 19:13:38 2021
New Revision: 1886472
URL: http://svn.apache.org/viewvc?rev=1886472&view=rev
Log:
Fix for CONNECTORS-1656. Thanks Julien for the patch.
Modified:
manifoldcf/trunk/CHANGES.txt
manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java
Modified: manifoldcf/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1886472&r1=1886471&r2=1886472&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Fri Feb 12 19:13:38 2021
@@ -3,6 +3,9 @@ $Id$
======================= 2.19-dev =====================
+CONNECTORS-1656: Ensure legit XML is produced for Tika by the html extractor.
+(Julien Massiera)
+
CONNECTORS-1661: Encoding for multipart requests is sometimes not set by the
new
UI, so assume UTF-8 if that happens.
(Julien Massiera)
Modified:
manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java?rev=1886472&r1=1886471&r2=1886472&view=diff
==============================================================================
---
manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java
(original)
+++
manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java
Fri Feb 12 19:13:38 2021
@@ -33,6 +33,7 @@ import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.jsoup.nodes.Document.OutputSettings;
+import org.jsoup.nodes.Entities.EscapeMode;
import org.jsoup.safety.Whitelist;
public class JsoupProcessing {
@@ -42,6 +43,7 @@ public class JsoupProcessing {
public static Hashtable<String,String>
extractTextAndMetadataHtmlDocument(InputStream streamDoc,String
whitelist,List<String> blacklist, boolean stripHtml) throws IOException{
Document doc = Jsoup.parse(streamDoc, "UTF-8", "");
+ doc.outputSettings().escapeMode(EscapeMode.xhtml);
Hashtable<String,String> metadata = new Hashtable<String,String>();
for(Element meta : doc.select("meta")) {
Logging.connectors.debug("Name: " + meta.attr("name") + " - Content: " +
meta.attr("content"));