This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/master by this push:
     new 0f5c16e  Fix for TIKA-2955 filter out invalid HTML characters 0x7F to 
0x9F (#285)
0f5c16e is described below

commit 0f5c16e57f85cc98ced6b52e52b1e84d53b66ae6
Author: Luke Butters <[email protected]>
AuthorDate: Thu Oct 10 13:56:26 2019 +1100

    Fix for TIKA-2955 filter out invalid HTML characters 0x7F to 0x9F (#285)
---
 .../org/apache/tika/sax/XHTMLContentHandler.java   | 10 +++++
 .../apache/tika/sax/XHTMLContentHandlerTest.java   | 43 ++++++++++++++++++++++
 2 files changed, 53 insertions(+)

diff --git 
a/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java 
b/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
index a200820..c568240 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
@@ -331,5 +331,15 @@ public class XHTMLContentHandler extends 
SafeContentHandler {
             endElement(name);
         }
     }
+    
+    @Override
+    protected boolean isInvalid(int ch) {
+        if(super.isInvalid(ch)) return true;
+        // These control chars are  invalid in XHTML.
+        if(0x7F <= ch && ch <=0x9F) {
+            return true;
+        }
+        return false;
+    }
 
 }
diff --git 
a/tika-core/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java 
b/tika-core/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java
index 6492b7c..e2ae019 100644
--- a/tika-core/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java
+++ b/tika-core/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java
@@ -157,6 +157,49 @@ public class XHTMLContentHandlerTest {
 
         assertTrue(toHTMLContentHandler.toString().contains("itemscope"));
     }
+    
+    
+    @Test
+    public void testInvalidControlCharacter0x7F() throws Exception {
+        xhtml.startDocument();
+        xhtml.startElement("menu");
+        xhtml.element("li", "a\u007Fz");
+        xhtml.endElement("menu");
+        xhtml.endDocument();
+        
+        String[] words = getRealWords(output.toString());
+        System.out.println(words[0]);
+        assertEquals(1, words.length);
+        assertEquals("a\ufffdz", words[0]);
+    }
+    
+    @Test
+    public void testInvalidControlCharacter0x9F() throws Exception {
+        xhtml.startDocument();
+        xhtml.startElement("menu");
+        xhtml.element("li", "a\u009Fz");
+        xhtml.endElement("menu");
+        xhtml.endDocument();
+        
+        String[] words = getRealWords(output.toString());
+        System.out.println(words[0]);
+        assertEquals(1, words.length);
+        assertEquals("a\ufffdz", words[0]);
+    }
+    
+    @Test
+    public void testInvalidControlCharacter0x93() throws Exception {
+        xhtml.startDocument();
+        xhtml.startElement("menu");
+        xhtml.element("li", "a\u0093z");
+        xhtml.endElement("menu");
+        xhtml.endDocument();
+        
+        String[] words = getRealWords(output.toString());
+        System.out.println(words[0]);
+        assertEquals(1, words.length);
+        assertEquals("a\ufffdz", words[0]);
+    }
 
     /**
      * Return array of non-zerolength words. Splitting on whitespace will get 
us

Reply via email to