[ https://issues.apache.org/jira/browse/TIKA-2347?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16264860#comment-16264860 ]
ASF GitHub Bot commented on TIKA-2347: -------------------------------------- dameikle closed pull request #173: Fix for TIKA-2347 Adds underline extraction from word documents URL: https://github.com/apache/tika/pull/173 This is a PR merged from a forked repository. As GitHub hides the original diff on merge, it is displayed below for the sake of provenance: As this is a foreign pull request (from a fork), the diff is supplied below (as it won't show otherwise due to GitHub magic): diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java index 31809250d..90fbd6c37 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java @@ -84,6 +84,7 @@ private boolean curStrikeThrough; private boolean curBold; private boolean curItalic; + private boolean curUnderline; private final Metadata metadata; @@ -372,20 +373,8 @@ private int handleParagraph(Paragraph p, int parentTableLevel, Range r, HWPFDocu } } - // Close any still open style tags - if (curStrikeThrough) { - xhtml.endElement("s"); - curStrikeThrough = false; - } - if (curItalic) { - xhtml.endElement("i"); - curItalic = false; - } - if (curBold) { - xhtml.endElement("b"); - curBold = false; - } - + closeStyleElements(false, xhtml); + xhtml.endElement(tas.getTag()); return 0; @@ -399,7 +388,11 @@ private void handleCharacterRun(CharacterRun cr, boolean skipStyling, XHTMLConte if (!skipStyling) { if (cr.isBold() != curBold) { - // Enforce nesting -- must close s and i tags + // Enforce nesting -- must close u, s and i tags + if (curUnderline) { + xhtml.endElement("u"); + curUnderline = false; + } if (curStrikeThrough) { xhtml.endElement("s"); curStrikeThrough = false; @@ -417,7 +410,11 @@ private void handleCharacterRun(CharacterRun cr, boolean skipStyling, XHTMLConte } if (cr.isItalic() != curItalic) { - // Enforce nesting -- must close s tag + // Enforce nesting -- must close u and s tag + if (curUnderline) { + xhtml.endElement("u"); + curUnderline = false; + } if (curStrikeThrough) { xhtml.endElement("s"); curStrikeThrough = false; @@ -431,6 +428,11 @@ private void handleCharacterRun(CharacterRun cr, boolean skipStyling, XHTMLConte } if (cr.isStrikeThrough() != curStrikeThrough) { + // Enforce nesting -- must close u tag + if (curUnderline) { + xhtml.endElement("u"); + curUnderline = false; + } if (cr.isStrikeThrough()) { xhtml.startElement("s"); } else { @@ -438,6 +440,16 @@ private void handleCharacterRun(CharacterRun cr, boolean skipStyling, XHTMLConte } curStrikeThrough = cr.isStrikeThrough(); } + + boolean isUnderline = cr.getUnderlineCode() != 0; + if (isUnderline != curUnderline) { + if (isUnderline) { + xhtml.startElement("u"); + } else { + xhtml.endElement("u"); + } + curUnderline = isUnderline; + } } // Clean up the text @@ -546,6 +558,10 @@ private void closeStyleElements(boolean skipStyling, XHTMLContentHandler xhtml) if (skipStyling) { return; } + if (curUnderline) { + xhtml.endElement("u"); + curUnderline = false; + } if (curStrikeThrough) { xhtml.endElement("s"); curStrikeThrough = false; diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java index 39a72c6a2..23a1aedac 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java @@ -33,6 +33,7 @@ import org.apache.poi.xwpf.usermodel.ICell; import org.apache.poi.xwpf.usermodel.IRunElement; import org.apache.poi.xwpf.usermodel.ISDTContent; +import org.apache.poi.xwpf.usermodel.UnderlinePatterns; import org.apache.poi.xwpf.usermodel.XWPFDocument; import org.apache.poi.xwpf.usermodel.XWPFHeaderFooter; import org.apache.poi.xwpf.usermodel.XWPFHyperlink; @@ -224,7 +225,7 @@ private void extractParagraph(XWPFParagraph paragraph, XWPFListManager listManag xhtml.endElement("a"); } - TmpFormatting fmtg = new TmpFormatting(false, false); + TmpFormatting fmtg = new TmpFormatting(false, false, false); //hyperlinks may or may not have hyperlink ids String lastHyperlinkId = null; @@ -328,6 +329,10 @@ private TmpFormatting closeStyleTags(XHTMLContentHandler xhtml, xhtml.endElement("b"); fmtg.setBold(false); } + if (fmtg.isUnderline()) { + xhtml.endElement("u"); + fmtg.setUnderline(false); + } return fmtg; } @@ -336,6 +341,10 @@ private TmpFormatting processRun(XWPFRun run, XWPFParagraph paragraph, throws SAXException, XmlException, IOException { // True if we are currently in the named style tag: if (run.isBold() != tfmtg.isBold()) { + if (tfmtg.isUnderline()) { + xhtml.endElement("u"); + tfmtg.setUnderline(false); + } if (tfmtg.isItalic()) { xhtml.endElement("i"); tfmtg.setItalic(false); @@ -349,6 +358,10 @@ private TmpFormatting processRun(XWPFRun run, XWPFParagraph paragraph, } if (run.isItalic() != tfmtg.isItalic()) { + if (tfmtg.isUnderline()) { + xhtml.endElement("u"); + tfmtg.setUnderline(false); + } if (run.isItalic()) { xhtml.startElement("i"); } else { @@ -356,6 +369,16 @@ private TmpFormatting processRun(XWPFRun run, XWPFParagraph paragraph, } tfmtg.setItalic(run.isItalic()); } + + boolean isUnderline = run.getUnderline() != UnderlinePatterns.NONE; + if (isUnderline != tfmtg.isUnderline()) { + if (isUnderline) { + xhtml.startElement("u"); + } else { + xhtml.endElement("u"); + } + tfmtg.setUnderline(isUnderline); + } xhtml.characters(run.toString()); @@ -484,10 +507,12 @@ private void addRelatedParts(PackagePart documentPart, List<PackagePart> related private class TmpFormatting { private boolean bold = false; private boolean italic = false; + private boolean underline = false; - private TmpFormatting(boolean bold, boolean italic) { + private TmpFormatting(boolean bold, boolean italic, boolean underline) { this.bold = bold; this.italic = italic; + this.underline = underline; } public boolean isBold() { @@ -505,6 +530,15 @@ public boolean isItalic() { public void setItalic(boolean italic) { this.italic = italic; } + + + public boolean isUnderline() { + return underline; + } + + public void setUnderline(boolean underline) { + this.underline = underline; + } } diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java index df6d807fc..7938c3b1c 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java @@ -29,6 +29,7 @@ import org.apache.log4j.Level; import org.apache.log4j.Logger; import org.apache.tika.TikaTest; + import org.apache.tika.config.TikaConfig; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Office; @@ -109,7 +110,7 @@ public void testWordHTML() throws Exception { assertTrue(xml.contains("<td>")); // TODO - Check for the nested table // Links - assertTrue(xml.contains("<a href=\"http://tika.apache.org/\">Tika</a>")); + assertTrue(xml.contains("<a href=\"http://tika.apache.org/\"><u>Tika</u></a>")); // Paragraphs with other styles assertTrue(xml.contains("<p class=\"signature\">This one")); @@ -195,6 +196,17 @@ public void testWord6Parser() throws Exception { assertContains("The quick brown fox jumps over the lazy dog", handler.toString()); } } + + @Test + public void testTextDecoration() throws Exception { + XMLResult result = getXML("testWORD_various.doc"); + String xml = result.xml; + + assertTrue(xml.contains("<b>Bold</b>")); + assertTrue(xml.contains("<i>italic</i>")); + assertTrue(xml.contains("<u>underline</u>")); + + } @Test public void testVarious() throws Exception { @@ -361,15 +373,15 @@ public void testHeaderHyperlinks() throws Exception { assertFalse(xml.contains("HYPERLINK")); // Check we do have the link - assertContains("<a href=\"http://tw-systemhaus.de\">http:", xml); + assertContains("<a href=\"http://tw-systemhaus.de\"><u>http:", xml); // Check we do have the email - assertContains("<a href=\"mailto:a...@example.com\">ab@", xml); + assertContains("<a href=\"mailto:a...@example.com\"><u>ab@", xml); } @Test public void testControlCharacter() throws Exception { - assertContains("1. Introduzione<b> </b></a> </p>", getXML("testControlCharacters.doc").xml.replaceAll("\\s+", " ")); + assertContains("<u>1.</u> <u>Introduzione</u><b> </b></a><u> </u></p>", getXML("testControlCharacters.doc").xml.replaceAll("\\s+", " ")); } @Test @@ -383,7 +395,7 @@ public void testParagraphsAfterTables() throws Exception { "application/msword", metadata.get(Metadata.CONTENT_TYPE)); - assertContains("<p>1. Organisering av vakten:</p>", xml); + assertContains("<p><u>1. Organisering av vakten:</u></p>", xml); } @@ -521,8 +533,8 @@ public void testBoldHyperlink() throws Exception { //TIKA-1255 String xml = getXML("testWORD_boldHyperlink.doc").xml; xml = xml.replaceAll("\\s+", " "); - assertContains("<a href=\"http://tika.apache.org/\">hyper <b>link</b></a>", xml); - assertContains("<a href=\"http://tika.apache.org/\"><b>hyper</b> link</a>; bold" , xml); + assertContains("<a href=\"http://tika.apache.org/\"><u>hyper </u><b><u>link</u></b></a>", xml); + assertContains("<a href=\"http://tika.apache.org/\"><b><u>hyper</u></b><u> link</u></a>; bold" , xml); } @Test diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java index 642054536..06c5a1e0b 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java @@ -579,6 +579,17 @@ public void testNullHeaders() throws Exception { assertEquals("Should have found some text", false, handler.toString().isEmpty()); } } + + @Test + public void testTextDecoration() throws Exception { + XMLResult result = getXML("testWORD_various.docx"); + String xml = result.xml; + + assertTrue(xml.contains("<b>Bold</b>")); + assertTrue(xml.contains("<i>italic</i>")); + assertTrue(xml.contains("<u>underline</u>")); + + } @Test public void testVarious() throws Exception { ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Underlined text is not decorated as such when extracting from word documents > ---------------------------------------------------------------------------- > > Key: TIKA-2347 > URL: https://issues.apache.org/jira/browse/TIKA-2347 > Project: Tika > Issue Type: Bug > Components: parser > Affects Versions: 2.0, 1.14 > Reporter: Stuart Hendren > Assignee: Dave Meikle > Fix For: 1.17 > > > When extracting from doc and docx bold and italic text decoration is > extracted, however underlining is not. Can be demonstrated in WordParserTest > or OOXMLParserTest (change to docx) with the following test case. > {code:title=WordParserTest.java|borderStyle=solid} > @Test > public void testTextDecoration() throws Exception { > XMLResult result = getXML("testWORD_various.doc"); > String xml = result.xml; > assertTrue(xml.contains("<b>Bold</b>")); > assertTrue(xml.contains("<i>italic</i>")); > assertTrue(xml.contains("<u>underline</u>")); > } > {code} -- This message was sent by Atlassian JIRA (v6.4.14#64029)