Author: tallison Date: Thu Sep 26 14:01:17 2013 New Revision: 1526498 URL: http://svn.apache.org/r1526498 Log: tika-1100 textboxes in xlsx; modified XSSFExcelExtractorDecorator and added test in OOXMLParserTest
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testEXCEL_textbox.xlsx (with props) Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java?rev=1526498&r1=1526497&r2=1526498&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java Thu Sep 26 14:01:17 2013 @@ -46,6 +46,8 @@ import org.apache.poi.xssf.model.Comment import org.apache.poi.xssf.model.StylesTable; import org.apache.poi.xssf.usermodel.XSSFComment; import org.apache.poi.xssf.usermodel.XSSFRelation; +import org.apache.poi.xssf.usermodel.XSSFShape; +import org.apache.poi.xssf.usermodel.XSSFSimpleShape; import org.apache.poi.xssf.usermodel.helpers.HeaderFooterHelper; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; @@ -118,6 +120,7 @@ public class XSSFExcelExtractorDecorator while (iter.hasNext()) { InputStream stream = iter.next(); sheetParts.add(iter.getSheetPart()); + SheetTextAsHTML sheetExtractor = new SheetTextAsHTML(xhtml, iter.getSheetComments()); // Start, and output the sheet name @@ -142,7 +145,7 @@ public class XSSFExcelExtractorDecorator for(String footer : sheetExtractor.footers) { extractHeaderFooter(footer, xhtml); } - + processShapes(iter.getShapes(), xhtml); // All done with this sheet xhtml.endElement("div"); } @@ -157,6 +160,20 @@ public class XSSFExcelExtractorDecorator } } + private void processShapes(List<XSSFShape> shapes, XHTMLContentHandler xhtml) throws SAXException { + if (shapes == null){ + return; + } + for (XSSFShape shape : shapes){ + if (shape instanceof XSSFSimpleShape){ + String sText = ((XSSFSimpleShape)shape).getText(); + if (sText != null && sText.length() > 0){ + xhtml.element("p", sText); + } + } + } + } + public void processSheet( SheetContentsHandler sheetContentsExtractor, StylesTable styles, Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=1526498&r1=1526497&r2=1526498&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java Thu Sep 26 14:01:17 2013 @@ -988,4 +988,16 @@ public class OOXMLParserTest extends Tik input.close(); } } + + //TIKA-1100: + public void testExcelTextBox() throws Exception { + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + ParseContext context = new ParseContext(); + InputStream input = getTestDocument("testEXCEL_textbox.xlsx"); + parser.parse(input, handler, metadata, context); + String content = handler.toString(); + assertContains("some autoshape", content); + } + } Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testEXCEL_textbox.xlsx URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testEXCEL_textbox.xlsx?rev=1526498&view=auto ============================================================================== Binary file - no diff available. Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testEXCEL_textbox.xlsx ------------------------------------------------------------------------------ svn:mime-type = application/octet-stream