Author: jukka Date: Wed Mar 26 11:23:53 2008 New Revision: 641457 URL: http://svn.apache.org/viewvc?rev=641457&view=rev Log: TIKA-132: Refactor Excel extractor to parse per sheet and add hyperlink support - Replace TikaExcelCell with a modular/extensible set of classes that encapsulate the functionality of rendering the cell content to XHTML
Added: incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Cell.java incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/CellDecorator.java incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/LinkedCell.java incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/TextCell.java Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java Added: incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Cell.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Cell.java?rev=641457&view=auto ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Cell.java (added) +++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Cell.java Wed Mar 26 11:23:53 2008 @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft; + +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.SAXException; + +/** + * Cell of content. Classes that implement this interface are used by + * Tika parsers (currently just the MS Excel parser) to keep track of + * individual pieces of content before they are rendered to the XHTML + * SAX event stream. + */ +public interface Cell { + + /** + * Renders the content to the given XHTML SAX event stream. + * + * @param handler + * @throws SAXException + */ + void render(XHTMLContentHandler handler) throws SAXException; + +} Added: incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/CellDecorator.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/CellDecorator.java?rev=641457&view=auto ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/CellDecorator.java (added) +++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/CellDecorator.java Wed Mar 26 11:23:53 2008 @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft; + +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.SAXException; + +/** + * Cell decorator. + */ +public class CellDecorator implements Cell { + + private final Cell cell; + + public CellDecorator(Cell cell) { + this.cell = cell; + } + + public void render(XHTMLContentHandler handler) throws SAXException { + cell.render(handler); + } + +} Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java?rev=641457&r1=641456&r2=641457&view=diff ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java (original) +++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java Wed Mar 26 11:23:53 2008 @@ -165,8 +165,8 @@ private boolean insideWorksheet = false; - private SortedMap<Point, TikaExcelCell> currentSheet = - new TreeMap<Point, TikaExcelCell>(new Comparator<Point> () { + private SortedMap<Point, Cell> currentSheet = + new TreeMap<Point, Cell>(new Comparator<Point> () { public int compare(Point a, Point b) { int diff = a.y - b.y; if (diff == 0) { @@ -256,9 +256,11 @@ // if (insideWorksheet) { // int row = hyperlinkRecord.getFirstRow(); // short column = hyperlinkRecord.getFirstColumn(); - // TikaExcelCell cell = currentSheet.findCell(row, column); + // Point point = new Point(column, row); + // Cell cell = currentSheet.get(point); // if (cell != null) { - // cell.setHyperlink(hyperlinkRecord.getAddress()); + // cell = new LinkedCell(cell, hyperlinkRecord.getAddress()); + // currentSheet.put(point, cell); // } // } // break; @@ -323,7 +325,7 @@ if (text != null && text.length() > 0) { currentSheet.put( new Point(record.getColumn(), record.getRow()), - new TikaExcelCell(text)); + new TextCell(text)); } } @@ -347,7 +349,7 @@ int currentColumn = 1; handler.startElement("tr"); handler.startElement("td"); - for (Map.Entry<Point, TikaExcelCell> entry : currentSheet.entrySet()) { + for (Map.Entry<Point, Cell> entry : currentSheet.entrySet()) { while (currentRow < entry.getKey().y) { handler.endElement("td"); handler.endElement("tr"); @@ -365,14 +367,7 @@ currentColumn++; } - TikaExcelCell cell = entry.getValue(); - if (cell.getHyperlink() != null) { - handler.startElement("a", "href", cell.getHyperlink()); - handler.characters(cell.getText()); - handler.endElement("a"); - } else { - handler.characters(cell.getText()); - } + entry.getValue().render(handler); } handler.endElement("td"); handler.endElement("tr"); @@ -383,54 +378,6 @@ handler.endElement("div"); handler.characters("\n"); } - } - - // ====================================================================== - - /** - * Tika's excel cell representation. - */ - private static class TikaExcelCell { - private String text; - private String hyperlink; - - /** - * Construct a new cell. - * - * @param column The cell's column number - * @param text The cell's text - */ - TikaExcelCell(String text) { - this.text = text; - } - - /** - * Return the cell's text. - * - * @return the cell's text - */ - String getText() { - return text; - } - - /** - * Return hyperlink address, if any - * - * @return the hyperlink address - */ - String getHyperlink() { - return hyperlink; - } - - /** - * Set the hyperlink address - * - * @param hyperlink the hyperlink address to set - */ - void setHyperlink(String hyperlink) { - this.hyperlink = hyperlink; - } - } } Added: incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/LinkedCell.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/LinkedCell.java?rev=641457&view=auto ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/LinkedCell.java (added) +++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/LinkedCell.java Wed Mar 26 11:23:53 2008 @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft; + +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.SAXException; + +/** + * Linked cell. This class decorates another content cell with a hyperlink. + */ +public class LinkedCell extends CellDecorator { + + private final String link; + + public LinkedCell(Cell cell, String link) { + super(cell); + this.link = link; + } + + public void render(XHTMLContentHandler handler) throws SAXException { + handler.startElement("a", "href", link); + super.render(handler); + handler.endElement("a"); + } + +} Added: incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/TextCell.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/TextCell.java?rev=641457&view=auto ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/TextCell.java (added) +++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/TextCell.java Wed Mar 26 11:23:53 2008 @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft; + +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.SAXException; + +/** + * Text cell. + */ +public class TextCell implements Cell { + + private final String text; + + public TextCell(String text) { + this.text = text; + } + + public void render(XHTMLContentHandler handler) throws SAXException { + handler.characters(text); + } + +}