Author: jukka
Date: Wed Mar 26 11:23:53 2008
New Revision: 641457

URL: http://svn.apache.org/viewvc?rev=641457&view=rev
Log:
TIKA-132: Refactor Excel extractor to parse per sheet and add hyperlink support
    - Replace TikaExcelCell with a modular/extensible set of classes that
      encapsulate the functionality of rendering the cell content to XHTML

Added:
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Cell.java
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/CellDecorator.java
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/LinkedCell.java
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/TextCell.java
Modified:
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java

Added: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Cell.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Cell.java?rev=641457&view=auto
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Cell.java 
(added)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Cell.java 
Wed Mar 26 11:23:53 2008
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Cell of content. Classes that implement this interface are used by
+ * Tika parsers (currently just the MS Excel parser) to keep track of
+ * individual pieces of content before they are rendered to the XHTML
+ * SAX event stream.
+ */
+public interface Cell {
+
+    /**
+     * Renders the content to the given XHTML SAX event stream.
+     *
+     * @param handler
+     * @throws SAXException
+     */
+    void render(XHTMLContentHandler handler) throws SAXException;
+
+}

Added: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/CellDecorator.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/CellDecorator.java?rev=641457&view=auto
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/CellDecorator.java
 (added)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/CellDecorator.java
 Wed Mar 26 11:23:53 2008
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Cell decorator.
+ */
+public class CellDecorator implements Cell {
+
+    private final Cell cell;
+
+    public CellDecorator(Cell cell) {
+        this.cell = cell;
+    }
+
+    public void render(XHTMLContentHandler handler) throws SAXException {
+        cell.render(handler);
+    }
+
+}

Modified: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java?rev=641457&r1=641456&r2=641457&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
 (original)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
 Wed Mar 26 11:23:53 2008
@@ -165,8 +165,8 @@
 
         private boolean insideWorksheet = false;
 
-        private SortedMap<Point, TikaExcelCell> currentSheet =
-            new TreeMap<Point, TikaExcelCell>(new Comparator<Point> () {
+        private SortedMap<Point, Cell> currentSheet =
+            new TreeMap<Point, Cell>(new Comparator<Point> () {
                 public int compare(Point a, Point b) {
                     int diff = a.y - b.y;
                     if (diff == 0) {
@@ -256,9 +256,11 @@
                 //    if (insideWorksheet) {
                 //        int row = hyperlinkRecord.getFirstRow();
                 //        short column =  hyperlinkRecord.getFirstColumn();
-                //        TikaExcelCell cell = currentSheet.findCell(row, 
column);
+                //        Point point = new Point(column, row);
+                //        Cell cell = currentSheet.get(point);
                 //        if (cell != null) {
-                //            cell.setHyperlink(hyperlinkRecord.getAddress());
+                //            cell = new LinkedCell(cell, 
hyperlinkRecord.getAddress());
+                //            currentSheet.put(point, cell);
                 //        }
                 //    }
                 //    break;
@@ -323,7 +325,7 @@
             if (text != null && text.length() > 0) {
                 currentSheet.put(
                         new Point(record.getColumn(), record.getRow()),
-                        new TikaExcelCell(text));
+                        new TextCell(text));
             }
         }
 
@@ -347,7 +349,7 @@
             int currentColumn = 1;
             handler.startElement("tr");
             handler.startElement("td");
-            for (Map.Entry<Point, TikaExcelCell> entry : 
currentSheet.entrySet()) {
+            for (Map.Entry<Point, Cell> entry : currentSheet.entrySet()) {
                 while (currentRow < entry.getKey().y) {
                     handler.endElement("td");
                     handler.endElement("tr");
@@ -365,14 +367,7 @@
                     currentColumn++;
                 }
 
-                TikaExcelCell cell = entry.getValue();
-                if (cell.getHyperlink() != null) {
-                    handler.startElement("a", "href", cell.getHyperlink());
-                    handler.characters(cell.getText());
-                    handler.endElement("a");
-                } else {
-                    handler.characters(cell.getText());
-                }
+                entry.getValue().render(handler);
             }
             handler.endElement("td");
             handler.endElement("tr");
@@ -383,54 +378,6 @@
             handler.endElement("div");
             handler.characters("\n");
         }
-    }
-
-    // ======================================================================
-
-    /**
-     * Tika's excel cell representation. 
-     */
-    private static class TikaExcelCell {
-        private String text;
-        private String hyperlink;
-
-        /**
-         * Construct a new cell.
-         *
-         * @param column The cell's column number
-         * @param text The cell's text
-         */
-        TikaExcelCell(String text) {
-            this.text = text;
-        }
-
-        /**
-         * Return the cell's text.
-         *
-         * @return the cell's text
-         */
-        String getText() {
-            return text;
-        }
-
-        /**
-         * Return hyperlink address, if any
-         *
-         * @return the hyperlink address
-         */
-        String getHyperlink() {
-            return hyperlink;
-        }
-
-        /**
-         * Set the hyperlink address
-         *
-         * @param hyperlink the hyperlink address to set
-         */
-        void setHyperlink(String hyperlink) {
-            this.hyperlink = hyperlink;
-        }
-
     }
 
 }

Added: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/LinkedCell.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/LinkedCell.java?rev=641457&view=auto
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/LinkedCell.java
 (added)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/LinkedCell.java
 Wed Mar 26 11:23:53 2008
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Linked cell. This class decorates another content cell with a hyperlink.
+ */
+public class LinkedCell extends CellDecorator {
+
+    private final String link;
+
+    public LinkedCell(Cell cell, String link) {
+        super(cell);
+        this.link = link;
+    }
+
+    public void render(XHTMLContentHandler handler) throws SAXException {
+        handler.startElement("a", "href", link);
+        super.render(handler);
+        handler.endElement("a");
+    }
+
+}

Added: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/TextCell.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/TextCell.java?rev=641457&view=auto
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/TextCell.java
 (added)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/TextCell.java
 Wed Mar 26 11:23:53 2008
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Text cell.
+ */
+public class TextCell implements Cell {
+
+    private final String text;
+
+    public TextCell(String text) {
+        this.text = text;
+    }
+
+    public void render(XHTMLContentHandler handler) throws SAXException {
+        handler.characters(text);
+    }
+
+}


Reply via email to