Patches for parser.microsoft.WordExtractor

kildishev Mon, 01 Jul 2013 05:11:39 -0700

Dear Tika developers,

My name is Denis Kildishev and I am working for Institute for System
Programming of the Russian Academy of Sciences (ISPRAS). We use Apache
Tika in our open source project Requality
(https://forge.ispras.ru/projects/reqdb) for doc->xhtml conversion. One
of our requirements is getting xhtml visual representation close to
original doc one.

Working with current version of Tika we found that some improvementscanbe made over it. I'd like to introduce some modifications that weremade

on Word Extractor from parsers package. They includes support of lists,
table borders(according to 2007 specification) and some additional
changes on styling and indents. Also, in our version of this parser we
have XHTML commands buffer that helps to deal with a problem of nested
tables. If it is possible, I'd like to contribute those changes back to
the Tika project. As a first of possible patches I'd like to present
changes over table representation.

This patch includes changes over table representation. The information
about border color is related to specification of 2007 format. Spanning
of cells is taken from poi html parser.

Some of patches, including this one, alters the structure of generatedXHTML file. Different

changes are made over existing unit tests to deal with this fact. All

those changes preserve original original test purposes, but indifferentway. As an example may be a check of table to be on output file. As forcurrenttrunk version, it is checked by looking for clear "<table>"construction.When we introduces styling to table, this construction tends to bewrong,

so, we can looks for "<table" instead.

I will create a corresponding ticket and I will attach my patch there.
It is my first contribution to an Apache project, so I would appreciate
if you guide me how to proceed with it.

Yours sincerely,
Denis Kildishev
Software Engineering Department, ISPRAS

Index: tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
===================================================================
--- tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java	(revision 1497320)
+++ tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java	(working copy)
@@ -96,8 +96,8 @@ public class WordParserTest extends TikaTest {
         assertTrue(xml.contains("<b>BOLD</b>"));
         assertTrue(xml.contains("<i>ITALIC</i>"));
         // Table
-        assertTrue(xml.contains("<table>"));
-        assertTrue(xml.contains("<td>"));
+        assertTrue(xml.contains("<table"));
+        assertTrue(xml.contains("<td"));
         // TODO - Check for the nested table
         // Links
         assertTrue(xml.contains("<a href=\"http://tika.apache.org/\";>Tika</a>"));
Index: tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
===================================================================
--- tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java	(revision 1498255)
+++ tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java	(working copy)
@@ -393,8 +393,8 @@ public class OOXMLParserTest extends TikaTest {
       assertTrue(xml.contains("<b>BOLD</b>"));
       assertTrue(xml.contains("<i>ITALIC</i>"));
       // Table
-      assertTrue(xml.contains("<table>"));
-      assertTrue(xml.contains("<td>"));
+      assertTrue(xml.contains("<table"));
+      assertTrue(xml.contains("<td"));
       // Links
       assertTrue(xml.contains("<a href=\"http://tika.apache.org/\";>Tika</a>"));
       // Anchor links
Index: tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
===================================================================
--- tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java	(revision 1497320)
+++ tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java	(working copy)
@@ -24,6 +24,7 @@ import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
+import java.util.TreeSet;
 
 import org.apache.poi.hwpf.HWPFDocument;
 import org.apache.poi.hwpf.HWPFOldDocument;
@@ -32,6 +33,7 @@ import org.apache.poi.hwpf.extractor.Word6Extracto
 import org.apache.poi.hwpf.model.FieldsDocumentPart;
 import org.apache.poi.hwpf.model.PicturesTable;
 import org.apache.poi.hwpf.model.StyleDescription;
+import org.apache.poi.hwpf.usermodel.BorderCode;
 import org.apache.poi.hwpf.usermodel.CharacterRun;
 import org.apache.poi.hwpf.usermodel.Field;
 import org.apache.poi.hwpf.usermodel.Paragraph;
@@ -55,6 +57,7 @@ public class WordExtractor extends AbstractPOIFSEx
 
     private static final char UNICODECHAR_NONBREAKING_HYPHEN = '\u2011';
     private static final char UNICODECHAR_ZERO_WIDTH_SPACE = '\u200b';
+    private static final int TWIPS_PER_INCH = 1440;
 
     public WordExtractor(ParseContext context) {
         super(context);
@@ -104,7 +107,7 @@ public class WordExtractor extends AbstractPOIFSEx
             xhtml.element("p", paragraph);
         }
 
-	for (String paragraph : wordExtractor.getFootnoteText()) {
+    for (String paragraph : wordExtractor.getFootnoteText()) {
             xhtml.element("p", paragraph);
         }
 
@@ -138,35 +141,252 @@ public class WordExtractor extends AbstractPOIFSEx
         } catch(FileNotFoundException e) {
         }
     }
-    
+
+    /*
+     * Method that is used to get an CSS representation of vertical aligmnent.
+     * This is used in table handling
+     */
+    private String getVerticalAligmentCss(int al) {
+        String s = "";
+        switch (al) {
+        case 0:
+            s = "top";
+            break;
+        case 1:
+            s = "center";
+            break;
+        case 2:
+            s = "bottom";
+            break;
+        default:
+            break;
+        }
+        return s.equals("") ? "" : "vertical-align: " + s + ";";
+    }
+
+    // a set of string constants to support different types
+    // of cell border. It tends to be wrong for old
+    // word formats.
+    private static String[] bordertype = { "hidden", "solid", "solid",
+            "double", "solid", "solid", "dashed", "solid", "solid", "dotted",
+            "dotted", "solid", "dashed", "dashed", "dashed", "dashed",
+            "dashed", "dashed", "dashed", "dashed", "dashed", "dashed",
+            "dashed", "solid", "solid", "dashed", "dashed", "groove", "groove",
+            "groove" };
+    // a set of string constants to support different colors
+    // of cell border. It tends to be wrong for old
+    // word formats.
+    private static String[] bordercolor = { "white", "black", "blue", "cyan",
+            "green", "magenta", "red", "yellow", "white", "DarkBlue",
+            "DarkCyan", "DarkMagenta", "DarkRed", "DarkYellow", "DarkGray",
+            "LightGray", "LightGray" };
+
+    /*
+     * Method for getting a css representation of word-styled border. Includes
+     * border type, width and color.
+     */
+    private String getBorderInfo(String s, BorderCode bc) {
+        int tp = bc.getBorderType();
+        if (bc.isEmpty() || tp == 0 || bc.getLineWidth() == 0)
+            return "";
+        Double d = bc.getLineWidth() / 8.0;
+        return s + Math.round(d + 0.5) + "px "
+                + ((tp > bordertype.length) ? "solid" : bordertype[tp]) + " "
+                + bordercolor[bc.getColor()] + ";";
+    }
+
+    /*
+     * Method for getting a css representation of table cell border styling. It
+     * includes type of borders, they width and color.
+     */
+
+    private String getCellStyle(TableCell cell, TableRow row, boolean toppest,
+            boolean bottomest, boolean leftest, boolean rightest) {
+        BorderCode top = cell.getBrcTop() != null
+                && cell.getBrcTop().getBorderType() != 0 ? cell.getBrcTop()
+                : toppest ? row.getTopBorder() : row.getHorizontalBorder();
+        BorderCode bottom = cell.getBrcBottom() != null
+                && cell.getBrcBottom().getBorderType() != 0 ? cell
+                .getBrcBottom() : bottomest ? row.getBottomBorder() : row
+                .getHorizontalBorder();
+
+        BorderCode left = cell.getBrcLeft() != null
+                && cell.getBrcLeft().getBorderType() != 0 ? cell.getBrcLeft()
+                : leftest ? row.getLeftBorder() : row.getVerticalBorder();
+        BorderCode right = cell.getBrcRight() != null
+                && cell.getBrcRight().getBorderType() != 0 ? cell.getBrcRight()
+                : rightest ? row.getRightBorder() : row.getVerticalBorder();
+        return getBorderInfo("border-left: ", left)
+                + getBorderInfo("border-right: ", right)
+                + getBorderInfo("border-top:", top)
+                + getBorderInfo("border-bottom:", bottom);
+    }
+
+    // Below there is a set of methods used to support
+    // cell merging from poi.
+    static int[] buildTableCellEdgesArray(Table table) {
+        Set<Integer> edges = new TreeSet<Integer>();
+        for (int r = 0; r < table.numRows(); r++) {
+            TableRow tableRow = table.getRow(r);
+            for (int c = 0; c < tableRow.numCells(); c++) {
+                TableCell tableCell = tableRow.getCell(c);
+                edges.add(Integer.valueOf(tableCell.getLeftEdge()));
+                edges.add(Integer.valueOf(tableCell.getLeftEdge()
+                        + tableCell.getWidth()));
+            }
+        }
+        Integer[] sorted = edges.toArray(new Integer[edges.size()]);
+        int[] result = new int[sorted.length];
+        for (int i = 0; i < sorted.length; i++) {
+            result[i] = sorted[i].intValue();
+        }
+        return result;
+    }
+
+    protected int getNumberColumnsSpanned(int[] tableCellEdges,
+            int currentEdgeIndex, TableCell tableCell) {
+        int nextEdgeIndex = currentEdgeIndex;
+        int colSpan = 0;
+        int cellRightEdge = tableCell.getLeftEdge() + tableCell.getWidth();
+        while (tableCellEdges[nextEdgeIndex] < cellRightEdge) {
+            colSpan++;
+            nextEdgeIndex++;
+        }
+        return colSpan;
+    }
+
+    protected int getNumberRowsSpanned(Table table, int currentRowIndex,
+            int currentColumnIndex, TableCell tableCell) {
+        if (!tableCell.isFirstVerticallyMerged())
+            return 1;
+        final int numRows = table.numRows();
+        int count = 1;
+        for (int r1 = currentRowIndex + 1; r1 < numRows; r1++) {
+            TableRow nextRow = table.getRow(r1);
+            if (currentColumnIndex >= nextRow.numCells())
+                break;
+            TableCell nextCell = nextRow.getCell(currentColumnIndex);
+            if (!nextCell.isVerticallyMerged()
+                    || nextCell.isFirstVerticallyMerged())
+                break;
+            count++;
+        }
+        return count;
+    }
+
+    protected int getTableCellEdgesIndexSkipCount(Table table, int r,
+            int[] tableCellEdges, int currentEdgeIndex, int c,
+            TableCell tableCell) {
+        TableCell upperCell = null;
+        for (int r1 = r - 1; r1 >= 0; r1--) {
+            final TableRow row = table.getRow(r1);
+            if (row == null || c >= row.numCells())
+                continue;
+
+            final TableCell prevCell = row.getCell(c);
+            if (prevCell != null && prevCell.isFirstVerticallyMerged()) {
+                upperCell = prevCell;
+                break;
+            }
+        }
+        if (upperCell == null) {
+            System.err.println("First vertically merged cell for " + tableCell
+                    + " not found");
+            return 0;
+        }
+
+        return getNumberColumnsSpanned(tableCellEdges, currentEdgeIndex,
+                tableCell);
+    }
+
     private int handleParagraph(Paragraph p, int parentTableLevel, Range r, HWPFDocument document, 
           PicturesSource pictures, PicturesTable pictureTable, XHTMLContentHandler xhtml)
           throws SAXException, IOException, TikaException {
        // Note - a poi bug means we can't currently properly recurse
        //  into nested tables, so currently we don't
-       if(p.isInTable() && p.getTableLevel() > parentTableLevel && parentTableLevel==0) {
-          Table t = r.getTable(p);
-          xhtml.startElement("table");
-          xhtml.startElement("tbody");
-          for(int rn=0; rn<t.numRows(); rn++) {
-             TableRow row = t.getRow(rn);
-             xhtml.startElement("tr");
-             for(int cn=0; cn<row.numCells(); cn++) {
-                TableCell cell = row.getCell(cn);
-                xhtml.startElement("td");
-
-                for(int pn=0; pn<cell.numParagraphs(); pn++) {
-                   Paragraph cellP = cell.getParagraph(pn);
-                   handleParagraph(cellP, p.getTableLevel(), cell, document, pictures, pictureTable, xhtml);
+        if (p.isInTable() && p.getTableLevel() > parentTableLevel
+                && parentTableLevel == 0) {
+            Table t = r.getTable(p);
+            xhtml.startElement("table", "style",
+                    "border-collapse: collapse; border-spacing: 0;");
+            xhtml.startElement("tbody");
+            // below it is a text for
+            // cell joins
+            final int[] tableCellEdges = buildTableCellEdgesArray(t);
+            int nrows = t.numRows();
+            int maxColumns = Integer.MIN_VALUE;
+            for (int row = 0; row < nrows; row++)
+                maxColumns = Math.max(maxColumns, t.getRow(row).numCells());
+            for (int rn = 0; rn < t.numRows(); rn++) {
+                TableRow row = t.getRow(rn);
+                xhtml.startElement("tr", "style",
+                        "height:"
+                                + row.getRowHeight()
+                                / TWIPS_PER_INCH
+                                + "in;"
+                                + (!row.cantSplit() ? "keep-together:always;"
+                                        : ""));
+                int rnc = row.numCells();
+                int currentEdgeIndex = 0;
+                for (int cn = 0; cn < rnc; cn++) {
+                    TableCell cell = row.getCell(cn);
+                    if (cell.isVerticallyMerged()
+                            && !cell.isFirstVerticallyMerged()) {
+                        currentEdgeIndex += getTableCellEdgesIndexSkipCount(t,
+                                rn, tableCellEdges, currentEdgeIndex, cn, cell);
+                        continue;
+                    }
+                    int colSpan = getNumberColumnsSpanned(tableCellEdges,
+                            currentEdgeIndex, cell);
+                    currentEdgeIndex += colSpan;
+                    if (colSpan == 0)
+                        continue;
+                    final int rowSpan = getNumberRowsSpanned(t, rn, cn, cell);
+                    AttributesImpl aim = new AttributesImpl();
+                    if (colSpan != 1)
+                        aim.addAttribute("", "colspan", "colspan", "",
+                                String.valueOf(colSpan));
+                    if (rowSpan > 1)
+                        aim.addAttribute("", "rowspan", "rowspan", "",
+                                String.valueOf(rowSpan));
+                    aim.addAttribute(
+                            "",
+                            "style",
+                            "style",
+                            "",
+                            getCellStyle(cell, row, rn == 0,
+                                    rn == t.numRows() - 1, cn == 0,
+                                    cn == rnc - 1)
+                                    + "width:"
+                                    + cell.getWidth()
+                                    / TWIPS_PER_INCH
+                                    + "in;"
+                                    + "padding-start:"
+                                    + (row.getGapHalf() / TWIPS_PER_INCH)
+                                    + "in;"
+                                    + "padding-end:"
+                                    + (row.getGapHalf() / TWIPS_PER_INCH)
+                                    + "in;"
+                                    + getVerticalAligmentCss(cell
+                                            .getVertAlign()));
+                    xhtml.startElement("td", aim);
+                    for (int pn = 0; pn < cell.numParagraphs(); pn++) {
+                        Paragraph cellP = cell.getParagraph(pn);
+                        pn += handleParagraph(cellP, t.getTableLevel(), cell,
+                                document, pictures, pictureTable, xhtml);
+                    }
+                    // for empty cell we tries to write nobr
+                    // character
+                    if (cell.text().trim().isEmpty())
+                        xhtml.characters((char) (0xA0) + "");
+                    xhtml.endElement("td");
                 }
-                xhtml.endElement("td");
-             }
-             xhtml.endElement("tr");
-          }
-          xhtml.endElement("tbody");
-          xhtml.endElement("table");
-          return (t.numParagraphs()-1);
-       }
+                xhtml.endElement("tr");
+            }
+            xhtml.endElement("tbody");
+            xhtml.endElement("table");
+            return (t.numParagraphs()-1);
+        }
 
        TagAndStyle tas;
 
@@ -540,7 +760,7 @@ public class WordExtractor extends AbstractPOIFSEx
      * @return true if character run should be included in extraction.
      */
     private boolean isRendered(final CharacterRun cr) {
- 	   return cr == null || !cr.isMarkedDeleted();
+       return cr == null || !cr.isMarkedDeleted();
     }

Patches for parser.microsoft.WordExtractor

Reply via email to