Author: tallison
Date: Thu Jun  4 01:52:23 2015
New Revision: 1683450

URL: http://svn.apache.org/r1683450
Log:
TIKA-1315 cleanup after run against govdocs1

Modified:
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractListManager.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ListManager.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractListManager.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractListManager.java?rev=1683450&r1=1683449&r2=1683450&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractListManager.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractListManager.java
 Thu Jun  4 01:52:23 2015
@@ -136,7 +136,7 @@ public abstract class AbstractListManage
             return sb.toString();
         }
 
-        //actual level number
+        //actual level number; can return empty string if numberformatter fails
         private String formatNum(int lvlNum, boolean isLegal, LevelTuple[] 
overrideLevelTuples) {
 
             int numFmtStyle = 0;
@@ -166,7 +166,11 @@ public abstract class AbstractListManage
             } else if ("none".equals(numFmt)) {
                 return "";
             }
-            return NumberFormatter.getNumber(count, numFmtStyle);
+            try {
+                return NumberFormatter.getNumber(count, numFmtStyle);
+            } catch (IllegalArgumentException e) {
+                return "";
+            }
         }
 
         private String ordinalize(int count) {

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ListManager.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ListManager.java?rev=1683450&r1=1683449&r2=1683450&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ListManager.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ListManager.java
 Thu Jun  4 01:52:23 2015
@@ -16,6 +16,7 @@
  */
 package org.apache.tika.parser.microsoft;
 
+import java.util.NoSuchElementException;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -53,16 +54,27 @@ public class ListManager extends Abstrac
      * <p><em>Note:</em> This only works correctly if called subsequently for 
<em>all</em> paragraphs in a valid selection (main document, text field, ...) 
which are part of a list.</p>
      *
      * @param paragraph list paragraph to process
-     * @return String which represents the numbering of this list paragraph; 
never {@code null}
+     * @return String which represents the numbering of this list paragraph; 
never {@code null}, can be empty string, though, 
+     *        if something goes wrong in getList()
      * @throws IllegalArgumentException If the given paragraph is {@code null} 
or is not part of a list
-     * @throws IllegalStateException    If problems with the document are 
encountered
      */
     public String getFormattedNumber(final Paragraph paragraph) {
         if (paragraph == null) throw new IllegalArgumentException("Given 
paragraph cannot be null.");
         if (!paragraph.isInList()) throw new IllegalArgumentException("Can 
only process list paragraphs.");
         //lsid is equivalent to docx's abnum
         //ilfo is equivalent to docx's num
-        int currAbNumId = paragraph.getList().getLsid();
+        int currAbNumId = -1;
+        try{
+            currAbNumId = paragraph.getList().getLsid();
+        } catch (NoSuchElementException e) {
+            //somewhat frequent exception when initializing HWPFList
+            return "";
+        } catch (IllegalArgumentException e) {
+            return "";
+        } catch (NullPointerException e) {
+            return "";
+        }
+
         int currNumId = paragraph.getIlfo();
         ParagraphLevelCounter lc = listLevelMap.get(currAbNumId);
         LevelTuple[] overrideTuples = overrideTupleMap.get(currNumId);
@@ -171,7 +183,8 @@ public class ListManager extends Abstrac
                 return "none";
             default:
                 //do we really want to silently swallow these uncovered cases?
-                throw new RuntimeException("NOT COVERED: " + numberFormat);
+                //throw new RuntimeException("NOT COVERED: " + numberFormat);
+                return "decimal";
         }
     }
 }

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java?rev=1683450&r1=1683449&r2=1683450&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java
 Thu Jun  4 01:52:23 2015
@@ -16,9 +16,11 @@
  */
 package org.apache.tika.parser.microsoft.ooxml;
 
+import java.math.BigInteger;
 
 import org.apache.poi.xwpf.usermodel.XWPFAbstractNum;
 import org.apache.poi.xwpf.usermodel.XWPFDocument;
+import org.apache.poi.xwpf.usermodel.XWPFNum;
 import org.apache.poi.xwpf.usermodel.XWPFNumbering;
 import org.apache.poi.xwpf.usermodel.XWPFParagraph;
 import org.apache.tika.parser.microsoft.AbstractListManager;
@@ -50,9 +52,18 @@ public class XWPFListManager extends Abs
         numbering = document.getNumbering();
     }
 
+    /**
+     *
+     * @param paragraph paragraph
+     * @return the formatted number or an empty string if something went wrong
+     */
     public String getFormattedNumber(final XWPFParagraph paragraph) {
         int currNumId = paragraph.getNumID().intValue();
-        CTNum ctNum = numbering.getNum(paragraph.getNumID()).getCTNum();
+        XWPFNum xwpfNum = numbering.getNum(paragraph.getNumID());
+        if (xwpfNum == null) {
+            return "";
+        }
+        CTNum ctNum = xwpfNum.getCTNum();
         CTDecimalNumber abNum = ctNum.getAbstractNumId();
         int currAbNumId = abNum.getVal().intValue();
 


Reply via email to