Author: tallison
Date: Thu Jun 4 01:52:23 2015
New Revision: 1683450
URL: http://svn.apache.org/r1683450
Log:
TIKA-1315 cleanup after run against govdocs1
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractListManager.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ListManager.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractListManager.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractListManager.java?rev=1683450&r1=1683449&r2=1683450&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractListManager.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractListManager.java
Thu Jun 4 01:52:23 2015
@@ -136,7 +136,7 @@ public abstract class AbstractListManage
return sb.toString();
}
- //actual level number
+ //actual level number; can return empty string if numberformatter fails
private String formatNum(int lvlNum, boolean isLegal, LevelTuple[]
overrideLevelTuples) {
int numFmtStyle = 0;
@@ -166,7 +166,11 @@ public abstract class AbstractListManage
} else if ("none".equals(numFmt)) {
return "";
}
- return NumberFormatter.getNumber(count, numFmtStyle);
+ try {
+ return NumberFormatter.getNumber(count, numFmtStyle);
+ } catch (IllegalArgumentException e) {
+ return "";
+ }
}
private String ordinalize(int count) {
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ListManager.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ListManager.java?rev=1683450&r1=1683449&r2=1683450&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ListManager.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ListManager.java
Thu Jun 4 01:52:23 2015
@@ -16,6 +16,7 @@
*/
package org.apache.tika.parser.microsoft;
+import java.util.NoSuchElementException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@@ -53,16 +54,27 @@ public class ListManager extends Abstrac
* <p><em>Note:</em> This only works correctly if called subsequently for
<em>all</em> paragraphs in a valid selection (main document, text field, ...)
which are part of a list.</p>
*
* @param paragraph list paragraph to process
- * @return String which represents the numbering of this list paragraph;
never {@code null}
+ * @return String which represents the numbering of this list paragraph;
never {@code null}, can be empty string, though,
+ * if something goes wrong in getList()
* @throws IllegalArgumentException If the given paragraph is {@code null}
or is not part of a list
- * @throws IllegalStateException If problems with the document are
encountered
*/
public String getFormattedNumber(final Paragraph paragraph) {
if (paragraph == null) throw new IllegalArgumentException("Given
paragraph cannot be null.");
if (!paragraph.isInList()) throw new IllegalArgumentException("Can
only process list paragraphs.");
//lsid is equivalent to docx's abnum
//ilfo is equivalent to docx's num
- int currAbNumId = paragraph.getList().getLsid();
+ int currAbNumId = -1;
+ try{
+ currAbNumId = paragraph.getList().getLsid();
+ } catch (NoSuchElementException e) {
+ //somewhat frequent exception when initializing HWPFList
+ return "";
+ } catch (IllegalArgumentException e) {
+ return "";
+ } catch (NullPointerException e) {
+ return "";
+ }
+
int currNumId = paragraph.getIlfo();
ParagraphLevelCounter lc = listLevelMap.get(currAbNumId);
LevelTuple[] overrideTuples = overrideTupleMap.get(currNumId);
@@ -171,7 +183,8 @@ public class ListManager extends Abstrac
return "none";
default:
//do we really want to silently swallow these uncovered cases?
- throw new RuntimeException("NOT COVERED: " + numberFormat);
+ //throw new RuntimeException("NOT COVERED: " + numberFormat);
+ return "decimal";
}
}
}
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java?rev=1683450&r1=1683449&r2=1683450&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java
Thu Jun 4 01:52:23 2015
@@ -16,9 +16,11 @@
*/
package org.apache.tika.parser.microsoft.ooxml;
+import java.math.BigInteger;
import org.apache.poi.xwpf.usermodel.XWPFAbstractNum;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
+import org.apache.poi.xwpf.usermodel.XWPFNum;
import org.apache.poi.xwpf.usermodel.XWPFNumbering;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.tika.parser.microsoft.AbstractListManager;
@@ -50,9 +52,18 @@ public class XWPFListManager extends Abs
numbering = document.getNumbering();
}
+ /**
+ *
+ * @param paragraph paragraph
+ * @return the formatted number or an empty string if something went wrong
+ */
public String getFormattedNumber(final XWPFParagraph paragraph) {
int currNumId = paragraph.getNumID().intValue();
- CTNum ctNum = numbering.getNum(paragraph.getNumID()).getCTNum();
+ XWPFNum xwpfNum = numbering.getNum(paragraph.getNumID());
+ if (xwpfNum == null) {
+ return "";
+ }
+ CTNum ctNum = xwpfNum.getCTNum();
CTDecimalNumber abNum = ctNum.getAbstractNumId();
int currAbNumId = abNum.getVal().intValue();