This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch 2.x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 82509f32c30a3b7d82169d4757ca2735b656b511
Author: tballison <[email protected]>
AuthorDate: Mon Feb 27 21:40:02 2017 -0500

    TIKA-1857 xfa fix
---
 .../org/apache/tika/parser/pdf/XFAExtractor.java   | 28 ++++++++++++++++++----
 1 file changed, 23 insertions(+), 5 deletions(-)

diff --git 
a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/XFAExtractor.java
 
b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/XFAExtractor.java
index d3c34dd..d136295 100644
--- 
a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/XFAExtractor.java
+++ 
b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/XFAExtractor.java
@@ -218,16 +218,34 @@ class XFAExtractor {
     private void loadData(XMLStreamReader reader, Map<String, String> 
pdfObjRToValues)
             throws XMLStreamException {
         //reader is at the "xfa:data" element
+        //scrape the contents from the text containing nodes
+        StringBuilder buffer = new StringBuilder();
         while (reader.hasNext()) {
             switch (reader.next()) {
                 case (XMLStreamConstants.START_ELEMENT) :
-                    if ("topmostSubform".equals(reader.getLocalName())) {
-                        continue;
-                    }
-                    String value = scrapeTextUntil(reader, reader.getName());
-                    pdfObjRToValues.put(reader.getLocalName(), value);
                     break;
+                case XMLStreamConstants.CHARACTERS:
+                    int start = reader.getTextStart();
+                    int length = reader.getTextLength();
+                    buffer.append(reader.getTextCharacters(),
+                            start,
+                            length);
+                    break;
+
+                case XMLStreamConstants.CDATA:
+                    start = reader.getTextStart();
+                    length = reader.getTextLength();
+                    buffer.append(reader.getTextCharacters(),
+                            start,
+                            length);
+                    break;
+
                 case (XMLStreamConstants.END_ELEMENT) :
+                    if (buffer.length() > 0) {
+                        String localName = reader.getLocalName();
+                        pdfObjRToValues.put(localName, buffer.toString());
+                        buffer.setLength(0);
+                    }
                     if (XFA_DATA.equals(reader.getName())) {
                         return;
                     }

-- 
To stop receiving notification emails like this one, please contact
"[email protected]" <[email protected]>.

Reply via email to