This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new 1760249 TIKA-1804 -- remove dependency on org.json
1760249 is described below
commit 1760249f26536047e54e3932f47910217a6c81f5
Author: tballison <[email protected]>
AuthorDate: Wed Jun 21 17:12:00 2017 -0400
TIKA-1804 -- remove dependency on org.json
---
CHANGES.txt | 2 +
tika-dl/pom.xml | 9 +
tika-parsers/pom.xml | 4 +-
.../tika/parser/journal/GrobidRESTParser.java | 8 +-
.../journal/{TEIParser.java => TEIDOMParser.java} | 415 ++++++++++-----------
.../parser/ner/corenlp/CoreNLPNERecogniser.java | 15 +-
.../org/apache/tika/parser/journal/TEITest.java | 24 +-
7 files changed, 236 insertions(+), 241 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 38acad7..e8a4a74 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,7 @@
Release 1.15.1 - ??/??/????
+ * Remove non-ASL 2.0 compatible org.json (TIKA-1804).
+
* Allow extraction of scripts as embedded "MACRO". Users
must turn this on via TikaConfig (TIKA-2391).
diff --git a/tika-dl/pom.xml b/tika-dl/pom.xml
index 933e004..83e2d09 100644
--- a/tika-dl/pom.xml
+++ b/tika-dl/pom.xml
@@ -56,6 +56,15 @@
<groupId>org.deeplearning4j</groupId>
<artifactId>deeplearning4j-keras</artifactId>
<version>${dl4j.model.version}</version>
+ <!-- exclude this because of non-ASF friendly "do no evil" license.
+ Because this relies on tika-parsers, that should have ted-dunning's
drop-in
+ -->
+ <exclusions>
+ <exclusion>
+ <groupId>org.json</groupId>
+ <artifactId>json</artifactId>
+ </exclusion>
+ </exclusions>
</dependency>
<dependency>
<groupId>org.deeplearning4j</groupId>
diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
index d61b0e8..218f03f 100644
--- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml
@@ -327,9 +327,9 @@
</dependency>
<dependency>
- <groupId>org.json</groupId>
+ <groupId>com.tdunning</groupId>
<artifactId>json</artifactId>
- <version>20140107</version>
+ <version>1.8</version>
</dependency>
<dependency>
<groupId>com.google.code.gson</groupId>
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/journal/GrobidRESTParser.java
b/tika-parsers/src/main/java/org/apache/tika/parser/journal/GrobidRESTParser.java
index f1d6924..110c504 100644
---
a/tika-parsers/src/main/java/org/apache/tika/parser/journal/GrobidRESTParser.java
+++
b/tika-parsers/src/main/java/org/apache/tika/parser/journal/GrobidRESTParser.java
@@ -17,22 +17,20 @@
package org.apache.tika.parser.journal;
+import javax.ws.rs.core.MediaType;
+import javax.ws.rs.core.Response;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.Properties;
-import javax.ws.rs.core.MediaType;
-import javax.ws.rs.core.Response;
-
import org.apache.cxf.jaxrs.client.WebClient;
import org.apache.cxf.jaxrs.ext.multipart.Attachment;
import org.apache.cxf.jaxrs.ext.multipart.ContentDisposition;
import org.apache.cxf.jaxrs.ext.multipart.MultipartBody;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;
@@ -84,7 +82,7 @@ public class GrobidRESTParser {
try {
String resp = response.readEntity(String.class);
- Metadata teiMet = new TEIParser().parse(resp);
+ Metadata teiMet = new TEIDOMParser().parse(resp, context);
for (String key : teiMet.names()) {
metadata.add("grobid:header_" + key, teiMet.get(key));
}
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/journal/TEIParser.java
b/tika-parsers/src/main/java/org/apache/tika/parser/journal/TEIDOMParser.java
similarity index 63%
rename from
tika-parsers/src/main/java/org/apache/tika/parser/journal/TEIParser.java
rename to
tika-parsers/src/main/java/org/apache/tika/parser/journal/TEIDOMParser.java
index 9806162..6438bdd 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/journal/TEIParser.java
+++
b/tika-parsers/src/main/java/org/apache/tika/parser/journal/TEIDOMParser.java
@@ -17,116 +17,122 @@
package org.apache.tika.parser.journal;
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
+import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
-import org.json.JSONArray;
-import org.json.JSONException;
-import org.json.JSONObject;
-import org.json.XML;
+import org.apache.tika.parser.ParseContext;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.NamedNodeMap;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.xml.sax.SAXException;
-public class TEIParser {
+public class TEIDOMParser {
- public TEIParser() {
+ public TEIDOMParser() {
}
- public Metadata parse(String source) {
- JSONObject obj = XML.toJSONObject(source);
+ public Metadata parse(String source, ParseContext parseContext) throws
TikaException, SAXException, IOException {
+
+ Document root = parseContext.getDocumentBuilder().parse(
+ new
ByteArrayInputStream(source.getBytes(StandardCharsets.UTF_8))
+ );
Metadata metadata = new Metadata();
- createGrobidMetadata(source, obj, metadata);
+ createGrobidMetadata(source, root.getDocumentElement(), metadata);
return metadata;
}
- private void createGrobidMetadata(String source, JSONObject obj,
+ private void createGrobidMetadata(String source, Element root,
Metadata metadata) {
- if (obj != null) {
- try {
- JSONObject teiHeader = obj.getJSONObject("TEI")
- .getJSONObject("teiHeader");
- if (teiHeader.has("text")) {
- parseText(teiHeader.getJSONObject("text"), metadata);
- }
+ if (root != null) {
- if (teiHeader.has("fileDesc")) {
- parseFileDesc(teiHeader.getJSONObject("fileDesc"),
metadata);
+ Node text = getFirstChild(root.getChildNodes(), "text");
+ if (text != null) {
+ parseText(text, metadata);
+ }
+ Node teiHeader = getFirstChild(root.getChildNodes(), "teiHeader");
+ Node fileDesc = getFirstChild(teiHeader.getChildNodes(),
"fileDesc");
+ if (fileDesc != null) {
+ parseFileDesc(fileDesc, metadata);
- }
- if (teiHeader.has("profileDesc")) {
- parseProfileDesc(teiHeader.getJSONObject("profileDesc"),
metadata);
- }
- } catch (JSONException e) {
- System.out.println("No TEI Object found.");
}
+ Node profileDesc = getFirstChild(teiHeader.getChildNodes(),
"profileDesc");
+ if (profileDesc != null) {
+ parseProfileDesc(profileDesc, metadata);
+ }
+
}
- addStaticMet(source, obj, metadata);
+ addStaticMet(source, root, metadata);
}
- private void addStaticMet(String source, JSONObject obj, Metadata
metadata) {
+ private void addStaticMet(String source, Element obj, Metadata metadata) {
metadata.add("Class", Metadata.class.getName());
- metadata.add("TEIJSONSource", obj.toString());
+ //no longer available after we got rid of json.org's and its
.toJSONObject()
+// metadata.add("TEIJSONSource", obj.toString());
metadata.add("TEIXMLSource", source);
}
- private void parseText(JSONObject text, Metadata metadata) {
- if (text.has("xml:lang")) {
- metadata.add("Language", text.getString("xml:lang"));
+ private void parseText(Node text, Metadata metadata) {
+ String lang = getFirstAttribute(text, "xml", "lang");
+ if (lang != null) {
+ metadata.add("Language", lang);
}
}
- private void parseFileDesc(JSONObject fileDesc, Metadata metadata) {
- if (fileDesc.has("titleStmt")) {
- parseTitleStmt(fileDesc.getJSONObject("titleStmt"), metadata);
+ private void parseFileDesc(Node fileDesc, Metadata metadata) {
+ Node titleStmt = getFirstChild(fileDesc.getChildNodes(), "titleStmt");
+
+ if (titleStmt != null) {
+ parseTitleStmt(titleStmt, metadata);
}
- if (fileDesc.has("sourceDesc")) {
- parseSourceDesc(fileDesc.getJSONObject("sourceDesc"), metadata);
+ Node sourceDesc = getFirstChild(fileDesc.getChildNodes(),
"sourceDesc");
+ if (sourceDesc != null) {
+ parseSourceDesc(sourceDesc, metadata);
}
}
- private void parseTitleStmt(JSONObject titleStmt, Metadata metadata) {
- if (titleStmt.has("title")) {
- JSONObject title = titleStmt.getJSONObject("title");
- if (title.has("content")) {
- metadata.add("Title", title.getString("content"));
+ private void parseTitleStmt(Node titleStmt, Metadata metadata) {
+ Node title = getFirstChild(titleStmt.getChildNodes(), "title");
+ if (title != null) {
+ String titleText = title.getTextContent();
+ if (titleText != null) {
+ metadata.add("Title", titleText);
}
}
}
- private void parseSourceDesc(JSONObject sourceDesc, Metadata metadata) {
- if (sourceDesc.has("biblStruct")) {
- parseBiblStruct(sourceDesc.getJSONObject("biblStruct"), metadata);
+ private void parseSourceDesc(Node sourceDesc, Metadata metadata) {
+ Node biblStruct = getFirstChild(sourceDesc.getChildNodes(),
"biblStruct");
+ if (biblStruct != null) {
+ parseBiblStruct(biblStruct, metadata);
}
}
- private void parseBiblStruct(JSONObject biblStruct, Metadata metadata) {
- if (biblStruct.has("analytic")
- && biblStruct.get("analytic") instanceof JSONObject) {
- JSONObject analytic = biblStruct.getJSONObject("analytic");
- if (analytic.has("author")) {
- Object authorObj = analytic.get("author");
-
- List<Author> authorList = new ArrayList<Author>();
- if (authorObj instanceof JSONObject) {
- parseAuthor((JSONObject) authorObj, authorList);
- } else if (authorObj instanceof JSONArray) {
- JSONArray authors = (JSONArray) authorObj;
- if (authors.length() > 0) {
- for (int i = 0; i < authors.length(); i++) {
- JSONObject author = authors.getJSONObject(i);
- parseAuthor(author, authorList);
- }
- }
-
- metadata.add("Address", getMetadataAddresses(authorList));
- metadata.add("Affiliation",
getMetadataAffiliations(authorList));
- metadata.add("Authors", getMetadataAuthors(authorList));
- metadata.add("FullAffiliations",
- getMetadataFullAffiliations(authorList));
- }
+ private void parseBiblStruct(Node biblStruct, Metadata metadata) {
+ Node analytic = getFirstChild(biblStruct.getChildNodes(), "analytic");
+ if (analytic != null) {
+ List<Node> authorNodes = getChildNodes(analytic.getChildNodes(),
"author");
+ List<Author> authorList = new ArrayList<>();
+ for (Node authorNode : authorNodes) {
+ parseAuthor(authorNode, authorList);
}
+
+ metadata.add("Address", getMetadataAddresses(authorList));
+ metadata.add("Affiliation", getMetadataAffiliations(authorList));
+ metadata.add("Authors", getMetadataAuthors(authorList));
+ metadata.add("FullAffiliations",
+ getMetadataFullAffiliations(authorList));
+
+
} else {
metadata.add("Error", "Unable to parse: no analytic section in
JSON");
}
@@ -245,47 +251,35 @@ public class TEIParser {
return metAddress.toString();
}
- private void parseAuthor(JSONObject authorObj, List<Author> authorList) {
+ private void parseAuthor(Node authorNode, List<Author> authorList) {
Author author = new Author();
-
- if (authorObj.has("persName")) {
- JSONObject persName = authorObj.getJSONObject("persName");
-
- if (persName.has("forename")) {
-
- Object foreNameObj = persName.get("forename");
-
- if (foreNameObj instanceof JSONObject) {
- parseNamePart((JSONObject) foreNameObj, author);
- } else if (foreNameObj instanceof JSONArray) {
- JSONArray foreName = persName.getJSONArray("forename");
-
- if (foreName.length() > 0) {
- for (int i = 0; i < foreName.length(); i++) {
- JSONObject namePart = foreName.getJSONObject(i);
- parseNamePart(namePart, author);
- }
- }
- }
- }
-
- if (persName.has("surname")) {
- author.setSurName(persName.getString("surname"));
+ Node persName = getFirstChild(authorNode.getChildNodes(), "persName");
+ if (persName != null) {
+ List<Node> forenames = getChildNodes(persName.getChildNodes(),
"forename");
+ for (Node forenameNode : forenames) {
+ parseNamePart(forenameNode, author);
}
-
- if (authorObj.has("affiliation")) {
- parseAffiliation(authorObj.get("affiliation"), author);
+ Node surnameNode = getFirstChild(persName.getChildNodes(),
"surname");
+ if (surnameNode != null) {
+ String surnameContent = surnameNode.getTextContent();
+ if (surnameContent != null) {
+ author.setSurName(surnameContent);
+ }
}
-
+ }
+ List<Node> affiliationNodes =
getChildNodes(authorNode.getChildNodes(), "affiliation");
+ for (Node affiliationNode : affiliationNodes) {
+ parseOneAffiliation(affiliationNode, author);
}
+
authorList.add(author);
}
- private void parseNamePart(JSONObject namePart, Author author) {
- if (namePart.has("type") && namePart.has("content")) {
- String type = namePart.getString("type");
- String content = namePart.getString("content");
+ private void parseNamePart(Node namePart, Author author) {
+ String type = getFirstAttribute(namePart, null, "type");
+ String content = namePart.getTextContent();
+ if (type != null && content != null) {
if (type.equals("first")) {
author.setFirstName(content);
@@ -297,79 +291,49 @@ public class TEIParser {
}
}
- private void parseAffiliation(Object affiliationJSON, Author author) {
- if (affiliationJSON instanceof JSONObject) {
- parseOneAffiliation((JSONObject) affiliationJSON, author);
- } else if (affiliationJSON instanceof JSONArray) {
- JSONArray affiliationArray = (JSONArray) affiliationJSON;
- if (affiliationArray != null && affiliationArray.length() > 0) {
- for (int i = 0; i < affiliationArray.length(); i++) {
- JSONObject affiliationObj =
affiliationArray.getJSONObject(i);
- parseOneAffiliation(affiliationObj, author);
- }
- }
- }
- }
-
- private void parseOneAffiliation(JSONObject affiliationObj, Author author)
{
+ private void parseOneAffiliation(Node affiliationNode, Author author) {
Affiliation affiliation = new Affiliation();
- if (affiliationObj.has("address")) {
- parseAddress(affiliationObj.getJSONObject("address"), affiliation);
- }
-
- if (affiliationObj.has("orgName")) {
- OrgName orgName = new OrgName();
- Object orgObject = affiliationObj.get("orgName");
- if (orgObject instanceof JSONObject) {
- parseOrgName((JSONObject) orgObject, orgName);
- } else if (orgObject instanceof JSONArray) {
- JSONArray orgNames = (JSONArray) orgObject;
- if (orgNames != null && orgNames.length() > 0) {
- for (int i = 0; i < orgNames.length(); i++) {
- parseOrgName(orgNames.getJSONObject(i), orgName);
- }
- }
-
- affiliation.setOrgName(orgName);
- }
+ Node address = getFirstChild(affiliationNode.getChildNodes(),
"address");
+ if (address != null) {
+ parseAddress(address, affiliation);
+ }
+ List<Node> orgNameNodes =
getChildNodes(affiliationNode.getChildNodes(), "orgName");
+ OrgName orgName = new OrgName();
+ for (Node orgNameNode : orgNameNodes) {
+ parseOrgName(orgNameNode, orgName);
}
+ affiliation.setOrgName(orgName);
author.getAffiliations().add(affiliation);
}
- private void parseAddress(JSONObject addressObj, Affiliation affiliation) {
+ private void parseAddress(Node addressNode, Affiliation affiliation) {
Address address = new Address();
-
- if (addressObj.has("region")) {
- address.setRegion(addressObj.getString("region"));
+ Node region = getFirstChild(addressNode.getChildNodes(), "region");
+ if (region != null && region.getTextContent() != null) {
+ address.setRegion(region.getTextContent());
}
-
- if (addressObj.has("postCode")) {
-
address.setPostCode(JSONObject.valueToString(addressObj.get("postCode")));
+ Node postCode = getFirstChild(addressNode.getChildNodes(), "postCode");
+ if (postCode != null && postCode.getTextContent() != null) {
+ address.setPostCode(postCode.getTextContent());
}
-
- if (addressObj.has("settlement")) {
- address.setSettlment(addressObj.getString("settlement"));
+ Node settlementNode = getFirstChild(addressNode.getChildNodes(),
"settlement");
+ if (settlementNode != null && settlementNode.getTextContent() != null)
{
+ address.setSettlment(settlementNode.getTextContent());
}
- if (addressObj.has("country")) {
+ Node countryNode = getFirstChild(addressNode.getChildNodes(),
"country");
+ if (countryNode != null) {
Country country = new Country();
- Object countryObj = addressObj.get("country");
-
- if (countryObj instanceof JSONObject) {
- JSONObject countryJson = addressObj.getJSONObject("country");
-
- if (countryJson.has("content")) {
- country.setContent(countryJson.getString("content"));
- }
-
- if (countryJson.has("key")) {
- country.setKey(countryJson.getString("key"));
- }
- } else if (countryObj instanceof String) {
- country.setContent((String) countryObj);
+ String key = getFirstAttribute(countryNode, null, "key");
+ if (key != null) {
+ country.setKey(key);
+ }
+ String content = countryNode.getTextContent();
+ if (content != null) {
+ country.setContent(content);
}
address.setCountry(country);
}
@@ -377,41 +341,40 @@ public class TEIParser {
affiliation.setAddress(address);
}
- private void parseOrgName(JSONObject orgObj, OrgName orgName) {
+ private void parseOrgName(Node orgNode, OrgName orgName) {
OrgTypeName typeName = new OrgTypeName();
- if (orgObj.has("content")) {
- typeName.setName(orgObj.getString("content"));
+ String orgContent = orgNode.getTextContent();
+ if (orgContent != null) {
+ typeName.setName(orgContent);
}
-
- if (orgObj.has("type")) {
- typeName.setType(orgObj.getString("type"));
+ String orgType = getFirstAttribute(orgNode, null, "type");
+ if (orgType != null) {
+ typeName.setType(orgType);
}
orgName.getTypeNames().add(typeName);
}
- private void parseProfileDesc(JSONObject profileDesc, Metadata metadata) {
- if (profileDesc.has("abstract")) {
- if (profileDesc.has("p")) {
- metadata.add("Abstract", profileDesc.getString("p"));
+ private void parseProfileDesc(Node profileDesc, Metadata metadata) {
+ Node abstractNode = getFirstChild(profileDesc.getChildNodes(),
"abstract");
+ if (abstractNode != null) {
+ Node pNode = getFirstChild(abstractNode.getChildNodes(), "p");
+ if (pNode != null) {
+ metadata.add("Abstract", pNode.getTextContent());
}
}
- if (profileDesc.has("textClass")) {
- JSONObject textClass = profileDesc.getJSONObject("textClass");
-
- if (textClass.has("keywords")) {
- Object keywordsObj = textClass.get("keywords");
- // test AJ15.pdf
- if (keywordsObj instanceof String) {
- metadata.add("Keyword", (String) keywordsObj);
- } else if (keywordsObj instanceof JSONObject) {
- JSONObject keywords = textClass.getJSONObject("keywords");
- if (keywords.has("term")) {
- JSONArray termArr = keywords.getJSONArray("term");
- for (int i = 0; i < termArr.length(); i++) {
- metadata.add("Keyword",
JSONObject.valueToString(termArr.get(i)));
- }
+ Node textClassNode = getFirstChild(profileDesc.getChildNodes(),
"textClass");
+ if (textClassNode != null) {
+ Node keywordsNode = getFirstChild(textClassNode.getChildNodes(),
"keywords");
+ if (keywordsNode != null) {
+ List<Node> terms = getChildNodes(keywordsNode.getChildNodes(),
"term");
+ if (terms.size() == 0) {
+ // test AJ15.pdf
+ metadata.add("Keyword", keywordsNode.getTextContent());
+ } else {
+ for (Node term : terms) {
+ metadata.add("Keyword", term.getTextContent());
}
}
@@ -452,8 +415,7 @@ public class TEIParser {
}
/**
- * @param surName
- * the surName to set
+ * @param surName the surName to set
*/
public void setSurName(String surName) {
this.surName = surName;
@@ -467,8 +429,7 @@ public class TEIParser {
}
/**
- * @param middleName
- * the middleName to set
+ * @param middleName the middleName to set
*/
public void setMiddleName(String middleName) {
this.middleName = middleName;
@@ -482,8 +443,7 @@ public class TEIParser {
}
/**
- * @param firstName
- * the firstName to set
+ * @param firstName the firstName to set
*/
public void setFirstName(String firstName) {
this.firstName = firstName;
@@ -497,8 +457,7 @@ public class TEIParser {
}
/**
- * @param affiliations
- * the affiliations to set
+ * @param affiliations the affiliations to set
*/
public void setAffiliations(List<Affiliation> affiliations) {
this.affiliations = affiliations;
@@ -537,8 +496,7 @@ public class TEIParser {
}
/**
- * @param orgName
- * the orgName to set
+ * @param orgName the orgName to set
*/
public void setOrgName(OrgName orgName) {
this.orgName = orgName;
@@ -552,8 +510,7 @@ public class TEIParser {
}
/**
- * @param address
- * the address to set
+ * @param address the address to set
*/
public void setAddress(Address address) {
this.address = address;
@@ -599,8 +556,7 @@ public class TEIParser {
}
/**
- * @param typeNames
- * the typeNames to set
+ * @param typeNames the typeNames to set
*/
public void setTypeNames(List<OrgTypeName> typeNames) {
this.typeNames = typeNames;
@@ -665,8 +621,7 @@ public class TEIParser {
}
/**
- * @param name
- * the name to set
+ * @param name the name to set
*/
public void setName(String name) {
this.name = name;
@@ -680,8 +635,7 @@ public class TEIParser {
}
/**
- * @param type
- * the type to set
+ * @param type the type to set
*/
public void setType(String type) {
this.type = type;
@@ -723,8 +677,7 @@ public class TEIParser {
}
/**
- * @param region
- * the region to set
+ * @param region the region to set
*/
public void setRegion(String region) {
this.region = region;
@@ -738,8 +691,7 @@ public class TEIParser {
}
/**
- * @param postCode
- * the postCode to set
+ * @param postCode the postCode to set
*/
public void setPostCode(String postCode) {
this.postCode = postCode;
@@ -753,8 +705,7 @@ public class TEIParser {
}
/**
- * @param settlment
- * the settlment to set
+ * @param settlment the settlment to set
*/
public void setSettlment(String settlment) {
this.settlment = settlment;
@@ -768,8 +719,7 @@ public class TEIParser {
}
/**
- * @param country
- * the country to set
+ * @param country the country to set
*/
public void setCountry(Country country) {
this.country = country;
@@ -835,8 +785,7 @@ public class TEIParser {
}
/**
- * @param key
- * the key to set
+ * @param key the key to set
*/
public void setKey(String key) {
this.key = key;
@@ -850,8 +799,7 @@ public class TEIParser {
}
/**
- * @param content
- * the content to set
+ * @param content the content to set
*/
public void setContent(String content) {
this.content = content;
@@ -893,6 +841,41 @@ public class TEIParser {
}
}
}
+ }
+
+ //returns first child with this name, null otherwise
+ private static Node getFirstChild(NodeList childNodes, String name) {
+ for (int i = 0; i < childNodes.getLength(); i++) {
+ Node n = childNodes.item(i);
+ if (n.getNodeName().equals(name)) {
+ return n;
+ }
+ }
+ return null;
+ }
+
+ private static String getFirstAttribute(Node node, String ns, String name)
{
+ if (node.hasAttributes()) {
+ NamedNodeMap attrs = node.getAttributes();
+ for (int i = 0; i < attrs.getLength(); i++) {
+ Node attr = attrs.item(i);
+ if (attr.getLocalName().equals(name)) {
+ return attr.getNodeValue();
+ }
+ }
+ }
+ return null;
+ }
+ private static List<Node> getChildNodes(NodeList childNodes, String
localName) {
+ List<Node> ret = new ArrayList<>();
+ for (int i = 0; i < childNodes.getLength(); i++) {
+ Node child = childNodes.item(i);
+ if (child.getLocalName() != null &&
child.getLocalName().equals(localName)) {
+ ret.add(child);
+ }
+ }
+ return ret;
}
+
}
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/ner/corenlp/CoreNLPNERecogniser.java
b/tika-parsers/src/main/java/org/apache/tika/parser/ner/corenlp/CoreNLPNERecogniser.java
index d42be94..6e9e854 100644
---
a/tika-parsers/src/main/java/org/apache/tika/parser/ner/corenlp/CoreNLPNERecogniser.java
+++
b/tika-parsers/src/main/java/org/apache/tika/parser/ner/corenlp/CoreNLPNERecogniser.java
@@ -16,12 +16,6 @@
*/
package org.apache.tika.parser.ner.corenlp;
-import org.apache.tika.io.IOUtils;
-import org.apache.tika.parser.ner.NERecogniser;
-import org.json.JSONObject;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
import java.io.FileInputStream;
import java.io.IOException;
import java.lang.reflect.Field;
@@ -33,6 +27,13 @@ import java.util.Map;
import java.util.Properties;
import java.util.Set;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.parser.ner.NERecogniser;
+import org.json.JSONException;
+import org.json.JSONObject;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
/**
* This class offers an implementation of {@link NERecogniser} based on
* CRF classifiers from Stanford CoreNLP. This NER requires additional setup,
@@ -150,7 +151,7 @@ public class CoreNLPNERecogniser implements NERecogniser {
return names;
}
- public static void main(String[] args) throws IOException {
+ public static void main(String[] args) throws IOException, JSONException {
if (args.length != 1) {
System.err.println("Error: Invalid Args");
System.err.println("This tool finds names inside text");
diff --git
a/tika-parsers/src/test/java/org/apache/tika/parser/journal/TEITest.java
b/tika-parsers/src/test/java/org/apache/tika/parser/journal/TEITest.java
index 0c456de..2b82af9 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/journal/TEITest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/journal/TEITest.java
@@ -27,28 +27,30 @@ import java.nio.charset.StandardCharsets;
import org.apache.tika.TikaTest;
import org.apache.tika.io.IOUtils;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
import org.junit.Test;
public class TEITest extends TikaTest {
+
@Test
- public void testCurrent() throws Exception {
- TEIParser teiParser = new TEIParser();
+ public void testBasic() throws Exception {
+ TEIDOMParser teiParser = new TEIDOMParser();
ByteArrayOutputStream bos = new ByteArrayOutputStream();
try (InputStream is =
getResourceAsStream("/test-documents/testTEI.xml")) {
IOUtils.copy(is, bos);
}
String xml = new String (bos.toByteArray(), StandardCharsets.UTF_8);
- Metadata metadata = teiParser.parse(xml);
+ Metadata metadata = teiParser.parse(xml, new ParseContext());
assertEquals("Montbonnot Saint-Martin, Montbonnot Saint-Martin,
Montbonnot Saint-Martin, " +
- "Montbonnot Saint-Martin, null \"38330, 38330, 38330, 38330\"
" +
+ "Montbonnot Saint-Martin, null 38330, 38330, 38330, 38330 " +
"France, France, France, France ", metadata.get("Address"));
String[] keywords = new String[]{
- "\"F22 [Analysis of Algorithms and Problem Complexity]:
Nonnumerical Algorithms and Problems\\u2014Sequencing\"",
- "\"and scheduling; D41 [Operating Systems]: Process
management\\u2014Scheduling, Concurrency\"",
- "\"Keywords\"",
- "\"Parallel Computing, Algorithms, Scheduling, Parallel
Tasks,\"",
- "\"Moldable Tasks, Bi-criteria\""
+ "F22 [Analysis of Algorithms and Problem Complexity]:
Nonnumerical Algorithms and Problems\u2014Sequencing",
+ "and scheduling; D41 [Operating Systems]: Process
management\u2014Scheduling, Concurrency",
+ "Keywords",
+ "Parallel Computing, Algorithms, Scheduling, Parallel Tasks,",
+ "Moldable Tasks, Bi-criteria"
};
assertArrayEquals(keywords, metadata.getValues("Keyword"));
assertEquals("Pierre-François Dutot 1 Lionel Eyraud 1 Grégory Gr´ 1
Grégory Mouní 1 Denis Trystram 1 ",
@@ -58,10 +60,10 @@ public class TEITest extends TikaTest {
assertEquals("1 ID-IMAG ID-IMAG ID-IMAG ID-IMAG",
metadata.get("Affiliation"));
assertEquals("[Affiliation {orgName=ID-IMAG ID-IMAG ID-IMAG ID-IMAG ,
" +
"address=Montbonnot Saint-Martin, Montbonnot
Saint-Martin, Montbonnot Saint-Martin, Montbonnot Saint-Martin, " +
- "null \"38330, 38330, 38330, 38330\" France, France,
France, France}" +
+ "null 38330, 38330, 38330, 38330 France, France,
France, France}" +
"[Affiliation {orgName=ID-IMAG ID-IMAG ID-IMAG ID-IMAG
, " +
"address=Montbonnot Saint-Martin, Montbonnot
Saint-Martin, Montbonnot Saint-Martin, Montbonnot Saint-Martin, " +
- "null \"38330, 38330, 38330, 38330\" France, France,
France, France}]",
+ "null 38330, 38330, 38330, 38330 France, France,
France, France}]",
metadata.get("FullAffiliations"));
}
}
--
To stop receiving notification emails like this one, please contact
['"[email protected]" <[email protected]>'].