Repository: tika Updated Branches: refs/heads/master e0ca3b5df -> a35320069
Grobid NER Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/7e2c089c Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/7e2c089c Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/7e2c089c Branch: refs/heads/master Commit: 7e2c089c8e3f01fd60473517f64ed20887584527 Parents: d184e9b Author: AravindRam <[email protected]> Authored: Mon Apr 11 00:48:22 2016 -0700 Committer: AravindRam <[email protected]> Committed: Mon Apr 11 00:48:22 2016 -0700 ---------------------------------------------------------------------- .../parser/ner/grobid/GrobidNERecogniser.java | 223 +++++++++++++++++++ .../parser/ner/grobid/GrobidServer.properties | 17 ++ .../ner/grobid/GrobidNERecogniserTest.java | 66 ++++++ 3 files changed, 306 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/7e2c089c/tika-parsers/src/main/java/org/apache/tika/parser/ner/grobid/GrobidNERecogniser.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ner/grobid/GrobidNERecogniser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ner/grobid/GrobidNERecogniser.java new file mode 100644 index 0000000..e4d7152 --- /dev/null +++ b/tika-parsers/src/main/java/org/apache/tika/parser/ner/grobid/GrobidNERecogniser.java @@ -0,0 +1,223 @@ +package org.apache.tika.parser.ner.grobid; + +import java.util.Map; +import java.util.Set; + +import org.apache.tika.parser.ner.NERecogniser; +import org.json.simple.JSONArray; +import org.json.simple.JSONObject; +import org.json.simple.parser.JSONParser; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + +import java.io.IOException; +import java.net.URLEncoder; +import java.util.HashSet; +import java.util.HashMap; +import java.util.Properties; +import javax.ws.rs.core.MediaType; +import javax.ws.rs.core.Response; + +import org.apache.cxf.jaxrs.client.WebClient; + +public class GrobidNERecogniser implements NERecogniser{ + + private static final Logger LOG = LoggerFactory.getLogger(GrobidNERecogniser.class); + private static boolean available = false; + private static final String GROBID_REST_HOST = "http://localhost:8080"; + private String restHostUrlStr; + + /* + * Useful Entities from Grobid NER + */ + public static final Set<String> ENTITY_TYPES = new HashSet<String>(){{ + add("MEASUREMENT_NUMBERS"); + add("MEASUREMENT_UNITS"); + add("MEASUREMENTS"); + add("NORMALIZED_MEASUREMENTS"); + }}; + + + public GrobidNERecogniser(){ + try { + + String restHostUrlStr=""; + try { + restHostUrlStr = readRestUrl(); + } catch (IOException e) { + e.printStackTrace(); + } + + if (restHostUrlStr == null || restHostUrlStr.equals("")) { + this.restHostUrlStr = GROBID_REST_HOST; + } else { + this.restHostUrlStr = restHostUrlStr; + } + + Response response = WebClient.create(restHostUrlStr).accept(MediaType.APPLICATION_JSON).get(); + int responseCode = response.getStatus(); + if(responseCode == 200){ + available = true; + } + else{ + LOG.info("Grobid REST Server is not running"); + } + + } + catch (Exception e) { + LOG.info(e.getMessage(), e); + } + } + + /** + * Reads the GROBID REST URL from the properties file + * returns the GROBID REST URL + */ + private static String readRestUrl() throws IOException { + Properties grobidProperties = new Properties(); + grobidProperties.load(GrobidNERecogniser.class.getResourceAsStream("GrobidServer.properties")); + return grobidProperties.getProperty("grobid.server.url"); + } + + /** + * Reads the GROBID REST Endpoint from the properties file + * returns the GROBID REST Endpoint + */ + private static String readRestEndpoint() throws IOException { + Properties grobidProperties = new Properties(); + grobidProperties.load(GrobidNERecogniser.class.getResourceAsStream("GrobidServer.properties")); + return grobidProperties.getProperty("grobid.endpoint.text"); + } + + /** + * @return {@code true} if server endpoint is available. + * returns {@code false} if server endpoint is not avaliable for service. + */ + public boolean isAvailable() { + return available; + } + + /** + * Gets set of entity types recognised by this recogniser + * @return set of entity classes/types + */ + public Set<String> getEntityTypes() { + return ENTITY_TYPES; + } + + /** + * Converts JSON Object to JSON Array + * @return a JSON array + */ + public JSONArray convertToJSONArray(JSONObject obj, String key){ + JSONArray jsonArray = new JSONArray(); + try{ + jsonArray = (JSONArray) obj.get(key); + } + catch(Exception e){ + LOG.info(e.getMessage(), e); + } + return jsonArray; + } + + /** + * Parses a JSON String and converts it to a JSON Object + * @return a JSON Object + */ + public JSONObject convertToJSONObject(String jsonString){ + JSONParser parser = new JSONParser(); + JSONObject jsonObject = new JSONObject(); + try{ + jsonObject = (JSONObject) parser.parse(jsonString); + } + catch(Exception e){ + LOG.info(e.getMessage(), e); + } + return jsonObject; + } + /** + * recognises names of entities in the text + * @param text text which possibly contains names + * @return map of entity type -> set of names + */ + public Map<String, Set<String>> recognise(String text) { + + Map<String, Set<String>> entities = new HashMap<String,Set<String>>(); + Set<String> measurementNumberSet = new HashSet<String>(); + Set<String> unitSet = new HashSet<String>(); + Set<String> measurementSet = new HashSet<String>(); + Set<String> normalizedMeasurementSet = new HashSet<String>(); + + try { + String url = restHostUrlStr + readRestEndpoint()+ "?text=" + URLEncoder.encode(text,"UTF-8"); + Response response = WebClient.create(url).accept(MediaType.APPLICATION_JSON).get(); + int responseCode = response.getStatus(); + + if (responseCode == 200) { + String result = response.readEntity(String.class); + JSONObject jsonObject = convertToJSONObject(result); + JSONArray measurements = convertToJSONArray(jsonObject, "measurements"); + for(int i=0; i<measurements.size(); i++){ + + StringBuffer measurementString = new StringBuffer(); + StringBuffer normalizedMeasurementString = new StringBuffer(); + + JSONObject quantity = (JSONObject) convertToJSONObject(measurements.get(i).toString()).get("quantity"); + + if(quantity.containsKey("rawValue")){ + String measurementNumber = (String) convertToJSONObject(quantity.toString()).get("rawValue"); + measurementString.append(measurementNumber); + measurementString.append(" "); + measurementNumberSet.add(measurementNumber); + } + + if(quantity.containsKey("normalizedQuantity")){ + Long normalizedMeasurementNumber = (Long) convertToJSONObject(quantity.toString()).get("normalizedQuantity"); + normalizedMeasurementString.append(normalizedMeasurementNumber.toString()); + normalizedMeasurementString.append(" "); + } + + JSONObject jsonObj = (JSONObject) convertToJSONObject(quantity.toString()); + + if(jsonObj.containsKey("rawUnit")){ + JSONObject rawUnit = (JSONObject) jsonObj.get("rawUnit"); + String unitName = (String) convertToJSONObject(rawUnit.toString()).get("name"); + unitSet.add(unitName); + measurementString.append(unitName); + } + + if(jsonObj.containsKey("normalizedUnit")){ + JSONObject normalizedUnit = (JSONObject) jsonObj.get("normalizedUnit"); + String normalizedUnitName = (String) convertToJSONObject(normalizedUnit.toString()).get("name"); + normalizedMeasurementString.append(normalizedUnitName); + } + + if(!measurementString.toString().equals("")){ + measurementSet.add(measurementString.toString()); + } + + if(!normalizedMeasurementString.toString().equals("")){ + normalizedMeasurementSet.add(normalizedMeasurementString.toString()); + } + + } + entities.put("MEASUREMENT_NUMBERS",measurementNumberSet); + entities.put("MEASUREMENT_UNITS",unitSet); + entities.put("MEASUREMENTS",measurementSet); + entities.put("NORMALIZED_MEASUREMENTS",normalizedMeasurementSet); + + } + } + catch (Exception e) { + LOG.info(e.getMessage(), e); + + } + ENTITY_TYPES.clear(); + ENTITY_TYPES.addAll(entities.keySet()); + return entities; + } +} + + + \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/7e2c089c/tika-parsers/src/main/resources/org/apache/tika/parser/ner/grobid/GrobidServer.properties ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/resources/org/apache/tika/parser/ner/grobid/GrobidServer.properties b/tika-parsers/src/main/resources/org/apache/tika/parser/ner/grobid/GrobidServer.properties new file mode 100644 index 0000000..a7718ab --- /dev/null +++ b/tika-parsers/src/main/resources/org/apache/tika/parser/ner/grobid/GrobidServer.properties @@ -0,0 +1,17 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +grobid.server.url=http://localhost:8080 +grobid.endpoint.text=/processQuantityText \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/7e2c089c/tika-parsers/src/test/java/org/apache/tika/parser/ner/grobid/GrobidNERecogniserTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ner/grobid/GrobidNERecogniserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ner/grobid/GrobidNERecogniserTest.java new file mode 100644 index 0000000..60279e9 --- /dev/null +++ b/tika-parsers/src/test/java/org/apache/tika/parser/ner/grobid/GrobidNERecogniserTest.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright owlocationNameEntitieship. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser.ner.grobid; + +import static org.junit.Assert.assertTrue; + +import java.io.ByteArrayInputStream; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.HashSet; + +import org.apache.tika.Tika; +import org.apache.tika.config.TikaConfig; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ner.NamedEntityParser; +import org.junit.Test; + +/** +*Test case for {@link Grobid NER} +*/ +public class GrobidNERecogniserTest { + + @Test + public void testGetEntityTypes() throws Exception { + String text = "I've lost one minute."; + System.setProperty(NamedEntityParser.SYS_PROP_NER_IMPL, GrobidNERecogniser.class.getName()); + Tika tika = new Tika(new TikaConfig(NamedEntityParser.class.getResourceAsStream("tika-config.xml"))); + Metadata md = new Metadata(); + tika.parse(new ByteArrayInputStream(text.getBytes(StandardCharsets.UTF_8)), md); + + HashSet<String> set = new HashSet<String>(); + + set.clear(); + set.addAll(Arrays.asList(md.getValues("NER_MEASUREMENT_NUMBERS"))); + assertTrue(set.contains("one")); + + set.clear(); + set.addAll(Arrays.asList(md.getValues("NER_MEASUREMENT_UNITS"))); + assertTrue(set.contains("minute")); + + set.clear(); + set.addAll(Arrays.asList(md.getValues("NER_MEASUREMENTS"))); + assertTrue(set.contains("one minute")); + + set.clear(); + set.addAll(Arrays.asList(md.getValues("NER_NORMALIZED_MEASUREMENTS"))); + assertTrue(set.contains("60 s")); + } +} + +
