Revision: 18422
http://sourceforge.net/p/gate/code/18422
Author: ian_roberts
Date: 2014-10-31 17:22:29 +0000 (Fri, 31 Oct 2014)
Log Message:
-----------
Added entities support to corpus populator
Also enhanced handling to allow for the specification of annotation set names
in entity types. An entity type that does not contain a colon will go into the
Original markups annotation set as normal, but a type containing a colon will
be treated as "asName:annType" (with an empty asName denoting the default set):
{
"text":"This is a test",
"entities":{
"Mention":[{"indices":[10,14],"inst":"urn:test:Orig-markups"}],
":Mention":[{"indices":[10,14],"inst":"urn:test:default-set"}],
"Key:Mention":[{"indices":[10,14],"inst":"urn:test:Key-set"}]
}
}
Modified Paths:
--------------
gate/trunk/plugins/Twitter/src/gate/corpora/JSONTweetFormat.java
gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Population.java
gate/trunk/plugins/Twitter/src/gate/corpora/twitter/PopulationConfig.java
gate/trunk/plugins/Twitter/src/gate/corpora/twitter/PopulationDialogWrapper.java
gate/trunk/plugins/Twitter/src/gate/corpora/twitter/PreAnnotation.java
gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Tweet.java
Modified: gate/trunk/plugins/Twitter/src/gate/corpora/JSONTweetFormat.java
===================================================================
--- gate/trunk/plugins/Twitter/src/gate/corpora/JSONTweetFormat.java
2014-10-31 02:20:23 UTC (rev 18421)
+++ gate/trunk/plugins/Twitter/src/gate/corpora/JSONTweetFormat.java
2014-10-31 17:22:29 UTC (rev 18422)
@@ -110,11 +110,10 @@
DocumentContent newContent = new
DocumentContentImpl(concatenation.toString());
doc.edit(0L, doc.getContent().size(), newContent);
- AnnotationSet originalMarkups =
doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
// Create Original markups annotations for each tweet
for (Tweet tweet : tweetStarts.keySet()) {
for (PreAnnotation preAnn : tweet.getAnnotations()) {
- preAnn.toAnnotation(originalMarkups, tweetStarts.get(tweet));
+ preAnn.toAnnotation(doc, tweetStarts.get(tweet));
}
}
}
Modified: gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Population.java
===================================================================
--- gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Population.java
2014-10-31 02:20:23 UTC (rev 18421)
+++ gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Population.java
2014-10-31 17:22:29 UTC (rev 18422)
@@ -11,12 +11,10 @@
*/
package gate.corpora.twitter;
-import gate.AnnotationSet;
import gate.Corpus;
import gate.Document;
import gate.DocumentContent;
import gate.Factory;
-import gate.Gate;
import gate.corpora.DocumentContentImpl;
import gate.creole.ResourceInstantiationException;
import gate.creole.metadata.AutoInstance;
@@ -54,7 +52,7 @@
public static void populateCorpus(final Corpus corpus, URL inputUrl,
PopulationConfig config)
throws ResourceInstantiationException {
populateCorpus(corpus, inputUrl, config.getEncoding(),
config.getContentKeys(),
- config.getFeatureKeys(), config.getTweetsPerDoc());
+ config.getFeatureKeys(), config.getTweetsPerDoc(),
config.isProcessEntities());
}
/**
@@ -69,14 +67,19 @@
*/
public static void populateCorpus(final Corpus corpus, URL inputUrl, String
encoding, List<String> contentKeys,
List<String> featureKeys, int tweetsPerDoc) throws
ResourceInstantiationException {
-
+ populateCorpus(corpus, inputUrl, encoding, contentKeys, featureKeys,
tweetsPerDoc, true);
+ }
+
+ public static void populateCorpus(final Corpus corpus, URL inputUrl, String
encoding, List<String> contentKeys,
+ List<String> featureKeys, int tweetsPerDoc, boolean processEntities)
throws ResourceInstantiationException {
+
InputStream input = null;
try {
input = inputUrl.openStream();
// TODO Detect & handle gzipped input.
// TODO handling of entities, once there's GUI to control it
- TweetStreamIterator tweetSource = new TweetStreamIterator(input,
contentKeys, featureKeys, false, false);
+ TweetStreamIterator tweetSource = new TweetStreamIterator(input,
contentKeys, featureKeys, false, processEntities);
int tweetCounter = 0;
int tweetDocCounter = 0;
@@ -159,9 +162,8 @@
else {
DocumentContent contentImpl = new
DocumentContentImpl(content.toString());
document.setContent(contentImpl);
- AnnotationSet originalMarkups =
document.getAnnotations(Gate.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
for (PreAnnotation preAnn : annotandaOffsets.keySet()) {
- preAnn.toAnnotation(originalMarkups, annotandaOffsets.get(preAnn));
+ preAnn.toAnnotation(document, annotandaOffsets.get(preAnn));
}
corpus.add(document);
@@ -200,8 +202,7 @@
public void run() {
try {
for (URL fileUrl : fileUrls) {
- populateCorpus((Corpus) handle.getTarget(), fileUrl,
dialog.getEncoding(),
- dialog.getContentKeys(), dialog.getFeatureKeys(),
dialog.getTweetsPerDoc());
+ populateCorpus((Corpus) handle.getTarget(), fileUrl,
dialog.getConfig());
}
}
catch(ResourceInstantiationException e) {
Modified:
gate/trunk/plugins/Twitter/src/gate/corpora/twitter/PopulationConfig.java
===================================================================
--- gate/trunk/plugins/Twitter/src/gate/corpora/twitter/PopulationConfig.java
2014-10-31 02:20:23 UTC (rev 18421)
+++ gate/trunk/plugins/Twitter/src/gate/corpora/twitter/PopulationConfig.java
2014-10-31 17:22:29 UTC (rev 18422)
@@ -31,9 +31,12 @@
public class PopulationConfig {
+ public static final String RESOURCE_CODE = "twitter.population.config";
+
private String encoding;
private List<String> featureKeys, contentKeys;
private int tweetsPerDoc;
+ private boolean processEntities = true;
public boolean getOneDocCheckbox() {
@@ -48,6 +51,14 @@
this.tweetsPerDoc = tpd;
}
+ public boolean isProcessEntities() {
+ return processEntities;
+ }
+
+ public void setProcessEntities(boolean entities) {
+ this.processEntities = entities;
+ }
+
public String getEncoding() {
return this.encoding;
}
@@ -91,8 +102,9 @@
* @param cks
* @param fks
*/
- public PopulationConfig(int tpd, String encoding, List<String> cks,
List<String> fks) {
+ public PopulationConfig(int tpd, boolean entities, String encoding,
List<String> cks, List<String> fks) {
this.tweetsPerDoc = tpd;
+ this.processEntities = entities;
this.encoding = encoding;
this.contentKeys = cks;
this.featureKeys = fks;
@@ -102,6 +114,7 @@
public void reload(File file) {
PopulationConfig source = PopulationConfig.load(file);
this.tweetsPerDoc = source.tweetsPerDoc;
+ this.processEntities = source.processEntities;
this.encoding = source.encoding;
this.contentKeys = source.contentKeys;
this.featureKeys = source.featureKeys;
@@ -110,6 +123,7 @@
public void reload(URL url) {
PopulationConfig source = PopulationConfig.load(url);
this.tweetsPerDoc = source.tweetsPerDoc;
+ this.processEntities = source.processEntities;
this.encoding = source.encoding;
this.contentKeys = source.contentKeys;
this.featureKeys = source.featureKeys;
@@ -143,8 +157,6 @@
class LoadConfigListener implements ActionListener {
- public static final String RESOURCE_CODE = "twitter.population.config";
-
PopulationDialogWrapper wrapper;
public LoadConfigListener(PopulationDialogWrapper wrapper) {
@@ -153,8 +165,8 @@
@Override
public void actionPerformed(ActionEvent arg0) {
- XJFileChooser chooser = MainFrame.getFileChooser();
- chooser.setResource(RESOURCE_CODE);
+ XJFileChooser chooser = new XJFileChooser();
+ chooser.setResource(PopulationConfig.RESOURCE_CODE);
chooser.setDialogTitle("Load XML configuration");
chooser.setFileSelectionMode(XJFileChooser.FILES_ONLY);
int chosen = chooser.showOpenDialog(this.wrapper.dialog);
@@ -177,6 +189,7 @@
@Override
public void actionPerformed(ActionEvent event) {
XJFileChooser chooser = new XJFileChooser();
+ chooser.setResource(PopulationConfig.RESOURCE_CODE);
chooser.setDialogTitle("Save configuration as XML");
chooser.setFileSelectionMode(XJFileChooser.FILES_ONLY);
int chosen = chooser.showSaveDialog(this.wrapper.dialog);
Modified:
gate/trunk/plugins/Twitter/src/gate/corpora/twitter/PopulationDialogWrapper.java
===================================================================
---
gate/trunk/plugins/Twitter/src/gate/corpora/twitter/PopulationDialogWrapper.java
2014-10-31 02:20:23 UTC (rev 18421)
+++
gate/trunk/plugins/Twitter/src/gate/corpora/twitter/PopulationDialogWrapper.java
2014-10-31 17:22:29 UTC (rev 18422)
@@ -16,6 +16,10 @@
import gate.swing.XJFileChooser;
import gate.util.ExtensionFileFilter;
import gate.util.Strings;
+
+import java.awt.GridBagConstraints;
+import java.awt.GridBagLayout;
+import java.awt.Insets;
import java.awt.Window;
import java.awt.event.ActionEvent;
import java.awt.event.ActionListener;
@@ -42,7 +46,8 @@
protected JDialog dialog;
protected PopulationConfig config;
private JTextField encodingField;
- private JCheckBox checkbox;
+ private JCheckBox oneDocPerTweetCheckbox;
+ private JCheckBox entitiesCheckbox;
private XJFileChooser chooser;
private List<URL> fileUrls;
private ListEditor featureKeysEditor, contentKeysEditor;
@@ -57,39 +62,80 @@
dialog = new JDialog(MainFrame.getInstance(), "Populate from Twitter
JSON", true);
MainFrame.getGuiRoots().add(dialog);
dialog.getContentPane().setLayout(new BoxLayout(dialog.getContentPane(),
BoxLayout.Y_AXIS));
- dialog.add(Box.createVerticalStrut(3));
- Box encodingBox = Box.createHorizontalBox();
- JLabel encodingLabel = new JLabel("Encoding:");
+ GridBagLayout formLayout = new GridBagLayout();
+ JPanel formPanel = new JPanel(formLayout);
+ GridBagConstraints labelConstraints = new GridBagConstraints();
+ labelConstraints.gridx = 0;
+ labelConstraints.insets = new Insets(3, 3, 0, 3);
+ labelConstraints.anchor = GridBagConstraints.LINE_END;
+
+ GridBagConstraints componentConstraints = new GridBagConstraints();
+ componentConstraints.gridx = 1;
+ componentConstraints.gridwidth = GridBagConstraints.REMAINDER;
+ componentConstraints.insets = new Insets(3, 3, 0, 3);
+ componentConstraints.anchor = GridBagConstraints.LINE_START;
+ componentConstraints.weightx = 1.0;
+ componentConstraints.fill = GridBagConstraints.HORIZONTAL;
+
+
+ JLabel encodingLabel = new JLabel("Encoding");
encodingField = new JTextField(config.getEncoding());
- encodingBox.add(encodingLabel);
- encodingBox.add(encodingField);
- dialog.add(encodingBox);
- dialog.add(Box.createVerticalStrut(4));
+ formLayout.setConstraints(encodingLabel, labelConstraints);
+ formPanel.add(encodingLabel);
+ formLayout.setConstraints(encodingField, componentConstraints);
+ formPanel.add(encodingField);
- // Default is now 1 tweet per document; changed in PopulationConfig's
- // default constructor.
- Box checkboxBox = Box.createHorizontalBox();
- checkboxBox.setToolTipText("If unchecked, one document per file");
- JLabel checkboxLabel = new JLabel("One document per tweet");
- checkbox = new JCheckBox();
- checkbox.setSelected(config.getOneDocCheckbox());
- checkboxBox.add(checkboxLabel);
- checkboxBox.add(Box.createHorizontalGlue());
- checkboxBox.add(checkbox);
- dialog.add(checkboxBox);
- dialog.add(Box.createVerticalStrut(4));
+ // don't need horizontal fill for checkboxes
+ componentConstraints.fill = GridBagConstraints.NONE;
- contentKeysEditor = new ListEditor("Content keys: ",
config.getContentKeys());
+ JLabel odptCheckboxLabel = new JLabel("One document per tweet");
+ odptCheckboxLabel.setToolTipText("If unchecked, one document per file");
+ oneDocPerTweetCheckbox = new JCheckBox();
+ oneDocPerTweetCheckbox.setToolTipText("If unchecked, one document per
file");
+ oneDocPerTweetCheckbox.setSelected(config.getOneDocCheckbox());
+ formLayout.setConstraints(odptCheckboxLabel, labelConstraints);
+ formPanel.add(odptCheckboxLabel);
+
+ formLayout.setConstraints(oneDocPerTweetCheckbox, componentConstraints);
+ formPanel.add(oneDocPerTweetCheckbox);
+
+ JLabel entitiesCheckboxLabel = new JLabel("Annotations for \"entities\"");
+ entitiesCheckboxLabel.setToolTipText("Create annotations based on the
\"entities\" property of the JSON");
+ entitiesCheckbox = new JCheckBox();
+ entitiesCheckbox.setToolTipText("Create annotations based on the
\"entities\" property of the JSON");
+ entitiesCheckbox.setSelected(config.isProcessEntities());
+ formLayout.setConstraints(entitiesCheckboxLabel, labelConstraints);
+ formPanel.add(entitiesCheckboxLabel);
+
+ formLayout.setConstraints(entitiesCheckbox, componentConstraints);
+ formPanel.add(entitiesCheckbox);
+
+ // restore horizontal fill
+ componentConstraints.fill = GridBagConstraints.HORIZONTAL;
+
+ JLabel contentKeysLabel = new JLabel("Content keys");
+ contentKeysLabel.setToolTipText("JSON key paths to be turned into
DocumentContent");
+ contentKeysEditor = new ListEditor(config.getContentKeys());
contentKeysEditor.setToolTipText("JSON key paths to be turned into
DocumentContent");
- dialog.add(contentKeysEditor);
- dialog.add(Box.createVerticalStrut(4));
+ formLayout.setConstraints(contentKeysLabel, labelConstraints);
+ formPanel.add(contentKeysLabel);
+ formLayout.setConstraints(contentKeysEditor, componentConstraints);
+ formPanel.add(contentKeysEditor);
- featureKeysEditor = new ListEditor("Feature keys: ",
config.getFeatureKeys());
+
+ JLabel featureKeysLabel = new JLabel("Feature keys");
+ featureKeysLabel.setToolTipText("JSON key paths to be turned into Tweet
annotation features");
+ featureKeysEditor = new ListEditor(config.getFeatureKeys());
featureKeysEditor.setToolTipText("JSON key paths to be turned into Tweet
annotation features");
- dialog.add(featureKeysEditor);
- dialog.add(Box.createVerticalStrut(6));
+ formLayout.setConstraints(featureKeysLabel, labelConstraints);
+ formPanel.add(featureKeysLabel);
+ formLayout.setConstraints(featureKeysEditor, componentConstraints);
+ formPanel.add(featureKeysEditor);
+ dialog.add(formPanel);
+ dialog.add(Box.createVerticalStrut(4));
+
Box configPersistenceBox = Box.createHorizontalBox();
configPersistenceBox.add(Box.createHorizontalGlue());
JButton loadConfigButton = new JButton("Load configuration");
@@ -129,34 +175,22 @@
}
- public String getEncoding() {
- return this.config.getEncoding();
- }
-
public List<URL> getFileUrls() throws MalformedURLException {
return this.fileUrls;
}
- public int getTweetsPerDoc() {
- return this.config.getTweetsPerDoc();
+ public PopulationConfig getConfig() {
+ return this.config;
}
- public List<String> getContentKeys() {
- return this.config.getContentKeys();
- }
-
- public List<String> getFeatureKeys() {
- return this.config.getFeatureKeys();
- }
-
-
protected void setNewConfig(PopulationConfig newConfig) {
this.config = newConfig;
this.updateGui();
}
protected void updateConfig() {
- this.config.setTweetsPerDoc(this.checkbox.isSelected() ? 1 : 0);
+ this.config.setTweetsPerDoc(this.oneDocPerTweetCheckbox.isSelected() ? 1 :
0);
+ this.config.setProcessEntities(this.entitiesCheckbox.isSelected());
this.config.setContentKeys(this.contentKeysEditor.getValues());
this.config.setFeatureKeys(this.featureKeysEditor.getValues());
this.config.setEncoding(this.encodingField.getText());
@@ -167,7 +201,8 @@
this.encodingField.setText(config.getEncoding());
this.contentKeysEditor.setValues(config.getContentKeys());
this.featureKeysEditor.setValues(config.getFeatureKeys());
- this.checkbox.setSelected(config.getOneDocCheckbox());
+ this.oneDocPerTweetCheckbox.setSelected(config.getOneDocCheckbox());
+ this.entitiesCheckbox.setSelected(config.isProcessEntities());
}
@@ -224,19 +259,16 @@
private JButton listButton;
private ListEditorDialog listEditor;
private List<String> values;
- private JLabel label;
private JTextField field;
@Override
public void setToolTipText(String text) {
super.setToolTipText(text);
- label.setToolTipText(text);
field.setToolTipText(text);
}
- public ListEditor(String labelString, List<String> initialValues) {
- label = new JLabel(labelString);
+ public ListEditor(List<String> initialValues) {
field = new JTextField();
values = initialValues;
field.setText(Strings.toString(initialValues));
@@ -260,7 +292,6 @@
});
this.setLayout(new BoxLayout(this, BoxLayout.X_AXIS));
- this.add(label);
this.add(field);
this.add(listButton);
}
Modified: gate/trunk/plugins/Twitter/src/gate/corpora/twitter/PreAnnotation.java
===================================================================
--- gate/trunk/plugins/Twitter/src/gate/corpora/twitter/PreAnnotation.java
2014-10-31 02:20:23 UTC (rev 18421)
+++ gate/trunk/plugins/Twitter/src/gate/corpora/twitter/PreAnnotation.java
2014-10-31 17:22:29 UTC (rev 18422)
@@ -15,8 +15,10 @@
import gate.Annotation;
import gate.AnnotationSet;
+import gate.Document;
import gate.Factory;
import gate.FeatureMap;
+import gate.GateConstants;
import gate.util.InvalidOffsetException;
@@ -28,11 +30,16 @@
*/
public class PreAnnotation {
private FeatureMap features;
+ private String asName;
private String type;
private long start, end;
public PreAnnotation(long start, long end, String type, FeatureMap features)
{
+ this(start, end, GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME, type,
features);
+ }
+
+ public PreAnnotation(long start, long end, String asName, String type,
FeatureMap features) {
if (features == null) {
this.features = Factory.newFeatureMap();
}
@@ -40,6 +47,7 @@
this.features = features;
}
+ this.asName = asName;
this.type = type;
this.setStart(start);
this.setEnd(end);
@@ -53,6 +61,9 @@
this.setEnd(end);
}
+ public Annotation toAnnotation(Document doc, long startOffset) throws
InvalidOffsetException {
+ return toAnnotation(doc.getAnnotations(asName), startOffset);
+ }
public Annotation toAnnotation(AnnotationSet outputAS, long startOffset)
throws InvalidOffsetException {
long outputStart = this.start + startOffset;
@@ -77,6 +88,10 @@
public void setFeatures(FeatureMap features) {
this.features = features;
}
+
+ public String getASName() {
+ return asName;
+ }
public String getType() {
return this.type;
Modified: gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Tweet.java
===================================================================
--- gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Tweet.java
2014-10-31 02:20:23 UTC (rev 18421)
+++ gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Tweet.java
2014-10-31 17:22:29 UTC (rev 18422)
@@ -154,6 +154,8 @@
String entityType = entityTypes.next();
JsonNode entitiesOfType = entitiesNode.get(entityType);
if(entitiesOfType != null && entitiesOfType.isArray() &&
entitiesOfType.size() > 0) {
+ // if the entityType is X:Y then assume X is the AS name and Y is the
actual type
+ String[] setAndType = entityType.split(":", 2);
Iterator<JsonNode> it = entitiesOfType.elements();
while(it.hasNext()) {
JsonNode entity = it.next();
@@ -166,8 +168,14 @@
if(indicesList.get(0) instanceof Number && indicesList.get(1)
instanceof Number) {
// finally we know we have a valid entity
features.remove("indices");
- annotations.add(new PreAnnotation(startOffset +
((Number)indicesList.get(0)).longValue(),
- startOffset +
((Number)indicesList.get(1)).longValue(), entityType, features));
+ long annStart = startOffset +
((Number)indicesList.get(0)).longValue();
+ long annEnd = startOffset +
((Number)indicesList.get(1)).longValue();
+ if(setAndType.length == 2) {
+ // explicit annotation set name
+ annotations.add(new PreAnnotation(annStart, annEnd,
setAndType[0], setAndType[1], features));
+ } else {
+ annotations.add(new PreAnnotation(annStart, annEnd,
entityType, features));
+ }
}
}
}
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs