Revision: 17319
http://sourceforge.net/p/gate/code/17319
Author: valyt
Date: 2014-02-17 10:59:19 +0000 (Mon, 17 Feb 2014)
Log Message:
-----------
A much simpler and cleaner way of dealing with zero-length (or white space
only) tags. This also has the advantage that it's backward compatible with data
stored in existing indexes. All tags stored in existing indexes will continue
to work as they used to, which also means that zero-length tags in existing
indexes will not show correctly (as their length is not represented correctly).
Modified Paths:
--------------
mimir/trunk/mimir-core/src/gate/mimir/index/OriginalMarkupMetadataHelper.java
Modified:
mimir/trunk/mimir-core/src/gate/mimir/index/OriginalMarkupMetadataHelper.java
===================================================================
---
mimir/trunk/mimir-core/src/gate/mimir/index/OriginalMarkupMetadataHelper.java
2014-02-17 09:27:32 UTC (rev 17318)
+++
mimir/trunk/mimir-core/src/gate/mimir/index/OriginalMarkupMetadataHelper.java
2014-02-17 10:59:19 UTC (rev 17319)
@@ -29,6 +29,7 @@
import java.util.SortedMap;
import java.util.TreeMap;
+
import gate.Annotation;
import gate.AnnotationSet;
import gate.GateConstants;
@@ -46,8 +47,8 @@
* document at search time.
*
* The metadata saved by this class is stored in the main document metadata
map
- * using this class's name as a key. The value saved is a {@link DocumentTags}
- * instance populated with the tags data.
+ * using this class's name as a key. The value save is itself a Map, with
+ * multiple metadata fields.
*/
public class OriginalMarkupMetadataHelper implements DocumentMetadataHelper,
DocumentRenderer {
@@ -88,22 +89,22 @@
//key = token offset for close tag
//value: list of tag IDs that end at that location
SortedMap<Integer, LinkedList<String>> spansToEnd =
- new TreeMap<Integer, LinkedList<String>>();
- Iterator<int[]> tagIter =
- docTags.tags != null ? docTags.tags.iterator() : null;
+ new TreeMap<Integer, LinkedList<String>>();
+ Iterator<int[]> tagIter = docTags.tags != null ?
+ docTags.tags.iterator() : null;
int[] currentTag = (tagIter != null && tagIter.hasNext()) ?
tagIter.next() : null;
Iterator<Binding> hitIter = hits != null ? hits.iterator() : null;
Binding currentHit = (hitIter != null && hitIter.hasNext()) ?
hitIter.next() : null;
- for(int tokIdx = 0; tokIdx < tokens.length; tokIdx++) {
- if(docTags != null) {
+ for(int tokIdx = 0; tokIdx < tokens.length; tokIdx++){
+ if(docTags != null){
//check if we need to open any tags here
while((currentTag != null && currentTag[1] == tokIdx) ||
- (currentHit != null && currentHit.getTermPosition() == tokIdx)) {
+ (currentHit != null && currentHit.getTermPosition() == tokIdx)){
//we need to open a tag or a hit
if(currentTag != null && currentTag[1] == tokIdx &&
- currentHit != null && currentHit.getTermPosition() == tokIdx) {
+ currentHit != null && currentHit.getTermPosition() == tokIdx){
//we have both a tag and a hit, starting at the same position
//we start the one that ends later, with a preference for a tag
//(as hits should be inner-most)
@@ -111,13 +112,11 @@
//consume the TAG
String openingTag = docTags.tagDescriptors.get(currentTag[0]);
output.append(openingTag);
-
String closingTag = getClosingTag(openingTag);
- if(currentTag[1] == currentTag[2]) {
+ if(currentTag[2] == -1) {
// zero-length tag
output.append(closingTag);
} else {
- // queue the closing tag for later
LinkedList<String> spans = spansToEnd.get(currentTag[2]);
if(spans == null){
spans = new LinkedList<String>();
@@ -131,7 +130,7 @@
}else{
//consume the HIT
output.append(HIT_OPENING_TAG);
- int spanEnd = currentHit.getTermPosition() +
currentHit.getLength();
+ int spanEnd = currentHit.getTermPosition() +
currentHit.getLength() -1;
LinkedList<String> spans = spansToEnd.get(spanEnd);
if(spans == null){
spans = new LinkedList<String>();
@@ -142,16 +141,15 @@
currentHit = (hitIter != null && hitIter.hasNext()) ?
hitIter.next() : null;
}
- } else if(currentTag != null && currentTag[1] == tokIdx) {
+ }else if(currentTag != null && currentTag[1] == tokIdx){
//we only have a TAG to use
String openingTag = docTags.tagDescriptors.get(currentTag[0]);
output.append(openingTag);
String closingTag = getClosingTag(openingTag);
- if(currentTag[1] == currentTag[2]) {
+ if(currentTag[2] == -1) {
// zero-length tag
output.append(closingTag);
} else {
- // queue the closing tag for later
LinkedList<String> spans = spansToEnd.get(currentTag[2]);
if(spans == null){
spans = new LinkedList<String>();
@@ -162,10 +160,10 @@
//consume the tag
currentTag = (tagIter != null && tagIter.hasNext()) ?
tagIter.next() : null;
- } else {
+ }else{
//we only have a HIT to use
output.append(HIT_OPENING_TAG);
- int spanEnd = currentHit.getTermPosition() +
currentHit.getLength();
+ int spanEnd = currentHit.getTermPosition() +
currentHit.getLength() -1;
LinkedList<String> spans = spansToEnd.get(spanEnd);
if(spans == null){
spans = new LinkedList<String>();
@@ -178,10 +176,11 @@
}
}
}
- // write the token
+ //write the token
output.append(tokens[tokIdx]);
- // check if we need to close any tags here
- while(spansToEnd.size() > 0 && spansToEnd.firstKey() == tokIdx + 1){
+
+ //check if we need to close any spans here
+ while(spansToEnd.size() > 0 && spansToEnd.firstKey() == tokIdx){
LinkedList<String> closingTags =
spansToEnd.remove(spansToEnd.firstKey());
for(String aTag : closingTags){
output.append(aTag);
@@ -189,20 +188,8 @@
}
//write the non-token, if any
if(tokIdx < nonTokens.length) output.append(nonTokens[tokIdx]);
+
}
- // write the last nonToken, if any
- if(tokens.length <= nonTokens.length){
- output.append(nonTokens[tokens.length - 1]);
- }
- // any remaining tags, are zero-length, after the last token
- while(currentTag != null) {
- String openingTag = docTags.tagDescriptors.get(currentTag[0]);
- output.append(openingTag);
- output.append(getClosingTag(openingTag));
- // consume the tag
- currentTag = (tagIter != null && tagIter.hasNext()) ?
- tagIter.next() : null;
- }
}
/* (non-Javadoc)
@@ -230,32 +217,31 @@
Annotation currentTag = tagsiter.hasNext() ? tagsiter.next() : null;
long tagStart = currentTag == null ? -1 :
currentTag.getStartNode().getOffset();
long tagEnd = currentTag == null ? -1 :
currentTag.getEndNode().getOffset();
- for(int tokIdx = 0; tokIdx < tokens.length; tokIdx++){
+ for(int tokIdx = 0; tokIdx < tokens.length; tokIdx++) {
long tokStart = tokens[tokIdx].getStartNode().getOffset();
long tokEnd = tokens[tokIdx].getEndNode().getOffset();
- // see if there are any tags to close at this offset
- Long firstTagEnd = tagsToEnd.isEmpty() ? null : tagsToEnd.firstKey();
- while(tagsToEnd.size() > 0 && firstTagEnd <= tokStart) {
+ //see if there are any tags to close at this offset
+ while(tagsToEnd.size() > 0 && tagsToEnd.firstKey() <= tokStart){
//get all tags ending inside the previous token or the space before
the
//current token
- LinkedList<Integer> tags = tagsToEnd.remove(firstTagEnd);
+ LinkedList<Integer> tags = tagsToEnd.remove(tagsToEnd.firstKey());
for(int aTag : tags){
- documentTags.tags.get(aTag)[2] = tokIdx;
+ documentTags.tags.get(aTag)[2] = tokIdx -1;
}
- firstTagEnd = tagsToEnd.isEmpty() ? null : tagsToEnd.firstKey();
}
//see if we need to save any tags at this offset
while(currentTag != null){
if(tagStart < tokEnd){
//the current tag starts within the current token
int tagDescId = getTagId(currentTag, documentTags);
- int[] newTag = new int[]{tagDescId, tokIdx, -1};
- documentTags.tags.add(newTag);
- // if the new tag is zero-length, we actually know its ending
position
- if(tagEnd <= tokStart) {
- newTag[2] = tokIdx;
+ documentTags.tags.add(new int[]{tagDescId, tokIdx, -1});
+ if(tagEnd <= tokStart){
+ // the tag starts and ends before the current token starts, so it's
+ // either zero-length, or whitespace-only
+ // leave the end position as -1.
} else {
- // we queue it, and we'll find the end position later
+ // not a zero-length tag,
+ // so we'll need to find the closing position later
LinkedList<Integer> tagsEnding = tagsToEnd.get(tagEnd);
if(tagsEnding == null){
tagsEnding = new LinkedList<Integer>();
@@ -275,7 +261,7 @@
}//for tokens
while(tagsToEnd.size() > 0){
//we did not close all tags yet
- int tokIdx = tokens.length;
+ int tokIdx = tokens.length -1;
LinkedList<Integer> tags = tagsToEnd.remove(tagsToEnd.firstKey());
for(int aTag : tags){
documentTags.tags.get(aTag)[2] = tokIdx;
@@ -283,9 +269,9 @@
}
while(currentTag != null){
- // we did not exhaust all tags, we'll assign all remaining tags as
- // zero-length tags after the last token
- int tokIdx = tokens.length;
+ //we did not exhaust all tags, we'll assign all remaining tags to the
last
+ //token
+ int tokIdx = tokens.length -1;
int tagDescId = getTagId(currentTag, documentTags);
documentTags.tags.add(new int[]{tagDescId, tokIdx, tokIdx});
//update the current tag
@@ -439,10 +425,9 @@
* <ol>
* <li>the index in the {@link #tagDescriptors} array for the tag</li>
* <li>the start offset for the tag (in terms of token position);</li>
- * <li>the end offset for the tag (in terms of token position); This
- * corresponds to the first token that is <strong>not<strong> part of the
- * tag, hence it could point to a non-existent token for tags that include
- * the last token in the document.</li>
+ * <li>the end offset for the tag (in terms of token position); That is the
+ * position of the last token that is part of this tag. Zero-length tags
+ * are represented by setting this position to -1.</li>
* </ol>
*
*/
@@ -476,6 +461,7 @@
}
}
+
@Override
public String toString() {
StringBuffer str = new StringBuffer();
@@ -488,7 +474,7 @@
}
return str.toString();
}
-
+
/**
* A set used internally to ensure uniqueness of the tag descriptors.
*/
@@ -514,6 +500,8 @@
* appear in the correct document order.
*/
private List<int[]> tags;
+
+
}
/**
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
Android apps run on BlackBerry 10
Introducing the new BlackBerry 10.2.1 Runtime for Android apps.
Now with support for Jelly Bean, Bluetooth, Mapview and more.
Get your Android app in front of a whole new audience. Start now.
http://pubads.g.doubleclick.net/gampad/clk?id=124407151&iu=/4140/ostg.clktrk
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs