Revision: 17236
http://sourceforge.net/p/gate/code/17236
Author: valyt
Date: 2014-01-17 15:31:02 +0000 (Fri, 17 Jan 2014)
Log Message:
-----------
Making progress: we can now index and search tokens, standard annotations, and
document-mode annotations.
Details:
- support for document sizes in MimirIndex.
- support for indexing Document MODE annotations
- we are now actually using the Bloom filters we create for the cluster members.
Modified Paths:
--------------
mimir/branches/5.0/mimir-core/src/gate/mimir/MimirIndex.java
mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicAnnotationIndex.java
mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java
mimir/branches/5.0/mimir-core/src/gate/mimir/search/QueryEngine.java
mimir/branches/5.0/mimir-core/src/gate/mimir/search/query/AnnotationQuery.java
mimir/branches/5.0/mimir-test/src/gate/mimir/test/Scratch.java
Modified: mimir/branches/5.0/mimir-core/src/gate/mimir/MimirIndex.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/MimirIndex.java
2014-01-17 10:34:59 UTC (rev 17235)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/MimirIndex.java
2014-01-17 15:31:02 UTC (rev 17236)
@@ -28,6 +28,7 @@
import gate.mimir.index.mg4j.zipcollection.DocumentData;
import gate.mimir.search.QueryEngine;
import gate.util.GateRuntimeException;
+import it.unimi.dsi.fastutil.ints.IntBigList;
import it.unimi.dsi.fastutil.longs.Long2ObjectLinkedOpenHashMap;
import java.io.BufferedInputStream;
@@ -570,6 +571,16 @@
}
/**
+ * Gets the size (number of tokens) for a document.
+ * @param documentId the document being requested.
+ *
+ * @return
+ */
+ public int getDocumentSize(long documentId) {
+ return tokenIndexes[0].getIndex().sizes.get(documentId);
+ }
+
+ /**
* Marks a given document (identified by its ID) as deleted. Deleted
documents
* are never returned as search results.
* @param documentId
Modified:
mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicAnnotationIndex.java
===================================================================
---
mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicAnnotationIndex.java
2014-01-17 10:34:59 UTC (rev 17235)
+++
mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicAnnotationIndex.java
2014-01-17 15:31:02 UTC (rev 17236)
@@ -17,6 +17,10 @@
import gate.Annotation;
import gate.AnnotationSet;
import gate.Document;
+import gate.FeatureMap;
+import gate.Node;
+import gate.annotation.AnnotationImpl;
+import gate.event.AnnotationListener;
import gate.mimir.IndexConfig;
import gate.mimir.MimirIndex;
import gate.mimir.SemanticAnnotationHelper;
@@ -31,6 +35,7 @@
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
+import java.util.Set;
import java.util.concurrent.BlockingQueue;
import it.unimi.di.big.mg4j.index.Index;
@@ -47,6 +52,18 @@
private final static Logger logger =
Logger.getLogger(AtomicAnnotationIndex.class);
/**
+ * A simple object of type {@link Annotation} that can be used as a special
marker.
+ */
+ private static class ConstAnnotation extends AnnotationImpl {
+ private static final long serialVersionUID = 8224738902788616055L;
+ public ConstAnnotation() {
+ super(null, null, null, null, null);
+ }
+ }
+
+ private static final Annotation DOCUMENT_VIRTUAL_ANN = new ConstAnnotation();
+
+ /**
* The {@link IndexConfig} used by the {@link MimirIndex} that contains this
* mentions index.
*/
@@ -107,6 +124,36 @@
indexingThread.start();
}
+
+
+ @Override
+ protected void documentStarting(GATEDocument gateDocument)
+ throws IndexException {
+ for(SemanticAnnotationHelper aHelper : annotationHelpers.values()){
+ aHelper.documentStart(gateDocument.getDocument());
+ }
+ for(SemanticAnnotationHelper aHelper : documentHelpers){
+ aHelper.documentStart(gateDocument.getDocument());
+ }
+ }
+
+ @Override
+ protected void documentEnding(GATEDocument gateDocument)
+ throws IndexException {
+ for(SemanticAnnotationHelper aHelper : annotationHelpers.values()){
+ aHelper.documentEnd();
+ }
+ // index the document mode annotations
+ if(!documentHelpers.isEmpty()) {
+ processAnnotation(DOCUMENT_VIRTUAL_ANN, gateDocument);
+ }
+ for(SemanticAnnotationHelper aHelper : documentHelpers){
+ aHelper.documentEnd();
+ }
+ }
+
+
+
/* (non-Javadoc)
* @see
gate.mimir.index.AtomicIndex#getAnnotsToProcess(gate.mimir.index.mg4j.GATEDocument)
*/
@@ -139,7 +186,7 @@
@Override
protected void calculateStartPositionForAnnotation(Annotation ann,
GATEDocument gateDocument) throws IndexException {
- if(ann == null) {
+ if(ann == DOCUMENT_VIRTUAL_ANN) {
// we're supposed index the document metadata
tokenPosition = 0;
} else {
@@ -170,7 +217,7 @@
@Override
protected String[] calculateTermStringForAnnotation(Annotation ann,
GATEDocument gateDocument) throws IndexException {
- if(ann == null) {
+ if(ann == DOCUMENT_VIRTUAL_ANN) {
// obtain the URIs to be indexed for the *document* metadata
List<String> terms = new LinkedList<String>();
for(SemanticAnnotationHelper aHelper : documentHelpers) {
Modified: mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java
2014-01-17 10:34:59 UTC (rev 17235)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java
2014-01-17 15:31:02 UTC (rev 17236)
@@ -52,12 +52,15 @@
import it.unimi.dsi.fastutil.Hash;
import it.unimi.dsi.fastutil.Swapper;
import it.unimi.dsi.fastutil.ints.IntArrayList;
+import it.unimi.dsi.fastutil.ints.IntBigArrayBigList;
+import it.unimi.dsi.fastutil.ints.IntBigList;
import it.unimi.dsi.fastutil.ints.IntComparator;
import it.unimi.dsi.fastutil.ints.IntList;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
import it.unimi.dsi.fastutil.longs.LongBigList;
import it.unimi.dsi.fastutil.objects.Object2ReferenceOpenHashMap;
+import it.unimi.dsi.io.InputBitStream;
import it.unimi.dsi.io.OutputBitStream;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.logging.ProgressLogger;
@@ -131,13 +134,6 @@
}
- private static final Callable<Long> noOpLong = new Callable<Long>() {
- @Override
- public Long call() throws Exception {
- return null;
- }
- };
-
private static final Callable<Void> noOpVoid = new Callable<Void>() {
@Override
public Void call() throws Exception {
@@ -353,16 +349,16 @@
protected File indexDir;
protected Index index;
protected BloomFilter<Void> termFilter;
- long numberOfDocuments;
- long numberOfTerms;
- long numberOfPostings;
- long numberOfOccurences;
- int maxCount;
+ protected long numberOfDocuments;
+ protected long numberOfTerms;
+ protected long numberOfPostings;
+ protected long numberOfOccurences;
+ protected int maxCount;
+
public MG4JIndex(Index index, File indexDir,
BloomFilter<Void> termFilter,
long numberOfDocuments, long numberOfTerms, long numberOfPostings,
long numberOfOccurences, int maxCount) {
- super();
this.index = index;
this.indexDir = indexDir;
this.termFilter = termFilter;
@@ -427,6 +423,8 @@
long numberOfOccurences =-1;
int maxCount =-1;
int indexIdx = 0;
+ IntBigList sizes = new IntBigArrayBigList();
+ BloomFilter<Void> bloomFilters[] = new BloomFilter[indexes.length];
for(MG4JIndex aSubIndex : subIndexes) {
indexes[indexIdx] = aSubIndex.index;
@@ -438,12 +436,15 @@
numberOfPostings += aSubIndex.numberOfPostings;
numberOfOccurences += aSubIndex.numberOfOccurences;
if(maxCount < aSubIndex.maxCount) maxCount = aSubIndex.maxCount;
+ bloomFilters[indexIdx] = aSubIndex.termFilter;
+ sizes.addAll(aSubIndex.index.sizes);
indexIdx++;
}
+
return new DocumentalConcatenatedCluster(indexes,
new ContiguousDocumentalStrategy(cutPoints),
false, // flat = all component indexes have the same term list
- null, // Bloom Filters
+ bloomFilters, // Bloom Filters
numberOfDocuments,
numberOfTerms,
numberOfPostings,
@@ -454,7 +455,7 @@
true, // hasPositions,
termProcessor,
null, // field
- null, // sizes
+ sizes, // sizes
null // properties
);
}
@@ -1038,19 +1039,20 @@
String mg4jBasename = new File(subIndexDir, name).getAbsolutePath();
try {
try{
- newIndex = Index.getInstance(mg4jBasename + "?" +
- UriKeys.MAPPED.name().toLowerCase() + "=1;");
+ newIndex = Index.getInstance(
+ mg4jBasename + "?" + UriKeys.MAPPED.name().toLowerCase() + "=1;",
+ true, true);
} catch(IOException e) {
// memory mapping failed
logger.info("Memory mapping failed for index " + mg4jBasename
+ ". Loading as file index instead");
- // now try to just open it as an on-disk index
+ // now try to open it as a plain an on-disk index
newIndex = Index.getInstance(mg4jBasename, true, true);
}
} catch(Exception e) {
- throw new IndexException("Could not open the sub-index at" +
- mg4jBasename , e);
+ throw new IndexException("Could not open the sub-index at" +
mg4jBasename , e);
}
+ //read the Bloom filter
File bloomFile = new File(mg4jBasename +
DocumentalCluster.BLOOM_EXTENSION);
BloomFilter<Void> termFilter = null;
try {
@@ -1214,28 +1216,12 @@
try {
//process the annotations one by one.
for(Annotation ann : annotsToProcess){
- // calculate the position and string for this annotation
- calculateStartPositionForAnnotation(ann, gateDocument);
- String[] terms = calculateTermStringForAnnotation(ann, gateDocument);
- if(terms == null){
- //the value was already stored in #currentTerm by the implementation.
- indexCurrentTerm();
- }else if(terms.length == 0){
- //we received an empty array -> we should NOT index the current term
- }else{
- //we have received multiple values from the implementation
- for(String aTerm : terms){
- currentTerm.replace(aTerm == null ? "" : aTerm);
- indexCurrentTerm();
- }
- }
+ processAnnotation(ann, gateDocument);
}
// the current document is finished
int docLength = tokenPosition + 1;
if(docLength > maxDocSizeInRAM) maxDocSizeInRAM = docLength;
documentSizesInRAM.add(docLength);
- } catch (IOException e) {
- throw new IndexException("IO Exception while indexing", e);
} finally {
documentEnding(gateDocument);
documentPointer++;
@@ -1243,12 +1229,37 @@
}
}
+ /**
+ * Indexes one annotation (either a Token or a semantic annotation).
+ * @param ann the annotation to be indexed
+ * @param gateDocument the GATEDocument containing the annotation
+ * @throws IndexException
+ * @throws IOException
+ */
+ protected void processAnnotation(Annotation ann,
+ GATEDocument gateDocument) throws IndexException {
+ // calculate the position and string for this annotation
+ calculateStartPositionForAnnotation(ann, gateDocument);
+ String[] terms = calculateTermStringForAnnotation(ann, gateDocument);
+ if(terms == null){
+ //the value was already stored in #currentTerm by the implementation.
+ indexCurrentTerm();
+ }else if(terms.length == 0){
+ //we received an empty array -> we should NOT index the current term
+ }else{
+ //we have received multiple values from the implementation
+ for(String aTerm : terms){
+ currentTerm.replace(aTerm == null ? "" : aTerm);
+ indexCurrentTerm();
+ }
+ }
+ }
/**
* Adds the value in {@link #currentTerm} to the index.
* @throws IOException
*/
- protected void indexCurrentTerm() throws IOException {
+ protected void indexCurrentTerm() {
//check if we have seen this mention before
PostingsList termPostings = termMap.get(currentTerm);
if(termPostings == null){
Modified: mimir/branches/5.0/mimir-core/src/gate/mimir/search/QueryEngine.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/search/QueryEngine.java
2014-01-17 10:34:59 UTC (rev 17235)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/search/QueryEngine.java
2014-01-17 15:31:02 UTC (rev 17236)
@@ -103,12 +103,6 @@
* index.
*/
protected IndexReaderPool[] directIndexReaderPools;
-
- /**
- * The document sizes used during search time (if running in document mode)
to
- * simulate document-spanning annotations.
- */
- private transient IntBigList documentSizes;
/**
* The maximum size of an index that can be loaded in memory (by default 64
@@ -125,7 +119,7 @@
/**
* The index being searched.
*/
- protected MimirIndex index;
+ protected final MimirIndex index;
/**
* The index configuration this index was built from.
@@ -311,17 +305,8 @@
public IndexReaderPool[] getDirectIndexes() {
return directIndexReaderPools;
}
-
+
/**
- * Gets the list of document sizes from one underlying MG4J index (all
- * sub-indexes should have the same sizes).
- * @return
- */
- public IntBigList getDocumentSizes() {
- return documentSizes;
- }
-
- /**
* Returns the index that stores the data for a particular feature of token
* annotations.
*
@@ -401,7 +386,16 @@
return null;
}
+
/**
+ * Gets the index this query engine is searching.
+ * @return
+ */
+ public MimirIndex getIndex() {
+ return index;
+ }
+
+ /**
* @return the index configuration for this index
*/
public IndexConfig getIndexConfig() {
Modified:
mimir/branches/5.0/mimir-core/src/gate/mimir/search/query/AnnotationQuery.java
===================================================================
---
mimir/branches/5.0/mimir-core/src/gate/mimir/search/query/AnnotationQuery.java
2014-01-17 10:34:59 UTC (rev 17235)
+++
mimir/branches/5.0/mimir-core/src/gate/mimir/search/query/AnnotationQuery.java
2014-01-17 15:31:02 UTC (rev 17236)
@@ -133,7 +133,7 @@
if(underlyingHit == null) return null;
long doc = underlyingHit.getDocumentId();
if(isInDocumentMode) {
- return new Binding(query, doc, 0,
engine.getDocumentSizes().getInt(doc),
+ return new Binding(query, doc, 0,
engine.getIndex().getDocumentSize(doc),
underlyingHit.getContainedBindings());
} else {
return new Binding(query, doc,
@@ -141,7 +141,6 @@
underlyingHit.getLength(),
underlyingHit.getContainedBindings());
}
-
}
@Override
Modified: mimir/branches/5.0/mimir-test/src/gate/mimir/test/Scratch.java
===================================================================
--- mimir/branches/5.0/mimir-test/src/gate/mimir/test/Scratch.java
2014-01-17 10:34:59 UTC (rev 17235)
+++ mimir/branches/5.0/mimir-test/src/gate/mimir/test/Scratch.java
2014-01-17 15:31:02 UTC (rev 17236)
@@ -19,9 +19,13 @@
import it.unimi.di.big.mg4j.search.score.TfIdfScorer;
import java.io.BufferedReader;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
import java.lang.reflect.InvocationTargetException;
import java.net.MalformedURLException;
import java.net.URI;
@@ -39,7 +43,10 @@
import org.apache.commons.configuration.ConfigurationException;
import gate.Document;
+import gate.Factory;
import gate.Gate;
+import gate.corpora.DocumentImpl;
+import gate.creole.ResourceData;
import gate.mimir.AbstractSemanticAnnotationHelper;
import gate.mimir.IndexConfig;
import gate.mimir.MimirIndex;
@@ -74,9 +81,9 @@
public class Scratch {
public static void main (String[] args) throws Exception {
- mainIndexer5(args);
+// mainIndexer5(args);
-// mainSimple(args);
+ mainSimple(args);
// mainDirectIndexes(args);
// mainBuildDirectIndex(args);
@@ -173,28 +180,37 @@
new File("../plugins/sparql").toURI().toURL());
MimirIndex mainIndex = new MimirIndex(new File(args[0]));
QueryEngine qEngine = mainIndex.getQueryEngine();
- String query = "electrical";
+
+// String query = "electrical";
// String query = "{Document date > 20070000}";
- QueryNode qNode = QueryParser.parse(query);
+// String query = "{Abstract}";
+
+ String[] queries = new String[]{"electrical", "the", "{Abstract}",
"{Document date > 20070000}"};
long start = System.currentTimeMillis();
NumberFormat nf = NumberFormat.getNumberInstance();
- long startLocal = System.currentTimeMillis();
- QueryExecutor qExecutor = qNode.getQueryExecutor(qEngine);
- long latestDoc = qExecutor.nextDocument(-1);
- int totalHitCount = 0;
- int docCount = 0;
- while(latestDoc >= 0) {
- docCount++;
- int hitCount = 0;
- while(qExecutor.nextHit() != null) hitCount++;
- totalHitCount += hitCount;
- System.out.println("Doc " + latestDoc + ", hits: " + hitCount);
- latestDoc = qExecutor.nextDocument(-1);
+ for(String query : queries) {
+ System.out.println("Query: " + query);
+ QueryNode qNode = QueryParser.parse(query);
+ long startLocal = System.currentTimeMillis();
+ QueryExecutor qExecutor = qNode.getQueryExecutor(qEngine);
+ long latestDoc = qExecutor.nextDocument(-1);
+ int totalHitCount = 0;
+ int docCount = 0;
+ while(latestDoc >= 0) {
+ docCount++;
+ int hitCount = 0;
+ while(qExecutor.nextHit() != null) hitCount++;
+ totalHitCount += hitCount;
+ System.out.println("Doc " + latestDoc + ", hits: " + hitCount);
+ latestDoc = qExecutor.nextDocument(-1);
+ }
+ System.out.println("Found " + nf.format(totalHitCount) + " hits in " +
+ nf.format(docCount) + " documents, in " +
+ nf.format(System.currentTimeMillis() - startLocal) + " ms.\n" +
+ "========================================================\n" +
+ "========================================================");
+ qExecutor.close();
}
- System.out.println("Found " + nf.format(totalHitCount) + " hits in " +
- nf.format(docCount) + " documents, in " +
- nf.format(System.currentTimeMillis() - startLocal) + " ms.");
- qExecutor.close();
System.out.println("Total time " +
nf.format(System.currentTimeMillis() - start) + " ms.");
mainIndex.close();
@@ -224,7 +240,7 @@
MimirIndex mainIndex = new MimirIndex(indexConfig);
- mainIndex.setOccurrencesPerBatch(500000);
+ mainIndex.setOccurrencesPerBatch(1000000);
// index some documents
File zipFile = new File(args[1]);
String fileURI = zipFile.toURI().toString();
@@ -233,6 +249,7 @@
int copies = 100;
boolean compress = false;
+ ResourceData docRd =
Gate.getCreoleRegister().get(DocumentImpl.class.getName());
while(entries.hasMoreElements()) {
ZipEntry entry = entries.nextElement();
if(entry.isDirectory()) {
@@ -240,7 +257,15 @@
}
URL url = new URL("jar:" + fileURI + "!/" + entry.getName());
Document doc = gate.Factory.newDocument(url, "UTF-8");
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ ObjectOutputStream oos = new ObjectOutputStream(baos);
+ oos.writeObject(doc);
+ oos.close();
+ Factory.deleteResource(doc);
+ byte[] docBytes = baos.toByteArray();
for(int i = 0; i < copies; i++) {
+ doc = (Document) new ObjectInputStream(new
ByteArrayInputStream(docBytes)).readObject();
+ docRd.addInstantiation(doc);
mainIndex.indexDocument(doc);
}
}
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
CenturyLink Cloud: The Leader in Enterprise Cloud Services.
Learn Why More Businesses Are Choosing CenturyLink Cloud For
Critical Workloads, Development Environments & Everything In Between.
Get a Quote or Start a Free Trial Today.
http://pubads.g.doubleclick.net/gampad/clk?id=119420431&iu=/4140/ostg.clktrk
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs