[gate-cvs] SF.net SVN: gate:[17236] mimir/branches/5.0

valyt Fri, 17 Jan 2014 07:37:11 -0800

Revision: 17236
          http://sourceforge.net/p/gate/code/17236
Author:   valyt
Date:     2014-01-17 15:31:02 +0000 (Fri, 17 Jan 2014)
Log Message:
-----------
Making progress: we can now index and search tokens, standard annotations, and 
document-mode annotations.


Details:
- support for document sizes in MimirIndex.
- support for indexing Document MODE annotations
- we are now actually using the Bloom filters we create for the cluster members.

Modified Paths:
--------------
    mimir/branches/5.0/mimir-core/src/gate/mimir/MimirIndex.java
    
mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicAnnotationIndex.java
    mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java
    mimir/branches/5.0/mimir-core/src/gate/mimir/search/QueryEngine.java
    
mimir/branches/5.0/mimir-core/src/gate/mimir/search/query/AnnotationQuery.java
    mimir/branches/5.0/mimir-test/src/gate/mimir/test/Scratch.java

Modified: mimir/branches/5.0/mimir-core/src/gate/mimir/MimirIndex.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/MimirIndex.java        
2014-01-17 10:34:59 UTC (rev 17235)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/MimirIndex.java        
2014-01-17 15:31:02 UTC (rev 17236)
@@ -28,6 +28,7 @@
 import gate.mimir.index.mg4j.zipcollection.DocumentData;
 import gate.mimir.search.QueryEngine;
 import gate.util.GateRuntimeException;
+import it.unimi.dsi.fastutil.ints.IntBigList;
 import it.unimi.dsi.fastutil.longs.Long2ObjectLinkedOpenHashMap;
 
 import java.io.BufferedInputStream;
@@ -570,6 +571,16 @@
   }
   
   /**
+   * Gets the size (number of tokens) for a document.
+   * @param documentId the document being requested.
+   * 
+   * @return
+   */
+  public int getDocumentSize(long documentId) {
+    return tokenIndexes[0].getIndex().sizes.get(documentId);
+  }
+  
+  /**
    * Marks a given document (identified by its ID) as deleted. Deleted 
documents
    * are never returned as search results.
    * @param documentId

Modified: 
mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicAnnotationIndex.java
===================================================================
--- 
mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicAnnotationIndex.java   
    2014-01-17 10:34:59 UTC (rev 17235)
+++ 
mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicAnnotationIndex.java   
    2014-01-17 15:31:02 UTC (rev 17236)
@@ -17,6 +17,10 @@
 import gate.Annotation;
 import gate.AnnotationSet;
 import gate.Document;
+import gate.FeatureMap;
+import gate.Node;
+import gate.annotation.AnnotationImpl;
+import gate.event.AnnotationListener;
 import gate.mimir.IndexConfig;
 import gate.mimir.MimirIndex;
 import gate.mimir.SemanticAnnotationHelper;
@@ -31,6 +35,7 @@
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
 import java.util.concurrent.BlockingQueue;
 
 import it.unimi.di.big.mg4j.index.Index;
@@ -47,6 +52,18 @@
   private final static Logger logger = 
Logger.getLogger(AtomicAnnotationIndex.class);
   
   /**
+   * A simple object of type {@link Annotation} that can be used as a special 
marker.
+   */
+  private static class ConstAnnotation extends AnnotationImpl {
+    private static final long serialVersionUID = 8224738902788616055L;
+    public ConstAnnotation() {
+      super(null, null, null, null, null);
+    }
+  }
+  
+  private static final Annotation DOCUMENT_VIRTUAL_ANN = new ConstAnnotation();
+  
+  /**
    * The {@link IndexConfig} used by the {@link MimirIndex} that contains this
    * mentions index.
    */
@@ -107,6 +124,36 @@
     indexingThread.start();
   }
 
+  
+  
+  @Override
+  protected void documentStarting(GATEDocument gateDocument)
+      throws IndexException {
+    for(SemanticAnnotationHelper aHelper : annotationHelpers.values()){
+      aHelper.documentStart(gateDocument.getDocument());
+    }
+    for(SemanticAnnotationHelper aHelper : documentHelpers){
+      aHelper.documentStart(gateDocument.getDocument());
+    }
+  }
+
+  @Override
+  protected void documentEnding(GATEDocument gateDocument)
+      throws IndexException {
+    for(SemanticAnnotationHelper aHelper : annotationHelpers.values()){
+      aHelper.documentEnd();
+    }
+    // index the document mode annotations
+    if(!documentHelpers.isEmpty()) {
+      processAnnotation(DOCUMENT_VIRTUAL_ANN, gateDocument);
+    }
+    for(SemanticAnnotationHelper aHelper : documentHelpers){     
+      aHelper.documentEnd();
+    }
+  }
+
+
+
   /* (non-Javadoc)
    * @see 
gate.mimir.index.AtomicIndex#getAnnotsToProcess(gate.mimir.index.mg4j.GATEDocument)
    */
@@ -139,7 +186,7 @@
   @Override
   protected void calculateStartPositionForAnnotation(Annotation ann,
       GATEDocument gateDocument) throws IndexException {
-    if(ann == null) {
+    if(ann == DOCUMENT_VIRTUAL_ANN) {
       // we're supposed index the document metadata
       tokenPosition = 0;
     } else {
@@ -170,7 +217,7 @@
   @Override
   protected String[] calculateTermStringForAnnotation(Annotation ann,
       GATEDocument gateDocument) throws IndexException {
-    if(ann == null) {
+    if(ann == DOCUMENT_VIRTUAL_ANN) {
       // obtain the URIs to be indexed for the *document* metadata
       List<String> terms = new LinkedList<String>();
       for(SemanticAnnotationHelper aHelper : documentHelpers) {

Modified: mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java 
2014-01-17 10:34:59 UTC (rev 17235)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java 
2014-01-17 15:31:02 UTC (rev 17236)
@@ -52,12 +52,15 @@
 import it.unimi.dsi.fastutil.Hash;
 import it.unimi.dsi.fastutil.Swapper;
 import it.unimi.dsi.fastutil.ints.IntArrayList;
+import it.unimi.dsi.fastutil.ints.IntBigArrayBigList;
+import it.unimi.dsi.fastutil.ints.IntBigList;
 import it.unimi.dsi.fastutil.ints.IntComparator;
 import it.unimi.dsi.fastutil.ints.IntList;
 import it.unimi.dsi.fastutil.io.BinIO;
 import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
 import it.unimi.dsi.fastutil.longs.LongBigList;
 import it.unimi.dsi.fastutil.objects.Object2ReferenceOpenHashMap;
+import it.unimi.dsi.io.InputBitStream;
 import it.unimi.dsi.io.OutputBitStream;
 import it.unimi.dsi.lang.MutableString;
 import it.unimi.dsi.logging.ProgressLogger;
@@ -131,13 +134,6 @@
     
   }
   
-  private static final Callable<Long> noOpLong = new Callable<Long>() {
-    @Override
-    public Long call() throws Exception {
-      return null;
-    }
-  };
-  
   private static final Callable<Void> noOpVoid = new Callable<Void>() {
     @Override
     public Void call() throws Exception {
@@ -353,16 +349,16 @@
     protected File indexDir;
     protected Index index;
     protected BloomFilter<Void> termFilter;
-    long numberOfDocuments;
-    long numberOfTerms;
-    long numberOfPostings;
-    long numberOfOccurences;
-    int maxCount;
+    protected long numberOfDocuments;
+    protected long numberOfTerms;
+    protected long numberOfPostings;
+    protected long numberOfOccurences;
+    protected int maxCount;
+    
     public MG4JIndex(Index index, File indexDir, 
         BloomFilter<Void> termFilter,
         long numberOfDocuments, long numberOfTerms, long numberOfPostings,
         long numberOfOccurences, int maxCount) {
-      super();
       this.index = index;
       this.indexDir = indexDir;
       this.termFilter = termFilter;
@@ -427,6 +423,8 @@
     long numberOfOccurences =-1;
     int maxCount =-1;
     int indexIdx = 0;
+    IntBigList sizes = new IntBigArrayBigList();
+    BloomFilter<Void> bloomFilters[] = new BloomFilter[indexes.length];
     
     for(MG4JIndex aSubIndex : subIndexes) {
       indexes[indexIdx] = aSubIndex.index;
@@ -438,12 +436,15 @@
       numberOfPostings += aSubIndex.numberOfPostings;
       numberOfOccurences += aSubIndex.numberOfOccurences;
       if(maxCount < aSubIndex.maxCount) maxCount = aSubIndex.maxCount;
+      bloomFilters[indexIdx] = aSubIndex.termFilter;
+      sizes.addAll(aSubIndex.index.sizes);
       indexIdx++;
     }
+    
     return new DocumentalConcatenatedCluster(indexes,
           new ContiguousDocumentalStrategy(cutPoints),
           false, // flat = all component indexes have the same term list
-          null, // Bloom Filters
+          bloomFilters, // Bloom Filters
           numberOfDocuments, 
           numberOfTerms, 
           numberOfPostings, 
@@ -454,7 +455,7 @@
           true, // hasPositions, 
           termProcessor, 
           null, // field 
-          null, // sizes
+          sizes, // sizes
           null // properties
           );
   }  
@@ -1038,19 +1039,20 @@
     String mg4jBasename = new File(subIndexDir, name).getAbsolutePath(); 
     try {
       try{
-        newIndex = Index.getInstance(mg4jBasename + "?" +
-            UriKeys.MAPPED.name().toLowerCase() + "=1;");
+        newIndex = Index.getInstance(
+            mg4jBasename + "?" + UriKeys.MAPPED.name().toLowerCase() + "=1;", 
+            true, true);
       } catch(IOException e) {
         // memory mapping failed
         logger.info("Memory mapping failed for index " + mg4jBasename
                 + ". Loading as file index instead");
-        // now try to just open it as an on-disk index
+        // now try to open it as a plain an on-disk index
         newIndex = Index.getInstance(mg4jBasename, true, true);
       }
     } catch(Exception e) {
-      throw new IndexException("Could not open the sub-index at" +
-         mg4jBasename , e);
+      throw new IndexException("Could not open the sub-index at" + 
mg4jBasename , e);
     }
+    //read the Bloom filter 
     File bloomFile = new File(mg4jBasename + 
DocumentalCluster.BLOOM_EXTENSION);
     BloomFilter<Void> termFilter = null;
     try {
@@ -1214,28 +1216,12 @@
     try {
       //process the annotations one by one.
       for(Annotation ann : annotsToProcess){
-        // calculate the position and string for this annotation
-        calculateStartPositionForAnnotation(ann, gateDocument);
-        String[] terms = calculateTermStringForAnnotation(ann, gateDocument);
-        if(terms == null){
-          //the value was already stored in #currentTerm by the implementation.
-          indexCurrentTerm();
-        }else if(terms.length == 0){
-          //we received an empty array -> we should NOT index the current term
-        }else{
-          //we have received multiple values from the implementation
-          for(String aTerm : terms){
-            currentTerm.replace(aTerm == null ? "" : aTerm);
-            indexCurrentTerm();
-          }
-        }
+        processAnnotation(ann, gateDocument);
       }
       // the current document is finished
       int docLength = tokenPosition + 1;
       if(docLength > maxDocSizeInRAM) maxDocSizeInRAM = docLength;
       documentSizesInRAM.add(docLength);
-    } catch (IOException e) {
-      throw new IndexException("IO Exception while indexing", e);
     } finally {
       documentEnding(gateDocument);
       documentPointer++;
@@ -1243,12 +1229,37 @@
     }
   }
   
+  /**
+   * Indexes one annotation (either a Token or a semantic annotation).
+   * @param ann the annotation to be indexed
+   * @param gateDocument the GATEDocument containing the annotation
+   * @throws IndexException
+   * @throws IOException
+   */
+  protected void processAnnotation(Annotation ann,
+      GATEDocument gateDocument) throws IndexException {
+    // calculate the position and string for this annotation
+    calculateStartPositionForAnnotation(ann, gateDocument);
+    String[] terms = calculateTermStringForAnnotation(ann, gateDocument);
+    if(terms == null){
+      //the value was already stored in #currentTerm by the implementation.
+      indexCurrentTerm();
+    }else if(terms.length == 0){
+      //we received an empty array -> we should NOT index the current term
+    }else{
+      //we have received multiple values from the implementation
+      for(String aTerm : terms){
+        currentTerm.replace(aTerm == null ? "" : aTerm);
+        indexCurrentTerm();
+      }
+    }
+  }
   
   /**
    * Adds the value in {@link #currentTerm} to the index.
    * @throws IOException 
    */
-  protected void indexCurrentTerm() throws IOException {
+  protected void indexCurrentTerm() {
     //check if we have seen this mention before
     PostingsList termPostings = termMap.get(currentTerm);
     if(termPostings == null){

Modified: mimir/branches/5.0/mimir-core/src/gate/mimir/search/QueryEngine.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/search/QueryEngine.java        
2014-01-17 10:34:59 UTC (rev 17235)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/search/QueryEngine.java        
2014-01-17 15:31:02 UTC (rev 17236)
@@ -103,12 +103,6 @@
    * index.
    */
   protected IndexReaderPool[] directIndexReaderPools;
-
-  /**
-   * The document sizes used during search time (if running in document mode) 
to
-   * simulate document-spanning annotations.
-   */
-  private transient IntBigList documentSizes;
   
   /**
    * The maximum size of an index that can be loaded in memory (by default 64
@@ -125,7 +119,7 @@
   /**
    * The index being searched.
    */
-  protected MimirIndex index;
+  protected final MimirIndex index;
 
   /**
    * The index configuration this index was built from.
@@ -311,17 +305,8 @@
   public IndexReaderPool[] getDirectIndexes() {
     return directIndexReaderPools;
   }  
-  
+
   /**
-   * Gets the list of document sizes from one underlying MG4J index (all 
-   * sub-indexes should have the same sizes).
-   * @return
-   */
-  public IntBigList getDocumentSizes() {
-    return documentSizes;
-  }
-  
-  /**
    * Returns the index that stores the data for a particular feature of token
    * annotations.
    * 
@@ -401,7 +386,16 @@
     return null;
   }
   
+  
   /**
+   * Gets the index this query engine is searching.
+   * @return
+   */
+  public MimirIndex getIndex() {
+    return index;
+  }
+
+  /**
    * @return the index configuration for this index
    */
   public IndexConfig getIndexConfig() {

Modified: 
mimir/branches/5.0/mimir-core/src/gate/mimir/search/query/AnnotationQuery.java
===================================================================
--- 
mimir/branches/5.0/mimir-core/src/gate/mimir/search/query/AnnotationQuery.java  
    2014-01-17 10:34:59 UTC (rev 17235)
+++ 
mimir/branches/5.0/mimir-core/src/gate/mimir/search/query/AnnotationQuery.java  
    2014-01-17 15:31:02 UTC (rev 17236)
@@ -133,7 +133,7 @@
       if(underlyingHit == null) return null;
       long doc = underlyingHit.getDocumentId();
       if(isInDocumentMode) {
-        return new Binding(query, doc, 0, 
engine.getDocumentSizes().getInt(doc),
+        return new Binding(query, doc, 0, 
engine.getIndex().getDocumentSize(doc),
           underlyingHit.getContainedBindings());        
       } else {
         return new Binding(query, doc,
@@ -141,7 +141,6 @@
           underlyingHit.getLength(),
           underlyingHit.getContainedBindings());        
       }
-
     }
    
     @Override

Modified: mimir/branches/5.0/mimir-test/src/gate/mimir/test/Scratch.java
===================================================================
--- mimir/branches/5.0/mimir-test/src/gate/mimir/test/Scratch.java      
2014-01-17 10:34:59 UTC (rev 17235)
+++ mimir/branches/5.0/mimir-test/src/gate/mimir/test/Scratch.java      
2014-01-17 15:31:02 UTC (rev 17236)
@@ -19,9 +19,13 @@
 import it.unimi.di.big.mg4j.search.score.TfIdfScorer;
 
 import java.io.BufferedReader;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStreamReader;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
 import java.lang.reflect.InvocationTargetException;
 import java.net.MalformedURLException;
 import java.net.URI;
@@ -39,7 +43,10 @@
 import org.apache.commons.configuration.ConfigurationException;
 
 import gate.Document;
+import gate.Factory;
 import gate.Gate;
+import gate.corpora.DocumentImpl;
+import gate.creole.ResourceData;
 import gate.mimir.AbstractSemanticAnnotationHelper;
 import gate.mimir.IndexConfig;
 import gate.mimir.MimirIndex;
@@ -74,9 +81,9 @@
 public class Scratch {
 
   public static void main (String[] args) throws Exception {
-    mainIndexer5(args);
+//    mainIndexer5(args);
     
-//     mainSimple(args);
+     mainSimple(args);
     
 //     mainDirectIndexes(args);
 //    mainBuildDirectIndex(args);
@@ -173,28 +180,37 @@
       new File("../plugins/sparql").toURI().toURL());
     MimirIndex mainIndex = new MimirIndex(new File(args[0]));
     QueryEngine qEngine = mainIndex.getQueryEngine();
-    String query = "electrical";
+    
+//    String query = "electrical";
 //    String query = "{Document date > 20070000}";
-    QueryNode qNode = QueryParser.parse(query);
+//    String query = "{Abstract}";
+   
+    String[] queries = new String[]{"electrical", "the", "{Abstract}", 
"{Document date > 20070000}"};
     long start = System.currentTimeMillis();
     NumberFormat nf = NumberFormat.getNumberInstance();
-    long startLocal = System.currentTimeMillis();
-    QueryExecutor qExecutor = qNode.getQueryExecutor(qEngine);
-    long latestDoc = qExecutor.nextDocument(-1);
-    int totalHitCount = 0;
-    int docCount = 0;
-    while(latestDoc >= 0) {
-      docCount++;
-      int hitCount = 0;
-      while(qExecutor.nextHit() != null) hitCount++;
-      totalHitCount += hitCount;
-      System.out.println("Doc " + latestDoc + ", hits: " + hitCount);
-      latestDoc = qExecutor.nextDocument(-1);
+    for(String query : queries) {
+      System.out.println("Query: " + query);
+      QueryNode qNode = QueryParser.parse(query);
+      long startLocal = System.currentTimeMillis();
+      QueryExecutor qExecutor = qNode.getQueryExecutor(qEngine);
+      long latestDoc = qExecutor.nextDocument(-1);
+      int totalHitCount = 0;
+      int docCount = 0;
+      while(latestDoc >= 0) {
+        docCount++;
+        int hitCount = 0;
+        while(qExecutor.nextHit() != null) hitCount++;
+        totalHitCount += hitCount;
+        System.out.println("Doc " + latestDoc + ", hits: " + hitCount);
+        latestDoc = qExecutor.nextDocument(-1);
+      }
+      System.out.println("Found " + nf.format(totalHitCount) + " hits in " +
+        nf.format(docCount) + " documents, in " +
+        nf.format(System.currentTimeMillis() - startLocal) + " ms.\n" +
+        "========================================================\n" + 
+        "========================================================");
+      qExecutor.close();      
     }
-    System.out.println("Found " + nf.format(totalHitCount) + " hits in " +
-      nf.format(docCount) + " documents, in " +
-      nf.format(System.currentTimeMillis() - startLocal) + " ms.");
-    qExecutor.close();
     System.out.println("Total time " +
       nf.format(System.currentTimeMillis() - start) + " ms.");
     mainIndex.close();
@@ -224,7 +240,7 @@
     
     
     MimirIndex mainIndex = new MimirIndex(indexConfig);
-    mainIndex.setOccurrencesPerBatch(500000);
+    mainIndex.setOccurrencesPerBatch(1000000);
     // index some documents
     File zipFile = new File(args[1]);
     String fileURI = zipFile.toURI().toString();
@@ -233,6 +249,7 @@
     
     int copies = 100;
     boolean compress = false;
+    ResourceData docRd = 
Gate.getCreoleRegister().get(DocumentImpl.class.getName());
     while(entries.hasMoreElements()) {
       ZipEntry entry = entries.nextElement();
       if(entry.isDirectory()) {
@@ -240,7 +257,15 @@
       }
       URL url = new URL("jar:" + fileURI + "!/" + entry.getName());
       Document doc = gate.Factory.newDocument(url, "UTF-8");
+      ByteArrayOutputStream baos = new ByteArrayOutputStream();
+      ObjectOutputStream oos = new ObjectOutputStream(baos);
+      oos.writeObject(doc);
+      oos.close();
+      Factory.deleteResource(doc);
+      byte[] docBytes = baos.toByteArray();
       for(int i = 0;  i < copies; i++) {
+        doc = (Document) new ObjectInputStream(new 
ByteArrayInputStream(docBytes)).readObject();
+        docRd.addInstantiation(doc);
         mainIndex.indexDocument(doc);
       }
     }

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
CenturyLink Cloud: The Leader in Enterprise Cloud Services.
Learn Why More Businesses Are Choosing CenturyLink Cloud For
Critical Workloads, Development Environments & Everything In Between.
Get a Quote or Start a Free Trial Today. 
http://pubads.g.doubleclick.net/gampad/clk?id=119420431&iu=/4140/ostg.clktrk
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs

[gate-cvs] SF.net SVN: gate:[17236] mimir/branches/5.0

Reply via email to