mimir

valyt Wed, 29 Jan 2014 05:17:21 -0800

Revision: 17254
          http://sourceforge.net/p/gate/code/17254
Author:   valyt
Date:     2014-01-29 13:16:11 +0000 (Wed, 29 Jan 2014)
Log Message:
-----------
More work on supporting direct indexes:
- a direct index cluster is now maintained during the indexing process, in a 
fashion similar to the inverted index
- direct indexes are now be compacted as well during the overall index 
compaction
- we now use explicit inverted~ direct~ variable names to distinguish  between 
the two types of index.


Modified Paths:
--------------
    
mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicAnnotationIndex.java
    mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java
    mimir/branches/5.0/mimir-core/src/gate/mimir/search/query/TermQuery.java

Modified: 
mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicAnnotationIndex.java
===================================================================
--- 
mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicAnnotationIndex.java   
    2014-01-29 12:01:38 UTC (rev 17253)
+++ 
mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicAnnotationIndex.java   
    2014-01-29 13:16:11 UTC (rev 17254)
@@ -116,9 +116,6 @@
       theHelper.init(this);
     }
     offsetComparator = new OffsetComparator();
-    // create and save the term processor
-    additionalProperties.setProperty(Index.PropertyKeys.TERMPROCESSOR, 
-        ObjectParser.toSpec(termProcessor));
     // start the indexing thread
     indexingThread = new Thread(this, "Mimir-" + name + " indexing thread");
     indexingThread.start();

Modified: mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java 
2014-01-29 12:01:38 UTC (rev 17253)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java 
2014-01-29 13:16:11 UTC (rev 17254)
@@ -35,8 +35,10 @@
 import it.unimi.di.big.mg4j.index.CompressionFlags.Component;
 import it.unimi.di.big.mg4j.index.Index.UriKeys;
 import it.unimi.di.big.mg4j.index.cluster.ContiguousDocumentalStrategy;
+import it.unimi.di.big.mg4j.index.cluster.ContiguousLexicalStrategy;
 import it.unimi.di.big.mg4j.index.cluster.DocumentalCluster;
 import it.unimi.di.big.mg4j.index.cluster.DocumentalConcatenatedCluster;
+import it.unimi.di.big.mg4j.index.cluster.LexicalCluster;
 import it.unimi.di.big.mg4j.io.IOFactory;
 import it.unimi.di.big.mg4j.tool.Combine;
 import it.unimi.di.big.mg4j.tool.Concatenate;
@@ -388,26 +390,25 @@
    */
   protected static class MG4JIndex {
     protected File indexDir;
-    protected Index index;
-    protected BloomFilter<Void> termFilter;
-    protected long numberOfDocuments;
-    protected long numberOfTerms;
-    protected long numberOfPostings;
-    protected long numberOfOccurences;
-    protected int maxCount;
+    protected Index invertedIndex;
+    protected Index directIndex;
+    protected BloomFilter<Void> invertedTermFilter;
+    protected BloomFilter<Void> directTermFilter;
+
     
-    public MG4JIndex(Index index, File indexDir, 
-        BloomFilter<Void> termFilter,
-        long numberOfDocuments, long numberOfTerms, long numberOfPostings,
-        long numberOfOccurences, int maxCount) {
-      this.index = index;
+    public MG4JIndex(
+        File indexDir,
+        Index invertedIndex,  
+        BloomFilter<Void> invertedTermFilter,
+        Index directIndex,
+        BloomFilter<Void> directTermFilter) {
+      
       this.indexDir = indexDir;
-      this.termFilter = termFilter;
-      this.numberOfDocuments = numberOfDocuments;
-      this.numberOfTerms = numberOfTerms;
-      this.numberOfPostings = numberOfPostings;
-      this.numberOfOccurences = numberOfOccurences;
-      this.maxCount = maxCount;
+      this.invertedIndex = invertedIndex;
+      this.invertedTermFilter = invertedTermFilter;
+      
+      this.directIndex = directIndex;
+      this.directTermFilter = directTermFilter;
     }
   }
   
@@ -452,7 +453,7 @@
       TermProcessor termProcessor){
     
     if(subIndexes == null || subIndexes.size() == 0) return null;
-    if(subIndexes.size() == 1) return subIndexes.get(0).index;
+    if(subIndexes.size() == 1) return subIndexes.get(0).invertedIndex;
     
     // prepare the documental cluster
     Index[] indexes = new Index[subIndexes.size()];
@@ -468,17 +469,20 @@
     BloomFilter<Void> bloomFilters[] = new BloomFilter[indexes.length];
     
     for(MG4JIndex aSubIndex : subIndexes) {
-      indexes[indexIdx] = aSubIndex.index;
+      indexes[indexIdx] = aSubIndex.invertedIndex;
       if(indexIdx < cutPoints.length - 1) {
-        cutPoints[indexIdx + 1] = cutPoints[indexIdx] + 
aSubIndex.numberOfDocuments;
+        cutPoints[indexIdx + 1] = cutPoints[indexIdx] + 
+            aSubIndex.invertedIndex.numberOfDocuments;
       }
-      numberOfTerms += aSubIndex.numberOfTerms;
-      numberOfDocuments += aSubIndex.numberOfDocuments;
-      numberOfPostings += aSubIndex.numberOfPostings;
-      numberOfOccurences += aSubIndex.numberOfOccurences;
-      if(maxCount < aSubIndex.maxCount) maxCount = aSubIndex.maxCount;
-      bloomFilters[indexIdx] = aSubIndex.termFilter;
-      sizes.addAll(aSubIndex.index.sizes);
+      numberOfTerms += aSubIndex.invertedIndex.numberOfTerms;
+      numberOfDocuments += aSubIndex.invertedIndex.numberOfDocuments;
+      numberOfPostings += aSubIndex.invertedIndex.numberOfPostings;
+      numberOfOccurences += aSubIndex.invertedIndex.numberOfOccurrences;
+      if(maxCount < aSubIndex.invertedIndex.maxCount){
+        maxCount = aSubIndex.invertedIndex.maxCount;
+      }
+      bloomFilters[indexIdx] = aSubIndex.invertedTermFilter;
+      sizes.addAll(aSubIndex.invertedIndex.sizes);
       indexIdx++;
     }
     
@@ -501,6 +505,68 @@
           );
   }  
   
+  
+  protected final static Index openDirectIndexCluster(
+      List<MG4JIndex> subIndexes){
+    
+    if(subIndexes == null || subIndexes.size() == 0) return null;
+    if(subIndexes.size() == 1) return subIndexes.get(0).invertedIndex;
+    
+    // prepare the lexical cluster
+    Index[] indexes = new Index[subIndexes.size()];
+    int[] cutPoints = new int[indexes.length];
+    cutPoints[0] = 0;
+    String[] cutPointTerms = new String[indexes.length];
+    cutPointTerms[0] = longToTerm(0);
+    int numberOfTerms = -1;
+    int numberOfDocuments = -1;
+    long numberOfPostings = -1;
+    long numberOfOccurences =-1;
+    int maxCount =-1;
+    int indexIdx = 0;
+    IntBigList sizes = new IntBigArrayBigList();
+    BloomFilter<Void> bloomFilters[] = new BloomFilter[indexes.length];
+    
+    for(MG4JIndex aSubIndex : subIndexes) {
+      indexes[indexIdx] = aSubIndex.directIndex;
+      // we build this based on the inverted index, as the cut-points for the
+      // lexical partitioning are based on document IDs
+      if(indexIdx < cutPoints.length - 1) {
+        cutPoints[indexIdx + 1] = cutPoints[indexIdx] + 
+            (int)aSubIndex.invertedIndex.numberOfDocuments;
+        cutPointTerms[indexIdx + 1] = longToTerm(cutPoints[indexIdx + 1]);
+      }
+      numberOfTerms += aSubIndex.directIndex.numberOfTerms;
+      numberOfDocuments += aSubIndex.directIndex.numberOfDocuments;
+      numberOfPostings += aSubIndex.directIndex.numberOfPostings;
+      numberOfOccurences += aSubIndex.directIndex.numberOfOccurrences;
+      if(maxCount < aSubIndex.directIndex.maxCount){
+        maxCount = aSubIndex.directIndex.maxCount;
+      }
+      bloomFilters[indexIdx] = aSubIndex.directTermFilter;
+      sizes.addAll(aSubIndex.directIndex.sizes);
+      indexIdx++;
+    }
+    cutPointTerms[cutPointTerms.length - 1] = null;
+    
+    return new LexicalCluster(indexes,
+          new ContiguousLexicalStrategy(cutPoints, cutPointTerms),
+          bloomFilters, // Bloom Filters
+          numberOfDocuments, 
+          numberOfTerms, 
+          numberOfPostings, 
+          numberOfOccurences, 
+          maxCount, 
+          null, // payload
+          true, // hasCounts 
+          false, // hasPositions, 
+          NullTermProcessor.getInstance(), 
+          null, // field 
+          sizes, // sizes
+          null // properties
+          );
+  }  
+  
   /**
    * Converts a long value into a String containing a zero-padded Hex 
    * representation of the input value. The lexicographic ordering of the 
@@ -541,7 +607,14 @@
   
   
   public static final String DIRECT_TERMS_FILENAME = "direct.terms";
+  
   /**
+   * FIles belonging to teh direct index get this suffix added to their 
+   * basename.
+   */
+  public static final String DIRECT_INDEX_NAME_SUFFIX = "-dir";
+  
+  /**
    * The file name (under the current directory for this atomic index) for the
    * directory containing the documents that have been queued for indexing, 
but 
    * not yet indexed. 
@@ -751,10 +824,12 @@
     this.currentTerm = new MutableString();
     
     this.additionalProperties = new Properties();
-    
+    // save the term processor
+    additionalProperties.setProperty(Index.PropertyKeys.TERMPROCESSOR, 
+        ObjectParser.toSpec(termProcessor));
     if(hasDirectIndex) {
       additionalDirectProperties = new Properties();
-      additionalProperties.setProperty(Index.PropertyKeys.TERMPROCESSOR, 
+      additionalDirectProperties.setProperty(Index.PropertyKeys.TERMPROCESSOR, 
           ObjectParser.toSpec(NullTermProcessor.getInstance()));
     }
     initIndex();
@@ -816,8 +891,7 @@
         }
       }
       synchronized(this) {
-        //TODO
-        // open direct index cluster
+        directIndex = openDirectIndexCluster(subIndexes);
       }
     }
        }
@@ -1004,8 +1078,7 @@
         subIndexes.add(openSubIndex(newTailName));
         invertedIndex = openInvertedIndexCluster(subIndexes, termProcessor);
         if(hasDirectIndex) {
-          // TODO
-          // merge the new direct batch into the direct cluster
+          directIndex = openDirectIndexCluster(subIndexes);
         }
       }
     } catch(Exception e) {
@@ -1080,7 +1153,8 @@
     }
     
     // 2. write the data from RAM
-    String mg4jBasename = new File(batchDir, name + "-dir").getAbsolutePath();
+    String mg4jBasename = new File(batchDir, name + 
+        DIRECT_INDEX_NAME_SUFFIX).getAbsolutePath();
     // copy the default compression flags, and remove positions
     Map<Component, Coding> flags = new HashMap<Component, Coding>(
         CompressionFlags.DEFAULT_QUASI_SUCCINCT_INDEX);
@@ -1246,11 +1320,45 @@
     } catch(Exception e) {
       throw new IndexException("Exception while combining sub-indexes", e);
     }
+
+    if(hasDirectIndex()) {
+      codingFlags = new HashMap<Component, Coding>(
+          CompressionFlags.DEFAULT_QUASI_SUCCINCT_INDEX); 
+      codingFlags.remove(Component.POSITIONS);
+      outputBaseName = new File(headDirNew, name + 
+          DIRECT_INDEX_NAME_SUFFIX).getAbsolutePath();
+      
+      inputBaseNames = new String[indexesToMerge.size()];
+      for(int i = 0; i < inputBaseNames.length; i++) {
+        inputBaseNames[i] = new File(indexesToMerge.get(i).indexDir, 
+            name + DIRECT_INDEX_NAME_SUFFIX).getAbsolutePath(); 
+      }
+      try {
+        new Concatenate(
+            IOFactory.FILESYSTEM_FACTORY,
+            outputBaseName,
+            inputBaseNames,
+            false, // metadataOnly 
+            Combine.DEFAULT_BUFFER_SIZE, 
+            codingFlags,
+            IndexType.QUASI_SUCCINCT,
+            true, // skips
+            // BitStreamIndex.DEFAULT_QUANTUM,
+            // replaced with optimised automatic calculation
+            -5, 
+            BitStreamIndex.DEFAULT_HEIGHT, 
+            SkipBitStreamIndexWriter.DEFAULT_TEMP_BUFFER_SIZE, 
+            ProgressLogger.DEFAULT_LOG_INTERVAL).run();
+        // generate term map
+        generateTermMap(new File(outputBaseName + 
DiskBasedIndex.TERMS_EXTENSION), 
+            new File(outputBaseName +  DiskBasedIndex.TERMMAP_EXTENSION),
+            new File(outputBaseName +  DocumentalCluster.BLOOM_EXTENSION));
+      } catch(Exception e) {
+        throw new IndexException("Exception while combining direct 
sub-indexes", 
+            e);
+      }
+    }    
          
-         if(hasDirectIndex()) {
-           // TODO
-         }
-
          // update the internal state
     synchronized(this) {
       // remove the indexes that were merged
@@ -1262,7 +1370,9 @@
         if(headDirNew.renameTo(headDir)) {
           subIndexes.add(0, openSubIndex(HEAD_FILE_NAME));
           invertedIndex = openInvertedIndexCluster(subIndexes, termProcessor);
-          
+          if(hasDirectIndex) {
+            directIndex =openDirectIndexCluster(subIndexes);
+          }
           // clean-up: delete old head, used-up tails
           if(!gate.util.Files.rmdir(headDirOld)) {
             throw new IndexException(
@@ -1333,12 +1443,12 @@
         * @throws IndexException 
         */
        protected MG4JIndex openSubIndex(String subIndexDirname) throws 
IOException, IndexException {
-    Index newIndex = null;
+    Index invertedIndex = null;
     File subIndexDir = new File(indexDirectory, subIndexDirname);
     String mg4jBasename = new File(subIndexDir, name).getAbsolutePath(); 
     try {
       try{
-        newIndex = Index.getInstance(
+        invertedIndex = Index.getInstance(
             mg4jBasename + "?" + UriKeys.MAPPED.name().toLowerCase() + "=1;", 
             true, true);
       } catch(IOException e) {
@@ -1346,28 +1456,58 @@
         logger.info("Memory mapping failed for index " + mg4jBasename
                 + ". Loading as file index instead");
         // now try to open it as a plain an on-disk index
-        newIndex = Index.getInstance(mg4jBasename, true, true);
+        invertedIndex = Index.getInstance(mg4jBasename, true, true);
       }
     } catch(Exception e) {
       throw new IndexException("Could not open the sub-index at" + 
mg4jBasename , e);
     }
     //read the Bloom filter 
     File bloomFile = new File(mg4jBasename + 
DocumentalCluster.BLOOM_EXTENSION);
-    BloomFilter<Void> termFilter = null;
+    BloomFilter<Void> invertedTermFilter = null;
     try {
       if(bloomFile.exists()) {
-        termFilter = (BloomFilter<Void>) BinIO.loadObject(bloomFile);
+        invertedTermFilter = (BloomFilter<Void>) BinIO.loadObject(bloomFile);
       }
     } catch(ClassNotFoundException e) {
       // this should never happen. If it does, it's not fatal
       logger.warn("Exception wile loading stre Bloom Filter", e);
     }
-    MG4JIndex newIndexData = new MG4JIndex(newIndex, subIndexDir, termFilter, 
-        newIndex.numberOfDocuments,
-        newIndex.numberOfTerms,
-        newIndex.numberOfPostings,
-        newIndex.numberOfOccurrences,
-        newIndex.maxCount);
+    
+    // open direct index
+    Index directIndex = null;
+    mg4jBasename = new File(subIndexDir, name + 
+        DIRECT_INDEX_NAME_SUFFIX).getAbsolutePath(); 
+    try {
+      try{
+        directIndex = Index.getInstance(
+            mg4jBasename + "?" + UriKeys.MAPPED.name().toLowerCase() + "=1;", 
+            true, true);
+      } catch(IOException e) {
+        // memory mapping failed
+        logger.info("Memory mapping failed for index " + mg4jBasename
+                + ". Loading as file index instead");
+        // now try to open it as a plain an on-disk index
+        directIndex = Index.getInstance(mg4jBasename, true, true);
+      }
+    } catch(Exception e) {
+      throw new IndexException("Could not open the sub-index at" + 
mg4jBasename , e);
+    }
+    //read the Bloom filter 
+    bloomFile = new File(mg4jBasename + DocumentalCluster.BLOOM_EXTENSION);
+    BloomFilter<Void> directTermFilter = null;
+    try {
+      if(bloomFile.exists()) {
+        directTermFilter = (BloomFilter<Void>) BinIO.loadObject(bloomFile);
+      }
+    } catch(ClassNotFoundException e) {
+      // this should never happen. If it does, it's not fatal
+      logger.warn("Exception wile loading stre Bloom Filter", e);
+    }
+    
+    
+    MG4JIndex newIndexData = new MG4JIndex(subIndexDir, 
+        invertedIndex, invertedTermFilter, 
+        directIndex, directTermFilter);
          return newIndexData;
        }
        
@@ -1647,7 +1787,7 @@
   }
   
   public long getDirectTermOccurenceCount(long directTermId) throws 
IOException {
-    //TODO: copy from IndexReaderPool
-    return -1;
+    String termStr = directTerms.get(directTermId);
+    return invertedIndex.documents(termStr).count();
   }
 }

Modified: 
mimir/branches/5.0/mimir-core/src/gate/mimir/search/query/TermQuery.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/search/query/TermQuery.java    
2014-01-29 12:01:38 UTC (rev 17253)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/search/query/TermQuery.java    
2014-01-29 13:16:11 UTC (rev 17254)
@@ -107,7 +107,7 @@
     
     /**
      * @param node
-     * @param index
+     * @param invertedIndex
      * @throws IOException if the index files cannot be accessed.
      */
     public TermQueryExecutor(TermQuery node, QueryEngine engine) throws 
IOException {

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
WatchGuard Dimension instantly turns raw network data into actionable 
security intelligence. It gives you real-time visual feedback on key
security issues and trends.  Skip the complicated setup - simply import
a virtual appliance and go from zero to informed in seconds.
http://pubads.g.doubleclick.net/gampad/clk?id=123612991&iu=/4140/ostg.clktrk
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs

[gate-cvs] SF.net SVN: gate:[17254] mimir/branches/5.0/mimir-core/src/gate/mimir

Reply via email to