Revision: 17254
http://sourceforge.net/p/gate/code/17254
Author: valyt
Date: 2014-01-29 13:16:11 +0000 (Wed, 29 Jan 2014)
Log Message:
-----------
More work on supporting direct indexes:
- a direct index cluster is now maintained during the indexing process, in a
fashion similar to the inverted index
- direct indexes are now be compacted as well during the overall index
compaction
- we now use explicit inverted~ direct~ variable names to distinguish between
the two types of index.
Modified Paths:
--------------
mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicAnnotationIndex.java
mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java
mimir/branches/5.0/mimir-core/src/gate/mimir/search/query/TermQuery.java
Modified:
mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicAnnotationIndex.java
===================================================================
---
mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicAnnotationIndex.java
2014-01-29 12:01:38 UTC (rev 17253)
+++
mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicAnnotationIndex.java
2014-01-29 13:16:11 UTC (rev 17254)
@@ -116,9 +116,6 @@
theHelper.init(this);
}
offsetComparator = new OffsetComparator();
- // create and save the term processor
- additionalProperties.setProperty(Index.PropertyKeys.TERMPROCESSOR,
- ObjectParser.toSpec(termProcessor));
// start the indexing thread
indexingThread = new Thread(this, "Mimir-" + name + " indexing thread");
indexingThread.start();
Modified: mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java
2014-01-29 12:01:38 UTC (rev 17253)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicIndex.java
2014-01-29 13:16:11 UTC (rev 17254)
@@ -35,8 +35,10 @@
import it.unimi.di.big.mg4j.index.CompressionFlags.Component;
import it.unimi.di.big.mg4j.index.Index.UriKeys;
import it.unimi.di.big.mg4j.index.cluster.ContiguousDocumentalStrategy;
+import it.unimi.di.big.mg4j.index.cluster.ContiguousLexicalStrategy;
import it.unimi.di.big.mg4j.index.cluster.DocumentalCluster;
import it.unimi.di.big.mg4j.index.cluster.DocumentalConcatenatedCluster;
+import it.unimi.di.big.mg4j.index.cluster.LexicalCluster;
import it.unimi.di.big.mg4j.io.IOFactory;
import it.unimi.di.big.mg4j.tool.Combine;
import it.unimi.di.big.mg4j.tool.Concatenate;
@@ -388,26 +390,25 @@
*/
protected static class MG4JIndex {
protected File indexDir;
- protected Index index;
- protected BloomFilter<Void> termFilter;
- protected long numberOfDocuments;
- protected long numberOfTerms;
- protected long numberOfPostings;
- protected long numberOfOccurences;
- protected int maxCount;
+ protected Index invertedIndex;
+ protected Index directIndex;
+ protected BloomFilter<Void> invertedTermFilter;
+ protected BloomFilter<Void> directTermFilter;
+
- public MG4JIndex(Index index, File indexDir,
- BloomFilter<Void> termFilter,
- long numberOfDocuments, long numberOfTerms, long numberOfPostings,
- long numberOfOccurences, int maxCount) {
- this.index = index;
+ public MG4JIndex(
+ File indexDir,
+ Index invertedIndex,
+ BloomFilter<Void> invertedTermFilter,
+ Index directIndex,
+ BloomFilter<Void> directTermFilter) {
+
this.indexDir = indexDir;
- this.termFilter = termFilter;
- this.numberOfDocuments = numberOfDocuments;
- this.numberOfTerms = numberOfTerms;
- this.numberOfPostings = numberOfPostings;
- this.numberOfOccurences = numberOfOccurences;
- this.maxCount = maxCount;
+ this.invertedIndex = invertedIndex;
+ this.invertedTermFilter = invertedTermFilter;
+
+ this.directIndex = directIndex;
+ this.directTermFilter = directTermFilter;
}
}
@@ -452,7 +453,7 @@
TermProcessor termProcessor){
if(subIndexes == null || subIndexes.size() == 0) return null;
- if(subIndexes.size() == 1) return subIndexes.get(0).index;
+ if(subIndexes.size() == 1) return subIndexes.get(0).invertedIndex;
// prepare the documental cluster
Index[] indexes = new Index[subIndexes.size()];
@@ -468,17 +469,20 @@
BloomFilter<Void> bloomFilters[] = new BloomFilter[indexes.length];
for(MG4JIndex aSubIndex : subIndexes) {
- indexes[indexIdx] = aSubIndex.index;
+ indexes[indexIdx] = aSubIndex.invertedIndex;
if(indexIdx < cutPoints.length - 1) {
- cutPoints[indexIdx + 1] = cutPoints[indexIdx] +
aSubIndex.numberOfDocuments;
+ cutPoints[indexIdx + 1] = cutPoints[indexIdx] +
+ aSubIndex.invertedIndex.numberOfDocuments;
}
- numberOfTerms += aSubIndex.numberOfTerms;
- numberOfDocuments += aSubIndex.numberOfDocuments;
- numberOfPostings += aSubIndex.numberOfPostings;
- numberOfOccurences += aSubIndex.numberOfOccurences;
- if(maxCount < aSubIndex.maxCount) maxCount = aSubIndex.maxCount;
- bloomFilters[indexIdx] = aSubIndex.termFilter;
- sizes.addAll(aSubIndex.index.sizes);
+ numberOfTerms += aSubIndex.invertedIndex.numberOfTerms;
+ numberOfDocuments += aSubIndex.invertedIndex.numberOfDocuments;
+ numberOfPostings += aSubIndex.invertedIndex.numberOfPostings;
+ numberOfOccurences += aSubIndex.invertedIndex.numberOfOccurrences;
+ if(maxCount < aSubIndex.invertedIndex.maxCount){
+ maxCount = aSubIndex.invertedIndex.maxCount;
+ }
+ bloomFilters[indexIdx] = aSubIndex.invertedTermFilter;
+ sizes.addAll(aSubIndex.invertedIndex.sizes);
indexIdx++;
}
@@ -501,6 +505,68 @@
);
}
+
+ protected final static Index openDirectIndexCluster(
+ List<MG4JIndex> subIndexes){
+
+ if(subIndexes == null || subIndexes.size() == 0) return null;
+ if(subIndexes.size() == 1) return subIndexes.get(0).invertedIndex;
+
+ // prepare the lexical cluster
+ Index[] indexes = new Index[subIndexes.size()];
+ int[] cutPoints = new int[indexes.length];
+ cutPoints[0] = 0;
+ String[] cutPointTerms = new String[indexes.length];
+ cutPointTerms[0] = longToTerm(0);
+ int numberOfTerms = -1;
+ int numberOfDocuments = -1;
+ long numberOfPostings = -1;
+ long numberOfOccurences =-1;
+ int maxCount =-1;
+ int indexIdx = 0;
+ IntBigList sizes = new IntBigArrayBigList();
+ BloomFilter<Void> bloomFilters[] = new BloomFilter[indexes.length];
+
+ for(MG4JIndex aSubIndex : subIndexes) {
+ indexes[indexIdx] = aSubIndex.directIndex;
+ // we build this based on the inverted index, as the cut-points for the
+ // lexical partitioning are based on document IDs
+ if(indexIdx < cutPoints.length - 1) {
+ cutPoints[indexIdx + 1] = cutPoints[indexIdx] +
+ (int)aSubIndex.invertedIndex.numberOfDocuments;
+ cutPointTerms[indexIdx + 1] = longToTerm(cutPoints[indexIdx + 1]);
+ }
+ numberOfTerms += aSubIndex.directIndex.numberOfTerms;
+ numberOfDocuments += aSubIndex.directIndex.numberOfDocuments;
+ numberOfPostings += aSubIndex.directIndex.numberOfPostings;
+ numberOfOccurences += aSubIndex.directIndex.numberOfOccurrences;
+ if(maxCount < aSubIndex.directIndex.maxCount){
+ maxCount = aSubIndex.directIndex.maxCount;
+ }
+ bloomFilters[indexIdx] = aSubIndex.directTermFilter;
+ sizes.addAll(aSubIndex.directIndex.sizes);
+ indexIdx++;
+ }
+ cutPointTerms[cutPointTerms.length - 1] = null;
+
+ return new LexicalCluster(indexes,
+ new ContiguousLexicalStrategy(cutPoints, cutPointTerms),
+ bloomFilters, // Bloom Filters
+ numberOfDocuments,
+ numberOfTerms,
+ numberOfPostings,
+ numberOfOccurences,
+ maxCount,
+ null, // payload
+ true, // hasCounts
+ false, // hasPositions,
+ NullTermProcessor.getInstance(),
+ null, // field
+ sizes, // sizes
+ null // properties
+ );
+ }
+
/**
* Converts a long value into a String containing a zero-padded Hex
* representation of the input value. The lexicographic ordering of the
@@ -541,7 +607,14 @@
public static final String DIRECT_TERMS_FILENAME = "direct.terms";
+
/**
+ * FIles belonging to teh direct index get this suffix added to their
+ * basename.
+ */
+ public static final String DIRECT_INDEX_NAME_SUFFIX = "-dir";
+
+ /**
* The file name (under the current directory for this atomic index) for the
* directory containing the documents that have been queued for indexing,
but
* not yet indexed.
@@ -751,10 +824,12 @@
this.currentTerm = new MutableString();
this.additionalProperties = new Properties();
-
+ // save the term processor
+ additionalProperties.setProperty(Index.PropertyKeys.TERMPROCESSOR,
+ ObjectParser.toSpec(termProcessor));
if(hasDirectIndex) {
additionalDirectProperties = new Properties();
- additionalProperties.setProperty(Index.PropertyKeys.TERMPROCESSOR,
+ additionalDirectProperties.setProperty(Index.PropertyKeys.TERMPROCESSOR,
ObjectParser.toSpec(NullTermProcessor.getInstance()));
}
initIndex();
@@ -816,8 +891,7 @@
}
}
synchronized(this) {
- //TODO
- // open direct index cluster
+ directIndex = openDirectIndexCluster(subIndexes);
}
}
}
@@ -1004,8 +1078,7 @@
subIndexes.add(openSubIndex(newTailName));
invertedIndex = openInvertedIndexCluster(subIndexes, termProcessor);
if(hasDirectIndex) {
- // TODO
- // merge the new direct batch into the direct cluster
+ directIndex = openDirectIndexCluster(subIndexes);
}
}
} catch(Exception e) {
@@ -1080,7 +1153,8 @@
}
// 2. write the data from RAM
- String mg4jBasename = new File(batchDir, name + "-dir").getAbsolutePath();
+ String mg4jBasename = new File(batchDir, name +
+ DIRECT_INDEX_NAME_SUFFIX).getAbsolutePath();
// copy the default compression flags, and remove positions
Map<Component, Coding> flags = new HashMap<Component, Coding>(
CompressionFlags.DEFAULT_QUASI_SUCCINCT_INDEX);
@@ -1246,11 +1320,45 @@
} catch(Exception e) {
throw new IndexException("Exception while combining sub-indexes", e);
}
+
+ if(hasDirectIndex()) {
+ codingFlags = new HashMap<Component, Coding>(
+ CompressionFlags.DEFAULT_QUASI_SUCCINCT_INDEX);
+ codingFlags.remove(Component.POSITIONS);
+ outputBaseName = new File(headDirNew, name +
+ DIRECT_INDEX_NAME_SUFFIX).getAbsolutePath();
+
+ inputBaseNames = new String[indexesToMerge.size()];
+ for(int i = 0; i < inputBaseNames.length; i++) {
+ inputBaseNames[i] = new File(indexesToMerge.get(i).indexDir,
+ name + DIRECT_INDEX_NAME_SUFFIX).getAbsolutePath();
+ }
+ try {
+ new Concatenate(
+ IOFactory.FILESYSTEM_FACTORY,
+ outputBaseName,
+ inputBaseNames,
+ false, // metadataOnly
+ Combine.DEFAULT_BUFFER_SIZE,
+ codingFlags,
+ IndexType.QUASI_SUCCINCT,
+ true, // skips
+ // BitStreamIndex.DEFAULT_QUANTUM,
+ // replaced with optimised automatic calculation
+ -5,
+ BitStreamIndex.DEFAULT_HEIGHT,
+ SkipBitStreamIndexWriter.DEFAULT_TEMP_BUFFER_SIZE,
+ ProgressLogger.DEFAULT_LOG_INTERVAL).run();
+ // generate term map
+ generateTermMap(new File(outputBaseName +
DiskBasedIndex.TERMS_EXTENSION),
+ new File(outputBaseName + DiskBasedIndex.TERMMAP_EXTENSION),
+ new File(outputBaseName + DocumentalCluster.BLOOM_EXTENSION));
+ } catch(Exception e) {
+ throw new IndexException("Exception while combining direct
sub-indexes",
+ e);
+ }
+ }
- if(hasDirectIndex()) {
- // TODO
- }
-
// update the internal state
synchronized(this) {
// remove the indexes that were merged
@@ -1262,7 +1370,9 @@
if(headDirNew.renameTo(headDir)) {
subIndexes.add(0, openSubIndex(HEAD_FILE_NAME));
invertedIndex = openInvertedIndexCluster(subIndexes, termProcessor);
-
+ if(hasDirectIndex) {
+ directIndex =openDirectIndexCluster(subIndexes);
+ }
// clean-up: delete old head, used-up tails
if(!gate.util.Files.rmdir(headDirOld)) {
throw new IndexException(
@@ -1333,12 +1443,12 @@
* @throws IndexException
*/
protected MG4JIndex openSubIndex(String subIndexDirname) throws
IOException, IndexException {
- Index newIndex = null;
+ Index invertedIndex = null;
File subIndexDir = new File(indexDirectory, subIndexDirname);
String mg4jBasename = new File(subIndexDir, name).getAbsolutePath();
try {
try{
- newIndex = Index.getInstance(
+ invertedIndex = Index.getInstance(
mg4jBasename + "?" + UriKeys.MAPPED.name().toLowerCase() + "=1;",
true, true);
} catch(IOException e) {
@@ -1346,28 +1456,58 @@
logger.info("Memory mapping failed for index " + mg4jBasename
+ ". Loading as file index instead");
// now try to open it as a plain an on-disk index
- newIndex = Index.getInstance(mg4jBasename, true, true);
+ invertedIndex = Index.getInstance(mg4jBasename, true, true);
}
} catch(Exception e) {
throw new IndexException("Could not open the sub-index at" +
mg4jBasename , e);
}
//read the Bloom filter
File bloomFile = new File(mg4jBasename +
DocumentalCluster.BLOOM_EXTENSION);
- BloomFilter<Void> termFilter = null;
+ BloomFilter<Void> invertedTermFilter = null;
try {
if(bloomFile.exists()) {
- termFilter = (BloomFilter<Void>) BinIO.loadObject(bloomFile);
+ invertedTermFilter = (BloomFilter<Void>) BinIO.loadObject(bloomFile);
}
} catch(ClassNotFoundException e) {
// this should never happen. If it does, it's not fatal
logger.warn("Exception wile loading stre Bloom Filter", e);
}
- MG4JIndex newIndexData = new MG4JIndex(newIndex, subIndexDir, termFilter,
- newIndex.numberOfDocuments,
- newIndex.numberOfTerms,
- newIndex.numberOfPostings,
- newIndex.numberOfOccurrences,
- newIndex.maxCount);
+
+ // open direct index
+ Index directIndex = null;
+ mg4jBasename = new File(subIndexDir, name +
+ DIRECT_INDEX_NAME_SUFFIX).getAbsolutePath();
+ try {
+ try{
+ directIndex = Index.getInstance(
+ mg4jBasename + "?" + UriKeys.MAPPED.name().toLowerCase() + "=1;",
+ true, true);
+ } catch(IOException e) {
+ // memory mapping failed
+ logger.info("Memory mapping failed for index " + mg4jBasename
+ + ". Loading as file index instead");
+ // now try to open it as a plain an on-disk index
+ directIndex = Index.getInstance(mg4jBasename, true, true);
+ }
+ } catch(Exception e) {
+ throw new IndexException("Could not open the sub-index at" +
mg4jBasename , e);
+ }
+ //read the Bloom filter
+ bloomFile = new File(mg4jBasename + DocumentalCluster.BLOOM_EXTENSION);
+ BloomFilter<Void> directTermFilter = null;
+ try {
+ if(bloomFile.exists()) {
+ directTermFilter = (BloomFilter<Void>) BinIO.loadObject(bloomFile);
+ }
+ } catch(ClassNotFoundException e) {
+ // this should never happen. If it does, it's not fatal
+ logger.warn("Exception wile loading stre Bloom Filter", e);
+ }
+
+
+ MG4JIndex newIndexData = new MG4JIndex(subIndexDir,
+ invertedIndex, invertedTermFilter,
+ directIndex, directTermFilter);
return newIndexData;
}
@@ -1647,7 +1787,7 @@
}
public long getDirectTermOccurenceCount(long directTermId) throws
IOException {
- //TODO: copy from IndexReaderPool
- return -1;
+ String termStr = directTerms.get(directTermId);
+ return invertedIndex.documents(termStr).count();
}
}
Modified:
mimir/branches/5.0/mimir-core/src/gate/mimir/search/query/TermQuery.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/search/query/TermQuery.java
2014-01-29 12:01:38 UTC (rev 17253)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/search/query/TermQuery.java
2014-01-29 13:16:11 UTC (rev 17254)
@@ -107,7 +107,7 @@
/**
* @param node
- * @param index
+ * @param invertedIndex
* @throws IOException if the index files cannot be accessed.
*/
public TermQueryExecutor(TermQuery node, QueryEngine engine) throws
IOException {
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
WatchGuard Dimension instantly turns raw network data into actionable
security intelligence. It gives you real-time visual feedback on key
security issues and trends. Skip the complicated setup - simply import
a virtual appliance and go from zero to informed in seconds.
http://pubads.g.doubleclick.net/gampad/clk?id=123612991&iu=/4140/ostg.clktrk
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs