Revision: 17290
http://sourceforge.net/p/gate/code/17290
Author: valyt
Date: 2014-02-12 12:34:46 +0000 (Wed, 12 Feb 2014)
Log Message:
-----------
- implemented regular dump to disk functionality, to ensure recent document
become searcheable no longer than a specified timeout after they were indexed
- more synchronization in MimirIndex to keep internal state consistent (this
still needs further review)
- fixed TermsQuery to deal with empty indexes.
Modified Paths:
--------------
mimir/branches/5.0/mimir-core/src/gate/mimir/IndexConfig.java
mimir/branches/5.0/mimir-core/src/gate/mimir/MimirIndex.java
mimir/branches/5.0/mimir-core/src/gate/mimir/index/DocumentCollection.java
mimir/branches/5.0/mimir-core/src/gate/mimir/search/query/TermQuery.java
Modified: mimir/branches/5.0/mimir-core/src/gate/mimir/IndexConfig.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/IndexConfig.java
2014-02-11 16:46:10 UTC (rev 17289)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/IndexConfig.java
2014-02-12 12:34:46 UTC (rev 17290)
@@ -225,6 +225,11 @@
"gate.mimir.uri";
/**
+ * The default value for {@link #timeBetweenBatches} (1 hour).
+ */
+ public static final int DEFAULT_TIME_BETWEEN_BATCHES = 3600 * 1000;
+
+ /**
* A Map storing values that need to be passed between the various pluggable
* components used by this index (e.g. ORDI-based annotation helpers may
* pass references to the ORDI Factory between each other).
@@ -341,7 +346,15 @@
public SemanticIndexerConfig[] getSemanticIndexers() {
return semanticIndexers;
}
+
+ public int getTimeBetweenBatches() {
+ return timeBetweenBatches;
+ }
+ public void setTimeBetweenBatches(int timeBetweenBatches) {
+ this.timeBetweenBatches = timeBetweenBatches;
+ }
+
/**
* Gets the options map - a Map with arbitrary configuration options, which
* is made available to all sub-elements of this index (e.g. the various
@@ -584,7 +597,15 @@
*/
private String documentUriFeatureName = DOCUMENT_URI_FEATURE_DEFAULT_NAME;
+
/**
+ * The maximum amount of time between dumping batches to disk, i.e. the
+ * maximum amount of time a document may be stored in RAM after having been
+ * submitted for indexing and before it becomes searchable.
+ */
+ private int timeBetweenBatches = DEFAULT_TIME_BETWEEN_BATCHES;
+
+ /**
* A Map with arbitrary configuration options, which is made available to all
* sub-elements of this index (e.g. the various annotation helpers).
*/
Modified: mimir/branches/5.0/mimir-core/src/gate/mimir/MimirIndex.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/MimirIndex.java
2014-02-11 16:46:10 UTC (rev 17289)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/MimirIndex.java
2014-02-12 12:34:46 UTC (rev 17290)
@@ -93,8 +93,6 @@
*/
public static final int DEFAULT_INDEXING_QUEUE_SIZE = 30;
-
-
/**
* How many occurrences to be accumulated in RAM before a new tail batch is
* written to disk.
@@ -150,7 +148,7 @@
private class WriteDeletedDocsTask extends TimerTask {
public void run() {
- synchronized(writeDeletedDocsTimer) {
+ synchronized(maintenanceTimer) {
File delFile = new File(indexDirectory,
DELETED_DOCUMENT_IDS_FILE_NAME);
if(delFile.exists()) {
delFile.delete();
@@ -172,6 +170,20 @@
}
}
+ protected class DumpToDiskTask extends TimerTask {
+ @Override
+ public void run() {
+ if(occurrencesInRam > 0) {
+ requestDumpToDisk();
+ }
+ // and re-schedule
+ synchronized(maintenanceTimer) {
+ dumpToDiskTask = new DumpToDiskTask();
+ maintenanceTimer.schedule(dumpToDiskTask,
indexConfig.getTimeBetweenBatches());
+ }
+ }
+ }
+
/**
* The {@link IndexConfig} used for this index.
*/
@@ -213,11 +225,11 @@
private transient SortedSet<Long> deletedDocumentIds;
/**
- * A timer used to execute the writing of deleted documents data to disk.
- * This timer is used to create a delay, allowing a batch of writes to be
- * coalesced into a single one.
+ * A timer used to execute various regular index maintenance tasks, such as
+ * the writing of deleted documents data to disk, and making sure regular
+ * dumps to disk are performed.
*/
- private transient Timer writeDeletedDocsTimer;
+ private transient Timer maintenanceTimer;
/**
* The timer task used to top write to disk the deleted documents data.
@@ -226,6 +238,13 @@
private volatile transient WriteDeletedDocsTask writeDeletedDocsTask;
/**
+ * Timer task used to schedule regular dumps to disk making sure recent
+ * documents become searcheable after at most {@link #timeBetweenBatches} #
+ * milliseconds.
+ */
+ private volatile transient DumpToDiskTask dumpToDiskTask;
+
+ /**
* The token indexes, in the order they are listed in the {@link
#indexConfig}.
*/
protected AtomicTokenIndex[] tokenIndexes;
@@ -249,6 +268,7 @@
*/
protected long occurrencesInRam;
+
/**
* The {@link QueryEngine} used to run searches on this index.
*/
@@ -347,14 +367,18 @@
// Prepare for searching
// #####################
readDeletedDocs();
- // start the timer that regularly writes the deleted documents list
- writeDeletedDocsTimer = new Timer("Delete documents writer");
-
+
+ // #####################
+ // Index maintenance
+ // #####################
+ maintenanceTimer = new Timer("MÃmir index maintenance timer");
+ synchronized(maintenanceTimer) {
+ dumpToDiskTask = new DumpToDiskTask();
+ maintenanceTimer.schedule(dumpToDiskTask,
+ indexConfig.getTimeBetweenBatches());
+ }
// open the zipped document collection
documentCollection = new DocumentCollection(indexDirectory);
-
-
-
}
/**
@@ -371,13 +395,15 @@
// check if we need to write a new batch:
// we have too many occurrences and
// there are no outstanding batch writing operations
- if(occurrencesInRam > occurrencesPerBatch && dumpsRequested <= 0) {
+ if( occurrencesInRam > occurrencesPerBatch && dumpsRequested <= 0) {
requestDumpToDisk();
}
GATEDocument gDocument = new GATEDocument(document, indexConfig);
- for(AtomicIndex aSubIndex: subIndexes){
- aSubIndex.getInputQueue().put(gDocument);
+ synchronized(subIndexes) {
+ for(AtomicIndex aSubIndex: subIndexes){
+ aSubIndex.getInputQueue().put(gDocument);
+ }
}
}
@@ -390,10 +416,21 @@
*/
public List<Future<Void>> requestDumpToDisk() {
List<Future<Void>> futures = new ArrayList<Future<Void>>();
- for(AtomicIndex aSubIndex : subIndexes) {
- futures.add(aSubIndex.requestDumpToDisk());
- dumpsRequested++;
+ synchronized(subIndexes) {
+ for(AtomicIndex aSubIndex : subIndexes) {
+ futures.add(aSubIndex.requestDumpToDisk());
+ dumpsRequested++;
+ }
}
+
+ synchronized(maintenanceTimer) {
+ if(dumpToDiskTask != null) {
+ dumpToDiskTask.cancel();
+ }
+ dumpToDiskTask = new DumpToDiskTask();
+ maintenanceTimer.schedule(dumpToDiskTask,
+ indexConfig.getTimeBetweenBatches());
+ }
return futures;
}
@@ -406,8 +443,10 @@
*/
public List<Future<Void>> requestCompactIndex() {
List<Future<Void>> futures = new ArrayList<Future<Void>>();
- for(AtomicIndex aSubIndex : subIndexes) {
- futures.add(aSubIndex.requestCompactIndex());
+ synchronized(subIndexes) {
+ for(AtomicIndex aSubIndex : subIndexes) {
+ futures.add(aSubIndex.requestCompactIndex());
+ }
}
return futures;
}
@@ -461,17 +500,23 @@
// close the query engine
if(queryEngine != null) queryEngine.close();
// stop the indexing
- for(AtomicIndex aSubIndex : subIndexes) {
- aSubIndex.getInputQueue().put(GATEDocument.END_OF_QUEUE);
+ synchronized(subIndexes) {
+ for(AtomicIndex aSubIndex : subIndexes) {
+ aSubIndex.getInputQueue().put(GATEDocument.END_OF_QUEUE);
+ }
}
- // write the deleted documents set
- synchronized(writeDeletedDocsTimer) {
+
+ synchronized(maintenanceTimer) {
+ // write the deleted documents set
if(writeDeletedDocsTask != null) {
writeDeletedDocsTask.cancel();
}
- writeDeletedDocsTimer.cancel();
// explicitly call it one last time
new WriteDeletedDocsTask().run();
+ if(dumpToDiskTask != null) {
+ dumpToDiskTask.cancel();
+ }
+ maintenanceTimer.cancel();
}
// wait for indexing to end
@@ -529,6 +574,23 @@
this.occurrencesPerBatch = occurrencesPerBatch;
}
+ public int getTimeBetweenBatches() {
+ return getIndexConfig().getTimeBetweenBatches();
+ }
+
+ public void setTimeBetweenBatches(int timeBetweenBatches) {
+ if(indexConfig.getTimeBetweenBatches() != timeBetweenBatches) {
+ indexConfig.setTimeBetweenBatches(timeBetweenBatches);
+ synchronized(maintenanceTimer) {
+ if(dumpToDiskTask != null) {
+ dumpToDiskTask.cancel();
+ }
+ dumpToDiskTask = new DumpToDiskTask();
+ maintenanceTimer.schedule(dumpToDiskTask, timeBetweenBatches);
+ }
+ }
+ }
+
public DocumentCollection getDocumentCollection() {
return documentCollection;
}
@@ -627,12 +689,12 @@
* will replace it.
*/
protected void writeDeletedDocsLater() {
- synchronized(writeDeletedDocsTimer) {
+ synchronized(maintenanceTimer) {
if(writeDeletedDocsTask != null) {
writeDeletedDocsTask.cancel();
}
writeDeletedDocsTask = new WriteDeletedDocsTask();
- writeDeletedDocsTimer.schedule(writeDeletedDocsTask, 1000);
+ maintenanceTimer.schedule(writeDeletedDocsTask, 1000);
}
}
Modified:
mimir/branches/5.0/mimir-core/src/gate/mimir/index/DocumentCollection.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/index/DocumentCollection.java
2014-02-11 16:46:10 UTC (rev 17289)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/index/DocumentCollection.java
2014-02-12 12:34:46 UTC (rev 17290)
@@ -52,12 +52,10 @@
public class DocumentCollection {
/**
- * The maximum number of documents to be stored in the document cache.
+ * The maximum number of documents to be stored in the in-RAM document cache.
*/
protected static final int DOCUMENT_DATA_CACHE_SIZE = 100;
-
-
/**
* Class representing one of the collection (zip) files.
*/
@@ -212,7 +210,7 @@
* file but they cannot be read from the file. To account for this, we
keep
* them in memory, in the {@link #inputBuffer} structure.
*/
- protected static final int INPUT_BUFFER_SIZE = 100;
+ protected static final int INPUT_BUFFER_SIZE = 1000;
/**
Modified:
mimir/branches/5.0/mimir-core/src/gate/mimir/search/query/TermQuery.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/search/query/TermQuery.java
2014-02-11 16:46:10 UTC (rev 17289)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/search/query/TermQuery.java
2014-02-12 12:34:46 UTC (rev 17290)
@@ -117,20 +117,27 @@
if(atomicIndex == null) throw new IllegalArgumentException(
"No index provided for field " + node.getIndexName() + "!");
- indexReader = atomicIndex.getIndex().getReader();
- // if we have the term ID, use that
- if(query.termId != DocumentIterator.END_OF_LIST) {
- this.indexIterator = indexReader.documents(query.termId);
- // set the term (used by rankers)
- MutableString mutableString = new MutableString(query.getTerm());
- atomicIndex.getIndex().termProcessor.processTerm(mutableString);
- this.indexIterator.term(mutableString);
+ Index mg4jIndex = atomicIndex.getIndex();
+ if(mg4jIndex != null) {
+ indexReader = mg4jIndex.getReader();
+ // if we have the term ID, use that
+ if(query.termId != DocumentIterator.END_OF_LIST) {
+ this.indexIterator = indexReader.documents(query.termId);
+ // set the term (used by rankers)
+ MutableString mutableString = new MutableString(query.getTerm());
+ atomicIndex.getIndex().termProcessor.processTerm(mutableString);
+ this.indexIterator.term(mutableString);
+ } else {
+ //use the term processor for the query term
+ MutableString mutableString = new MutableString(query.getTerm());
+ atomicIndex.getIndex().termProcessor.processTerm(mutableString);
+ this.indexIterator =
indexReader.documents(mutableString.toString());
+ }
} else {
- //use the term processor for the query term
- MutableString mutableString = new MutableString(query.getTerm());
- atomicIndex.getIndex().termProcessor.processTerm(mutableString);
- this.indexIterator = indexReader.documents(mutableString.toString());
+ // the atomic index is empty: we have exhausted the search already
+ latestDocument = -1;
}
+
positionsIterator = null;
}
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
Android apps run on BlackBerry 10
Introducing the new BlackBerry 10.2.1 Runtime for Android apps.
Now with support for Jelly Bean, Bluetooth, Mapview and more.
Get your Android app in front of a whole new audience. Start now.
http://pubads.g.doubleclick.net/gampad/clk?id=124407151&iu=/4140/ostg.clktrk
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs