Revision: 17245
http://sourceforge.net/p/gate/code/17245
Author: valyt
Date: 2014-01-23 16:52:34 +0000 (Thu, 23 Jan 2014)
Log Message:
-----------
Started work on moving the DocumentCollectionWriter functionality into
DocumentCollection, which will become a read/write collection. This is required
to support live indexes.
Modified Paths:
--------------
mimir/branches/5.0/mimir-core/src/gate/mimir/MimirIndex.java
mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicTokenIndex.java
mimir/branches/5.0/mimir-core/src/gate/mimir/index/Indexer.java
mimir/branches/5.0/mimir-core/src/gate/mimir/index/mg4j/MG4JIndexer.java
mimir/branches/5.0/mimir-core/src/gate/mimir/index/mg4j/TokenIndexBuilder.java
mimir/branches/5.0/mimir-core/src/gate/mimir/index/mg4j/zipcollection/DocumentCollection.java
Removed Paths:
-------------
mimir/branches/5.0/mimir-core/src/gate/mimir/index/mg4j/zipcollection/DocumentCollectionWriter.java
Modified: mimir/branches/5.0/mimir-core/src/gate/mimir/MimirIndex.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/MimirIndex.java
2014-01-22 17:30:47 UTC (rev 17244)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/MimirIndex.java
2014-01-23 16:52:34 UTC (rev 17245)
@@ -18,18 +18,15 @@
import gate.Gate;
import gate.mimir.IndexConfig.SemanticIndexerConfig;
import gate.mimir.IndexConfig.TokenIndexerConfig;
+import gate.mimir.index.AtomicAnnotationIndex;
import gate.mimir.index.AtomicIndex;
-import gate.mimir.index.AtomicAnnotationIndex;
import gate.mimir.index.AtomicTokenIndex;
import gate.mimir.index.IndexException;
import gate.mimir.index.mg4j.GATEDocument;
import gate.mimir.index.mg4j.zipcollection.DocumentCollection;
-import gate.mimir.index.mg4j.zipcollection.DocumentCollectionWriter;
import gate.mimir.index.mg4j.zipcollection.DocumentData;
import gate.mimir.search.QueryEngine;
import gate.util.GateRuntimeException;
-import it.unimi.dsi.fastutil.ints.IntBigList;
-import it.unimi.dsi.fastutil.longs.Long2ObjectLinkedOpenHashMap;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
@@ -42,15 +39,12 @@
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
-import java.util.Iterator;
-import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import java.util.SortedSet;
import java.util.Timer;
import java.util.TimerTask;
import java.util.TreeSet;
-import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.zip.GZIPInputStream;
@@ -97,10 +91,7 @@
*/
public static final int DEFAULT_INDEXING_QUEUE_SIZE = 30;
- /**
- * The maximum number of documents to be stored in the document cache.
- */
- protected static final int DOCUMENT_DATA_CACHE_SIZE = 100;
+
/**
* How many occurrences to be accumulated in RAM before a new tail batch is
@@ -189,24 +180,15 @@
*/
protected File indexDirectory;
+
/**
- * A zip collection builder used to build a zip of the collection
- * if this has been requested.
- */
- protected DocumentCollectionWriter collectionWriter = null;
-
- /**
* The zipped document collection from MG4J (built during the indexing of the
* first token feature). This can be used to obtain the document text and to
* display the content of the hits.
*/
protected DocumentCollection documentCollection;
- /**
- * A cache of {@link DocumentData} values used for returning the various
- * document details (title, URI, text).
- */
- protected Long2ObjectLinkedOpenHashMap<DocumentData> documentCache;
+
/**
* The thread used to clean-up GATE documents after they have been indexed.
@@ -359,11 +341,6 @@
documentsCollectorThread = new Thread(new IndexMaintenanceRunner());
documentsCollectorThread.start();
- // prepare the zip collection writer
- logger.info("Creating zipped collection for field \"" +
- tokConfs[0].getFeatureName() + "\"");
- collectionWriter = new DocumentCollectionWriter(indexDirectory);
-
// #####################
// Prepare for searching
// #####################
@@ -374,8 +351,7 @@
// open the zipped document collection
documentCollection = new DocumentCollection(indexDirectory);
- // prepare the document cache
- documentCache = new Long2ObjectLinkedOpenHashMap<DocumentData>();
+
}
@@ -463,7 +439,7 @@
* @throws IndexException
*/
public void writeZipDocumentData(DocumentData docData) throws IndexException
{
- collectionWriter.writeDocument(docData);
+ documentCollection.writeDocument(docData);
}
/**
@@ -471,8 +447,9 @@
* this index from accepting any more queries, finishes indexing all the
* currently queued documents, writes all the files to disk, and returns.
* @throws InterruptedException
+ * @throws IOException
*/
- public void close() throws InterruptedException {
+ public void close() throws InterruptedException, IOException {
if(closed) return;
closed = true;
@@ -495,7 +472,7 @@
// close the document collection
documentCollection.close();
- documentCache.clear();
+
// wait for indexing to end
documentsCollectorThread.join();
// write the config file
@@ -566,16 +543,7 @@
if(isDeleted(documentID)) {
throw new IndexException("Invalid document ID " + documentID);
}
- DocumentData documentData = documentCache.getAndMoveToFirst(documentID);
- if(documentData == null) {
- // cache miss
- documentData = documentCollection.getDocumentData(documentID);
- documentCache.putAndMoveToFirst(documentID, documentData);
- if(documentCache.size() > DOCUMENT_DATA_CACHE_SIZE) {
- documentCache.removeLast();
- }
- }
- return documentData;
+ return documentCollection.getDocumentData(documentID);
}
/**
Modified:
mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicTokenIndex.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicTokenIndex.java
2014-01-22 17:30:47 UTC (rev 17244)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicTokenIndex.java
2014-01-23 16:52:34 UTC (rev 17245)
@@ -17,15 +17,12 @@
import gate.Annotation;
import gate.FeatureMap;
import gate.mimir.DocumentMetadataHelper;
+import gate.mimir.IndexConfig.TokenIndexerConfig;
import gate.mimir.MimirIndex;
-import gate.mimir.IndexConfig.TokenIndexerConfig;
import gate.mimir.index.mg4j.GATEDocument;
import gate.mimir.index.mg4j.GATEDocumentFactory;
-import gate.mimir.index.mg4j.zipcollection.DocumentCollection;
-import gate.mimir.index.mg4j.zipcollection.DocumentCollectionWriter;
import gate.mimir.index.mg4j.zipcollection.DocumentData;
import it.unimi.di.big.mg4j.index.Index;
-import it.unimi.di.big.mg4j.index.TermProcessor;
import it.unimi.dsi.lang.ObjectParser;
import java.io.File;
@@ -34,7 +31,6 @@
import java.util.List;
import java.util.concurrent.BlockingQueue;
-import org.apache.commons.configuration.ConfigurationException;
import org.apache.log4j.Logger;
/**
Modified: mimir/branches/5.0/mimir-core/src/gate/mimir/index/Indexer.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/index/Indexer.java
2014-01-22 17:30:47 UTC (rev 17244)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/index/Indexer.java
2014-01-23 16:52:34 UTC (rev 17245)
@@ -119,7 +119,7 @@
annHelpersClosingProgress = 0;
}
- protected void initMG4J() {
+ protected void initMG4J() throws IndexException {
// make sure the index directory exists
mg4jIndexDir =
new File(config.getIndexDirectory(), Indexer.MG4J_INDEX_DIRNAME);
Modified:
mimir/branches/5.0/mimir-core/src/gate/mimir/index/mg4j/MG4JIndexer.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/index/mg4j/MG4JIndexer.java
2014-01-22 17:30:47 UTC (rev 17244)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/index/mg4j/MG4JIndexer.java
2014-01-23 16:52:34 UTC (rev 17245)
@@ -138,8 +138,9 @@
/**
* Initialises the MG4JConnector, based on the index config provided to the
* constructor, and all other options set after construction.
+ * @throws IndexException
*/
- public void init(){
+ public void init() throws IndexException{
gateDocFactory = new GATEDocumentFactory(indexConfig);
inputQueue = new LinkedBlockingQueue<GATEDocument>(documentQueueSize);
//start the sub-indexers for the token features
Modified:
mimir/branches/5.0/mimir-core/src/gate/mimir/index/mg4j/TokenIndexBuilder.java
===================================================================
---
mimir/branches/5.0/mimir-core/src/gate/mimir/index/mg4j/TokenIndexBuilder.java
2014-01-22 17:30:47 UTC (rev 17244)
+++
mimir/branches/5.0/mimir-core/src/gate/mimir/index/mg4j/TokenIndexBuilder.java
2014-01-23 16:52:34 UTC (rev 17245)
@@ -20,7 +20,7 @@
import gate.mimir.IndexConfig.TokenIndexerConfig;
import gate.mimir.index.IndexException;
import gate.mimir.index.Indexer;
-import gate.mimir.index.mg4j.zipcollection.DocumentCollectionWriter;
+import gate.mimir.index.mg4j.zipcollection.DocumentCollection;
import gate.mimir.index.mg4j.zipcollection.DocumentData;
import java.io.IOException;
@@ -52,7 +52,7 @@
* A zip collection builder used to build a zip of the collection
* if this has been requested.
*/
- protected DocumentCollectionWriter collectionWriter = null;
+ protected DocumentCollection collectionWriter = null;
/**
* An array of helpers for creating document metadata.
@@ -101,7 +101,7 @@
BlockingQueue<GATEDocument> outputQueue, Indexer indexer,
GATEDocumentFactory factory, boolean zipCollection,
String baseName,
- TokenIndexerConfig config) {
+ TokenIndexerConfig config) throws IndexException {
super(inputQueue, outputQueue, indexer, baseName,
config.isDirectIndexEnabled());
this.termProcessor = config.getTermProcessor();
@@ -124,7 +124,7 @@
if(zipCollection) {
logger.info("Creating zipped collection for field \"" + featureName +
"\"");
- collectionWriter = new DocumentCollectionWriter(indexer.getIndexDir());
+ collectionWriter = new DocumentCollection(indexer.getIndexDir());
}
}
Modified:
mimir/branches/5.0/mimir-core/src/gate/mimir/index/mg4j/zipcollection/DocumentCollection.java
===================================================================
---
mimir/branches/5.0/mimir-core/src/gate/mimir/index/mg4j/zipcollection/DocumentCollection.java
2014-01-22 17:30:47 UTC (rev 17244)
+++
mimir/branches/5.0/mimir-core/src/gate/mimir/index/mg4j/zipcollection/DocumentCollection.java
2014-01-23 16:52:34 UTC (rev 17245)
@@ -17,9 +17,13 @@
import gate.mimir.index.IndexException;
import gate.mimir.index.Indexer;
+import it.unimi.dsi.fastutil.ints.IntArrayList;
+import it.unimi.dsi.fastutil.ints.IntList;
+import it.unimi.dsi.fastutil.longs.Long2ObjectLinkedOpenHashMap;
import java.io.*;
import java.util.*;
+import java.util.concurrent.BlockingQueue;
import java.util.zip.*;
import org.apache.log4j.Logger;
@@ -31,8 +35,12 @@
* serialised {@link DocumentData} values.
*/
public class DocumentCollection {
-
/**
+ * The maximum number of documents to be stored in the document cache.
+ */
+ protected static final int DOCUMENT_DATA_CACHE_SIZE = 100;
+
+ /**
* A simple {@link FilenameFilter} that only accepts the zip files that are
* part of a collection.
*
@@ -71,7 +79,7 @@
/**
* The zip files containing the document collection.
*/
- protected ZipFile[] zipFiles = null;
+ protected List<ZipFile> zipFiles = null;
private static Logger logger = Logger.getLogger(DocumentCollection.class);
@@ -87,15 +95,160 @@
* <code>maxEntries[i-1] + 1</code> and <code>maxEntries[i]</code>,
inclusive.
* By convention, <code>maxEntries[-1]=-1</code>.
*/
- protected int[] maxEntries = null;
+ protected IntList maxEntries = null;
/**
+ * A cache of {@link DocumentData} values used for returning the various
+ * document details (title, URI, text).
+ */
+ protected Long2ObjectLinkedOpenHashMap<DocumentData> documentCache;
+
+ /**
+ * Document data objects that have been written to the zip file currently
+ * being created and have to be kept in RAM until the file is closed and can
+ * be open in read mode.
+ */
+ protected Long2ObjectLinkedOpenHashMap<DocumentData> inputBuffer;
+
+ /**
* Flag that gets set to true when the collection is closed (and blocks all
* subsequent operations).
*/
private volatile boolean closed = false;
+
+
/**
+ * The maximum number of bytes to write to a single zip file.
+ */
+ public static final long ZIP_FILE_MAX_SIZE = 2 * 1000 * 1000 * 1000;
+
+ /**
+ * The maximum number of entries to write to a single zip file.
+ * Java 1.5 only support 2^16 entries, so the default limit is set below
that.
+ * If running on Java 1.6, this limit can safely be increased, however, the
+ * total size of the file (as specified by {@link #ZIP_FILE_MAX_SIZE})
should
+ * not be greater than 4GB, in either case.
+ */
+ public static final int ZIP_FILE_MAX_ENTRIES = 65530;
+
+ /**
+ * The zip file managed by this collection.
+ */
+ protected ZipOutputStream zipOuputStream;
+
+ /**
+ * The zip file to which we are currently writing.
+ */
+ protected File zipFile;
+
+ /**
+ * The number of entries written so far to the current zip file.
+ */
+ protected int currentEntries;
+
+ /**
+ * The amount of bytes written so far to the current zip file.
+ */
+ protected long currentLength;
+
+ /**
+ * A {@link ByteArrayOutputStream} used to temporarily store serialised
+ * document data objects.
+ */
+ protected ByteArrayOutputStream byteArrayOS;
+
+ /**
+ * The ID for the next document to be written. This value is initialised to 0
+ * and then is automatically incremented whenever anew document is written.
+ */
+ protected int documentId;
+
+
+ /**
+ * The unique ID of the current zip file.
+ */
+ protected int zipFileId;
+
+
+
+
+ /**
+ * Writes a new document to the underlying zip file. The documents added
+ * through this method will get automatically generated names starting from
+ * "0", and continuing with "1", "2", etc.
+ * @param document
+ * @throws IndexException if there are any problems while accessing the zip
+ * collection file(s).
+ */
+ public void writeDocument(DocumentData document) throws IndexException{
+ if(zipFile == null) openZipFile();
+ try{
+ //write the new document to the byte array
+ ObjectOutputStream objectOutStream = new ObjectOutputStream(byteArrayOS);
+ objectOutStream.writeObject(document);
+ objectOutStream.close();
+
+ //see if we're about to go over the limits
+ if(currentEntries >= ZIP_FILE_MAX_ENTRIES ||
+ currentLength + byteArrayOS.size() >= ZIP_FILE_MAX_SIZE){
+ //move to the next zip file
+ closeZipFile();
+ zipFileId ++;
+ openZipFile();
+ }
+
+ //create a new entry in the current zip file
+ ZipEntry entry = new ZipEntry(Integer.toString(documentId++));
+ zipOuputStream.putNextEntry(entry);
+ //write the data
+ byteArrayOS.writeTo(zipOuputStream);
+ zipOuputStream.closeEntry();
+ currentLength += entry.getCompressedSize();
+ //clean up the byte array for next time
+ byteArrayOS.reset();
+ currentEntries++;
+ }catch(IOException e){
+ throw new IndexException("Problem while accessing the collection file",
e);
+ }
+ }
+
+ /**
+ * Opens the current zip file and sets the {@link #zipFile} and
+ * {@link #zipOuputStream} values accordingly.
+ * @throws IndexException if the collection zip file already exists, or
cannot
+ * be opened for writing.
+ */
+ protected void openZipFile() throws IndexException{
+ zipFile = new File(indexDir,
+ Indexer.MIMIR_COLLECTION_BASENAME +
+ "-" + zipFileId +
+ Indexer.MIMIR_COLLECTION_EXTENSION);
+ if(zipFile.exists()) throw new IndexException("Collection zip file (" +
+ zipFile.getAbsolutePath() + ") already exists!");
+
+ try {
+ zipOuputStream = new ZipOutputStream(new BufferedOutputStream(
+ new FileOutputStream(zipFile)));
+ } catch(FileNotFoundException e) {
+ throw new IndexException("Cannot write to collection zip file (" +
+ zipFile.getAbsolutePath() + ")", e);
+ }
+ currentEntries = 0;
+ currentLength = 0;
+ }
+
+ /**
+ * Closes the current zip file.
+ * @throws IOException
+ */
+ protected void closeZipFile() throws IOException{
+ if(zipOuputStream != null) zipOuputStream.close();
+ }
+
+
+
+ /**
* Opens a zip file and creates a DocumentCollection object for accessing
the
* document data.
* @param indexDirectory
@@ -103,6 +256,12 @@
*/
public DocumentCollection(File indexDirectory) throws IndexException {
this.indexDir = indexDirectory;
+ // prepare the document cache
+ documentCache = new Long2ObjectLinkedOpenHashMap<DocumentData>();
+
+ byteArrayOS = new ByteArrayOutputStream();
+ documentId = 0;
+ zipFileId = 0;
}
/**
@@ -127,21 +286,23 @@
return getZipFileId(o1.getName()) - getZipFileId(o2.getName());
}
});
- zipFiles = new ZipFile[collectionFiles.length];
- maxEntries = new int[collectionFiles.length];
+ zipFiles = new ArrayList<ZipFile>(collectionFiles.length);
+
+ int[] maxEntriesArr = new int[collectionFiles.length];
for(int i = 0; i < collectionFiles.length; i++){
try {
//for each file, open a ZipFile, parse the entries, set the maxEntry
value.
- zipFiles[i] = new ZipFile(collectionFiles[i]);
- Enumeration<? extends ZipEntry> entries = zipFiles[i].entries();
- maxEntries[i] = -1;
+ ZipFile aZipFile = new ZipFile(collectionFiles[i]);
+ zipFiles.add(aZipFile);
+ Enumeration<? extends ZipEntry> entries = aZipFile.entries();
+ maxEntriesArr[i] = -1;
while(entries.hasMoreElements()){
ZipEntry anEntry = entries.nextElement();
String entryName = anEntry.getName();
try {
int entryId = Integer.parseInt(entryName);
//sanity check
- if(i > 0 && entryId <= maxEntries[i-1]){
+ if(i > 0 && entryId <= maxEntriesArr[i-1]){
throw new IndexException(
"Invalid entries distribution: collection file " +
collectionFiles[i].getAbsolutePath() +
@@ -150,7 +311,7 @@
"already seen in a previous collection file!");
}
//update the current maximum
- if(entryId > maxEntries[i]) maxEntries[i] = entryId;
+ if(entryId > maxEntriesArr[i]) maxEntriesArr[i] = entryId;
} catch(NumberFormatException e) {
//not parseable -> we'll ignore this entry.
logger.warn("Unparseable zip entry name: " + entryName);
@@ -164,8 +325,8 @@
collectionFiles[i].getAbsolutePath(), e);
}
}
-
- logger.info("Opened zip collection: maxEntries = " +
Arrays.toString(maxEntries));
+ maxEntries = new IntArrayList(maxEntriesArr);
+ logger.info("Opened zip collection: maxEntries = " +
Arrays.toString(maxEntriesArr));
}
/**
@@ -178,55 +339,68 @@
public DocumentData getDocumentData(long documentID) throws IndexException{
if(closed) throw new IllegalStateException(
"This document collection has already been closed!");
- if(zipFiles == null){
- //open the zip files, parse their catalogues and update the values in
- //maxEntries
- openCollectionFiles();
- }
- //locate the right zip file
- int zipFileId = 0;
- while(zipFileId < maxEntries.length && documentID > maxEntries[zipFileId]){
- zipFileId++;
- }
- if(zipFileId >= maxEntries.length){
- //entry not found (entry number too large)
- throw new NoSuchElementException("No entry found for document ID " +
- documentID + ". Document ID too large for this collection!");
- }
- ZipEntry entry = zipFiles[zipFileId].getEntry(Long.toString(documentID));
- if(entry == null)
- throw new NoSuchElementException("No entry found for document ID " +
documentID);
- try {
- ObjectInputStream ois = new
ObjectInputStream(zipFiles[zipFileId].getInputStream(entry));
+ DocumentData documentData = documentCache.getAndMoveToFirst(documentID);
+ if(documentData == null) {
+ // cache miss
+ if(zipFiles == null){
+ //open the zip files, parse their catalogues and update the values in
+ //maxEntries
+ openCollectionFiles();
+ }
+ //locate the right zip file
+ int zipFileId = 0;
+ while(zipFileId < maxEntries.size() && documentID >
maxEntries.get(zipFileId)){
+ zipFileId++;
+ }
+ if(zipFileId >= maxEntries.size()){
+ //entry not found (entry number too large)
+ throw new NoSuchElementException("No entry found for document ID " +
+ documentID + ". Document ID too large for this collection!");
+ }
- DocumentData docData = (DocumentData) ois.readObject();
- ois.close();
- return docData;
- } catch(ClassNotFoundException e) {
- //invalid data read from the zip file
- throw new IndexException("Invalid data read from zip file!", e);
- } catch(IOException e) {
- throw new IndexException("Exception reading zip file!", e);
+ ZipEntry entry =
zipFiles.get(zipFileId).getEntry(Long.toString(documentID));
+ if(entry == null) throw new NoSuchElementException(
+ "No entry found for document ID " + documentID);
+ try {
+ ObjectInputStream ois = new
ObjectInputStream(zipFiles.get(zipFileId).getInputStream(entry));
+ documentData = (DocumentData) ois.readObject();
+ ois.close();
+ documentCache.putAndMoveToFirst(documentID, documentData);
+ if(documentCache.size() > DOCUMENT_DATA_CACHE_SIZE) {
+ documentCache.removeLast();
+ }
+ } catch(ClassNotFoundException e) {
+ //invalid data read from the zip file
+ throw new IndexException("Invalid data read from zip file!", e);
+ } catch(IOException e) {
+ throw new IndexException("Exception reading zip file!", e);
+ }
}
+ return documentData;
}
/**
* Close this document collection and release all allocated resources (such
* as open file handles).
+ * @throws IOException
*/
- public void close() {
+ public void close() throws IOException {
+ // close the writer
+ closeZipFile();
+ // close the reader
closed = true;
if(zipFiles != null){
- for(int i = 0; i < zipFiles.length; i++){
+ for(ZipFile aZipFile : zipFiles){
try {
- zipFiles[i].close();
- zipFiles[i] = null;
+ aZipFile.close();
} catch(IOException e) {
// ignore
}
}
+ zipFiles.clear();
zipFiles = null;
}
+ documentCache.clear();
}
}
Deleted:
mimir/branches/5.0/mimir-core/src/gate/mimir/index/mg4j/zipcollection/DocumentCollectionWriter.java
===================================================================
---
mimir/branches/5.0/mimir-core/src/gate/mimir/index/mg4j/zipcollection/DocumentCollectionWriter.java
2014-01-22 17:30:47 UTC (rev 17244)
+++
mimir/branches/5.0/mimir-core/src/gate/mimir/index/mg4j/zipcollection/DocumentCollectionWriter.java
2014-01-23 16:52:34 UTC (rev 17245)
@@ -1,195 +0,0 @@
-/*
- * DocumentCollectionWriter.java
- *
- * Copyright (c) 2007-2011, The University of Sheffield.
- *
- * This file is part of GATE MÃmir (see http://gate.ac.uk/family/mimir.html),
- * and is free software, licenced under the GNU Lesser General Public License,
- * Version 3, June 2007 (also included with this distribution as file
- * LICENCE-LGPL3.html).
- *
- * Valentin Tablan, 15 Apr 2009
- *
- * $Id$
- */
-package gate.mimir.index.mg4j.zipcollection;
-
-
-import gate.mimir.index.IndexException;
-import gate.mimir.index.Indexer;
-
-import java.io.*;
-
-import org.apache.log4j.Logger;
-
-
-import java.util.zip.*;
-
-/**
- * A writer for Mimir zip document collections.
- * A Mimir document collection is a set of serialised {@link DocumentData}
- * objects stored in one or more zip files.
- * To create a Mimir document collection, create a writer (pointing it to the
- * top level index directory), add new documents using the
- * {@link #writeDocument(DocumentData)} method, and close the collection at the
- * end, by calling the {@link #close()} method.
- *
- * This writer will create one or more zip files as required, never writing
more
- * than {@link #ZIP_FILE_MAX_ENTRIES} entries, or more than
- * {@link #ZIP_FILE_MAX_SIZE} bytes to a single file.
- */
-public class DocumentCollectionWriter {
-
-
- /**
- * The maximum number of bytes to write to a single zip file.
- */
- public static final long ZIP_FILE_MAX_SIZE = 2 * 1000 * 1000 * 1000;
-
- /**
- * The maximum number of entries to write to a single zip file.
- * Java 1.5 only support 2^16 entries, so the default limit is set below
that.
- * If running on Java 1.6, this limit can safely be increased, however, the
- * total size of the file (as specified by {@link #ZIP_FILE_MAX_SIZE})
should
- * not be greater than 4GB, in either case.
- */
- public static final int ZIP_FILE_MAX_ENTRIES = 65530;
-
- private static Logger logger =
Logger.getLogger(DocumentCollectionWriter.class);
-
- /**
- * The zip file managed by this collection.
- */
- protected ZipOutputStream zipOuputStream;
-
- /**
- * The zip file to which we are currently writing.
- */
- protected File zipFile;
-
- /**
- * The top-level index directory.
- */
- protected File indexDir;
- /**
- * The number of entries written so far to the current zip file.
- */
- protected int currentEntries;
-
- /**
- * The amount of bytes written so far to the current zip file.
- */
- protected long currentLength;
-
- /**
- * A {@link ByteArrayOutputStream} used to temporarily store serialised
- * document data objects.
- */
- protected ByteArrayOutputStream byteArrayOS;
-
- /**
- * The ID for the next document to be written. This value is initialised to 0
- * and then is automatically incremented whenever anew document is written.
- */
- protected int documentId;
-
-
- /**
- * The unique ID of the current zip file.
- */
- protected int zipFileId;
-
- /**
- * Creates a new DocumentCollectionWriter for the specified index.
- * @param indexDir the top level index directory.
- */
- public DocumentCollectionWriter(File indexDir){
- this.indexDir = indexDir;
- byteArrayOS = new ByteArrayOutputStream();
- documentId = 0;
- zipFileId = 0;
- }
-
-
- /**
- * Writes a new document to the underlying zip file. The documents added
- * through this method will get automatically generated names starting from
- * "0", and continuing with "1", "2", etc.
- * @param document
- * @throws IndexException if there are any problems while accessing the zip
- * collection file(s).
- */
- public void writeDocument(DocumentData document) throws IndexException{
- if(zipFile == null) openZipFile();
- try{
- //write the new document to the byte array
- ObjectOutputStream objectOutStream = new ObjectOutputStream(byteArrayOS);
- objectOutStream.writeObject(document);
- objectOutStream.close();
-
- //see if we're about to go over the limits
- if(currentEntries >= ZIP_FILE_MAX_ENTRIES ||
- currentLength + byteArrayOS.size() >= ZIP_FILE_MAX_SIZE){
- //move to the next zip file
- closeZipFile();
- zipFileId ++;
- openZipFile();
- }
-
- //create a new entry in the current zip file
- ZipEntry entry = new ZipEntry(Integer.toString(documentId++));
- zipOuputStream.putNextEntry(entry);
- //write the data
- byteArrayOS.writeTo(zipOuputStream);
- zipOuputStream.closeEntry();
- currentLength += entry.getCompressedSize();
- //clean up the byte array for next time
- byteArrayOS.reset();
- currentEntries++;
- }catch(IOException e){
- throw new IndexException("Problem while accessing the collection file",
e);
- }
- }
-
- /**
- * Opens the current zip file and sets the {@link #zipFile} and
- * {@link #zipOuputStream} values accordingly.
- * @throws IndexException if the collection zip file already exists, or
cannot
- * be opened for writing.
- */
- protected void openZipFile() throws IndexException{
- zipFile = new File(indexDir,
- Indexer.MIMIR_COLLECTION_BASENAME +
- "-" + zipFileId +
- Indexer.MIMIR_COLLECTION_EXTENSION);
- if(zipFile.exists()) throw new IndexException("Collection zip file (" +
- zipFile.getAbsolutePath() + ") already exists!");
-
- try {
- zipOuputStream = new ZipOutputStream(new BufferedOutputStream(
- new FileOutputStream(zipFile)));
- } catch(FileNotFoundException e) {
- throw new IndexException("Cannot write to collection zip file (" +
- zipFile.getAbsolutePath() + ")", e);
- }
- currentEntries = 0;
- currentLength = 0;
- }
-
- /**
- * Closes the current zip file.
- * @throws IOException
- */
- protected void closeZipFile() throws IOException{
- if(zipOuputStream != null) zipOuputStream.close();
- }
-
- /**
- * Closes this writer (and the underlying zip file).
- * @throws IOException
- */
- public void close() throws IOException{
- closeZipFile();
- }
-
-}
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
CenturyLink Cloud: The Leader in Enterprise Cloud Services.
Learn Why More Businesses Are Choosing CenturyLink Cloud For
Critical Workloads, Development Environments & Everything In Between.
Get a Quote or Start a Free Trial Today.
http://pubads.g.doubleclick.net/gampad/clk?id=119420431&iu=/4140/ostg.clktrk
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs