mimir

valyt Thu, 23 Jan 2014 08:54:53 -0800

Revision: 17245
          http://sourceforge.net/p/gate/code/17245
Author:   valyt
Date:     2014-01-23 16:52:34 +0000 (Thu, 23 Jan 2014)
Log Message:
-----------
Started work on moving the DocumentCollectionWriter functionality into 
DocumentCollection, which will become a read/write collection. This is required 
to support live indexes.


Modified Paths:
--------------
    mimir/branches/5.0/mimir-core/src/gate/mimir/MimirIndex.java
    mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicTokenIndex.java
    mimir/branches/5.0/mimir-core/src/gate/mimir/index/Indexer.java
    mimir/branches/5.0/mimir-core/src/gate/mimir/index/mg4j/MG4JIndexer.java
    
mimir/branches/5.0/mimir-core/src/gate/mimir/index/mg4j/TokenIndexBuilder.java
    
mimir/branches/5.0/mimir-core/src/gate/mimir/index/mg4j/zipcollection/DocumentCollection.java

Removed Paths:
-------------
    
mimir/branches/5.0/mimir-core/src/gate/mimir/index/mg4j/zipcollection/DocumentCollectionWriter.java

Modified: mimir/branches/5.0/mimir-core/src/gate/mimir/MimirIndex.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/MimirIndex.java        
2014-01-22 17:30:47 UTC (rev 17244)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/MimirIndex.java        
2014-01-23 16:52:34 UTC (rev 17245)
@@ -18,18 +18,15 @@
 import gate.Gate;
 import gate.mimir.IndexConfig.SemanticIndexerConfig;
 import gate.mimir.IndexConfig.TokenIndexerConfig;
+import gate.mimir.index.AtomicAnnotationIndex;
 import gate.mimir.index.AtomicIndex;
-import gate.mimir.index.AtomicAnnotationIndex;
 import gate.mimir.index.AtomicTokenIndex;
 import gate.mimir.index.IndexException;
 import gate.mimir.index.mg4j.GATEDocument;
 import gate.mimir.index.mg4j.zipcollection.DocumentCollection;
-import gate.mimir.index.mg4j.zipcollection.DocumentCollectionWriter;
 import gate.mimir.index.mg4j.zipcollection.DocumentData;
 import gate.mimir.search.QueryEngine;
 import gate.util.GateRuntimeException;
-import it.unimi.dsi.fastutil.ints.IntBigList;
-import it.unimi.dsi.fastutil.longs.Long2ObjectLinkedOpenHashMap;
 
 import java.io.BufferedInputStream;
 import java.io.BufferedOutputStream;
@@ -42,15 +39,12 @@
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
-import java.util.Iterator;
-import java.util.LinkedList;
 import java.util.List;
 import java.util.Set;
 import java.util.SortedSet;
 import java.util.Timer;
 import java.util.TimerTask;
 import java.util.TreeSet;
-import java.util.concurrent.ExecutionException;
 import java.util.concurrent.Future;
 import java.util.concurrent.LinkedBlockingQueue;
 import java.util.zip.GZIPInputStream;
@@ -97,10 +91,7 @@
    */
   public static final int DEFAULT_INDEXING_QUEUE_SIZE = 30;
   
-  /**
-   * The maximum number of documents to be stored in the document cache.
-   */
-  protected static final int DOCUMENT_DATA_CACHE_SIZE = 100;
+
   
   /**
    * How many occurrences to be accumulated in RAM before a new tail batch is
@@ -189,24 +180,15 @@
    */
   protected File indexDirectory;
   
+ 
   /**
-   * A zip collection builder used to build a zip of the collection
-   * if this has been requested.
-   */
-  protected DocumentCollectionWriter collectionWriter = null;
-  
-  /**
    * The zipped document collection from MG4J (built during the indexing of the
    * first token feature). This can be used to obtain the document text and to
    * display the content of the hits.
    */
   protected DocumentCollection documentCollection;
   
-  /**
-   * A cache of {@link DocumentData} values used for returning the various
-   * document details (title, URI, text).
-   */
-  protected Long2ObjectLinkedOpenHashMap<DocumentData> documentCache;
+
   
   /**
    * The thread used to clean-up GATE documents after they have been indexed.
@@ -359,11 +341,6 @@
     documentsCollectorThread = new Thread(new IndexMaintenanceRunner());
     documentsCollectorThread.start();
 
-    // prepare the zip collection writer
-    logger.info("Creating zipped collection for field \"" + 
-        tokConfs[0].getFeatureName() + "\"");
-    collectionWriter = new DocumentCollectionWriter(indexDirectory);
-
     // #####################
     // Prepare for searching
     // #####################
@@ -374,8 +351,7 @@
     // open the zipped document collection
     documentCollection = new DocumentCollection(indexDirectory);
     
-    // prepare the document cache
-    documentCache = new Long2ObjectLinkedOpenHashMap<DocumentData>();
+
     
   }
   
@@ -463,7 +439,7 @@
    * @throws IndexException 
    */
   public void writeZipDocumentData(DocumentData docData) throws IndexException 
{
-    collectionWriter.writeDocument(docData);
+    documentCollection.writeDocument(docData);
   }
   
   /**
@@ -471,8 +447,9 @@
    * this index from accepting any more queries, finishes indexing all the 
    * currently queued documents, writes all the files to disk, and returns.
    * @throws InterruptedException 
+   * @throws IOException 
    */
-  public void close() throws InterruptedException {
+  public void close() throws InterruptedException, IOException {
     if(closed) return;
     closed = true;
 
@@ -495,7 +472,7 @@
     // close the document collection
     documentCollection.close();
 
-    documentCache.clear();
+
     // wait for indexing to end
     documentsCollectorThread.join();
     // write the config file
@@ -566,16 +543,7 @@
     if(isDeleted(documentID)) {
       throw new IndexException("Invalid document ID " + documentID);
     }
-    DocumentData documentData = documentCache.getAndMoveToFirst(documentID);
-    if(documentData == null) {
-      // cache miss
-      documentData = documentCollection.getDocumentData(documentID);
-      documentCache.putAndMoveToFirst(documentID, documentData);
-      if(documentCache.size() > DOCUMENT_DATA_CACHE_SIZE) {
-        documentCache.removeLast();
-      }
-    }
-    return documentData;
+    return  documentCollection.getDocumentData(documentID);
   }
   
   /**

Modified: 
mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicTokenIndex.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicTokenIndex.java    
2014-01-22 17:30:47 UTC (rev 17244)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/index/AtomicTokenIndex.java    
2014-01-23 16:52:34 UTC (rev 17245)
@@ -17,15 +17,12 @@
 import gate.Annotation;
 import gate.FeatureMap;
 import gate.mimir.DocumentMetadataHelper;
+import gate.mimir.IndexConfig.TokenIndexerConfig;
 import gate.mimir.MimirIndex;
-import gate.mimir.IndexConfig.TokenIndexerConfig;
 import gate.mimir.index.mg4j.GATEDocument;
 import gate.mimir.index.mg4j.GATEDocumentFactory;
-import gate.mimir.index.mg4j.zipcollection.DocumentCollection;
-import gate.mimir.index.mg4j.zipcollection.DocumentCollectionWriter;
 import gate.mimir.index.mg4j.zipcollection.DocumentData;
 import it.unimi.di.big.mg4j.index.Index;
-import it.unimi.di.big.mg4j.index.TermProcessor;
 import it.unimi.dsi.lang.ObjectParser;
 
 import java.io.File;
@@ -34,7 +31,6 @@
 import java.util.List;
 import java.util.concurrent.BlockingQueue;
 
-import org.apache.commons.configuration.ConfigurationException;
 import org.apache.log4j.Logger;
 
 /**

Modified: mimir/branches/5.0/mimir-core/src/gate/mimir/index/Indexer.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/index/Indexer.java     
2014-01-22 17:30:47 UTC (rev 17244)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/index/Indexer.java     
2014-01-23 16:52:34 UTC (rev 17245)
@@ -119,7 +119,7 @@
     annHelpersClosingProgress = 0;
   }
 
-  protected void initMG4J() {
+  protected void initMG4J() throws IndexException {
     // make sure the index directory exists
     mg4jIndexDir =
       new File(config.getIndexDirectory(), Indexer.MG4J_INDEX_DIRNAME);

Modified: 
mimir/branches/5.0/mimir-core/src/gate/mimir/index/mg4j/MG4JIndexer.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/index/mg4j/MG4JIndexer.java    
2014-01-22 17:30:47 UTC (rev 17244)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/index/mg4j/MG4JIndexer.java    
2014-01-23 16:52:34 UTC (rev 17245)
@@ -138,8 +138,9 @@
   /**
    * Initialises the MG4JConnector, based on the index config provided to the 
    * constructor, and all other options set after construction.
+   * @throws IndexException 
    */
-  public void init(){
+  public void init() throws IndexException{
     gateDocFactory = new GATEDocumentFactory(indexConfig);
     inputQueue =  new LinkedBlockingQueue<GATEDocument>(documentQueueSize);
     //start the sub-indexers for the token features

Modified: 
mimir/branches/5.0/mimir-core/src/gate/mimir/index/mg4j/TokenIndexBuilder.java
===================================================================
--- 
mimir/branches/5.0/mimir-core/src/gate/mimir/index/mg4j/TokenIndexBuilder.java  
    2014-01-22 17:30:47 UTC (rev 17244)
+++ 
mimir/branches/5.0/mimir-core/src/gate/mimir/index/mg4j/TokenIndexBuilder.java  
    2014-01-23 16:52:34 UTC (rev 17245)
@@ -20,7 +20,7 @@
 import gate.mimir.IndexConfig.TokenIndexerConfig;
 import gate.mimir.index.IndexException;
 import gate.mimir.index.Indexer;
-import gate.mimir.index.mg4j.zipcollection.DocumentCollectionWriter;
+import gate.mimir.index.mg4j.zipcollection.DocumentCollection;
 import gate.mimir.index.mg4j.zipcollection.DocumentData;
 
 import java.io.IOException;
@@ -52,7 +52,7 @@
    * A zip collection builder used to build a zip of the collection
    * if this has been requested.
    */
-  protected DocumentCollectionWriter collectionWriter = null;
+  protected DocumentCollection collectionWriter = null;
   
   /**
    * An array of helpers for creating document metadata. 
@@ -101,7 +101,7 @@
           BlockingQueue<GATEDocument> outputQueue, Indexer indexer,
           GATEDocumentFactory factory, boolean zipCollection,
           String baseName,
-          TokenIndexerConfig config) {
+          TokenIndexerConfig config) throws IndexException {
     super(inputQueue, outputQueue, indexer, baseName, 
         config.isDirectIndexEnabled());
     this.termProcessor = config.getTermProcessor();
@@ -124,7 +124,7 @@
     
     if(zipCollection) {
       logger.info("Creating zipped collection for field \"" + featureName + 
"\"");
-      collectionWriter = new DocumentCollectionWriter(indexer.getIndexDir());
+      collectionWriter = new DocumentCollection(indexer.getIndexDir());
     }
     
   }

Modified: 
mimir/branches/5.0/mimir-core/src/gate/mimir/index/mg4j/zipcollection/DocumentCollection.java
===================================================================
--- 
mimir/branches/5.0/mimir-core/src/gate/mimir/index/mg4j/zipcollection/DocumentCollection.java
       2014-01-22 17:30:47 UTC (rev 17244)
+++ 
mimir/branches/5.0/mimir-core/src/gate/mimir/index/mg4j/zipcollection/DocumentCollection.java
       2014-01-23 16:52:34 UTC (rev 17245)
@@ -17,9 +17,13 @@
 
 import gate.mimir.index.IndexException;
 import gate.mimir.index.Indexer;
+import it.unimi.dsi.fastutil.ints.IntArrayList;
+import it.unimi.dsi.fastutil.ints.IntList;
+import it.unimi.dsi.fastutil.longs.Long2ObjectLinkedOpenHashMap;
 
 import java.io.*;
 import java.util.*;
+import java.util.concurrent.BlockingQueue;
 import java.util.zip.*;
 
 import org.apache.log4j.Logger;
@@ -31,8 +35,12 @@
  * serialised {@link DocumentData} values.
  */
 public class DocumentCollection {
-
   /**
+   * The maximum number of documents to be stored in the document cache.
+   */
+  protected static final int DOCUMENT_DATA_CACHE_SIZE = 100;
+  
+  /**
    * A simple {@link FilenameFilter} that only accepts the zip files that are
    * part of a collection.
    * 
@@ -71,7 +79,7 @@
   /**
    * The zip files containing the document collection.
    */
-  protected ZipFile[] zipFiles = null;
+  protected List<ZipFile> zipFiles = null;
   
   private static Logger logger = Logger.getLogger(DocumentCollection.class);
   
@@ -87,15 +95,160 @@
    * <code>maxEntries[i-1] + 1</code> and <code>maxEntries[i]</code>, 
inclusive.
    * By convention, <code>maxEntries[-1]=-1</code>.
    */
-  protected int[] maxEntries = null;
+  protected IntList maxEntries = null;
   
   /**
+   * A cache of {@link DocumentData} values used for returning the various
+   * document details (title, URI, text).
+   */
+  protected Long2ObjectLinkedOpenHashMap<DocumentData> documentCache;
+  
+  /**
+   * Document data objects that have been written to the zip file currently 
+   * being created and have to be kept in RAM until the file is closed and can 
+   * be open in read mode. 
+   */
+  protected Long2ObjectLinkedOpenHashMap<DocumentData> inputBuffer;
+  
+  /**
    * Flag that gets set to true when the collection is closed (and blocks all 
    * subsequent operations).
    */
   private volatile boolean closed = false; 
   
+
+  
   /**
+   * The maximum number of bytes to write to a single zip file.
+   */
+  public static final long ZIP_FILE_MAX_SIZE = 2 * 1000 * 1000 * 1000; 
+    
+  /**
+   * The maximum number of entries to write to a single zip file.
+   * Java 1.5 only support 2^16 entries, so the default limit is set below 
that.
+   * If running on Java 1.6, this limit can safely be increased, however, the
+   * total size of the file (as specified by {@link #ZIP_FILE_MAX_SIZE}) 
should 
+   * not be greater than 4GB, in either case.
+   */
+  public static final int ZIP_FILE_MAX_ENTRIES = 65530;
+  
+  /**
+   * The zip file managed by this collection.
+   */
+  protected ZipOutputStream zipOuputStream;
+  
+  /**
+   * The zip file to which we are currently writing.
+   */
+  protected File zipFile;
+  
+  /**
+   * The number of entries written so far to the current zip file.
+   */
+  protected int currentEntries;
+  
+  /**
+   * The amount of bytes written so far to the current zip file.
+   */
+  protected long currentLength;
+  
+  /**
+   * A {@link ByteArrayOutputStream} used to temporarily store serialised 
+   * document data objects.
+   */
+  protected ByteArrayOutputStream byteArrayOS;
+  
+  /**
+   * The ID for the next document to be written. This value is initialised to 0
+   * and then is automatically incremented whenever anew document is written.
+   */
+  protected int documentId;
+  
+
+  /**
+   * The unique ID of the current zip file.
+   */
+  protected int zipFileId;
+  
+
+
+  
+  /**
+   * Writes a new document to the underlying zip file. The documents added 
+   * through this method will get automatically generated names starting from 
+   * &quot;0&quot;, and continuing with &quot;1&quot;, &quot;2&quot;, etc.   
+   * @param document
+   * @throws IndexException if there are any problems while accessing the zip 
+   * collection file(s).
+   */
+  public void writeDocument(DocumentData document) throws IndexException{
+    if(zipFile == null) openZipFile();
+    try{
+      //write the new document to the byte array
+      ObjectOutputStream objectOutStream = new ObjectOutputStream(byteArrayOS);
+      objectOutStream.writeObject(document);
+      objectOutStream.close();
+
+      //see if we're about to go over the limits
+      if(currentEntries >= ZIP_FILE_MAX_ENTRIES || 
+         currentLength + byteArrayOS.size()  >= ZIP_FILE_MAX_SIZE){
+        //move to the next zip file
+        closeZipFile();
+        zipFileId ++;
+        openZipFile();
+      }
+
+      //create a new entry in the current zip file
+      ZipEntry entry = new ZipEntry(Integer.toString(documentId++));
+      zipOuputStream.putNextEntry(entry);
+      //write the data
+      byteArrayOS.writeTo(zipOuputStream);
+      zipOuputStream.closeEntry();
+      currentLength += entry.getCompressedSize();
+      //clean up the byte array for next time
+      byteArrayOS.reset();
+      currentEntries++;
+    }catch(IOException e){
+      throw new IndexException("Problem while accessing the collection file", 
e);
+    }
+  }
+  
+  /**
+   * Opens the current zip file and sets the {@link #zipFile} and 
+   * {@link #zipOuputStream} values accordingly. 
+   * @throws IndexException if the collection zip file already exists, or 
cannot
+   * be opened for writing.
+   */
+  protected void openZipFile() throws IndexException{
+    zipFile = new File(indexDir, 
+            Indexer.MIMIR_COLLECTION_BASENAME + 
+            "-" + zipFileId +
+            Indexer.MIMIR_COLLECTION_EXTENSION);
+    if(zipFile.exists()) throw new IndexException("Collection zip file (" + 
+            zipFile.getAbsolutePath() + ") already exists!");
+    
+    try {
+      zipOuputStream = new ZipOutputStream(new BufferedOutputStream(
+              new  FileOutputStream(zipFile)));
+    } catch(FileNotFoundException e) {
+      throw new IndexException("Cannot write to collection zip file (" + 
+              zipFile.getAbsolutePath() + ")", e);
+    }
+    currentEntries = 0;
+    currentLength = 0;
+  }
+  
+  /**
+   * Closes the current zip file.
+   * @throws IOException 
+   */
+  protected void closeZipFile() throws IOException{
+    if(zipOuputStream != null) zipOuputStream.close();
+  }
+  
+
+  
+  /**
    * Opens a zip file and creates a DocumentCollection object for accessing 
the 
    * document data.
    * @param indexDirectory
@@ -103,6 +256,12 @@
    */
   public DocumentCollection(File indexDirectory) throws IndexException {
     this.indexDir = indexDirectory;
+    // prepare the document cache
+    documentCache = new Long2ObjectLinkedOpenHashMap<DocumentData>();
+    
+    byteArrayOS = new ByteArrayOutputStream();
+    documentId = 0;
+    zipFileId = 0;
   }
   
   /**
@@ -127,21 +286,23 @@
         return getZipFileId(o1.getName()) - getZipFileId(o2.getName());
       }
     });
-    zipFiles = new ZipFile[collectionFiles.length];
-    maxEntries = new int[collectionFiles.length];
+    zipFiles = new ArrayList<ZipFile>(collectionFiles.length);
+    
+    int[] maxEntriesArr = new int[collectionFiles.length];
     for(int  i = 0; i  < collectionFiles.length; i++){
       try {
         //for each file, open a ZipFile, parse the entries, set the maxEntry 
value.
-        zipFiles[i] = new ZipFile(collectionFiles[i]);
-        Enumeration<? extends ZipEntry> entries = zipFiles[i].entries();
-        maxEntries[i] = -1;
+        ZipFile aZipFile = new ZipFile(collectionFiles[i]); 
+        zipFiles.add(aZipFile);
+        Enumeration<? extends ZipEntry> entries = aZipFile.entries();
+        maxEntriesArr[i] = -1;
         while(entries.hasMoreElements()){
           ZipEntry anEntry = entries.nextElement();
           String entryName = anEntry.getName();
           try {
             int entryId = Integer.parseInt(entryName);
             //sanity check
-            if(i > 0 && entryId <= maxEntries[i-1]){
+            if(i > 0 && entryId <= maxEntriesArr[i-1]){
               throw new IndexException(
                       "Invalid entries distribution: collection file " + 
                       collectionFiles[i].getAbsolutePath() + 
@@ -150,7 +311,7 @@
                       "already seen in a previous collection file!");
             }
             //update the current maximum
-            if(entryId > maxEntries[i]) maxEntries[i] = entryId;
+            if(entryId > maxEntriesArr[i]) maxEntriesArr[i] = entryId;
           } catch(NumberFormatException e) {
             //not parseable -> we'll ignore this entry.
             logger.warn("Unparseable zip entry name: " + entryName);
@@ -164,8 +325,8 @@
                 collectionFiles[i].getAbsolutePath(), e);
       }
     }
-    
-    logger.info("Opened zip collection: maxEntries = " + 
Arrays.toString(maxEntries));
+    maxEntries = new IntArrayList(maxEntriesArr);
+    logger.info("Opened zip collection: maxEntries = " + 
Arrays.toString(maxEntriesArr));
   }
   
   /**
@@ -178,55 +339,68 @@
   public DocumentData getDocumentData(long documentID) throws IndexException{
     if(closed) throw new IllegalStateException(
             "This document collection has already been closed!");
-    if(zipFiles == null){
-      //open the zip files, parse their catalogues and update the values in 
-      //maxEntries
-      openCollectionFiles();
-    }
-    //locate the right zip file
-    int zipFileId = 0;
-    while(zipFileId < maxEntries.length && documentID > maxEntries[zipFileId]){
-      zipFileId++;
-    }
-    if(zipFileId >= maxEntries.length){
-      //entry not found (entry number too large)
-      throw new NoSuchElementException("No entry found for document ID " + 
-              documentID + ". Document ID too large for this collection!");
-    }
     
-    ZipEntry entry = zipFiles[zipFileId].getEntry(Long.toString(documentID));
-    if(entry == null) 
-      throw new NoSuchElementException("No entry found for document ID " + 
documentID);
-    try {
-      ObjectInputStream ois = new 
ObjectInputStream(zipFiles[zipFileId].getInputStream(entry));
+    DocumentData documentData = documentCache.getAndMoveToFirst(documentID);
+    if(documentData == null) {
+      // cache miss
+      if(zipFiles == null){
+        //open the zip files, parse their catalogues and update the values in 
+        //maxEntries
+        openCollectionFiles();
+      }
+      //locate the right zip file
+      int zipFileId = 0;
+      while(zipFileId < maxEntries.size() && documentID > 
maxEntries.get(zipFileId)){
+        zipFileId++;
+      }
+      if(zipFileId >= maxEntries.size()){
+        //entry not found (entry number too large)
+        throw new NoSuchElementException("No entry found for document ID " + 
+                documentID + ". Document ID too large for this collection!");
+      }
       
-      DocumentData docData = (DocumentData) ois.readObject();
-      ois.close();
-      return docData;
-    } catch(ClassNotFoundException e) {
-      //invalid data read from the zip file
-      throw new IndexException("Invalid data read from zip file!", e);
-    } catch(IOException e) {
-      throw new IndexException("Exception reading zip file!", e);
+      ZipEntry entry = 
zipFiles.get(zipFileId).getEntry(Long.toString(documentID));
+      if(entry == null) throw new NoSuchElementException(
+          "No entry found for document ID " + documentID);
+      try {
+        ObjectInputStream ois = new 
ObjectInputStream(zipFiles.get(zipFileId).getInputStream(entry));
+        documentData = (DocumentData) ois.readObject();
+        ois.close();
+        documentCache.putAndMoveToFirst(documentID, documentData);
+        if(documentCache.size() > DOCUMENT_DATA_CACHE_SIZE) {
+          documentCache.removeLast();
+        }
+      } catch(ClassNotFoundException e) {
+        //invalid data read from the zip file
+        throw new IndexException("Invalid data read from zip file!", e);
+      } catch(IOException e) {
+        throw new IndexException("Exception reading zip file!", e);
+      }
     }
+    return documentData;  
   }
   
   /**
    * Close this document collection and release all allocated resources (such 
    * as open file handles). 
+   * @throws IOException 
    */
-  public void close() {
+  public void close() throws IOException {
+    // close the writer
+    closeZipFile();
+    // close the reader
     closed = true;
     if(zipFiles != null){
-      for(int i = 0; i < zipFiles.length; i++){
+      for(ZipFile aZipFile : zipFiles){
         try {
-          zipFiles[i].close();
-          zipFiles[i] = null;
+          aZipFile.close();
         } catch(IOException e) {
           // ignore
         }
       }
+      zipFiles.clear();
       zipFiles = null;      
     }
+    documentCache.clear();
   }
 }

Deleted: 
mimir/branches/5.0/mimir-core/src/gate/mimir/index/mg4j/zipcollection/DocumentCollectionWriter.java
===================================================================
--- 
mimir/branches/5.0/mimir-core/src/gate/mimir/index/mg4j/zipcollection/DocumentCollectionWriter.java
 2014-01-22 17:30:47 UTC (rev 17244)
+++ 
mimir/branches/5.0/mimir-core/src/gate/mimir/index/mg4j/zipcollection/DocumentCollectionWriter.java
 2014-01-23 16:52:34 UTC (rev 17245)
@@ -1,195 +0,0 @@
-/*
- *  DocumentCollectionWriter.java
- *
- *  Copyright (c) 2007-2011, The University of Sheffield.
- *
- *  This file is part of GATE Mímir (see http://gate.ac.uk/family/mimir.html), 
- *  and is free software, licenced under the GNU Lesser General Public License,
- *  Version 3, June 2007 (also included with this distribution as file
- *  LICENCE-LGPL3.html).
- *
- *  Valentin Tablan, 15 Apr 2009
- *
- *  $Id$
- */
-package gate.mimir.index.mg4j.zipcollection;
-
-
-import gate.mimir.index.IndexException;
-import gate.mimir.index.Indexer;
-
-import java.io.*;
-
-import org.apache.log4j.Logger;
-
-
-import java.util.zip.*;
-
-/**
- * A writer for Mimir zip document collections.
- * A Mimir document collection is a set of serialised {@link DocumentData} 
- * objects stored in one or more zip files.
- * To create a Mimir document collection, create a writer (pointing it to the 
- * top level index directory), add new documents using the 
- * {@link #writeDocument(DocumentData)} method, and close the collection at the
- * end, by calling the {@link #close()} method.
- * 
- * This writer will create one or more zip files as required, never writing 
more
- * than {@link #ZIP_FILE_MAX_ENTRIES} entries, or more than 
- * {@link #ZIP_FILE_MAX_SIZE} bytes to a single file.
- */
-public class DocumentCollectionWriter {
-
-  
-  /**
-   * The maximum number of bytes to write to a single zip file.
-   */
-  public static final long ZIP_FILE_MAX_SIZE = 2 * 1000 * 1000 * 1000; 
-    
-  /**
-   * The maximum number of entries to write to a single zip file.
-   * Java 1.5 only support 2^16 entries, so the default limit is set below 
that.
-   * If running on Java 1.6, this limit can safely be increased, however, the
-   * total size of the file (as specified by {@link #ZIP_FILE_MAX_SIZE}) 
should 
-   * not be greater than 4GB, in either case.
-   */
-  public static final int ZIP_FILE_MAX_ENTRIES = 65530;
-  
-  private static Logger logger = 
Logger.getLogger(DocumentCollectionWriter.class);
-  
-  /**
-   * The zip file managed by this collection.
-   */
-  protected ZipOutputStream zipOuputStream;
-  
-  /**
-   * The zip file to which we are currently writing.
-   */
-  protected File zipFile;
-  
-  /**
-   * The top-level index directory.
-   */
-  protected File indexDir;
-  /**
-   * The number of entries written so far to the current zip file.
-   */
-  protected int currentEntries;
-  
-  /**
-   * The amount of bytes written so far to the current zip file.
-   */
-  protected long currentLength;
-  
-  /**
-   * A {@link ByteArrayOutputStream} used to temporarily store serialised 
-   * document data objects.
-   */
-  protected ByteArrayOutputStream byteArrayOS;
-  
-  /**
-   * The ID for the next document to be written. This value is initialised to 0
-   * and then is automatically incremented whenever anew document is written.
-   */
-  protected int documentId;
-  
-
-  /**
-   * The unique ID of the current zip file.
-   */
-  protected int zipFileId;
-  
-  /**
-   * Creates a new DocumentCollectionWriter for the specified index.
-   * @param indexDir the top level index directory.
-   */
-  public DocumentCollectionWriter(File indexDir){
-    this.indexDir = indexDir;
-    byteArrayOS = new ByteArrayOutputStream();
-    documentId = 0;
-    zipFileId = 0;
-  }
-
-  
-  /**
-   * Writes a new document to the underlying zip file. The documents added 
-   * through this method will get automatically generated names starting from 
-   * &quot;0&quot;, and continuing with &quot;1&quot;, &quot;2&quot;, etc.   
-   * @param document
-   * @throws IndexException if there are any problems while accessing the zip 
-   * collection file(s).
-   */
-  public void writeDocument(DocumentData document) throws IndexException{
-    if(zipFile == null) openZipFile();
-    try{
-      //write the new document to the byte array
-      ObjectOutputStream objectOutStream = new ObjectOutputStream(byteArrayOS);
-      objectOutStream.writeObject(document);
-      objectOutStream.close();
-
-      //see if we're about to go over the limits
-      if(currentEntries >= ZIP_FILE_MAX_ENTRIES || 
-         currentLength + byteArrayOS.size()  >= ZIP_FILE_MAX_SIZE){
-        //move to the next zip file
-        closeZipFile();
-        zipFileId ++;
-        openZipFile();
-      }
-
-      //create a new entry in the current zip file
-      ZipEntry entry = new ZipEntry(Integer.toString(documentId++));
-      zipOuputStream.putNextEntry(entry);
-      //write the data
-      byteArrayOS.writeTo(zipOuputStream);
-      zipOuputStream.closeEntry();
-      currentLength += entry.getCompressedSize();
-      //clean up the byte array for next time
-      byteArrayOS.reset();
-      currentEntries++;
-    }catch(IOException e){
-      throw new IndexException("Problem while accessing the collection file", 
e);
-    }
-  }
-  
-  /**
-   * Opens the current zip file and sets the {@link #zipFile} and 
-   * {@link #zipOuputStream} values accordingly. 
-   * @throws IndexException if the collection zip file already exists, or 
cannot
-   * be opened for writing.
-   */
-  protected void openZipFile() throws IndexException{
-    zipFile = new File(indexDir, 
-            Indexer.MIMIR_COLLECTION_BASENAME + 
-            "-" + zipFileId +
-            Indexer.MIMIR_COLLECTION_EXTENSION);
-    if(zipFile.exists()) throw new IndexException("Collection zip file (" + 
-            zipFile.getAbsolutePath() + ") already exists!");
-    
-    try {
-      zipOuputStream = new ZipOutputStream(new BufferedOutputStream(
-              new  FileOutputStream(zipFile)));
-    } catch(FileNotFoundException e) {
-      throw new IndexException("Cannot write to collection zip file (" + 
-              zipFile.getAbsolutePath() + ")", e);
-    }
-    currentEntries = 0;
-    currentLength = 0;
-  }
-  
-  /**
-   * Closes the current zip file.
-   * @throws IOException 
-   */
-  protected void closeZipFile() throws IOException{
-    if(zipOuputStream != null) zipOuputStream.close();
-  }
-  
-  /**
-   * Closes this writer (and the underlying zip file).
-   * @throws IOException 
-   */
-  public void close() throws IOException{
-    closeZipFile();
-  }
-  
-}

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
CenturyLink Cloud: The Leader in Enterprise Cloud Services.
Learn Why More Businesses Are Choosing CenturyLink Cloud For
Critical Workloads, Development Environments & Everything In Between.
Get a Quote or Start a Free Trial Today. 
http://pubads.g.doubleclick.net/gampad/clk?id=119420431&iu=/4140/ostg.clktrk
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs

[gate-cvs] SF.net SVN: gate:[17245] mimir/branches/5.0/mimir-core/src/gate/mimir

Reply via email to