Revision: 17466
http://sourceforge.net/p/gate/code/17466
Author: valyt
Date: 2014-02-27 12:48:30 +0000 (Thu, 27 Feb 2014)
Log Message:
-----------
First bash at an automatic index upgrade routine.
Modified Paths:
--------------
mimir/branches/5.0/mimir-core/src/gate/mimir/IndexConfig.java
mimir/branches/5.0/mimir-core/src/gate/mimir/index/DocumentCollection.java
Added Paths:
-----------
mimir/branches/5.0/mimir-core/src/gate/mimir/util/IndexUpgrader.java
Modified: mimir/branches/5.0/mimir-core/src/gate/mimir/IndexConfig.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/IndexConfig.java
2014-02-27 09:44:39 UTC (rev 17465)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/IndexConfig.java
2014-02-27 12:48:30 UTC (rev 17466)
@@ -299,7 +299,24 @@
this.options = new HashMap<String, String>();
}
+
+
/**
+ * @return the formatVersion See {@link #FORMAT_VERSION}.
+ */
+ public int getFormatVersion() {
+ return formatVersion;
+ }
+
+ /**
+ * See {@link #FORMAT_VERSION}.
+ * @param formatVersion the formatVersion to set
+ */
+ public void setFormatVersion(int formatVersion) {
+ this.formatVersion = formatVersion;
+ }
+
+ /**
* Gets the top level directory of an index.
*
* @return a {@link File} object.
Modified:
mimir/branches/5.0/mimir-core/src/gate/mimir/index/DocumentCollection.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/index/DocumentCollection.java
2014-02-27 09:44:39 UTC (rev 17465)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/index/DocumentCollection.java
2014-02-27 12:48:30 UTC (rev 17466)
@@ -62,7 +62,7 @@
/**
* Class representing one of the collection (zip) files.
*/
- protected static class CollectionFile implements Comparable<CollectionFile> {
+ public static class CollectionFile implements Comparable<CollectionFile> {
/**
* The filename for the zip collection.
*/
@@ -87,7 +87,7 @@
"\\Q" + MIMIR_COLLECTION_BASENAME +
"\\E((\\d+)(?:-([-0-9a-zA-Z]+))?)\\Q"+
MIMIR_COLLECTION_EXTENSION + "\\E");
- protected static FilenameFilter FILENAME_FILTER = new FilenameFilter() {
+ public static FilenameFilter FILENAME_FILTER = new FilenameFilter() {
@Override
public boolean accept(File dir, String name) {
return MIMIR_COLLECTION_PATTERN.matcher(name).matches();
Added: mimir/branches/5.0/mimir-core/src/gate/mimir/util/IndexUpgrader.java
===================================================================
--- mimir/branches/5.0/mimir-core/src/gate/mimir/util/IndexUpgrader.java
(rev 0)
+++ mimir/branches/5.0/mimir-core/src/gate/mimir/util/IndexUpgrader.java
2014-02-27 12:48:30 UTC (rev 17466)
@@ -0,0 +1,286 @@
+/*
+ * IndexUpgrader.java
+ *
+ * Copyright (c) 2007-2014, The University of Sheffield.
+ *
+ * This file is part of GATE MÃmir (see http://gate.ac.uk/family/mimir.html),
+ * and is free software, licenced under the GNU Lesser General Public License,
+ * Version 3, June 2007 (also included with this distribution as file
+ * LICENCE-LGPL3.html).
+ *
+ * Valentin Tablan, 27 Feb 2014
+ *
+ * $Id$
+ */
+package gate.mimir.util;
+
+import gate.Gate;
+import gate.mimir.IndexConfig;
+import gate.mimir.IndexConfig.SemanticIndexerConfig;
+import gate.mimir.MimirIndex;
+import gate.mimir.SemanticAnnotationHelper;
+import gate.mimir.index.AtomicIndex;
+import gate.mimir.index.DocumentCollection;
+import gate.mimir.index.IndexException;
+import it.unimi.di.big.mg4j.index.DiskBasedIndex;
+import it.unimi.di.big.mg4j.index.cluster.DocumentalCluster;
+import it.unimi.dsi.fastutil.io.BinIO;
+import it.unimi.dsi.util.BloomFilter;
+
+import java.io.BufferedInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FilenameFilter;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.StandardCopyOption;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.log4j.Logger;
+
+/**
+ * Implementation of an algorithm to upgrade a 4.x Mímir index to the
+ * format used by version 5.0.
+ */
+public class IndexUpgrader {
+
+ protected static Logger logger = Logger.getLogger(IndexUpgrader.class);
+
+ /**
+ * A minimal set of files required for a valid index.
+ */
+ protected static final String[] REQUIRED_INDEX_FILE_EXTENSIONS = new
String[] {
+ DiskBasedIndex.INDEX_EXTENSION,
+ DiskBasedIndex.POSITIONS_EXTENSION,
+ DiskBasedIndex.TERMS_EXTENSION,
+ DiskBasedIndex.OFFSETS_EXTENSION
+ };
+
+ protected static final String[] REQUIRED_DIRECT_INDEX_FILE_EXTENSIONS = new
String[] {
+ DiskBasedIndex.INDEX_EXTENSION,
+ DiskBasedIndex.TERMS_EXTENSION,
+ DiskBasedIndex.OFFSETS_EXTENSION
+ };
+
+ public void upgradeIndex(File indexDirectory) throws IOException,
+ IndexException {
+ IndexConfig indexConfig = IndexConfig.readConfigFromFile( new File(
+ indexDirectory, MimirIndex.INDEX_CONFIG_FILENAME));
+ //test the version
+ if(indexConfig.getFormatVersion() > 6 || indexConfig.getFormatVersion() <
4){
+ throw new IndexException(
+ "Unsupported index version: " + indexConfig.getFormatVersion());
+ }
+
+ //check that none of the files to be created exist already
+ for(int i = 0 ; i < indexConfig.getTokenIndexers().length; i++) {
+ File tokenDir = new File(indexDirectory, "token-" + i);
+ if(tokenDir.exists()) {
+ throw new IndexException(
+ "Location required by upgraded index already exists:" +
+ tokenDir.getAbsolutePath());
+ }
+ }
+ for(int i = 0 ; i < indexConfig.getSemanticIndexers().length; i++) {
+ File tokenDir = new File(indexDirectory, "mention-" + i);
+ if(tokenDir.exists()) {
+ throw new IndexException(
+ "Location required by upgraded index already exists:" +
+ tokenDir.getAbsolutePath());
+ }
+ }
+
+ // check access
+ File sourceDir = new File(indexDirectory, "mg4j");
+ if(!sourceDir.isDirectory()) throw new IndexException(
+ "Invalid index: could not find source directory at" +
+ sourceDir.getAbsolutePath());
+ if(!sourceDir.canRead()) throw new IndexException(
+ "Could not read source directory at" + sourceDir.getAbsolutePath());
+ // check that we know how to deal with the S-A-H implementations
+ Class<? extends SemanticAnnotationHelper> dbSahClass = null;
+ try {
+ dbSahClass = Class.forName(
+ "gate.mimir.db.DBSemanticAnnotationHelper",
+ true, Gate.getClassLoader()).asSubclass(
+ SemanticAnnotationHelper.class);
+ } catch(ClassNotFoundException e) {
+ throw new IndexException("Could not find the DB S-A-H class. "
+ + "Is the 'db-h2' plugin loaded?", e);
+ }
+ for(int subIndexIdx = 0 ;
+ subIndexIdx < indexConfig.getSemanticIndexers().length;
+ subIndexIdx++) {
+ SemanticIndexerConfig sic =
indexConfig.getSemanticIndexers()[subIndexIdx];
+ for(SemanticAnnotationHelper sah : sic.getHelpers()) {
+ while(sah instanceof DelegatingSemanticAnnotationHelper) {
+ sah = ((DelegatingSemanticAnnotationHelper)sah).getDelegate();
+ }
+ if(!dbSahClass.isAssignableFrom(sah.getClass())) {
+ throw new IndexException("Cannot convert mentions index mentions-" +
+ subIndexIdx + " because it does not use the DH H2 " +
+ "Annotation Helper, which is the only one supported by " +
+ "this automatic upgrade process");
+ }
+ }
+ }
+ // move files
+ //collection files
+ File[] collectionFiles = sourceDir.listFiles(
+ DocumentCollection.CollectionFile.FILENAME_FILTER);
+ for(File aColFile : collectionFiles) {
+ File dest = new File(indexDirectory, aColFile.getName());
+ if(! aColFile.renameTo(dest)) {
+ throw new IndexException("Could not rename " +
+ aColFile.getAbsolutePath() + " to " + dest.getAbsolutePath());
+ }
+ }
+ //token indexes
+ for(int subIndexIdx = 0 ;
+ subIndexIdx < indexConfig.getTokenIndexers().length;
+ subIndexIdx++) {
+ upgradeSubIndex(indexDirectory, subIndexIdx,
+ indexConfig.getTokenIndexers()[subIndexIdx].isDirectIndexEnabled(),
+ null);
+ }
+ // mention indexes
+ for(int subIndexIdx = 0 ;
+ subIndexIdx < indexConfig.getSemanticIndexers().length;
+ subIndexIdx++) {
+ SemanticIndexerConfig sic =
indexConfig.getSemanticIndexers()[subIndexIdx];
+ upgradeSubIndex(indexDirectory, subIndexIdx, sic.isDirectIndexEnabled(),
+ sic);
+ }
+ }
+
+ /**
+ * Moves the file belonging to one sub-index.
+ * @param indexDirectory the top level index directory for the Mímir
+ * index being upgraded.
+ * @param subIndexIdx the index (position) of the sub-index
+ * @param mentionsConfig if this is a mentions index, then this parameter
+ * contains the mentions indexer config, null otherwise.
+ * @param direct doe this sub-index have a direct index also?
+ * @throws IndexException
+ * @throws IOException
+ */
+ protected void upgradeSubIndex(File indexDirectory, int subIndexIdx,
+ final boolean direct, SemanticIndexerConfig mentionsConfig) throws
IndexException, IOException {
+ File sourceDir = new File(indexDirectory, "mg4j");
+ // sanity checks
+ final String inputFilePrefix =
+ (mentionsConfig != null ? "mimir-mentions-" : "mimir-token-") +
+ subIndexIdx;
+
+ File[] atomicIndexFiles = sourceDir.listFiles(new FilenameFilter() {
+ @Override
+ public boolean accept(File dir, String name) {
+ return name.startsWith(inputFilePrefix + ".") ||
+ (direct && name.startsWith(inputFilePrefix +
+ AtomicIndex.DIRECT_INDEX_NAME_SUFFIX + "."));
+ }
+ });
+ Set<String> requiredExtensions = new HashSet<String>(
+ Arrays.asList(REQUIRED_INDEX_FILE_EXTENSIONS));
+ Set<String> requiredDirectExtensions = new HashSet<String>(
+ Arrays.asList(REQUIRED_DIRECT_INDEX_FILE_EXTENSIONS));
+
+ for(File aFile : atomicIndexFiles) {
+ String extension = aFile.getName().substring(inputFilePrefix.length());
+ if(direct && extension.startsWith(AtomicIndex.DIRECT_INDEX_NAME_SUFFIX))
{
+ extension =
extension.substring(AtomicIndex.DIRECT_INDEX_NAME_SUFFIX.length());
+ requiredDirectExtensions.remove(extension);
+ } else {
+ requiredExtensions.remove(extension);
+ }
+ }
+ // check that we've seen all files we wanted
+ if(!requiredExtensions.isEmpty() ||
+ (direct && ! requiredDirectExtensions.isEmpty())) {
+ //not all required files were found
+ StringBuilder str = new StringBuilder(
+ "Some required files were not found for index '");
+ str.append(inputFilePrefix).append("': ");
+ for(String extension : requiredExtensions) {
+ str.append(new File(sourceDir,
+ inputFilePrefix + extension).getAbsolutePath());
+ str.append("\n");
+ }
+ if(direct) {
+ for(String extension : requiredDirectExtensions) {
+ str.append(new File(sourceDir,
+ inputFilePrefix + extension).getAbsolutePath());
+ str.append("\n");
+ }
+ }
+ throw new IndexException(str.toString());
+ }
+
+ // all tests passed - start creating the new directories
+ String outputFilePrefix = (mentionsConfig != null ? "mention-" : "token-")
+
+ subIndexIdx;
+ File atomicIndexDir = new File(indexDirectory, outputFilePrefix);
+ File headDir = new File(atomicIndexDir, AtomicIndex.HEAD_FILE_NAME);
+ if(!headDir.mkdirs()) {
+ throw new IndexException(
+ "Location required by upgraded index could not be created:" +
+ headDir.getAbsolutePath());
+ }
+ for(File sourceFile : atomicIndexFiles) {
+ String extension =
sourceFile.getName().substring(inputFilePrefix.length());
+ File destinationFile = new File(headDir, outputFilePrefix + extension);
+ if(!sourceFile.renameTo(destinationFile)) {
+ throw new IndexException("Could not rename " +
+ sourceFile.getAbsolutePath() + " to " +
+ destinationFile.getAbsolutePath());
+ }
+ }
+ // create Bloom filter
+ File termsFile = new File(headDir, outputFilePrefix +
+ DiskBasedIndex.TERMS_EXTENSION); // guaranteed to exist, as tested
already
+ File bloomFile = new File(headDir, outputFilePrefix +
+ DocumentalCluster.BLOOM_EXTENSION); // guaranteed to exist, as tested
already
+ AtomicIndex.generateTermMap(termsFile, null, bloomFile);
+
+ if(direct) {
+ // create the direct.terms file by copying the terms file from
+ // the **inverted** index in head
+ File dest = new File(atomicIndexDir, AtomicIndex.DIRECT_TERMS_FILENAME);
+ Files.copy(termsFile.toPath(), dest.toPath(),
+ StandardCopyOption.COPY_ATTRIBUTES);
+ // create direct Bloom filter
+ File dirTermsFile = new File(headDir, outputFilePrefix +
+ AtomicIndex.DIRECT_INDEX_NAME_SUFFIX +
+ DiskBasedIndex.TERMS_EXTENSION); // guaranteed to exist, as tested
already
+ File dirBloomFile = new File(headDir, outputFilePrefix +
+ AtomicIndex.DIRECT_INDEX_NAME_SUFFIX +
+ DocumentalCluster.BLOOM_EXTENSION); // guaranteed to exist, as
tested already
+ AtomicIndex.generateTermMap(dirTermsFile, null, dirBloomFile);
+ }
+
+ // move the DB files
+ if(mentionsConfig != null) {
+ // We know that the DB-H2 S-A-H was used, as we've already tested for
that
+ File sourceDBDir = new File(indexDirectory, "db");
+ File destDBDir = new File(atomicIndexDir, "db");
+ if(!destDBDir.mkdirs()) {
+ throw new IndexException(
+ "Location required by upgraded index could not be created:" +
+ destDBDir.getAbsolutePath());
+ }
+ for(String annType : mentionsConfig.getAnnotationTypes()) {
+ String tableBaseName = annType.replaceAll("[^\\p{Alnum}_]", "_");
+ File source = new File(sourceDBDir, tableBaseName + ".h2.db");
+ File dest = new File(destDBDir, tableBaseName + ".h2.db");
+ if(!source.renameTo(dest)) {
+ throw new IndexException("Could not rename " +
+ source.getAbsolutePath() + " to " + dest.getAbsolutePath());
+ }
+ }
+ }
+ }
+
+}
Property changes on:
mimir/branches/5.0/mimir-core/src/gate/mimir/util/IndexUpgrader.java
___________________________________________________________________
Added: svn:keywords
## -0,0 +1 ##
+Id
\ No newline at end of property
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
Flow-based real-time traffic analytics software. Cisco certified tool.
Monitor traffic, SLAs, QoS, Medianet, WAAS etc. with NetFlow Analyzer
Customize your own dashboards, set traffic alerts and generate reports.
Network behavioral analysis & security monitoring. All-in-one tool.
http://pubads.g.doubleclick.net/gampad/clk?id=126839071&iu=/4140/ostg.clktrk
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs