Revision: 20207
http://sourceforge.net/p/gate/code/20207
Author: johann_p
Date: 2017-04-18 15:04:37 +0000 (Tue, 18 Apr 2017)
Log Message:
-----------
Make the -i option understand both directories and files.
If the file specified for the -i option is a directory, everything
is as it was before: the documents within that directory and all
subdirectories are processed. If the file is not a directory, it
is assumed to be a file listing all the relative paths of
documents to process, where the relative paths are expected to
be relative to the directory containing the list file.
The same relative paths are used for creating the output
documents relative to the output directory, if specified.
Modified Paths:
--------------
gcp/trunk/doc/gcp-guide.pdf
gcp/trunk/doc/install-and-run.tex
gcp/trunk/gcp-direct.sh
gcp/trunk/src/gate/cloud/batch/BatchRunner.java
Modified: gcp/trunk/doc/gcp-guide.pdf
===================================================================
(Binary files differ)
Modified: gcp/trunk/doc/install-and-run.tex
===================================================================
--- gcp/trunk/doc/install-and-run.tex 2017-04-11 15:11:32 UTC (rev 20206)
+++ gcp/trunk/doc/install-and-run.tex 2017-04-18 15:04:37 UTC (rev 20207)
@@ -140,11 +140,16 @@
(GATE XML format) or ``finf'' (FastInfoset format). To use FastInfoset the
GATE \verb!Format_FastInfoset! plugin must be loaded by the saved
application.
-\item[-i] the directory in which to look for the input files. All files in
+\item[-i] the directory in which to look for the input files or a file that
contains
+ relative path names to the input files. If this points to a directory, all
files in
this directory and any subdirectories will be processed (except for standard
backup and temporary file name patterns and source control metadata -- see
\url{http://ant.apache.org/manual/dirtasks.html#defaultexcludes} for
- details).
+ details). If this points to a file, the content of the file is expected to
be
+ one relative file path per line, using UTF-8 encoding. The file paths are
+ interpreted to be relative to the directory that contains the list file.
+ If processed documents are written, then this will also be their relative
+ path to the output directory.
\item[-o] (optional) the directory in which to place the output files. Each
input file
will generate an output file with the same name in the output directory.
If this option is missing, and the option \texttt{-b} is missing as well,
Modified: gcp/trunk/gcp-direct.sh
===================================================================
--- gcp/trunk/gcp-direct.sh 2017-04-11 15:11:32 UTC (rev 20206)
+++ gcp/trunk/gcp-direct.sh 2017-04-18 15:04:37 UTC (rev 20207)
@@ -81,5 +81,6 @@
fi
shift
done
-
+echo JVM parameters used ${jvmparams[@]}
+echo GCP parameters used ${gcpparams[@]}
"$JAVA_HOME/bin/java" -Dgcp.home="${SCRIPTDIR}"
-Djava.protocol.handler.pkgs=gate.cloud.util.protocols -cp "${GCP_CLASSPATH}"
"${jvmparams[@]}" gate.cloud.batch.BatchRunner "${gcpparams[@]}"
Modified: gcp/trunk/src/gate/cloud/batch/BatchRunner.java
===================================================================
--- gcp/trunk/src/gate/cloud/batch/BatchRunner.java 2017-04-11 15:11:32 UTC
(rev 20206)
+++ gcp/trunk/src/gate/cloud/batch/BatchRunner.java 2017-04-18 15:04:37 UTC
(rev 20207)
@@ -56,6 +56,8 @@
import org.apache.log4j.Logger;
import com.sun.jna.Platform;
+import static gate.cloud.io.IOConstants.PARAM_BATCH_FILE_LOCATION;
+import gate.cloud.io.ListDocumentEnumerator;
import static gate.cloud.io.IOConstants.PARAM_COMPRESSION;
import static gate.cloud.io.IOConstants.PARAM_DOCUMENT_ROOT;
import static gate.cloud.io.IOConstants.PARAM_ENCODING;
@@ -64,6 +66,7 @@
import static gate.cloud.io.IOConstants.VALUE_COMPRESSION_GZIP;
import static gate.cloud.io.IOConstants.VALUE_COMPRESSION_NONE;
import static gate.cloud.io.IOConstants.VALUE_COMPRESSION_SNAPPY;
+import static gate.cloud.io.ListDocumentEnumerator.PARAM_FILE_NAME;
import gate.cloud.io.file.JSONOutputHandler;
import static
gate.cloud.io.file.JSONOutputHandler.PARAM_ANNOTATION_TYPE_PROPERTY;
import static gate.cloud.io.file.JSONOutputHandler.PARAM_GROUP_ENTITIES_BY;
@@ -520,7 +523,7 @@
// TODO: may be useful to be able to override the default user config and
// session files here?
options.addOption("b","batchFile",true,"Batch file (required, replaces -i,
-o, -x, -r, -I)");
- options.addOption("i","inputDirectory",true,"Input directory (required,
unless -b given)");
+ options.addOption("i","inputDirectoryOrFile",true,"Input directory or file
listing document IDs (required, unless -b given)");
options.addOption("f","outputFormat",true,"Output format, optional, one of
'xml'|'gatexml', 'finf', 'ser', 'json', default is 'finf'");
options.addOption("o","outputDirectory",true,"Output directory (not output
if missing)");
options.addOption("x","executePipeline",true,"Pipeline/application file to
execute (required, unless -b given)");
@@ -710,10 +713,24 @@
} else {
aBatch.setBatchId("GcpBatchId");
}
- // set the input Handler
+ // set the input Handler, depending on the value of the option "i":
+ // If this points to a directory, we process all matching files in
that
+ // directory, if it points to a file we process all files listed in
+ // that file by interpreting each line as a file path relative to
+ // the directory where the specified file is located in.
+ String fileOrDir = line.getOptionValue('i');
+ File fileOrDirFile = new File(fileOrDir);
+ if(!fileOrDirFile.exists()) {
+ throw new RuntimeException("ERROR file or directory does not
exist: "+fileOrDirFile.getAbsolutePath());
+ }
String inputHandlerClassName = "gate.cloud.io.file.FileInputHandler";
Map<String,String> configData = new HashMap<String, String>();
- configData.put(PARAM_DOCUMENT_ROOT, line.getOptionValue('i'));
+ if(fileOrDirFile.isDirectory()) {
+ configData.put(PARAM_DOCUMENT_ROOT, fileOrDir);
+ } else {
+ // if we have a file, use the parent directory
+ configData.put(PARAM_DOCUMENT_ROOT, fileOrDirFile.getParent());
+ }
if(line.hasOption("ci")) {
configData.put(PARAM_COMPRESSION,VALUE_COMPRESSION_GZIP);
} else if(line.hasOption("si")) {
@@ -781,18 +798,33 @@
outHandler.init();
// log.info("Have output handler: "+outHandler);
outHandlers.add(outHandler);
- } // if option -o is given
+ } else { // if option -o is given
+ log.info("WARNING: no option -o, processed documents are
discarded!");
+ }
aBatch.setOutputHandlers(outHandlers);
- String enumeratorClassName =
"gate.cloud.io.file.FileDocumentEnumerator";
+ String enumeratorClassName = null;
+ configData = new HashMap<String, String>();
+ if(fileOrDirFile.isDirectory()) {
+ log.info("Enumerating all file IDs in directory:
"+fileOrDirFile.getAbsolutePath());
+ enumeratorClassName = "gate.cloud.io.file.FileDocumentEnumerator";
+ configData.put(PARAM_DOCUMENT_ROOT, line.getOptionValue('i'));
+ } else {
+ log.info("Reading file IDs from file:
"+fileOrDirFile.getAbsolutePath());
+ enumeratorClassName = "gate.cloud.io.ListDocumentEnumerator";
+ configData.put(PARAM_BATCH_FILE_LOCATION,new
File(".").getAbsolutePath());
+ configData.put(PARAM_FILE_NAME, fileOrDir);
+ configData.put(PARAM_ENCODING,"UTF-8");
+ }
Class<? extends DocumentEnumerator> enumeratorClass =
Class.forName(enumeratorClassName, true, Gate.getClassLoader())
.asSubclass(DocumentEnumerator.class);
- configData = new HashMap<String, String>();
- configData.put(PARAM_DOCUMENT_ROOT, line.getOptionValue('i'));
- List<DocumentID> docIds = new LinkedList<DocumentID>();
DocumentEnumerator enumerator = enumeratorClass.newInstance();
enumerator.config(configData);
enumerator.init();
+ // TODO: this should really not be done like this!
+ // Instead of reading the docIds in all at once, they should
+ // get streamed to the workers on demand, if at all possible?
+ List<DocumentID> docIds = new LinkedList<DocumentID>();
while(enumerator.hasNext()) {
DocumentID id = enumerator.next();
// log.info("Adding document: "+id);
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most
engaging tech sites, Slashdot.org! http://sdm.link/slashdot
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs