Further refactoring of tdbloader2 scripts (JENA-977)

- Proper usage summaries in all scripts
- -k/--keep-work option instead of hidden environment variable
  for keeping work
- Short forms for all options


Project: http://git-wip-us.apache.org/repos/asf/jena/repo
Commit: http://git-wip-us.apache.org/repos/asf/jena/commit/a96b0164
Tree: http://git-wip-us.apache.org/repos/asf/jena/tree/a96b0164
Diff: http://git-wip-us.apache.org/repos/asf/jena/diff/a96b0164

Branch: refs/heads/JENA-977
Commit: a96b0164c43142791ac030e5332b3f54df6fb4ba
Parents: 7b61a14
Author: Rob Vesse <[email protected]>
Authored: Fri Jun 26 12:25:57 2015 +0100
Committer: Rob Vesse <[email protected]>
Committed: Fri Jun 26 16:30:53 2015 +0100

----------------------------------------------------------------------
 apache-jena/bin/tdbloader2      |  72 ++++++++++++++++------
 apache-jena/bin/tdbloader2data  |  82 ++++++++++++++++++++-----
 apache-jena/bin/tdbloader2index | 116 +++++++++++++++++++++++++----------
 3 files changed, 204 insertions(+), 66 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/jena/blob/a96b0164/apache-jena/bin/tdbloader2
----------------------------------------------------------------------
diff --git a/apache-jena/bin/tdbloader2 b/apache-jena/bin/tdbloader2
index 34ee029..9ff2727 100755
--- a/apache-jena/bin/tdbloader2
+++ b/apache-jena/bin/tdbloader2
@@ -17,24 +17,53 @@
 
 function printUsage() {
   cat << EOF
-Usage: tdbloader2 <Options> <Data>
+tdbloader2 - TDB Bulk Loader
 
-Options are as follows:
+Usage: tdbloader2 --loc <Directory> [Options] <Data> ...
 
+Bulk loader for TDB which manipulates the data files directly and so
+can only be used to create new databases.  This command relies on
+POSIX utilities so will only work on POSIX operating systems.
+
+If you wish to bulk load to an existing database please use tdbloader
+instead.
+
+Required options are as follows:
+
+  -l <DatabaseDirectory>
+  --loc <DatabaseDirectory>
+    Sets the location in which the database should be created.
+
+    This location must be a directory and must be empty, if a
+    non-existent path is specified it will be created as a new
+    directory.
+
+  <Data>
+    Specifies the path to one/more data files to load
+
+Common additional options are as follows:
+
+  -h
   --help
     Prints this help summary and exits
 
-  --loc <DatabaseDirectory>
-    Sets the location in which the database should be created
+Advanced additional options are as follows:
 
+  -k
+  --keep-work
+    Keeps the temporary work files around after they are no longer
+    needed.  May be useful for debugging.
+
+  -p <Phase>
   --phase <Phase>
     Sets the phase of the build to run, supported values are:
 
-      all    Full bulk load
-      data   Data phase only
-      index  Index phase only, requires the data phase to previously have been 
run
+      all      Full bulk load
+      data     Data phase only
+      index    Index phase only, requires the data phase to
+               previously have been run
 
-    When not specified defaults to all
+    When no phase is specified it defaults to all
 
 EOF
 }
@@ -86,12 +115,18 @@ export SORT_ARGS
 # Process arguments
 LOC=
 PHASE=
+KEEP_WORK=0
 
 while [ $# -gt 0 ]
 do
   ARG=$1
   case "$ARG" in
-    --loc|-loc)
+    -k|--keep-work)
+      # Keep work files
+      shift
+      KEEP_WORK=1
+      ;;
+    -l|--loc|-loc)
       # Location space separated
       shift
       LOC="$1"
@@ -102,13 +137,13 @@ do
       LOC=${ARG/-*loc=/}
       shift
       ;;
-    --phase)
+    -p|--phase)
       # Phase space separated
       shift
       PHASE="$1"
       shift
       ;;
-    --help)
+    -h|--help)
       # Help
       printUsage
       exit 0
@@ -123,9 +158,10 @@ done
 if [ -z "$PHASE" ]; then
   PHASE="all"
 fi
-
-#echo "Location is '$LOC'"
-#echo "Phase is '$PHASE'"
+COMMON_ARGS=
+if [ $KEEP_WORK = 0 ]; then
+  COMMON_ARGS="--keep-work"
+fi
 
 log() { echo " $(date $DATE)" "$@" ; }
 
@@ -138,14 +174,14 @@ TIME1="$(date +%s)"
 
 case "$PHASE" in
   all)
-    exec "$JENA_HOME/bin/tdbloader2data" --loc "$LOC" "$@"
-    exec "$JENA_HOME/bin/tdbloader2index" --loc "$LOC"
+    exec "$JENA_HOME/bin/tdbloader2data" $COMMON_ARGS --loc "$LOC" "$@"
+    exec "$JENA_HOME/bin/tdbloader2index" $COMMON_ARGS --loc "$LOC"
     ;;
   data)
-    exec "$JENA_HOME/bin/tdbloader2data" --loc "$LOC" "$@"
+    exec "$JENA_HOME/bin/tdbloader2data" $COMMON_ARGS --loc "$LOC" "$@"
     ;;
   index)
-    exec "$JENA_HOME/bin/tdbloader2index" --loc "$LOC"
+    exec "$JENA_HOME/bin/tdbloader2index" $COMMON_ARGS --loc "$LOC"
     ;;
   *)
     echo "Unrecognized phase $PHASE" 1>&2

http://git-wip-us.apache.org/repos/asf/jena/blob/a96b0164/apache-jena/bin/tdbloader2data
----------------------------------------------------------------------
diff --git a/apache-jena/bin/tdbloader2data b/apache-jena/bin/tdbloader2data
index 90200e4..5aceb27 100755
--- a/apache-jena/bin/tdbloader2data
+++ b/apache-jena/bin/tdbloader2data
@@ -18,6 +18,48 @@
 
 # The environment for this sub-script is setup by "tdbloader2"
 
+function printUsage() {
+  cat << EOF
+tdbloader2data - TDB Bulk Loader - Data Phase
+
+Usage tdbloader2data --loc <Directory> [Options] <Data> ...
+
+Bulk Loader for TDB which generates the Node Table.  This command
+relies on POSIX utilities so will only work on POSIX operating
+systems.
+
+This command can only be used to create new database. If you wish to
+bulk load to an existing database please use tdbloader instead.
+
+Required options are as follows:
+
+  -l <DatabaseDirectory>
+  --loc <DatabaseDirectory>
+    Sets the location in which the database should be created.
+
+    This location must be a directory and must be empty, if a
+    non-existent path is specified it will be created as a new
+    directory.
+
+  <Data>
+    Specifies the path to one/more data files to load
+
+Common additional options are as follows:
+
+  -h
+  --help
+    Prints this help summary and exits
+
+Advanced additional options are as follows:
+
+  -k
+  --keep-work
+    Keeps the temporary work files around after they are no longer
+    needed.  May be useful for debugging.
+
+EOF
+}
+
 # Exit on error.
 set -e
 
@@ -29,24 +71,24 @@ log() { echo " $(date $DATE)" "$@" ; }
 #DATE="+%Y-%m-%dT%H:%M:%S%:z"
 DATE="+%H:%M:%S"
 
-## JVM Arguments
-JVM_ARGS=${JVM_ARGS:--Xmx1200M}
-
-# Classpath set in "tdbloader2"
-if [ -z "$JENA_CP" ]
-then
-    echo "Classpath not provided : set JENA_CP" 1>&2
-    exit 1
-fi
-
-USAGE="Usage: tdbloader2data --loc location datafile ..."
 PKG=org.apache.jena.tdb.store.bulkloader2
 
+# Process Arguments
+LOC=
+KEEP_WORK=0
+
 while [ $# -gt 0 ]
 do
   ARG=$1
   case "$ARG" in
-    --loc|-loc)
+    -k|--keep-work)
+      # Keep work files
+      # This option is actually not used by this script but may be passed in
+      # by the parent tdbloader2 script
+      shift
+      KEEP_WORK=1
+      ;;
+    -l|--loc|-loc)
       # Location space separated
       shift
       LOC="$1"
@@ -57,8 +99,8 @@ do
       LOC=${ARG/-*loc=/}
       shift
       ;;
-    --help)
-      echo $USAGE
+    -h|--help)
+      printUsage
       exit 0
       ;;
     *)
@@ -91,8 +133,16 @@ fi
 if [ ! -d "$LOC" ] ; then echo "Location is not a directory: $LOC" ; exit 1 ; 
fi
 
 FILES="$@"
-## Stdin?
-KEEPWORKFILES="${KEEPWORKFILES:-}"
+
+## JVM Arguments
+JVM_ARGS=${JVM_ARGS:--Xmx1200M}
+
+# Classpath set in "tdbloader2"
+if [ -z "$JENA_CP" ]
+then
+    echo "Classpath not provided : set JENA_CP" 1>&2
+    exit 1
+fi
 
 # ---- Data loading phase
 log "Data Load Phase"

http://git-wip-us.apache.org/repos/asf/jena/blob/a96b0164/apache-jena/bin/tdbloader2index
----------------------------------------------------------------------
diff --git a/apache-jena/bin/tdbloader2index b/apache-jena/bin/tdbloader2index
index 5624854..2730af1 100755
--- a/apache-jena/bin/tdbloader2index
+++ b/apache-jena/bin/tdbloader2index
@@ -18,6 +18,45 @@
 
 # The environment for this sub-script is setup by "tdbloader2"
 
+function printUsage() {
+  cat << EOF
+tdbloader2index - TDB Bulk Loader - Index Phase
+
+Usage: tdbloader2index --loc <Directory> [Options]
+
+Bulk Loader for TDB which generates the Index files based upon the
+temporary data files generated by tdbloader2data.  This command relies
+on POSIX utilities so will only work on POSIX operating systems.
+
+This command can only be used to create new database. If you wish to
+bulk load to an existing database please use tdbloader instead.
+
+Required options are as follows:
+
+  -l <DatabaseDirectory>
+  --loc <DatabaseDirectory>
+    Sets the location in which the database should be created.
+
+    This location must be a directory and must be empty, if a
+    non-existent path is specified it will be created as a new
+    directory.
+
+Common additional options are as follows:
+
+  -h
+  --help
+    Prints this help summary and exits
+
+Advanced additional options are as follows:
+
+  -k
+  --keep-work
+    Keeps the temporary work files around after they are no longer
+    needed.  May be useful for debugging.
+
+EOF
+}
+
 # Exit on error.
 set -e
 
@@ -30,25 +69,22 @@ TMP=$$
 #DATE="+%Y-%m-%dT%H:%M:%S%:z"
 DATE="+%H:%M:%S"
 
-##--parallel is not always available.
-SORT_ARGS="${SORT_ARGS:---buffer-size=50%}"
-JVM_ARGS=${JVM_ARGS:--Xmx1200M}
-
-# Classpath set in "tdbloader2"
-if [ -z "$JENA_CP" ]
-then
-    echo "Classpath not provided : set JENA_CP" 1>&2
-    exit 1
-fi
-
-USAGE="Usage: tdbloader2index --loc location"
 PKG=org.apache.jena.tdb.store.bulkloader2
 
+# Process Arguments
+LOC=
+KEEP_WORK=0
+
 while [ $# -gt 0 ]
 do
   ARG=$1
   case "$ARG" in
-    --loc|-loc)
+    -k|--keep-work)
+      # Keep work files
+      shift
+      KEEP_WORK=1
+      ;;
+    -l|--loc|-loc)
       # Location space separated
       shift
       LOC="$1"
@@ -59,8 +95,8 @@ do
       LOC=${ARG/-*loc=/}
       shift
       ;;
-    --help)
-      echo $USAGE
+    -h|--help)
+      printUsage
       exit 0
       ;;
     *)
@@ -75,8 +111,6 @@ if [ -z "$LOC" ] ; then echo "No location specified" ; exit 
1 ; fi
 if [ ! -e "$LOC" ] ; then echo "Location specified does not exist: $LOC" ; 
exit 1; fi
 if [ ! -d "$LOC" ] ; then echo "Location is not a directory: $LOC" ; exit 1 ; 
fi
 
-KEEPWORKFILES="${KEEPWORKFILES:-}"
-
 DATA_TRIPLES="$LOC/data-triples.tmp"
 DATA_QUADS="$LOC/data-quads.tmp"
 
@@ -89,14 +123,29 @@ if [ ! -e "$DATA_QUADS" ]; then
   exit 1
 fi
 
+##--parallel is not always available.
+SORT_ARGS="${SORT_ARGS:---buffer-size=50%}"
+JVM_ARGS=${JVM_ARGS:--Xmx1200M}
+
+# Classpath set in "tdbloader2"
+if [ -z "$JENA_CP" ]
+then
+    echo "Classpath not provided : set JENA_CP" 1>&2
+    exit 1
+fi
+
 # ---- Index intermediates
 ## All files are writtern S P O / G S P O columns per row but in different 
sort orders.
 log "Index Building Phase"
 
+# Check whether Pipe Viewer is available
+# Needs to temporarily disable exit on error
+set +e
 which pv >/dev/null 2>&1
 HAS_PV=$?
+set -e
 
-process_rows()
+generate_index()
 {
     local KEYS="$1"
     local DATA="$2"
@@ -109,6 +158,8 @@ process_rows()
          fi
 
     log "Creating Index $IDX"
+
+    # Sort the input data
     log "  Sort $IDX"
     if [ $HAS_PV = 0 ]; then
       # Use pv (pipe viewer) to monitor sort progress
@@ -120,14 +171,16 @@ process_rows()
       sort $SORT_ARGS -u $KEYS < "$DATA" > $WORK
     fi
     log "  Sort $IDX Completed"
+
+    # Build into an index
     log "  Build $IDX"
     rm -f "$LOC/$IDX.dat"
     rm -f "$LOC/$IDX.idn"
     java -cp "$JENA_CP" "$PKG".CmdIndexBuild "$LOC" "$IDX" "$WORK"
     log "  Build $IDX Completed"
-    # Remove intermediary file.
-    if [ "$KEEPWORKFILES" != "yes" ] 
-    then
+
+    # Remove work file unless keeping
+    if [ $KEEP_WORK = 1 ]; then
            rm "$WORK"
     fi
 }
@@ -137,28 +190,27 @@ K2="-k 2,2"
 K3="-k 3,3"
 K4="-k 4,4"
 
-process_rows "$K1 $K2 $K3" "$DATA_TRIPLES" SPO
+generate_index "$K1 $K2 $K3" "$DATA_TRIPLES" SPO
 
-process_rows "$K2 $K3 $K1" "$DATA_TRIPLES" POS
+generate_index "$K2 $K3 $K1" "$DATA_TRIPLES" POS
 
-process_rows "$K3 $K1 $K2" "$DATA_TRIPLES" OSP
+generate_index "$K3 $K1 $K2" "$DATA_TRIPLES" OSP
 
-process_rows "$K1 $K2 $K3 $K4" "$DATA_QUADS" GSPO
+generate_index "$K1 $K2 $K3 $K4" "$DATA_QUADS" GSPO
 
-process_rows "$K1 $K3 $K4 $K2" "$DATA_QUADS" GPOS
+generate_index "$K1 $K3 $K4 $K2" "$DATA_QUADS" GPOS
 
-process_rows "$K1 $K4 $K2 $K3" "$DATA_QUADS" GOSP
+generate_index "$K1 $K4 $K2 $K3" "$DATA_QUADS" GOSP
 
-process_rows "$K2 $K3 $K4 $K1" "$DATA_QUADS" SPOG
+generate_index "$K2 $K3 $K4 $K1" "$DATA_QUADS" SPOG
 
-process_rows "$K3 $K4 $K2 $K1" "$DATA_QUADS" POSG
+generate_index "$K3 $K4 $K2 $K1" "$DATA_QUADS" POSG
 
-process_rows "$K4 $K2 $K3 $K1" "$DATA_QUADS" OSPG
+generate_index "$K4 $K2 $K3 $K1" "$DATA_QUADS" OSPG
 
 log "Index Building Phase Completed"
 
 # ---- Clean up.
-if [ "$KEEPWORKFILES" != "yes" ] 
-then
+if [ $KEEP_WORK = 1 ]; then
     rm -f "$DATA_TRIPLES" "$DATA_QUADS" 
 fi

Reply via email to