Further refactoring of tdbloader2 scripts (JENA-977) - Proper usage summaries in all scripts - -k/--keep-work option instead of hidden environment variable for keeping work - Short forms for all options
Project: http://git-wip-us.apache.org/repos/asf/jena/repo Commit: http://git-wip-us.apache.org/repos/asf/jena/commit/a96b0164 Tree: http://git-wip-us.apache.org/repos/asf/jena/tree/a96b0164 Diff: http://git-wip-us.apache.org/repos/asf/jena/diff/a96b0164 Branch: refs/heads/JENA-977 Commit: a96b0164c43142791ac030e5332b3f54df6fb4ba Parents: 7b61a14 Author: Rob Vesse <[email protected]> Authored: Fri Jun 26 12:25:57 2015 +0100 Committer: Rob Vesse <[email protected]> Committed: Fri Jun 26 16:30:53 2015 +0100 ---------------------------------------------------------------------- apache-jena/bin/tdbloader2 | 72 ++++++++++++++++------ apache-jena/bin/tdbloader2data | 82 ++++++++++++++++++++----- apache-jena/bin/tdbloader2index | 116 +++++++++++++++++++++++++---------- 3 files changed, 204 insertions(+), 66 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/jena/blob/a96b0164/apache-jena/bin/tdbloader2 ---------------------------------------------------------------------- diff --git a/apache-jena/bin/tdbloader2 b/apache-jena/bin/tdbloader2 index 34ee029..9ff2727 100755 --- a/apache-jena/bin/tdbloader2 +++ b/apache-jena/bin/tdbloader2 @@ -17,24 +17,53 @@ function printUsage() { cat << EOF -Usage: tdbloader2 <Options> <Data> +tdbloader2 - TDB Bulk Loader -Options are as follows: +Usage: tdbloader2 --loc <Directory> [Options] <Data> ... +Bulk loader for TDB which manipulates the data files directly and so +can only be used to create new databases. This command relies on +POSIX utilities so will only work on POSIX operating systems. + +If you wish to bulk load to an existing database please use tdbloader +instead. + +Required options are as follows: + + -l <DatabaseDirectory> + --loc <DatabaseDirectory> + Sets the location in which the database should be created. + + This location must be a directory and must be empty, if a + non-existent path is specified it will be created as a new + directory. + + <Data> + Specifies the path to one/more data files to load + +Common additional options are as follows: + + -h --help Prints this help summary and exits - --loc <DatabaseDirectory> - Sets the location in which the database should be created +Advanced additional options are as follows: + -k + --keep-work + Keeps the temporary work files around after they are no longer + needed. May be useful for debugging. + + -p <Phase> --phase <Phase> Sets the phase of the build to run, supported values are: - all Full bulk load - data Data phase only - index Index phase only, requires the data phase to previously have been run + all Full bulk load + data Data phase only + index Index phase only, requires the data phase to + previously have been run - When not specified defaults to all + When no phase is specified it defaults to all EOF } @@ -86,12 +115,18 @@ export SORT_ARGS # Process arguments LOC= PHASE= +KEEP_WORK=0 while [ $# -gt 0 ] do ARG=$1 case "$ARG" in - --loc|-loc) + -k|--keep-work) + # Keep work files + shift + KEEP_WORK=1 + ;; + -l|--loc|-loc) # Location space separated shift LOC="$1" @@ -102,13 +137,13 @@ do LOC=${ARG/-*loc=/} shift ;; - --phase) + -p|--phase) # Phase space separated shift PHASE="$1" shift ;; - --help) + -h|--help) # Help printUsage exit 0 @@ -123,9 +158,10 @@ done if [ -z "$PHASE" ]; then PHASE="all" fi - -#echo "Location is '$LOC'" -#echo "Phase is '$PHASE'" +COMMON_ARGS= +if [ $KEEP_WORK = 0 ]; then + COMMON_ARGS="--keep-work" +fi log() { echo " $(date $DATE)" "$@" ; } @@ -138,14 +174,14 @@ TIME1="$(date +%s)" case "$PHASE" in all) - exec "$JENA_HOME/bin/tdbloader2data" --loc "$LOC" "$@" - exec "$JENA_HOME/bin/tdbloader2index" --loc "$LOC" + exec "$JENA_HOME/bin/tdbloader2data" $COMMON_ARGS --loc "$LOC" "$@" + exec "$JENA_HOME/bin/tdbloader2index" $COMMON_ARGS --loc "$LOC" ;; data) - exec "$JENA_HOME/bin/tdbloader2data" --loc "$LOC" "$@" + exec "$JENA_HOME/bin/tdbloader2data" $COMMON_ARGS --loc "$LOC" "$@" ;; index) - exec "$JENA_HOME/bin/tdbloader2index" --loc "$LOC" + exec "$JENA_HOME/bin/tdbloader2index" $COMMON_ARGS --loc "$LOC" ;; *) echo "Unrecognized phase $PHASE" 1>&2 http://git-wip-us.apache.org/repos/asf/jena/blob/a96b0164/apache-jena/bin/tdbloader2data ---------------------------------------------------------------------- diff --git a/apache-jena/bin/tdbloader2data b/apache-jena/bin/tdbloader2data index 90200e4..5aceb27 100755 --- a/apache-jena/bin/tdbloader2data +++ b/apache-jena/bin/tdbloader2data @@ -18,6 +18,48 @@ # The environment for this sub-script is setup by "tdbloader2" +function printUsage() { + cat << EOF +tdbloader2data - TDB Bulk Loader - Data Phase + +Usage tdbloader2data --loc <Directory> [Options] <Data> ... + +Bulk Loader for TDB which generates the Node Table. This command +relies on POSIX utilities so will only work on POSIX operating +systems. + +This command can only be used to create new database. If you wish to +bulk load to an existing database please use tdbloader instead. + +Required options are as follows: + + -l <DatabaseDirectory> + --loc <DatabaseDirectory> + Sets the location in which the database should be created. + + This location must be a directory and must be empty, if a + non-existent path is specified it will be created as a new + directory. + + <Data> + Specifies the path to one/more data files to load + +Common additional options are as follows: + + -h + --help + Prints this help summary and exits + +Advanced additional options are as follows: + + -k + --keep-work + Keeps the temporary work files around after they are no longer + needed. May be useful for debugging. + +EOF +} + # Exit on error. set -e @@ -29,24 +71,24 @@ log() { echo " $(date $DATE)" "$@" ; } #DATE="+%Y-%m-%dT%H:%M:%S%:z" DATE="+%H:%M:%S" -## JVM Arguments -JVM_ARGS=${JVM_ARGS:--Xmx1200M} - -# Classpath set in "tdbloader2" -if [ -z "$JENA_CP" ] -then - echo "Classpath not provided : set JENA_CP" 1>&2 - exit 1 -fi - -USAGE="Usage: tdbloader2data --loc location datafile ..." PKG=org.apache.jena.tdb.store.bulkloader2 +# Process Arguments +LOC= +KEEP_WORK=0 + while [ $# -gt 0 ] do ARG=$1 case "$ARG" in - --loc|-loc) + -k|--keep-work) + # Keep work files + # This option is actually not used by this script but may be passed in + # by the parent tdbloader2 script + shift + KEEP_WORK=1 + ;; + -l|--loc|-loc) # Location space separated shift LOC="$1" @@ -57,8 +99,8 @@ do LOC=${ARG/-*loc=/} shift ;; - --help) - echo $USAGE + -h|--help) + printUsage exit 0 ;; *) @@ -91,8 +133,16 @@ fi if [ ! -d "$LOC" ] ; then echo "Location is not a directory: $LOC" ; exit 1 ; fi FILES="$@" -## Stdin? -KEEPWORKFILES="${KEEPWORKFILES:-}" + +## JVM Arguments +JVM_ARGS=${JVM_ARGS:--Xmx1200M} + +# Classpath set in "tdbloader2" +if [ -z "$JENA_CP" ] +then + echo "Classpath not provided : set JENA_CP" 1>&2 + exit 1 +fi # ---- Data loading phase log "Data Load Phase" http://git-wip-us.apache.org/repos/asf/jena/blob/a96b0164/apache-jena/bin/tdbloader2index ---------------------------------------------------------------------- diff --git a/apache-jena/bin/tdbloader2index b/apache-jena/bin/tdbloader2index index 5624854..2730af1 100755 --- a/apache-jena/bin/tdbloader2index +++ b/apache-jena/bin/tdbloader2index @@ -18,6 +18,45 @@ # The environment for this sub-script is setup by "tdbloader2" +function printUsage() { + cat << EOF +tdbloader2index - TDB Bulk Loader - Index Phase + +Usage: tdbloader2index --loc <Directory> [Options] + +Bulk Loader for TDB which generates the Index files based upon the +temporary data files generated by tdbloader2data. This command relies +on POSIX utilities so will only work on POSIX operating systems. + +This command can only be used to create new database. If you wish to +bulk load to an existing database please use tdbloader instead. + +Required options are as follows: + + -l <DatabaseDirectory> + --loc <DatabaseDirectory> + Sets the location in which the database should be created. + + This location must be a directory and must be empty, if a + non-existent path is specified it will be created as a new + directory. + +Common additional options are as follows: + + -h + --help + Prints this help summary and exits + +Advanced additional options are as follows: + + -k + --keep-work + Keeps the temporary work files around after they are no longer + needed. May be useful for debugging. + +EOF +} + # Exit on error. set -e @@ -30,25 +69,22 @@ TMP=$$ #DATE="+%Y-%m-%dT%H:%M:%S%:z" DATE="+%H:%M:%S" -##--parallel is not always available. -SORT_ARGS="${SORT_ARGS:---buffer-size=50%}" -JVM_ARGS=${JVM_ARGS:--Xmx1200M} - -# Classpath set in "tdbloader2" -if [ -z "$JENA_CP" ] -then - echo "Classpath not provided : set JENA_CP" 1>&2 - exit 1 -fi - -USAGE="Usage: tdbloader2index --loc location" PKG=org.apache.jena.tdb.store.bulkloader2 +# Process Arguments +LOC= +KEEP_WORK=0 + while [ $# -gt 0 ] do ARG=$1 case "$ARG" in - --loc|-loc) + -k|--keep-work) + # Keep work files + shift + KEEP_WORK=1 + ;; + -l|--loc|-loc) # Location space separated shift LOC="$1" @@ -59,8 +95,8 @@ do LOC=${ARG/-*loc=/} shift ;; - --help) - echo $USAGE + -h|--help) + printUsage exit 0 ;; *) @@ -75,8 +111,6 @@ if [ -z "$LOC" ] ; then echo "No location specified" ; exit 1 ; fi if [ ! -e "$LOC" ] ; then echo "Location specified does not exist: $LOC" ; exit 1; fi if [ ! -d "$LOC" ] ; then echo "Location is not a directory: $LOC" ; exit 1 ; fi -KEEPWORKFILES="${KEEPWORKFILES:-}" - DATA_TRIPLES="$LOC/data-triples.tmp" DATA_QUADS="$LOC/data-quads.tmp" @@ -89,14 +123,29 @@ if [ ! -e "$DATA_QUADS" ]; then exit 1 fi +##--parallel is not always available. +SORT_ARGS="${SORT_ARGS:---buffer-size=50%}" +JVM_ARGS=${JVM_ARGS:--Xmx1200M} + +# Classpath set in "tdbloader2" +if [ -z "$JENA_CP" ] +then + echo "Classpath not provided : set JENA_CP" 1>&2 + exit 1 +fi + # ---- Index intermediates ## All files are writtern S P O / G S P O columns per row but in different sort orders. log "Index Building Phase" +# Check whether Pipe Viewer is available +# Needs to temporarily disable exit on error +set +e which pv >/dev/null 2>&1 HAS_PV=$? +set -e -process_rows() +generate_index() { local KEYS="$1" local DATA="$2" @@ -109,6 +158,8 @@ process_rows() fi log "Creating Index $IDX" + + # Sort the input data log " Sort $IDX" if [ $HAS_PV = 0 ]; then # Use pv (pipe viewer) to monitor sort progress @@ -120,14 +171,16 @@ process_rows() sort $SORT_ARGS -u $KEYS < "$DATA" > $WORK fi log " Sort $IDX Completed" + + # Build into an index log " Build $IDX" rm -f "$LOC/$IDX.dat" rm -f "$LOC/$IDX.idn" java -cp "$JENA_CP" "$PKG".CmdIndexBuild "$LOC" "$IDX" "$WORK" log " Build $IDX Completed" - # Remove intermediary file. - if [ "$KEEPWORKFILES" != "yes" ] - then + + # Remove work file unless keeping + if [ $KEEP_WORK = 1 ]; then rm "$WORK" fi } @@ -137,28 +190,27 @@ K2="-k 2,2" K3="-k 3,3" K4="-k 4,4" -process_rows "$K1 $K2 $K3" "$DATA_TRIPLES" SPO +generate_index "$K1 $K2 $K3" "$DATA_TRIPLES" SPO -process_rows "$K2 $K3 $K1" "$DATA_TRIPLES" POS +generate_index "$K2 $K3 $K1" "$DATA_TRIPLES" POS -process_rows "$K3 $K1 $K2" "$DATA_TRIPLES" OSP +generate_index "$K3 $K1 $K2" "$DATA_TRIPLES" OSP -process_rows "$K1 $K2 $K3 $K4" "$DATA_QUADS" GSPO +generate_index "$K1 $K2 $K3 $K4" "$DATA_QUADS" GSPO -process_rows "$K1 $K3 $K4 $K2" "$DATA_QUADS" GPOS +generate_index "$K1 $K3 $K4 $K2" "$DATA_QUADS" GPOS -process_rows "$K1 $K4 $K2 $K3" "$DATA_QUADS" GOSP +generate_index "$K1 $K4 $K2 $K3" "$DATA_QUADS" GOSP -process_rows "$K2 $K3 $K4 $K1" "$DATA_QUADS" SPOG +generate_index "$K2 $K3 $K4 $K1" "$DATA_QUADS" SPOG -process_rows "$K3 $K4 $K2 $K1" "$DATA_QUADS" POSG +generate_index "$K3 $K4 $K2 $K1" "$DATA_QUADS" POSG -process_rows "$K4 $K2 $K3 $K1" "$DATA_QUADS" OSPG +generate_index "$K4 $K2 $K3 $K1" "$DATA_QUADS" OSPG log "Index Building Phase Completed" # ---- Clean up. -if [ "$KEEPWORKFILES" != "yes" ] -then +if [ $KEEP_WORK = 1 ]; then rm -f "$DATA_TRIPLES" "$DATA_QUADS" fi
