[09/50] [abbrv] brooklyn-client git commit: optimize creation of whitelist files

heneveld Mon, 01 Feb 2016 10:03:26 -0800

optimize creation of whitelist files

about 50x faster i'd say. now it gets the commit id's and parses them for files,
rather than following every file. output is similar;
imperfect because of how git infers moves, slightly different mistakes but 
nothing egregious,
and if anything this is slightly better



Project: http://git-wip-us.apache.org/repos/asf/brooklyn-client/repo
Commit: http://git-wip-us.apache.org/repos/asf/brooklyn-client/commit/7b346367
Tree: http://git-wip-us.apache.org/repos/asf/brooklyn-client/tree/7b346367
Diff: http://git-wip-us.apache.org/repos/asf/brooklyn-client/diff/7b346367

Branch: refs/heads/master
Commit: 7b3463671f58d349c024e64a2c8d6021ab6ea5e2
Parents: 4904ee0
Author: Alex Heneveld <[email protected]>
Authored: Wed Dec 16 12:28:47 2015 +0000
Committer: Alex Heneveld <[email protected]>
Committed: Wed Dec 16 14:31:22 2015 +0000

----------------------------------------------------------------------
 3-create-full-whitelists.sh |  8 ++---
 4-make-new-repos.sh         |  2 +-
 make-whitelist.sh           | 67 ++++++++++++++++++++++++++--------------
 uber-repo-whitelist.txt     |  2 ++
 4 files changed, 50 insertions(+), 29 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/brooklyn-client/blob/7b346367/3-create-full-whitelists.sh
----------------------------------------------------------------------
diff --git a/3-create-full-whitelists.sh b/3-create-full-whitelists.sh
index 4b3bf50..337e242 100755
--- a/3-create-full-whitelists.sh
+++ b/3-create-full-whitelists.sh
@@ -5,12 +5,12 @@ set -e
 
 . env.sh
 
-for x in $PROJS ; do
-  ./make-whitelist.sh incubator-brooklyn/ "brooklyn-$x $(cat 
common-whitelist.txt) $(cat $x-whitelist.txt)" $x-whitelist.full.gen.txt
+for x in $PROJS uber-repo ; do
+  echo brooklyn-$x | cat - $x-whitelist.txt common-whitelist.txt > 
TMP-whitelist-$x.gen.txt
+  ./make-whitelist.sh incubator-brooklyn/ TMP-whitelist-$x.gen.txt 
$x-whitelist.full.gen.txt
+  rm TMP-whitelist-$x.gen.txt
 done
 
-./make-whitelist.sh incubator-brooklyn/ "brooklyn $(cat common-whitelist.txt) 
README.md" brooklyn-uber-repo-whitelist.full.gen.txt
-
 # finally anything which isn't in any full whitelist, put into 
unclaimed-whitelist.gen.txt
 
 pushd incubator-brooklyn

http://git-wip-us.apache.org/repos/asf/brooklyn-client/blob/7b346367/4-make-new-repos.sh
----------------------------------------------------------------------
diff --git a/4-make-new-repos.sh b/4-make-new-repos.sh
index ccfca3e..267427f 100755
--- a/4-make-new-repos.sh
+++ b/4-make-new-repos.sh
@@ -90,5 +90,5 @@ for x in $PROJS ; do
   do_repo_w_whitelist brooklyn-$x $x-whitelist.full.gen.txt
 done
 
-do_repo_w_whitelist brooklyn brooklyn-uber-repo-whitelist.gen.txt
+do_repo_w_whitelist brooklyn uber-repo-whitelist.full.gen.txt
 

http://git-wip-us.apache.org/repos/asf/brooklyn-client/blob/7b346367/make-whitelist.sh
----------------------------------------------------------------------
diff --git a/make-whitelist.sh b/make-whitelist.sh
index b03adac..84e6ed8 100755
--- a/make-whitelist.sh
+++ b/make-whitelist.sh
@@ -1,12 +1,10 @@
 
 # inputs
 
-# TODO take inputs, including OUTPUT_FILENAME
-
-if [ -z "$3" ]; then echo "Usage: make-whitelist.sh REPO_DIR DIRS_TO_FOLLOW 
OUTPUT_FILENAME" ; exit 1 ; fi
+if [ -z "$3" ]; then echo "Usage: make-whitelist.sh REPO_DIR PATH_PREFIX_FILE 
OUTPUT_FILENAME" ; exit 1 ; fi
 
 export REPO=$1
-export DIRS=$2
+export PREFIX_FILE=$2
 export OUTPUT_FILENAME=$3
 
 # output
@@ -18,39 +16,60 @@ export OUTPUT=${ORIG_DIR}/${OUTPUT_FILENAME}
 # working
 
 # file/paths we have left to look at, built up for the next cycle on one cycle,
-# starting with the DIRS
+# starting with the PREFIX_FILE
 export TODO_REMAINING=${ORIG_DIR}/TODO-remaining
 
 # file/paths encountered on one cycle
 export TODO_HERE=${ORIG_DIR}/TODO-here
 
-
-rm $OUTPUT
-for x in $DIRS ; do echo $x >> $OUTPUT ; done
+sort -u -o $OUTPUT $PREFIX_FILE
 cp $OUTPUT $TODO_REMAINING
+SAMPLE_PATHS=`head -4 $PREFIX_FILE`" and "`( gshuf $PREFIX_FILE 2> /dev/null 
|| echo "maybe others" ) | head -4`
 
 pushd $REPO > /dev/null
 
-echo scanning $REPO for all files
+echo scanning $REPO for relevant files in history for $OUTPUT_FILENAME 
starting with `cat $TODO_REMAINING | wc -l` paths including $SAMPLE_PATHS
 
 while [ -s $TODO_REMAINING ] ; do
 
-  echo current scan has `wc $TODO_REMAINING | awk '{print $1}'` paths 
including `head -1 $TODO_REMAINING`
-  rm -f $TODO_HERE
-  touch $TODO_HERE
+  echo current pass has `cat $TODO_REMAINING | wc -l` paths including `( gshuf 
$TODO_REMAINING 2> /dev/null || cat $TODO_REMAINING ) | head -4`
 
-  for x in `cat $TODO_REMAINING` ; do
-    # NB: this doesn't work with spsces in the filename; we just have a few 
though and they're manually added
-    git log --format='%H' --name-status --follow -- $x | awk '{if ($3) print 
$3; if ($2) print $2;}' | sort -u | cat $TODO_HERE - > ${TODO_HERE}2
-    mv ${TODO_HERE}2 ${TODO_HERE}
-  done
-  cat ${TODO_HERE} | sort -u > ${TODO_HERE}2
-  mv ${TODO_HERE}2 ${TODO_HERE}
+#  echo PICKED UP for $OUTPUT_FILENAME : >> ${ORIG_DIR}/log
+#  cat $TODO_REMAINING >> ${ORIG_DIR}/log
+
+  rm -f $TODO_HERE
 
-  diff --new-line-format="" --unchanged-line-format="" ${TODO_HERE} $OUTPUT > 
${TODO_HERE}_new
-  cat $OUTPUT ${TODO_HERE}_new | sort -u > ${OUTPUT}2
-  mv ${OUTPUT}2 ${OUTPUT}
-  mv ${TODO_HERE}_new $TODO_REMAINING
+  echo collecting relevant commits...
+  cat $TODO_REMAINING | xargs -L -n100 git log --format='%H' --diff-filter=A 
-- >> ${TODO_HERE}_ids
+
+  sort -u ${TODO_HERE}_ids -o ${TODO_HERE}_ids
+#  echo IDS | cat - ${TODO_HERE}_ids >> ${ORIG_DIR}/log
+
+  rm -f ${TODO_HERE}_allpaths
+  echo gathering files from `cat ${TODO_HERE}_ids | wc -l` commits...
+  # 50% match is a bit low but better safe than sorry for moves; for copies we 
go higher
+  cat ${TODO_HERE}_ids | xargs -L -n100 git show -l99999 -M50 -C90 
--name-status --format="ID: %H" | grep -v ^ID: | awk -F $'\t' '{ if ($3) print 
$3"\t"$2; else print $2; }' | sort -u >> ${TODO_HERE}_allpaths
+
+  echo comparing `cat ${TODO_HERE}_allpaths | wc -l` candidate files against 
paths...
+  cat $TODO_REMAINING | awk '{print $0"\tMATCH_THIS" }' | cat - 
${TODO_HERE}_allpaths | sort -u > ${TODO_HERE}_merged
+  cat ${TODO_HERE}_merged | awk -F $'\t' '{ 
+    if ($2=="MATCH_THIS") { 
+      if (!patt || substr($1,0,length(patt))!=patt) { patt=$1; } 
+      if (last1==patt) { print last1; if (last2) print last2; } 
+      last1=""; 
+    } else { 
+      last1=$1; last2=$2; 
+      if (patt && substr(last1,0,length(patt))==patt) { print last1; if 
(last2) print last2; } 
+    } }' | sort -u -o ${TODO_HERE}
+   # logging for the above, if needed
+#  echo MATCHING for $OUTPUT_FILENAME : >> ${ORIG_DIR}/log
+#  cat ${TODO_HERE}_merged | awk -F $'\t' '{ if ($2=="MATCH_THIS") { if (!patt 
|| substr($1,0,length(patt))!=patt) { patt=$1; } 
+#      if (last1==patt) { print "MATCH LAST on "patt" ADDS "last1" "last2; } 
last1=""; }
+#    else { last1=$1; last2=$2; if (patt && 
substr(last1,0,length(patt))==patt) { print "MATCH NEXT on "patt" ADDS "last1" 
"last2; } } }' >> ${ORIG_DIR}/log
+
+  comm -23 ${TODO_HERE} $OUTPUT > ${TODO_REMAINING}
+  cat $OUTPUT ${TODO_HERE} | sort -u -o ${OUTPUT}
+  rm ${TODO_HERE}_*
 
 done
 
@@ -59,5 +78,5 @@ popd > /dev/null
 rm ${TODO_REMAINING}
 rm ${TODO_HERE}
 
-echo completed scan of $REPO, history has `wc ${OUTPUT} | awk '{print $1}'` 
files
+echo completed scan of $REPO in $OUTPUT_FILENAME, relevant history has `wc 
${OUTPUT} | awk '{print $1}'` files
 

http://git-wip-us.apache.org/repos/asf/brooklyn-client/blob/7b346367/uber-repo-whitelist.txt
----------------------------------------------------------------------
diff --git a/uber-repo-whitelist.txt b/uber-repo-whitelist.txt
new file mode 100644
index 0000000..f2a1baf
--- /dev/null
+++ b/uber-repo-whitelist.txt
@@ -0,0 +1,2 @@
+README.md
+brooklyn/

[09/50] [abbrv] brooklyn-client git commit: optimize creation of whitelist files

Reply via email to