optimize creation of whitelist files about 50x faster i'd say. now it gets the commit id's and parses them for files, rather than following every file. output is similar; imperfect because of how git infers moves, slightly different mistakes but nothing egregious, and if anything this is slightly better
Project: http://git-wip-us.apache.org/repos/asf/brooklyn-client/repo Commit: http://git-wip-us.apache.org/repos/asf/brooklyn-client/commit/7b346367 Tree: http://git-wip-us.apache.org/repos/asf/brooklyn-client/tree/7b346367 Diff: http://git-wip-us.apache.org/repos/asf/brooklyn-client/diff/7b346367 Branch: refs/heads/master Commit: 7b3463671f58d349c024e64a2c8d6021ab6ea5e2 Parents: 4904ee0 Author: Alex Heneveld <[email protected]> Authored: Wed Dec 16 12:28:47 2015 +0000 Committer: Alex Heneveld <[email protected]> Committed: Wed Dec 16 14:31:22 2015 +0000 ---------------------------------------------------------------------- 3-create-full-whitelists.sh | 8 ++--- 4-make-new-repos.sh | 2 +- make-whitelist.sh | 67 ++++++++++++++++++++++++++-------------- uber-repo-whitelist.txt | 2 ++ 4 files changed, 50 insertions(+), 29 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/brooklyn-client/blob/7b346367/3-create-full-whitelists.sh ---------------------------------------------------------------------- diff --git a/3-create-full-whitelists.sh b/3-create-full-whitelists.sh index 4b3bf50..337e242 100755 --- a/3-create-full-whitelists.sh +++ b/3-create-full-whitelists.sh @@ -5,12 +5,12 @@ set -e . env.sh -for x in $PROJS ; do - ./make-whitelist.sh incubator-brooklyn/ "brooklyn-$x $(cat common-whitelist.txt) $(cat $x-whitelist.txt)" $x-whitelist.full.gen.txt +for x in $PROJS uber-repo ; do + echo brooklyn-$x | cat - $x-whitelist.txt common-whitelist.txt > TMP-whitelist-$x.gen.txt + ./make-whitelist.sh incubator-brooklyn/ TMP-whitelist-$x.gen.txt $x-whitelist.full.gen.txt + rm TMP-whitelist-$x.gen.txt done -./make-whitelist.sh incubator-brooklyn/ "brooklyn $(cat common-whitelist.txt) README.md" brooklyn-uber-repo-whitelist.full.gen.txt - # finally anything which isn't in any full whitelist, put into unclaimed-whitelist.gen.txt pushd incubator-brooklyn http://git-wip-us.apache.org/repos/asf/brooklyn-client/blob/7b346367/4-make-new-repos.sh ---------------------------------------------------------------------- diff --git a/4-make-new-repos.sh b/4-make-new-repos.sh index ccfca3e..267427f 100755 --- a/4-make-new-repos.sh +++ b/4-make-new-repos.sh @@ -90,5 +90,5 @@ for x in $PROJS ; do do_repo_w_whitelist brooklyn-$x $x-whitelist.full.gen.txt done -do_repo_w_whitelist brooklyn brooklyn-uber-repo-whitelist.gen.txt +do_repo_w_whitelist brooklyn uber-repo-whitelist.full.gen.txt http://git-wip-us.apache.org/repos/asf/brooklyn-client/blob/7b346367/make-whitelist.sh ---------------------------------------------------------------------- diff --git a/make-whitelist.sh b/make-whitelist.sh index b03adac..84e6ed8 100755 --- a/make-whitelist.sh +++ b/make-whitelist.sh @@ -1,12 +1,10 @@ # inputs -# TODO take inputs, including OUTPUT_FILENAME - -if [ -z "$3" ]; then echo "Usage: make-whitelist.sh REPO_DIR DIRS_TO_FOLLOW OUTPUT_FILENAME" ; exit 1 ; fi +if [ -z "$3" ]; then echo "Usage: make-whitelist.sh REPO_DIR PATH_PREFIX_FILE OUTPUT_FILENAME" ; exit 1 ; fi export REPO=$1 -export DIRS=$2 +export PREFIX_FILE=$2 export OUTPUT_FILENAME=$3 # output @@ -18,39 +16,60 @@ export OUTPUT=${ORIG_DIR}/${OUTPUT_FILENAME} # working # file/paths we have left to look at, built up for the next cycle on one cycle, -# starting with the DIRS +# starting with the PREFIX_FILE export TODO_REMAINING=${ORIG_DIR}/TODO-remaining # file/paths encountered on one cycle export TODO_HERE=${ORIG_DIR}/TODO-here - -rm $OUTPUT -for x in $DIRS ; do echo $x >> $OUTPUT ; done +sort -u -o $OUTPUT $PREFIX_FILE cp $OUTPUT $TODO_REMAINING +SAMPLE_PATHS=`head -4 $PREFIX_FILE`" and "`( gshuf $PREFIX_FILE 2> /dev/null || echo "maybe others" ) | head -4` pushd $REPO > /dev/null -echo scanning $REPO for all files +echo scanning $REPO for relevant files in history for $OUTPUT_FILENAME starting with `cat $TODO_REMAINING | wc -l` paths including $SAMPLE_PATHS while [ -s $TODO_REMAINING ] ; do - echo current scan has `wc $TODO_REMAINING | awk '{print $1}'` paths including `head -1 $TODO_REMAINING` - rm -f $TODO_HERE - touch $TODO_HERE + echo current pass has `cat $TODO_REMAINING | wc -l` paths including `( gshuf $TODO_REMAINING 2> /dev/null || cat $TODO_REMAINING ) | head -4` - for x in `cat $TODO_REMAINING` ; do - # NB: this doesn't work with spsces in the filename; we just have a few though and they're manually added - git log --format='%H' --name-status --follow -- $x | awk '{if ($3) print $3; if ($2) print $2;}' | sort -u | cat $TODO_HERE - > ${TODO_HERE}2 - mv ${TODO_HERE}2 ${TODO_HERE} - done - cat ${TODO_HERE} | sort -u > ${TODO_HERE}2 - mv ${TODO_HERE}2 ${TODO_HERE} +# echo PICKED UP for $OUTPUT_FILENAME : >> ${ORIG_DIR}/log +# cat $TODO_REMAINING >> ${ORIG_DIR}/log + + rm -f $TODO_HERE - diff --new-line-format="" --unchanged-line-format="" ${TODO_HERE} $OUTPUT > ${TODO_HERE}_new - cat $OUTPUT ${TODO_HERE}_new | sort -u > ${OUTPUT}2 - mv ${OUTPUT}2 ${OUTPUT} - mv ${TODO_HERE}_new $TODO_REMAINING + echo collecting relevant commits... + cat $TODO_REMAINING | xargs -L -n100 git log --format='%H' --diff-filter=A -- >> ${TODO_HERE}_ids + + sort -u ${TODO_HERE}_ids -o ${TODO_HERE}_ids +# echo IDS | cat - ${TODO_HERE}_ids >> ${ORIG_DIR}/log + + rm -f ${TODO_HERE}_allpaths + echo gathering files from `cat ${TODO_HERE}_ids | wc -l` commits... + # 50% match is a bit low but better safe than sorry for moves; for copies we go higher + cat ${TODO_HERE}_ids | xargs -L -n100 git show -l99999 -M50 -C90 --name-status --format="ID: %H" | grep -v ^ID: | awk -F $'\t' '{ if ($3) print $3"\t"$2; else print $2; }' | sort -u >> ${TODO_HERE}_allpaths + + echo comparing `cat ${TODO_HERE}_allpaths | wc -l` candidate files against paths... + cat $TODO_REMAINING | awk '{print $0"\tMATCH_THIS" }' | cat - ${TODO_HERE}_allpaths | sort -u > ${TODO_HERE}_merged + cat ${TODO_HERE}_merged | awk -F $'\t' '{ + if ($2=="MATCH_THIS") { + if (!patt || substr($1,0,length(patt))!=patt) { patt=$1; } + if (last1==patt) { print last1; if (last2) print last2; } + last1=""; + } else { + last1=$1; last2=$2; + if (patt && substr(last1,0,length(patt))==patt) { print last1; if (last2) print last2; } + } }' | sort -u -o ${TODO_HERE} + # logging for the above, if needed +# echo MATCHING for $OUTPUT_FILENAME : >> ${ORIG_DIR}/log +# cat ${TODO_HERE}_merged | awk -F $'\t' '{ if ($2=="MATCH_THIS") { if (!patt || substr($1,0,length(patt))!=patt) { patt=$1; } +# if (last1==patt) { print "MATCH LAST on "patt" ADDS "last1" "last2; } last1=""; } +# else { last1=$1; last2=$2; if (patt && substr(last1,0,length(patt))==patt) { print "MATCH NEXT on "patt" ADDS "last1" "last2; } } }' >> ${ORIG_DIR}/log + + comm -23 ${TODO_HERE} $OUTPUT > ${TODO_REMAINING} + cat $OUTPUT ${TODO_HERE} | sort -u -o ${OUTPUT} + rm ${TODO_HERE}_* done @@ -59,5 +78,5 @@ popd > /dev/null rm ${TODO_REMAINING} rm ${TODO_HERE} -echo completed scan of $REPO, history has `wc ${OUTPUT} | awk '{print $1}'` files +echo completed scan of $REPO in $OUTPUT_FILENAME, relevant history has `wc ${OUTPUT} | awk '{print $1}'` files http://git-wip-us.apache.org/repos/asf/brooklyn-client/blob/7b346367/uber-repo-whitelist.txt ---------------------------------------------------------------------- diff --git a/uber-repo-whitelist.txt b/uber-repo-whitelist.txt new file mode 100644 index 0000000..f2a1baf --- /dev/null +++ b/uber-repo-whitelist.txt @@ -0,0 +1,2 @@ +README.md +brooklyn/
