[GitHub] chrismattmann closed pull request #155: Working dratstat

GitBox Tue, 14 Aug 2018 06:51:55 -0700

chrismattmann closed pull request #155: Working dratstat
URL: https://github.com/apache/drat/pull/155


This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/distribution/src/main/resources/bin/drat 
b/distribution/src/main/resources/bin/drat
index 8081a3e0..f20d116b 100755
--- a/distribution/src/main/resources/bin/drat
+++ b/distribution/src/main/resources/bin/drat
@@ -243,28 +243,28 @@ function reset {
 }
 
 # Clears the Solr Statistics repo for aggregating crawl stats. Expects no 
arguments.
-function delstats{
-    check_services_running "running"
-    check_num_args "delstats" $# 0
-    echo "This will remove any previous or current Solr DRAT stats 
information."
-    read -p "Do you wish to continue? [yN] " yn
-        case $yn in
-            [Yy]*)
-                echo "rm -rf $DRAT_HOME/solr/statistics/data"
-                rm -rf $DRAT_HOME/solr/statistics/data
-                echo "Please restart OODT with '\$DRAT_HOME/bin/oodt start' if 
you wish to run another crawl."
-            ;;
-            [Nn]*)
-                echo "Reset cancelled. Exiting..."
-                exit 0
-            ;;
-            *)
-                echo "Aborting..."
-                exit 1
-            ;;
-        esac
-
-}
+#function delstats{
+#    check_services_running "running"
+#    check_num_args "delstats" $# 0
+#    echo "This will remove any previous or current Solr DRAT stats 
information."
+#    read -p "Do you wish to continue? [yN] " yn
+#        case $yn in
+#            [Yy]*)
+#                echo "rm -rf $DRAT_HOME/solr/statistics/data"
+#                rm -rf $DRAT_HOME/solr/statistics/data
+#                echo "Please restart OODT with '\$DRAT_HOME/bin/oodt start' 
if you wish to run another crawl."
+#            ;;
+#            [Nn]*)
+#                echo "Reset cancelled. Exiting..."
+#                exit 0
+#            ;;
+#            *)
+#                echo "Aborting..."
+#                exit 1
+#            ;;
+#        esac
+#
+#}
 
 # Start parsing the arguments.
 case $1 in
diff --git a/distribution/src/main/resources/bin/dratstats.py 
b/distribution/src/main/resources/bin/dratstats.py
index cae5c897..4648c682 100644
--- a/distribution/src/main/resources/bin/dratstats.py
+++ b/distribution/src/main/resources/bin/dratstats.py
@@ -147,7 +147,7 @@ def drat_process(command, repository):
                elif command == "index":
                        retcode = subprocess.call("${DRAT_HOME}/bin/drat" + " " 
+ command + " " + repository, shell=True)
                elif command == "map" or command == "reduce":
-                       retcode = subprocess.call("nohup ${DRAT_HOME}/bin/drat" 
+ " " + command + " &", shell=True)
+                       retcode = subprocess.call("${DRAT_HOME}/bin/drat" + " " 
+ command + " &", shell=True)
                if retcode < 0:
                        print >>sys.stderr, "DRAT " + command + " process was 
terminated by signal", -retcode, ". Aborting..."
                        retval = False
@@ -180,12 +180,19 @@ def drat_reset():
 def job_in_queue(job_name):
        status = "PGE EXEC"
        server = xmlrpclib.ServerProxy(os.getenv("WORKFLOW_URL"), verbose=False)
-       response = server.workflowmgr.getWorkflowInstancesByStatus(status)
+       
+
+       for x in range(0,6):
+               response = 
server.workflowmgr.getWorkflowInstancesByStatus(status)
 
-       for i in range(0, len(response)):
+               for i in range(0, len(response)):
                #print response[i]["sharedContext"]["TaskId"]
-               if response[i]["sharedContext"]["TaskId"][0] == job_name:
-                       return True
+                       if response[i]["sharedContext"]["TaskId"][0] == 
job_name:
+                               return True
+
+               time.sleep(10)          
+
+       
 
        return False
 
@@ -268,7 +275,7 @@ def run(repos_list, output_dir):
                        print("\nOODT Started: OK\n")
 
                        print('Adding repository: '+str(rep)+' to Solr')
-                       index_solr(json.dumps([rep]))
+                       # index_solr(json.dumps([rep]))
 
 
                        print("\nRunning DRAT on " + rep["repo"] + " ...\n")
@@ -281,6 +288,12 @@ def run(repos_list, output_dir):
                        retval = drat_process("crawl", rep["repo"])
                        stats['crawl_end'] = current_datetime()
 
+                       rep["id"] = "id:"+os.path.normpath(rep["repo"])
+                       outputfile = os.getenv("DRAT_HOME") + "/data/repo"
+                       file = open(outputfile,"w")
+                       file.write(json.dumps(rep))
+                       file.close()
+
                        if retval:
                                time.sleep(5)
                                stats['index_start'] = current_datetime()
@@ -296,177 +309,14 @@ def run(repos_list, output_dir):
                                        wait_for_job("urn:drat:RatCodeAudit")
                                        stats['map_end'] = current_datetime()
 
-                                       if retval:
-                                               time.sleep(5)
-                                               stats['reduce_start'] = 
current_datetime()
-                                               
-                                               # Extract data from 
RatAggregate File
-                                               totalNotes = 0
-                                               totalBinaries = 0
-                                               totalArchives = 0
-                                               totalStandards = 0
-                                               totalApache = 0
-                                               totalGenerated = 0
-                                               totalUnknown = 0
-
-                                               rat_dir = 
os.getenv("DRAT_HOME") + "/data/archive/rat"
-
-                                               # Iterate over all RAT log 
files 
-                                               for root, dirs, files in 
os.walk(rat_dir):
-                                                       for filename in files:
-                                                               if 
filename.endswith(".log"):
-                                                                       (notes, 
binaries, archives,standards,apachelicensed,generated,unknown) = 
parseFile(os.path.join(root, filename))
-                                                                       
totalNotes = totalNotes + notes
-                                                                       
totalBinaries = totalBinaries + binaries
-                                                                       
totalArchives = totalArchives + archives
-                                                                       
totalStandards = totalStandards + standards
-                                                                       
totalApache = totalApache + apachelicensed
-                                                                       
totalGenerated = totalGenerated + generated
-                                                                       
totalUnknown = totalUnknown + unknown
-
-                                               stats["license_Notes"] = 
totalNotes
-                                               stats["license_Binaries"] = 
totalBinaries
-                                               stats["license_Archives"] = 
totalArchives
-                                               stats["license_Standards"] = 
totalStandards
-                                               stats["license_Apache"] = 
totalApache
-                                               stats["license_Generated"] = 
totalGenerated
-                                               stats["license_Unknown"] = 
totalUnknown
-
-                                               stats['reduce_end'] = 
current_datetime()
-                                               print "\nDRAT Scan Completed: 
OK\n"
-
-                       time.sleep(5)
-
-                       if retval:
-                               # Copy Data with datetime variables above, 
extract output from RatAggregate file, extract data from Solr Core
-                               printnow ("\nCopying data to Solr and Output 
Directory...\n")
-
-                               # Extract data from Solr
-                               neg_mimetype = ["image", "application", "text", 
"video", "audio", "message", "multipart"]
-                               connection = 
urllib2.urlopen(os.getenv("SOLR_URL") + 
"/drat/select?q=*%3A*&rows=0&facet=true&facet.field=mimetype&wt=python&indent=true")
-                               
-                               response = eval(connection.read())
-                               mime_count = 
response["facet_counts"]["facet_fields"]["mimetype"]
-
-                               for i in range(0, len(mime_count), 2):
-                                       if mime_count[i].split("/")[0] not in 
neg_mimetype:
-                                               stats["mime_" + mime_count[i]] 
= mime_count[i + 1]
-
-
-                               # Count the number of files
-                               stats["files"] = count_num_files(rep["repo"], 
".git")
-
-                               # Write data into Solr
-                               stats["type"] = 'software'
-                               stats_data = []
-                               stats_data.append(stats)
-                               json_data = json.dumps(stats_data)
-                               index_solr(json_data)
-
-                               # Parse RAT logs
-                               rat_logs_dir = os.getenv("DRAT_HOME") + 
"/data/archive/rat/*/*.log"
-                               rat_license = {}
-                               rat_header = {}
-                               for filename in glob.glob(rat_logs_dir):
-                                       #print('=' * 20)
-                                       l = 0
-                                       h = 0
-                                       cur_file = ''
-                                       cur_header = ''
-                                       cur_section = ''
-                                       parsedHeaders = False
-                                       parsedLicenses = False
-                                       
-                                       with open(filename, 'rb') as f:
-                                               printnow('Parsing rat log: 
['+filename+']')
-                                               for line in f:
-                                                       if 
'*****************************************************' in line:
-                                                               l = 0
-                                                               h = 0
-                                                               if cur_section 
== 'licenses':
-                                                                       
parsedLicenses = True
-                                                               if cur_section 
== 'headers':
-                                                                       
parsedHeaders = True
-                                                                       
-                                                               cur_file = ''
-                                                               cur_header = ''
-                                                               cur_section = ''
-                                                       if line.startswith('  
Files with Apache') and not parsedLicenses:
-                                                               cur_section = 
'licenses'
-                                                       if line.startswith(' 
Printing headers for ') and not parsedHeaders:
-                                                               cur_section = 
'headers'
-                                                       if cur_section == 
'licenses':
-                                                               l += 1
-                                                               if l > 4:
-                                                                       line = 
line.strip()
-                                                                       if line:
-                                                                               
print("File: %s with License Line: %s" % (filename, line))
-                                                                               
li = parse_license(line)
-                                                                               
rat_license[li[0]] = li[1]
-                                                                               
print(li)
-                                                       if cur_section == 
'headers':
-                                                               if 
'=====================================================' in line or '== File:' 
in line:
-                                                                       h += 1
-                                                               if h == 2:
-                                                                       
cur_file = line.split("/")[-1].strip()
-                                                               if h == 3:
-                                                                       
cur_header += line
-                                                               if h == 4:
-                                                                       
rat_header[cur_file] = cur_header.split("\n", 1)[1]
-                                                                       
cur_file = ''
-                                                                       
cur_header = ''
-                                                                       h = 1
-                                       if h == 3:
-                                               rat_header[cur_file] = 
cur_header.split("\n", 1)[1]
-                                       parsedHeaders = True
-                                       parsedLicenses = True
-
-                               # Index RAT logs into Solr
-                               connection = 
urllib2.urlopen(os.getenv("SOLR_URL") +
-                                                                               
         
"/drat/select?q=*%3A*&fl=filename%2Cfilelocation%2Cmimetype&wt=python&rows="
-                                                                               
         + str(stats["files"]) +"&indent=true")
-                               response = eval(connection.read())
-                               docs = response['response']['docs']
-                               file_data = []
-                               batch = 100
-                               dc = 0
-                               
-                               for doc in docs:
-                                       fdata = {}
-                                       fdata['id'] = 
os.path.join(doc['filelocation'][0], doc['filename'][0])
-                                       m = md5.new()
-                                       m.update(fdata['id'])
-                                       hashId = m.hexdigest()
-                                       fileId = hashId+"-"+doc['filename'][0]
-
-                                       if fileId not in rat_license:
-                                               print "File: 
"+str(fdata['id'])+": ID: ["+fileId+"] not present in parsed licenses => Likely 
file copying issue. Skipping."
-                                               continue #handle issue with 
DRAT #93
-                                       
-                                       fdata["type"] = 'file'
-                                       fdata['parent'] = rep["repo"]
-                                       fdata['mimetype'] = doc['mimetype'][0]
-                                       fdata['license'] = rat_license[fileId]
-                                       if fileId in rat_header:
-                                               fdata['header'] = 
rat_header[fileId]
-                                       file_data.append(fdata)
-                                       dc += 1
-                                       if dc % batch == 0:
-                                               json_data = 
json.dumps(file_data)
-                                               index_solr(json_data)
-                                               file_data = []
-                               if dc % batch != 0:
-                                       json_data = json.dumps(file_data)
-                                       index_solr(json_data)
-
-                               # Copying data to Output Directory
-                               repos_out = output_dir + "/" + 
normalize_path(rep["repo"])
-                               shutil.copytree(os.getenv("DRAT_HOME") + 
"/data", repos_out)
-                               print("\nData copied to Solr and Output 
Directory: OK\n")
-
-                       else:
-                               print ("\nDRAT Scan Completed: Resulted in 
Error\n")
-
+                                       if(retval):
+                                               
wait_for_job("urn:drat:RatAggregator")
+                                               time.sleep(10)
+                                               retval = 
drat_process("reduce",None)
+                                               time.sleep(10)
+                                               print ("\nwaiting for Rat 
Aggregator...\n")
+                                               
wait_for_job("urn:drat:RatAggregator")
+                       
 
                        time.sleep(5)
                        print ("\nStopping OODT...\n")
diff --git a/nohup.out b/nohup.out
new file mode 100644
index 00000000..bb32b5eb
--- /dev/null
+++ b/nohup.out
@@ -0,0 +1,3 @@
+Started dynamic workflow with id '6453cca6-9f30-11e8-b99d-f5018c8e9233'
+
+Navigate to http://localhost:8080/opsui/ to view the OODT browser and 
http://localhost:8080/solr to view the Solr catalog.
diff --git a/webapps/proteus-new/src/main/webapp/resources/public/logo.png 
b/webapps/proteus-new/src/main/webapp/resources/public/logo.png
index 445d7d7e..7800df90 100644
Binary files a/webapps/proteus-new/src/main/webapp/resources/public/logo.png 
and b/webapps/proteus-new/src/main/webapp/resources/public/logo.png differ
diff --git a/webapps/proteus-new/src/main/webapp/resources/src/App.vue 
b/webapps/proteus-new/src/main/webapp/resources/src/App.vue
index d071508f..b952de13 100644
--- a/webapps/proteus-new/src/main/webapp/resources/src/App.vue
+++ b/webapps/proteus-new/src/main/webapp/resources/src/App.vue
@@ -149,7 +149,7 @@ the License.
     
     <v-spacer/>
     <v-card id="footercard">
-      <img height="60px"  src="logo.png">
+      <img height="80px"  src="logo.png">
     </v-card>
        
     </v-app>
diff --git 
a/webapps/proteus-new/src/main/webapp/resources/src/components/auditsummarycomp.vue
 
b/webapps/proteus-new/src/main/webapp/resources/src/components/auditsummarycomp.vue
index 345dfdcf..eec59a9b 100644
--- 
a/webapps/proteus-new/src/main/webapp/resources/src/components/auditsummarycomp.vue
+++ 
b/webapps/proteus-new/src/main/webapp/resources/src/components/auditsummarycomp.vue
@@ -59,7 +59,7 @@ the License.
               
 
               var docs = response.data.response.docs;
-            // var docs = 
[{"license_Generated":0,"license_Archives":1,"license_Standards":108,"license_Unknown":12,"license_Binaries":139,"license_Apache":96,"id":"/media/Workings/GSOC-2018/drat/deploy/data/clones/android-UniversalMusicPlayer","license_Notes":1}];
+        
               for(var i = 0; i < docs.length; i++) {
                   var doc = docs[i];
                   var repo = doc.id.split("/");
@@ -241,7 +241,7 @@ the License.
                         var offset = -gapBetweenGroups/2;
                         var horz = spaceForLabels + chartWidth + 40 - 
legendRectSize;
                         var vert = i * height - offset;
-                        return 'translate(' + horz + ',' + vert + ')';
+                        return 'translate(' + 0+ ',' + vert + ')';
                     });
 
                 legend.append('rect')
diff --git a/webapps/proteus-new/src/main/webapp/resources/src/logo.png 
b/webapps/proteus-new/src/main/webapp/resources/src/logo.png
index 445d7d7e..7800df90 100644
Binary files a/webapps/proteus-new/src/main/webapp/resources/src/logo.png and 
b/webapps/proteus-new/src/main/webapp/resources/src/logo.png differ


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] chrismattmann closed pull request #155: Working dratstat

Reply via email to