This is an automated email from the ASF dual-hosted git repository. magicaltrout pushed a commit to branch python3 in repository https://gitbox.apache.org/repos/asf/drat.git
commit 0b66c27510e4617b05d6028768c6acc8a2480226 Author: Tom Barber <t...@spicule.co.uk> AuthorDate: Tue Mar 5 23:23:42 2019 +0000 python 3 help --- .../src/main/resources/bin/dump_repo_details.py | 8 ++-- distribution/src/main/resources/bin/dratstats.py | 24 +++++------- .../bin/mime_partitioner/mime_rat_partitioner.py | 41 ++++++++++---------- .../resources/bin/rat_aggregator/rat_aggregator.py | 44 +++++++++------------- .../resources/config/PgeConfig_RatAggregator.xml | 2 +- 5 files changed, 54 insertions(+), 65 deletions(-) diff --git a/crawler/src/main/resources/bin/dump_repo_details.py b/crawler/src/main/resources/bin/dump_repo_details.py index ee68f4e..bd17d65 100755 --- a/crawler/src/main/resources/bin/dump_repo_details.py +++ b/crawler/src/main/resources/bin/dump_repo_details.py @@ -21,18 +21,18 @@ import json def main(argv=None): if len(argv) == 0: - print "No Repo details to dump" + print("No Repo details to dump") sys.exit() if os.getenv("DRAT_HOME")==None: - print "Please add DRAT_HOME environment variable and try again"; + print("Please add DRAT_HOME environment variable and try again"); sys.exit() default_repo_file_url = os.getenv("DRAT_HOME") + "/conf/repo.default.txt" with open(default_repo_file_url,'rb')as repoFile: data = '' for line in repoFile: - data+=line + data+=line.strip().decode('utf-8') rep = eval(data) reponame = os.path.basename(os.path.normpath(argv[0])) @@ -45,7 +45,7 @@ def main(argv=None): file.write(json.dumps(rep)) file.close() - print rep + print(rep) if __name__ == "__main__": main(sys.argv[1:]) diff --git a/distribution/src/main/resources/bin/dratstats.py b/distribution/src/main/resources/bin/dratstats.py index c4dd0c5..0ac89ab 100755 --- a/distribution/src/main/resources/bin/dratstats.py +++ b/distribution/src/main/resources/bin/dratstats.py @@ -26,27 +26,23 @@ import subprocess import time import shutil import datetime -import csv -import urllib2 +from urllib.request import urlopen, Request import json -import xmlrpclib -import getopt -import glob -import md5 +import xmlrpc # Check for environment variables def check_env_var(): if os.getenv("DRAT_HOME") == None: - print "Environment variable $DRAT_HOME is not set." + print("Environment variable $DRAT_HOME is not set.") sys.exit(1) if os.getenv("JAVA_HOME") == None: - print "Environment variable $JAVA_HOME is not set." + print("Environment variable $JAVA_HOME is not set.") sys.exit(1) if os.getenv("SOLR_DRAT_URL") == None: - print "Environment variable $SOLR_DRAT_URL is not set." + print("Environment variable $SOLR_DRAT_URL is not set.") sys.exit(1) if os.getenv("WORKFLOW_URL") == None: - print "Environment variable $WORKFLOW_URL is not set." + print("Environment variable $WORKFLOW_URL is not set.") sys.exit(1) @@ -83,7 +79,7 @@ def help(): # Printing out on Console def printnow(string): - print string + print(string) sys.stdout.flush() @@ -176,7 +172,7 @@ def drat_reset(): # Check if there are any pending PGE jobs in the queue def job_in_queue(job_name): status = "PGE EXEC" - server = xmlrpclib.ServerProxy(os.getenv("WORKFLOW_URL"), verbose=False) + server = xmlrpc.client.ServerProxy(os.getenv("WORKFLOW_URL"), verbose=False) for x in range(0,6): @@ -225,9 +221,9 @@ def parse_license(s): # Index into Solr def index_solr(json_data): printnow(json_data) - request = urllib2.Request(os.getenv("SOLR_URL") + "/statistics/update/json?commit=true") + request = Request(os.getenv("SOLR_URL") + "/statistics/update/json?commit=true") request.add_header('Content-type', 'application/json') - urllib2.urlopen(request, json_data) + urlopen(request, json_data) # Run DRAT and collect statistics diff --git a/pge/src/main/resources/bin/mime_partitioner/mime_rat_partitioner.py b/pge/src/main/resources/bin/mime_partitioner/mime_rat_partitioner.py index 87d52b6..c6cd1ff 100755 --- a/pge/src/main/resources/bin/mime_partitioner/mime_rat_partitioner.py +++ b/pge/src/main/resources/bin/mime_partitioner/mime_rat_partitioner.py @@ -24,11 +24,12 @@ import sys import json -import os import getopt -import urllib2 -import xmlrpclib -urllib2.build_opener(urllib2.HTTPHandler(debuglevel=1)) +import urllib +from urllib.request import urlopen, Request +import xmlrpc + +#urllib.request.build_opener(urllib.HTTPHandler(debuglevel=1)) solrPostfix = "/select/?q=mimetype:$type&version=2.2&start=0&rows=10&indent=on&facet=on&facet.field=mimetype&wt=json&fl=filelocation,filename" solrPostfixByPage = "/select/?q=mimetype:$type&version=2.2&start=$i&rows=$num&indent=on&facet=on&facet.field=mimetype&wt=json&fl=filelocation,filename" @@ -41,25 +42,25 @@ def executeRatJobs(url, num, type, workflowUrl, taskIds): if not url.endswith("/"): url = url + "/" solrUrl = url+solrPostfix.replace("$type", type) - print "GET "+solrUrl + print("GET "+solrUrl) numFound = 0 - req = urllib2.Request(solrUrl) + req = Request(solrUrl) try: - f = urllib2.urlopen(req) + f = urlopen(req) jsonResp = json.loads(f.read()) numFound = int(jsonResp["response"]["numFound"]) - except urllib2.HTTPError, (err): - print "HTTP error(%s)" % (err) - print "Aborting RAT execution" + except urllib.error.HTTPError as err: + print("HTTP error(%s)" % (err)) + print("Aborting RAT execution") return - wm = xmlrpclib.Server(workflowUrl) + wm = xmlrpc.client.Server(workflowUrl) for i in range(0, numFound, num): ratSolrUrl = url + solrPostfixByPage.replace("$type", type).replace("$i", str(i)).replace("$num",str(num)) - req = urllib2.Request(ratSolrUrl) - f = urllib2.urlopen(req) + req = Request(ratSolrUrl) + f = urlopen(req) jsonResp = json.loads(f.read()) docs = jsonResp["response"]["docs"] metadata = {} @@ -75,13 +76,13 @@ def executeRatJobs(url, num, type, workflowUrl, taskIds): metadata["InputFiles"] = [] metadata["InputFiles"].append(fullpath) - print "Metadata is "+str(metadata) + print("Metadata is "+str(metadata)) wm.workflowmgr.executeDynamicWorkflow([taskIds], metadata) def get_mime_types(solrUrl): neg_mimetype = ["image", "application", "text", "video", "audio", "message", "multipart"] - connection = urllib2.urlopen(solrUrl + "/select?q=*%3A*&rows=0&facet=true&facet.field=mimetype&wt=python&indent=true") + connection = urlopen(solrUrl + "/select?q=*%3A*&rows=0&facet=true&facet.field=mimetype&wt=python&indent=true") response = eval(connection.read()) mime_count = response["facet_counts"]["facet_fields"]["mimetype"] stats = {} @@ -101,11 +102,11 @@ def main(argv): try: opts, args = getopt.getopt(argv,"hu:c:w:t:",["solrUrl=", "numFilesPerJob=", "workflowUrl=", "ratTaskId="]) except getopt.GetoptError: - print usage + print(usage) sys.exit(2) for opt, arg in opts: if opt == '-h': - print usage + print(usage) sys.exit() elif opt in ("-u", "--solrUrl"): solrUrl = arg @@ -117,15 +118,15 @@ def main(argv): ratTaskId = arg if solrUrl == "" or numFilesPerJob == 0 or workflowUrl == "" or ratTaskId == "": - print usage + print(usage) sys.exit() - print "Configured SOLR url: ["+solrUrl+"]" + print("Configured SOLR url: ["+solrUrl+"]") mimeTypes = get_mime_types(solrUrl) for type in mimeTypes: - print "Executing RAT for MIME: ["+type+"]: num files per job: ["+str(numFilesPerJob)+"]" + print("Executing RAT for MIME: ["+type+"]: num files per job: ["+str(numFilesPerJob)+"]") executeRatJobs(solrUrl, numFilesPerJob, type, workflowUrl, ratTaskId) if __name__ == "__main__": diff --git a/pge/src/main/resources/bin/rat_aggregator/rat_aggregator.py b/pge/src/main/resources/bin/rat_aggregator/rat_aggregator.py index 0b1b04d..e7969b8 100755 --- a/pge/src/main/resources/bin/rat_aggregator/rat_aggregator.py +++ b/pge/src/main/resources/bin/rat_aggregator/rat_aggregator.py @@ -25,31 +25,24 @@ import sys import os -import getopt -import subprocess -import time -import shutil -import datetime -import csv -import urllib2 + +from urllib.request import urlopen, Request import json -import xmlrpclib -import getopt import glob -import md5 +import hashlib import requests def parse_license(s): li_dict = {'N': 'Notes', 'B': 'Binaries', 'A': 'Archives', 'AL': 'Apache', '!?????': 'Unknown'} if s and not s.isspace(): - arr = s.split("/", 1) + arr = s.split(b"/", 1) li = arr[0].strip() if li in li_dict: li = li_dict[li] - if len(arr) > 1 and len(arr[1].split("/")) > 0: - return [arr[1].split("/")[-1], li] + if len(arr) > 1 and len(arr[1].split(b"/")) > 0: + return [arr[1].split(b"/")[-1], li] else: #print('split not correct during license parsing '+str(arr)) return ["/dev/null", li_dict['!?????']] @@ -98,9 +91,9 @@ def count_num_files(path, exclude): def index_solr(json_data): #print(json_data) - request = urllib2.Request(os.getenv("SOLR_URL") + "/statistics/update/json?commit=true") + request = Request(os.getenv("SOLR_URL") + "/statistics/update/json?commit=true") request.add_header('Content-type', 'application/json') - urllib2.urlopen(request, json_data) + urlopen(request, json_data.encode('utf-8')) def main(argv=None): usage = 'rat_aggregator.py logfile1 logfile2 ... logfileN' @@ -110,13 +103,13 @@ def main(argv=None): with open(repo_file_url,'rb')as repoFile: data = '' for line in repoFile: - data+=line + data+=line.decode('utf-8') rep = eval(data) index_solr(json.dumps([rep])) if len(argv) == 0: - print usage + print(usage) sys.exit() totalNotes = 0 @@ -193,7 +186,7 @@ def main(argv=None): with open(filename, 'rb') as f: for line in f: - if '*****************************************************' in line: + if b'*****************************************************' in line: l = 0 h = 0 if cur_section == 'licenses': @@ -204,9 +197,9 @@ def main(argv=None): cur_file = '' cur_header = '' cur_section = '' - if line.startswith(' Files with Apache') and not parsedLicenses: + if line.startswith(b' Files with Apache') and not parsedLicenses: cur_section = 'licenses' - if line.startswith(' Printing headers for ') and not parsedHeaders: + if line.startswith(b' Printing headers for ') and not parsedHeaders: cur_section = 'headers' if cur_section == 'licenses': l += 1 @@ -218,12 +211,12 @@ def main(argv=None): rat_license[li[0]] = li[1] #print(li) if cur_section == 'headers': - if '=====================================================' in line or '== File:' in line: + if b'=====================================================' in line or b'== File:' in line: h += 1 if h == 2: - cur_file = line.split("/")[-1].strip() + cur_file = line.split(b"/")[-1].strip() if h == 3: - cur_header += line + cur_header += line.decode('utf-8') if h == 4: rat_header[cur_file] = cur_header.split("\n", 1)[1] cur_file = '' @@ -248,8 +241,7 @@ def main(argv=None): for doc in docs: fdata = {} fdata['id'] = os.path.join(doc['filelocation'][0], doc['filename'][0]) - m = md5.new() - m.update(fdata['id']) + m = hashlib.md5(fdata['id'].encode('utf-8')) hashId = m.hexdigest() fileId = hashId+"-"+doc['filename'][0] @@ -275,7 +267,7 @@ def main(argv=None): # Copying data to Output Directory print ("Notes,Binaries,Archives,Standards,Apache,Generated,Unknown") - print str(totalNotes)+","+str(totalBinaries)+","+str(totalArchives)+","+str(totalStandards)+","+str(totalApache)+" ,"+str(totalGenerated)+","+str(totalUnknown) + print(str(totalNotes)+","+str(totalBinaries)+","+str(totalArchives)+","+str(totalStandards)+","+str(totalApache)+" ,"+str(totalGenerated)+","+str(totalUnknown)) #print("\nData copied to Solr and Output Directory: OK\n") diff --git a/pge/src/main/resources/config/PgeConfig_RatAggregator.xml b/pge/src/main/resources/config/PgeConfig_RatAggregator.xml index d9cafec..73c3768 100644 --- a/pge/src/main/resources/config/PgeConfig_RatAggregator.xml +++ b/pge/src/main/resources/config/PgeConfig_RatAggregator.xml @@ -8,7 +8,7 @@ <cmd>echo "Creating working dirs"</cmd> <cmd>mkdir [JobInputDir] ; mkdir [JobOutputDir]; mkdir [JobLogDir]</cmd> <cmd>echo "Running RAT aggregator"</cmd> - <cmd>[RatAggregatorScript] `python -c "print ' '.join('[InputFiles]'.split(','))"` > [JobOutputDir]/rat_aggregate_stats_[DateMilis].csv</cmd> + <cmd>[RatAggregatorScript] `python -c "print(' '.join('[InputFiles]'.split(',')))"` > [JobOutputDir]/rat_aggregate_stats_[DateMilis].csv</cmd> </exe> <!-- Files to ingest -->