Author: robweir Date: Thu Mar 13 13:42:09 2014 New Revision: 1577158 URL: http://svn.apache.org/r1577158 Log: Script to download attachments from BZ
Added: openoffice/devtools/bz-tools/ openoffice/devtools/bz-tools/bz-attachment-extract.py Added: openoffice/devtools/bz-tools/bz-attachment-extract.py URL: http://svn.apache.org/viewvc/openoffice/devtools/bz-tools/bz-attachment-extract.py?rev=1577158&view=auto ============================================================================== --- openoffice/devtools/bz-tools/bz-attachment-extract.py (added) +++ openoffice/devtools/bz-tools/bz-attachment-extract.py Thu Mar 13 13:42:09 2014 @@ -0,0 +1,138 @@ +################################################################ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +################################################################ + + +# This script reads a text file containing AOO Bugzilla ID's and uses +# the Bugzilla REST API to download the XML data for each BZ issue, +# extract and decode and save any attachments. The XML for each issue +# is cached, so repeated runs can avoid redundant hits to the server. + +import urllib +import sys +import datetime +import time + +import base64 + +import xml.etree.ElementTree as ET + +from urllib import urlencode + +#Smarter caching would look at the modify date of the issue and invalidate the cache if the issue had changed... +def getXMLFromCache(issue): + + fileName = "./cache/" + str(issue) + ".xml" + + try: + + file = open(fileName) + + data = file.read() + + print "R" + + + except IOError as e: + data = "" + + + return data + + +def writeDataToCache(issue,data): + + fileName = "./cache/" + str(issue) + ".xml" + + file = open(fileName, "w") + file.write(data) + file.close() + + print "W" + + +def getXML(issueID): + + url = "https://issues.apache.org/ooo/show_bug.cgi?ctype=xml&id=" + str(issueID) + + attempts = 0 + +# We get occasional time out errors, so retry up to 3 times + while attempts < 3: + try: + conn = urllib.urlopen(url) + data = conn.read() + + return data + + except: + attempts += 1 + print url + print "error " + str(attempts) + + return "" + + +if len(sys.argv) != 2: + print "syntax: python bz-attach-extract.py <issues>" + print "where <issues> is a text file containing a list of BZ ID's to extract their attachments, one per line" + exit(-1) + +issues = [line.strip() for line in open(sys.argv[1])] + +for issue in issues: + print issue + + cached = False + + data = getXMLFromCache(issue) + + if data=="": + data = getXML(issue) + writeDataToCache(issue,data) + else: + cached = True + + root = ET.fromstring(data) + + for attachment in root.iter('attachment'): + mimetype = attachment.find('type').text + + filename = attachment.find('filename').text + + base64data = attachment.find('data').text + + decoded = base64.b64decode(base64data) + + try: + + file = open("./download/" + issue + "_" + filename, "wb") + file.write(decoded) + file.close() + + except IOError as e: + print "I/O error({0}): {1}".format(e.errno, e.strerror) + +# We don't want to overload the server or get our IP banned + if cached == False: + time.sleep(15) + + +