Author: robweir
Date: Thu Mar 13 13:42:09 2014
New Revision: 1577158

URL: http://svn.apache.org/r1577158
Log:
Script to download attachments from BZ

Added:
    openoffice/devtools/bz-tools/
    openoffice/devtools/bz-tools/bz-attachment-extract.py

Added: openoffice/devtools/bz-tools/bz-attachment-extract.py
URL: 
http://svn.apache.org/viewvc/openoffice/devtools/bz-tools/bz-attachment-extract.py?rev=1577158&view=auto
==============================================================================
--- openoffice/devtools/bz-tools/bz-attachment-extract.py (added)
+++ openoffice/devtools/bz-tools/bz-attachment-extract.py Thu Mar 13 13:42:09 
2014
@@ -0,0 +1,138 @@
+################################################################
+# 
+#  Licensed to the Apache Software Foundation (ASF) under one
+#  or more contributor license agreements.  See the NOTICE file
+#  distributed with this work for additional information
+#  regarding copyright ownership.  The ASF licenses this file
+#  to you under the Apache License, Version 2.0 (the
+#  "License"); you may not use this file except in compliance
+#  with the License.  You may obtain a copy of the License at
+#  
+#    http://www.apache.org/licenses/LICENSE-2.0
+#  
+#  Unless required by applicable law or agreed to in writing,
+#  software distributed under the License is distributed on an
+#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+#  KIND, either express or implied.  See the License for the
+#  specific language governing permissions and limitations
+#  under the License.
+#  
+################################################################
+
+
+# This script reads a text file containing AOO Bugzilla ID's and uses
+# the Bugzilla REST API to download the XML data for each BZ issue,
+# extract and decode and save any attachments.  The XML for each issue
+# is cached, so repeated runs can avoid redundant hits to the server.
+
+import urllib
+import sys
+import datetime
+import time
+
+import base64
+
+import xml.etree.ElementTree as ET
+
+from urllib import urlencode
+
+#Smarter caching would look at the modify date of the issue and invalidate the 
cache if the issue had changed...
+def getXMLFromCache(issue):
+
+    fileName = "./cache/" + str(issue) + ".xml"
+
+    try:
+    
+        file = open(fileName) 
+
+        data = file.read()
+
+        print "R"
+
+
+    except IOError as e:
+        data = ""
+
+
+    return data
+
+
+def writeDataToCache(issue,data):
+
+    fileName = "./cache/" + str(issue) + ".xml"
+
+    file = open(fileName, "w")
+    file.write(data)
+    file.close()
+
+    print "W"
+
+
+def getXML(issueID):
+
+    url = "https://issues.apache.org/ooo/show_bug.cgi?ctype=xml&id="; + 
str(issueID)
+
+    attempts = 0
+
+# We get occasional time out errors, so retry up to 3 times
+    while attempts < 3:
+        try:    
+            conn = urllib.urlopen(url)
+            data = conn.read()
+
+            return data
+
+        except:
+            attempts += 1
+            print url
+            print "error " + str(attempts)
+
+    return ""
+    
+    
+if len(sys.argv) != 2:
+    print "syntax:  python bz-attach-extract.py <issues>"
+    print "where <issues> is a text file containing a list of BZ ID's to 
extract their attachments, one per line"
+    exit(-1)
+
+issues = [line.strip() for line in open(sys.argv[1])]
+
+for issue in issues:
+    print issue
+
+    cached = False
+
+    data = getXMLFromCache(issue)
+
+    if data=="":
+        data = getXML(issue)
+        writeDataToCache(issue,data)
+    else:
+        cached = True
+
+    root = ET.fromstring(data)
+
+    for attachment in root.iter('attachment'):
+        mimetype =  attachment.find('type').text
+
+        filename =  attachment.find('filename').text
+        
+        base64data = attachment.find('data').text
+
+        decoded = base64.b64decode(base64data)
+        
+        try:
+
+            file = open("./download/" + issue + "_" + filename, "wb")
+            file.write(decoded)
+            file.close()
+
+        except IOError as e:
+            print "I/O error({0}): {1}".format(e.errno, e.strerror)
+
+# We don't want to overload the server or get our IP banned 
+    if cached == False:
+        time.sleep(15)
+
+
+


Reply via email to