commit:     dde4a3a9c8fbe76897219886f21d046392d65730
Author:     Mart Raudsepp <leio <AT> gentoo <DOT> org>
AuthorDate: Wed Dec  7 01:56:00 2016 +0000
Commit:     Mart Raudsepp <leio <AT> gentoo <DOT> org>
CommitDate: Wed Dec  7 01:56:00 2016 +0000
URL:        https://gitweb.gentoo.org/proj/grumpy.git/commit/?id=dde4a3a9

sync: Add package description and maintainers sync

Maintains a sync timestamp to skip recently synced packages, so if a
previous run got stuck, we can skip re-doing it too soon.
Saves the DB transaction after every 100 packages, because packages.g.o
seems to rate-limit us, so at least we will have things saved into DB
periodically to cancel out when we get stuck and restart.

 backend/lib/sync.py | 49 +++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 45 insertions(+), 4 deletions(-)

diff --git a/backend/lib/sync.py b/backend/lib/sync.py
index e53fa9b..567da2d 100644
--- a/backend/lib/sync.py
+++ b/backend/lib/sync.py
@@ -1,8 +1,11 @@
 import xml.etree.ElementTree as ET
 import requests
+import time
+from datetime import datetime
 from .. import app, db
 from .models import Category, Maintainer, Package, PackageVersion
 
+SYNC_BUFFER_SECS = 30*60
 proj_url = "https://api.gentoo.org/metastructure/projects.xml";
 pkg_url_base = "https://packages.gentoo.org/";
 http_session = requests.session()
@@ -144,11 +147,49 @@ def sync_packages():
     db.session.commit()
 
 def sync_versions():
-    for package in Package.query.all():
+    cnt = 0
+    ts = datetime.utcfromtimestamp(time.time() - SYNC_BUFFER_SECS)
+    now = datetime.utcnow()
+    existing_maintainers = {}
+    for maintainer in Maintainer.query.all():
+        existing_maintainers[maintainer.email] = maintainer
+
+    for package in Package.query.filter(Package.last_sync_ts < ts).all():
+        cnt += 1
         data = http_session.get(pkg_url_base + "packages/" + package.full_name 
+ ".json")
         if not data:
             print("No JSON data for package %s" % package.full_name) # FIXME: 
Handle better; e.g mark the package as removed if no pkgmove update
             continue
-        from pprint import pprint
-        pprint(data.json())
-        break
+
+        pkg = data.json()
+
+        print ("Updating package: %s" % package.full_name)
+        if 'description' in pkg:
+            package.description = pkg['description']
+
+        maintainers = []
+        if 'maintainers' in pkg:
+            for maint in pkg['maintainers']:
+                if 'email' not in maint:
+                    print("WARNING: Package %s was told to have a maintainer 
without an e-mail identifier" % package.full_name)
+                    continue
+                if maint['email'] in existing_maintainers: # FIXME: Some 
proxy-maintainers are using mixed case e-mail address, right now we'd be 
creating duplicates right now if the case is different across different packages
+                    maintainers.append(existing_maintainers[maint['email']])
+                else:
+                    is_project = False
+                    if 'type' in maint and maint['type'] == 'project':
+                        is_project = True
+                    print("Adding %s maintainer %s" % ("project" if is_project 
else "individual", maint['email']))
+                    new_maintainer = Maintainer(email=maint['email'], 
is_project=is_project, name=maint['name'] if 'name' in maint else None)
+                    db.session.add(new_maintainer)
+                    existing_maintainers[maint['email']] = new_maintainer
+                    maintainers.append(new_maintainer)
+
+        # Intentionally outside if 'maintainers' in pkg, because if there are 
no maintainers in JSON, it's falled to maintainer-needed and we need to clean 
out old maintainer entries
+        package.maintainers = maintainers # TODO: Retain order to know who is 
primary; retain description associated with the maintainership
+        package.last_sync_ts = now
+
+        if not cnt % 100:
+            print("%d packages updated, committing DB transaction" % cnt)
+            db.session.commit()
+            now = datetime.utcnow()

Reply via email to