Nilesh has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/72457


Change subject: Fixed errors in wikiparser scripts and tested on Hadoop.
......................................................................

Fixed errors in wikiparser scripts and tested on Hadoop.

Change-Id: Ie22d06f74097a547f9f6f4d5a2ff8f6acf6ffd1a
---
M wikiparser/wikiparser.py
M wikiparser/wikiparser_db.py
2 files changed, 25 insertions(+), 17 deletions(-)


  git pull 
ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/WikidataEntitySuggester 
refs/changes/57/72457/1

diff --git a/wikiparser/wikiparser.py b/wikiparser/wikiparser.py
index a123e40..02653d6 100644
--- a/wikiparser/wikiparser.py
+++ b/wikiparser/wikiparser.py
@@ -30,8 +30,8 @@
     tree = etree.parse(StringIO(page))
     page = {child.tag:child.text for child in tree.iter()}
     try:
-        title = page['title'][1:]
         if page['ns'] == '0':
+            title = page['title'][1:]
             text = json.loads(page['text'])
             statement = None
             if 'claims' in text:
@@ -42,12 +42,14 @@
                             toyield1 = str(statement['value'])
                             value = 
str(statement['wikibase-entityid']['numeric-id']) if 'wikibase-entityid' in 
statement else statement['string']
                             toyield2 = str(statement['value']) + "----" + value
-                            sys.stdout.write(toyield1.encode("utf-8", 
'ignore'))
-                            sys.stdout.write(toyield2.encode("utf-8", 
'ignore'))
+                            sys.stdout.write(toyield1.encode("utf-8", 
'ignore') + "\n")
+                            sys.stdout.write(toyield2.encode("utf-8", 
'ignore') + "\n")
                         except KeyError:
-                            toyield1 = toyield2 = None
-    except KeyError:
-        pass
+                            pass
+    except (KeyError, ValueError, TypeError) as e:
+        sys.stderr.write("Error occurred for page : " + str(title) + ", ns = " 
+ str(page['ns']) + "\n")
+        sys.stderr.write(traceback.format_exc() + "\n")
+
 
 if __name__ == '__main__':
     main()
diff --git a/wikiparser/wikiparser_db.py b/wikiparser/wikiparser_db.py
index 6a0413d..be0c6b0 100644
--- a/wikiparser/wikiparser_db.py
+++ b/wikiparser/wikiparser_db.py
@@ -5,6 +5,7 @@
 import json
 import sys
 import MySQLdb as mdb
+import traceback
 
 count = 0
 page = ''
@@ -13,7 +14,7 @@
     con = None
     cur = None
     try:
-        con = mdb.connect('localhost', 'root', 'password', 'wikidatawiki');
+        con = mdb.connect('localhost', 'root', 'orangetail', 'wikidatawiki');
         cur = con.cursor()
         cur.execute("SET FOREIGN_KEY_CHECKS = 0")
         cur.execute("SET UNIQUE_CHECKS = 0")
@@ -26,12 +27,12 @@
             if '<page>' in i:
                 i = sys.stdin.readline()
                 while '</page>' not in i:
-                    page += i#.strip()
+                    page += i
                     i = sys.stdin.readline()
                 page = '<page>' + page + '</page>'
                 parsePage(con, cur, page)
                 count += 1
-                if(count % 1000 == 0)
+                if(count % 10000 == 0):
                     con.commit()
     finally:
         if cur:
@@ -50,10 +51,12 @@
     tree = etree.parse(StringIO(page))
     page = {child.tag:child.text for child in tree.iter()}
     try:
-        title = page['title'][1:]
         if page['ns'] == '0':
+            title = page['title'][1:]
             text = json.loads(page['text'])
-            cur.execute("""INSERT INTO label VALUES (%s, %s, %s)""", 
(int(title), 'en', text['label']['en'].encode("utf-8", 'ignore')))
+            if 'en' not in text['label']: return
+            label = text['label']['en'].encode("utf-8", 'ignore')
+            cur.execute("""INSERT INTO label VALUES (%s, %s, %s)""", 
(int(title), 'en', label))
             statement = None
             if 'claims' in text:
                 for a in text['claims']:
@@ -63,15 +66,18 @@
                             toyield1 = str(statement['value'])
                             value = 
str(statement['wikibase-entityid']['numeric-id']) if 'wikibase-entityid' in 
statement else statement['string']
                             toyield2 = str(statement['value']) + "----" + value
-                            sys.stdout.write(toyield1.encode("utf-8", 
'ignore'))
-                            sys.stdout.write(toyield2.encode("utf-8", 
'ignore'))
+                            sys.stdout.write(toyield1.encode("utf-8", 
'ignore') + "\n")
+                            sys.stdout.write(toyield2.encode("utf-8", 
'ignore') + "\n")
                         except KeyError:
-                            toyield1 = toyield2 = None
+                            pass
         elif page['ns'] == '120':
+            title = page['title'][10:]
             text = json.loads(page['text'])
-            cur.execute("""INSERT INTO plabel VALUES (%s, %s, %s)""", 
(int(title), 'en', text['label']['en'].encode("utf-8", 'ignore')))
-    except KeyError:
-        pass
+            label = text['label']['en'].encode("utf-8", 'ignore')
+            cur.execute("""INSERT INTO plabel VALUES (%s, %s, %s)""", 
(int(title), 'en', label))
+    except (KeyError, ValueError, TypeError) as e:
+        sys.stderr.write("Error occurred for page : " + str(title) + ", ns = " 
+ str(page['ns']) + "\n")
+        sys.stderr.write(traceback.format_exc() + "\n")
     except mdb.Error, e:
         print "Error %d: %s" % (e.args[0],e.args[1])
         sys.exit(1)

-- 
To view, visit https://gerrit.wikimedia.org/r/72457
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ie22d06f74097a547f9f6f4d5a2ff8f6acf6ffd1a
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/WikidataEntitySuggester
Gerrit-Branch: master
Gerrit-Owner: Nilesh <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to