Nilesh has uploaded a new change for review.
https://gerrit.wikimedia.org/r/72457
Change subject: Fixed errors in wikiparser scripts and tested on Hadoop.
......................................................................
Fixed errors in wikiparser scripts and tested on Hadoop.
Change-Id: Ie22d06f74097a547f9f6f4d5a2ff8f6acf6ffd1a
---
M wikiparser/wikiparser.py
M wikiparser/wikiparser_db.py
2 files changed, 25 insertions(+), 17 deletions(-)
git pull
ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/WikidataEntitySuggester
refs/changes/57/72457/1
diff --git a/wikiparser/wikiparser.py b/wikiparser/wikiparser.py
index a123e40..02653d6 100644
--- a/wikiparser/wikiparser.py
+++ b/wikiparser/wikiparser.py
@@ -30,8 +30,8 @@
tree = etree.parse(StringIO(page))
page = {child.tag:child.text for child in tree.iter()}
try:
- title = page['title'][1:]
if page['ns'] == '0':
+ title = page['title'][1:]
text = json.loads(page['text'])
statement = None
if 'claims' in text:
@@ -42,12 +42,14 @@
toyield1 = str(statement['value'])
value =
str(statement['wikibase-entityid']['numeric-id']) if 'wikibase-entityid' in
statement else statement['string']
toyield2 = str(statement['value']) + "----" + value
- sys.stdout.write(toyield1.encode("utf-8",
'ignore'))
- sys.stdout.write(toyield2.encode("utf-8",
'ignore'))
+ sys.stdout.write(toyield1.encode("utf-8",
'ignore') + "\n")
+ sys.stdout.write(toyield2.encode("utf-8",
'ignore') + "\n")
except KeyError:
- toyield1 = toyield2 = None
- except KeyError:
- pass
+ pass
+ except (KeyError, ValueError, TypeError) as e:
+ sys.stderr.write("Error occurred for page : " + str(title) + ", ns = "
+ str(page['ns']) + "\n")
+ sys.stderr.write(traceback.format_exc() + "\n")
+
if __name__ == '__main__':
main()
diff --git a/wikiparser/wikiparser_db.py b/wikiparser/wikiparser_db.py
index 6a0413d..be0c6b0 100644
--- a/wikiparser/wikiparser_db.py
+++ b/wikiparser/wikiparser_db.py
@@ -5,6 +5,7 @@
import json
import sys
import MySQLdb as mdb
+import traceback
count = 0
page = ''
@@ -13,7 +14,7 @@
con = None
cur = None
try:
- con = mdb.connect('localhost', 'root', 'password', 'wikidatawiki');
+ con = mdb.connect('localhost', 'root', 'orangetail', 'wikidatawiki');
cur = con.cursor()
cur.execute("SET FOREIGN_KEY_CHECKS = 0")
cur.execute("SET UNIQUE_CHECKS = 0")
@@ -26,12 +27,12 @@
if '<page>' in i:
i = sys.stdin.readline()
while '</page>' not in i:
- page += i#.strip()
+ page += i
i = sys.stdin.readline()
page = '<page>' + page + '</page>'
parsePage(con, cur, page)
count += 1
- if(count % 1000 == 0)
+ if(count % 10000 == 0):
con.commit()
finally:
if cur:
@@ -50,10 +51,12 @@
tree = etree.parse(StringIO(page))
page = {child.tag:child.text for child in tree.iter()}
try:
- title = page['title'][1:]
if page['ns'] == '0':
+ title = page['title'][1:]
text = json.loads(page['text'])
- cur.execute("""INSERT INTO label VALUES (%s, %s, %s)""",
(int(title), 'en', text['label']['en'].encode("utf-8", 'ignore')))
+ if 'en' not in text['label']: return
+ label = text['label']['en'].encode("utf-8", 'ignore')
+ cur.execute("""INSERT INTO label VALUES (%s, %s, %s)""",
(int(title), 'en', label))
statement = None
if 'claims' in text:
for a in text['claims']:
@@ -63,15 +66,18 @@
toyield1 = str(statement['value'])
value =
str(statement['wikibase-entityid']['numeric-id']) if 'wikibase-entityid' in
statement else statement['string']
toyield2 = str(statement['value']) + "----" + value
- sys.stdout.write(toyield1.encode("utf-8",
'ignore'))
- sys.stdout.write(toyield2.encode("utf-8",
'ignore'))
+ sys.stdout.write(toyield1.encode("utf-8",
'ignore') + "\n")
+ sys.stdout.write(toyield2.encode("utf-8",
'ignore') + "\n")
except KeyError:
- toyield1 = toyield2 = None
+ pass
elif page['ns'] == '120':
+ title = page['title'][10:]
text = json.loads(page['text'])
- cur.execute("""INSERT INTO plabel VALUES (%s, %s, %s)""",
(int(title), 'en', text['label']['en'].encode("utf-8", 'ignore')))
- except KeyError:
- pass
+ label = text['label']['en'].encode("utf-8", 'ignore')
+ cur.execute("""INSERT INTO plabel VALUES (%s, %s, %s)""",
(int(title), 'en', label))
+ except (KeyError, ValueError, TypeError) as e:
+ sys.stderr.write("Error occurred for page : " + str(title) + ", ns = "
+ str(page['ns']) + "\n")
+ sys.stderr.write(traceback.format_exc() + "\n")
except mdb.Error, e:
print "Error %d: %s" % (e.args[0],e.args[1])
sys.exit(1)
--
To view, visit https://gerrit.wikimedia.org/r/72457
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: Ie22d06f74097a547f9f6f4d5a2ff8f6acf6ffd1a
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/WikidataEntitySuggester
Gerrit-Branch: master
Gerrit-Owner: Nilesh <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits