Milimetric has uploaded a new change for review. https://gerrit.wikimedia.org/r/75865
Change subject: trying to handle unicode names ...................................................................... trying to handle unicode names Change-Id: Icf936da3ebcf37c88cb937c1175ca37f7ab0cc65 --- M tests/test_controllers/test_cohorts.py M wikimetrics/controllers/cohorts.py 2 files changed, 20 insertions(+), 9 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/analytics/wikimetrics refs/changes/65/75865/1 diff --git a/tests/test_controllers/test_cohorts.py b/tests/test_controllers/test_cohorts.py index c3fdc93..4d4d60f 100644 --- a/tests/test_controllers/test_cohorts.py +++ b/tests/test_controllers/test_cohorts.py @@ -1,7 +1,9 @@ +# -*- coding:utf-8 -*- import pprint import json -from nose.tools import assert_equal +from nose.tools import assert_equal, assert_not_equal from tests.fixtures import WebTest +from wikimetrics.controllers.cohorts import * class TestCohortsController(WebTest): @@ -58,3 +60,14 @@ response.status_code, 404, ) + + def test_validate_username(self): + # this username has a few problems that the normalize call should handle + # 1. normal ascii space in front + # 2. lowercase + # 3. nasty trailing unicode space (the reason this file has an encoding definition) + problem_username = ' danĀ ' + + parsed_user = parse_username(problem_username) + valid_user = normalize_user(parsed_user, 'enwiki') + assert_not_equal(valid_user, None) diff --git a/wikimetrics/controllers/cohorts.py b/wikimetrics/controllers/cohorts.py index c280ed3..8b0966e 100644 --- a/wikimetrics/controllers/cohorts.py +++ b/wikimetrics/controllers/cohorts.py @@ -283,26 +283,25 @@ project = default_project parsed.append({ - 'raw_username': parse_username(username, decode=False), 'username': parse_username(username), 'project': project, }) return parsed -def parse_username(raw_username, decode=True): +def parse_username(username): """ parses uncapitalized, whitespace-padded, and weird-charactered mediawiki user names into ones that have a chance of being found in the database """ - username = str(raw_username) - if decode: - username = username.decode('utf8') + username = str(username) + username = username.decode('utf8') stripped = username.strip() # Capitalize the username according to the Mediawiki standard # NOTE: unfortunately .title() or .capitalize() don't work # because 'miliMetric'.capitalize() == 'Milimetric' - return stripped[0].upper() + stripped[1:] + capitalized = stripped[0].upper() + stripped[1:] + return capitalized.encode('utf8') def normalize_project(project): @@ -319,7 +318,6 @@ def get_wikiuser_by_name(username, project): - # NOTE: Not needed right? username = username.encode('utf-8') db_session = db.get_mw_session(project) try: wikiuser = db_session.query(MediawikiUser)\ @@ -393,7 +391,7 @@ record['reason_invalid'] = 'invalid project: %s' % record['project'] invalid.append(record) continue - normalized_user = normalize_user(record['raw_username'], normalized_project) + normalized_user = normalize_user(record['username'], normalized_project) # make a link to the potential user page even if user doesn't exist # this gives a chance to see any misspelling etc. if normalized_user is None: -- To view, visit https://gerrit.wikimedia.org/r/75865 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Icf936da3ebcf37c88cb937c1175ca37f7ab0cc65 Gerrit-PatchSet: 1 Gerrit-Project: analytics/wikimetrics Gerrit-Branch: master Gerrit-Owner: Milimetric <dandree...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits