Milimetric has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/75865


Change subject: trying to handle unicode names
......................................................................

trying to handle unicode names

Change-Id: Icf936da3ebcf37c88cb937c1175ca37f7ab0cc65
---
M tests/test_controllers/test_cohorts.py
M wikimetrics/controllers/cohorts.py
2 files changed, 20 insertions(+), 9 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/analytics/wikimetrics 
refs/changes/65/75865/1

diff --git a/tests/test_controllers/test_cohorts.py 
b/tests/test_controllers/test_cohorts.py
index c3fdc93..4d4d60f 100644
--- a/tests/test_controllers/test_cohorts.py
+++ b/tests/test_controllers/test_cohorts.py
@@ -1,7 +1,9 @@
+# -*- coding:utf-8 -*-
 import pprint
 import json
-from nose.tools import assert_equal
+from nose.tools import assert_equal, assert_not_equal
 from tests.fixtures import WebTest
+from wikimetrics.controllers.cohorts import *
 
 
 class TestCohortsController(WebTest):
@@ -58,3 +60,14 @@
             response.status_code,
             404,
         )
+    
+    def test_validate_username(self):
+        # this username has a few problems that the normalize call should 
handle
+        # 1. normal ascii space in front
+        # 2. lowercase
+        # 3. nasty trailing unicode space (the reason this file has an 
encoding definition)
+        problem_username = ' danĀ '
+        
+        parsed_user = parse_username(problem_username)
+        valid_user = normalize_user(parsed_user, 'enwiki')
+        assert_not_equal(valid_user, None)
diff --git a/wikimetrics/controllers/cohorts.py 
b/wikimetrics/controllers/cohorts.py
index c280ed3..8b0966e 100644
--- a/wikimetrics/controllers/cohorts.py
+++ b/wikimetrics/controllers/cohorts.py
@@ -283,26 +283,25 @@
                 project = default_project
             
             parsed.append({
-                'raw_username': parse_username(username, decode=False),
                 'username': parse_username(username),
                 'project': project,
             })
     return parsed
 
 
-def parse_username(raw_username, decode=True):
+def parse_username(username):
     """
     parses uncapitalized, whitespace-padded, and weird-charactered mediawiki
     user names into ones that have a chance of being found in the database
     """
-    username = str(raw_username)
-    if decode:
-        username = username.decode('utf8')
+    username = str(username)
+    username = username.decode('utf8')
     stripped = username.strip()
     # Capitalize the username according to the Mediawiki standard
     # NOTE: unfortunately .title() or .capitalize() don't work
     # because 'miliMetric'.capitalize() == 'Milimetric'
-    return stripped[0].upper() + stripped[1:]
+    capitalized = stripped[0].upper() + stripped[1:]
+    return capitalized.encode('utf8')
 
 
 def normalize_project(project):
@@ -319,7 +318,6 @@
 
 
 def get_wikiuser_by_name(username, project):
-    # NOTE: Not needed right? username = username.encode('utf-8')
     db_session = db.get_mw_session(project)
     try:
         wikiuser = db_session.query(MediawikiUser)\
@@ -393,7 +391,7 @@
             record['reason_invalid'] = 'invalid project: %s' % 
record['project']
             invalid.append(record)
             continue
-        normalized_user = normalize_user(record['raw_username'], 
normalized_project)
+        normalized_user = normalize_user(record['username'], 
normalized_project)
         # make a link to the potential user page even if user doesn't exist
         # this gives a chance to see any misspelling etc.
         if normalized_user is None:

-- 
To view, visit https://gerrit.wikimedia.org/r/75865
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Icf936da3ebcf37c88cb937c1175ca37f7ab0cc65
Gerrit-PatchSet: 1
Gerrit-Project: analytics/wikimetrics
Gerrit-Branch: master
Gerrit-Owner: Milimetric <dandree...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to