Title: [100700] trunk/Tools
Revision
100700
Author
rn...@webkit.org
Date
2011-11-17 16:46:03 -0800 (Thu, 17 Nov 2011)

Log Message

fuzzy_match doesn't recognize "Dan B" or "hyatt"
https://bugs.webkit.org/show_bug.cgi?id=72636

Reviewed by Eric Seidel.

Make contributors_by_fuzz_match search string in shorthand names of contributors such as "Ryosuke N"
and user names of email addreses such as rniwa in rn...@webkit.org.

Also import legacy contributor names and mispell lists from Eric's script on the bug 26533 as a test.

* Scripts/webkitpy/common/config/committers.py:
* Scripts/webkitpy/common/config/committers_unittest.py:

Modified Paths

Diff

Modified: trunk/Tools/ChangeLog (100699 => 100700)


--- trunk/Tools/ChangeLog	2011-11-18 00:39:33 UTC (rev 100699)
+++ trunk/Tools/ChangeLog	2011-11-18 00:46:03 UTC (rev 100700)
@@ -1,3 +1,18 @@
+2011-11-17  Ryosuke Niwa  <rn...@webkit.org>
+
+        fuzzy_match doesn't recognize "Dan B" or "hyatt"
+        https://bugs.webkit.org/show_bug.cgi?id=72636
+
+        Reviewed by Eric Seidel.
+
+        Make contributors_by_fuzz_match search string in shorthand names of contributors such as "Ryosuke N"
+        and user names of email addreses such as rniwa in rn...@webkit.org.
+
+        Also import legacy contributor names and mispell lists from Eric's script on the bug 26533 as a test.
+
+        * Scripts/webkitpy/common/config/committers.py:
+        * Scripts/webkitpy/common/config/committers_unittest.py:
+
 2011-11-17  Eric Seidel  <e...@webkit.org>
 
         Remove support for running check-webkit-style w/o a webkit checkout

Modified: trunk/Tools/Scripts/webkitpy/common/config/committers.py (100699 => 100700)


--- trunk/Tools/Scripts/webkitpy/common/config/committers.py	2011-11-18 00:39:33 UTC (rev 100699)
+++ trunk/Tools/Scripts/webkitpy/common/config/committers.py	2011-11-18 00:46:03 UTC (rev 100700)
@@ -531,6 +531,24 @@
     def contributors_by_search_string(self, string):
         return filter(lambda contributor: contributor.contains_string(string), self.contributors())
 
+    def contributors_by_email_username(self, string):
+        string = string + '@'
+        result = []
+        for contributor in self.contributors():
+            for email in contributor.emails:
+                if email.startswith(string):
+                    result.append(contributor)
+                    break
+        return result
+
+    def _contributor_name_shorthands(self, contributor):
+        if ' ' not in contributor.full_name:
+            return []
+        split_fullname = contributor.full_name.split()
+        first_name = split_fullname[0]
+        last_name = split_fullname[-1]
+        return first_name, last_name, first_name + last_name[0], first_name + ' ' + last_name[0]
+
     def _tokenize_contributor_name(self, contributor):
         full_name_in_lowercase = contributor.full_name.lower()
         tokens = [full_name_in_lowercase] + full_name_in_lowercase.split()
@@ -539,14 +557,25 @@
         return tokens
 
     def contributors_by_fuzzy_match(self, string):
-        string = string.lower()
+        string_in_lowercase = string.lower()
 
-        # First path, optimitically match for fullname, email and irc_nicknames
-        account = self.contributor_by_name(string) or self.account_by_email(string) or self.contributor_by_irc_nickname(string)
+        # 1. Exact match for fullname, email and irc_nicknames
+        account = self.contributor_by_name(string_in_lowercase) or self.account_by_email(string_in_lowercase) or self.contributor_by_irc_nickname(string_in_lowercase)
         if account:
             return [account], 0
 
-        # Second path, much slower search using edit-distance
+        # 2. Exact match for email username (before @)
+        accounts = self.contributors_by_email_username(string_in_lowercase)
+        if accounts and len(accounts) == 1:
+            return accounts, 0
+
+        # 3. Exact match for first name, last name, and first name + initial combinations such as "Dan B" and "Tim H"
+        accounts = [contributor for contributor in self.contributors() if string in self._contributor_name_shorthands(contributor)]
+        if accounts and len(accounts) == 1:
+            return accounts, 0
+
+        # 4. Finally, fuzzy-match using edit-distance
+        string = string_in_lowercase
         contributorWithMinDistance = []
         minDistance = len(string) / 2 - 1
         for contributor in self.contributors():

Modified: trunk/Tools/Scripts/webkitpy/common/config/committers_unittest.py (100699 => 100700)


--- trunk/Tools/Scripts/webkitpy/common/config/committers_unittest.py	2011-11-18 00:39:33 UTC (rev 100699)
+++ trunk/Tools/Scripts/webkitpy/common/config/committers_unittest.py	2011-11-18 00:46:03 UTC (rev 100700)
@@ -35,8 +35,10 @@
         committer = Committer('Test One', 'o...@test.com', 'one')
         reviewer = Reviewer('Test Two', ['t...@test.com', 't...@rad.com', 'so_...@gmail.com'])
         contributor = Contributor('Test Three', ['th...@test.com'], 'three')
-        contributor_with_two_nicknames = Contributor('Other Four', ['otherf...@webkit.org'], ['four', 'otherfour'])
-        committer_list = CommitterList(watchers=[account], committers=[committer], reviewers=[reviewer], contributors=[contributor, contributor_with_two_nicknames])
+        contributor_with_two_nicknames = Contributor('Other Four', ['otherf...@webkit.org', 'otherf...@webkit2.org'], ['four', 'otherfour'])
+        contributor_with_same_email_username = Contributor('Yet Another Four', ['otherf...@webkit.com'], ['yetanotherfour'])
+        committer_list = CommitterList(watchers=[account], committers=[committer], reviewers=[reviewer],
+            contributors=[contributor, contributor_with_two_nicknames, contributor_with_same_email_username])
 
         # Test valid committer, reviewer and contributor lookup
         self.assertEqual(committer_list.account_by_email('z...@test.com'), account)
@@ -87,7 +89,7 @@
         self.assertEqual(committer_list.contributor_by_irc_nickname('otherfour'), contributor_with_two_nicknames)
 
         # Test that the lists returned are are we expect them.
-        self.assertEqual(committer_list.contributors(), [contributor, contributor_with_two_nicknames, committer, reviewer])
+        self.assertEqual(committer_list.contributors(), [contributor, contributor_with_two_nicknames, contributor_with_same_email_username, committer, reviewer])
         self.assertEqual(committer_list.committers(), [committer, reviewer])
         self.assertEqual(committer_list.reviewers(), [reviewer])
 
@@ -95,10 +97,15 @@
         self.assertEqual(committer_list.contributors_by_search_string('rad'), [reviewer])
         self.assertEqual(committer_list.contributors_by_search_string('Two'), [reviewer])
 
+        self.assertEqual(committer_list.contributors_by_email_username("one"), [committer])
+        self.assertEqual(committer_list.contributors_by_email_username("four"), [])
+        self.assertEqual(committer_list.contributors_by_email_username("otherfour"), [contributor_with_two_nicknames, contributor_with_same_email_username])
+
     def _assert_fuzz_match(self, text, name_of_expected_contributor, expected_distance):
         committers = CommitterList()
-        expected_contributors = [committers.contributor_by_name(name_of_expected_contributor)] if name_of_expected_contributor else []
-        self.assertEqual(committers.contributors_by_fuzzy_match(text), (expected_contributors, expected_distance))
+        contributors, distance = committers.contributors_by_fuzzy_match(text)
+        expected_names = [name_of_expected_contributor] if name_of_expected_contributor else []
+        self.assertEqual(([contributor.full_name for contributor in contributors], distance), (expected_names, expected_distance))
 
     def test_contributors_by_fuzzy_match(self):
         self._assert_fuzz_match('Geoff Garen', 'Geoffrey Garen', 3)
@@ -118,6 +125,8 @@
         self._assert_fuzz_match('Sam', 'Sam Weinig', 0)
         self._assert_fuzz_match('darin', 'Darin Adler', 0)
         self._assert_fuzz_match('harrison', 'David Harrison', 0)
+        self._assert_fuzz_match('me', None, 2)
+        self._assert_fuzz_match('myself', None, 6)
         self._assert_fuzz_match('others', None, 6)
         self._assert_fuzz_match('BUILD FIX', None, 9)
         self._assert_fuzz_match('but Dan Bernstein also reviewed', None, 31)
@@ -126,3 +135,150 @@
         self._assert_fuzz_match('a spell checker', None, 15)
         self._assert_fuzz_match('nobody, build fix', None, 17)
         self._assert_fuzz_match('NOBODY (chromium build fix)', None, 27)
+
+    def test_contributors_by_fuzzy_match_with_legacy_names(self):
+        # Commented out lines are test cases imported from the bug 26533 yet to pass.
+
+        self._assert_fuzz_match('Ada', 'Ada Chan', 0)
+        self._assert_fuzz_match('adele', 'Adele Peterson', 0)
+#        self._assert_fuzz_match('Adam', 'Adam Roben', 0)
+        self._assert_fuzz_match('aroben', 'Adam Roben', 0)
+#        self._assert_fuzz_match('Alexey', 'Alexey Proskuryakov', 0)
+        self._assert_fuzz_match('ap', 'Alexey Proskuryakov', 0)
+        self._assert_fuzz_match('Alexey P', 'Alexey Proskuryakov', 0)
+#        self._assert_fuzz_match('Alice', 'Alice Liu', 0)
+        self._assert_fuzz_match('aliu', 'Alice Liu', 0)
+        self._assert_fuzz_match('Liu', 'Alice Liu', 0)
+        self._assert_fuzz_match('Alp', 'Alp Toker', 0)
+        self._assert_fuzz_match('Anders', 'Anders Carlsson', 0)
+        self._assert_fuzz_match('andersca', 'Anders Carlsson', 0)
+        self._assert_fuzz_match('anders', 'Anders Carlsson', 0)
+        self._assert_fuzz_match('Andersca', 'Anders Carlsson', 0)
+        self._assert_fuzz_match('Antti', 'Antti Koivisto', 0)
+
+        self._assert_fuzz_match('Beth', 'Beth Dakin', 0)
+        self._assert_fuzz_match('beth', 'Beth Dakin', 0)
+        self._assert_fuzz_match('bdakin', 'Beth Dakin', 0)
+        self._assert_fuzz_match('Brady', 'Brady Eidson', 0)
+        self._assert_fuzz_match('bradee-oh', 'Brady Eidson', 0)
+
+#        self._assert_fuzz_match('Cameron', 'Cameron Zwarich', 0)
+#        self._assert_fuzz_match('cpst', 'Cameron Zwarich', 1)
+#        self._assert_fuzz_match('Chris', 'Chris Blumenberg', 0)
+        self._assert_fuzz_match('cblu', 'Chris Blumenberg', 0)
+
+        self._assert_fuzz_match('Dan', 'Dan Bernstein', 0)
+        self._assert_fuzz_match('Dan B', 'Dan Bernstein', 0)
+#        self._assert_fuzz_match('mitz', 'Dan Bernstein', 0)
+        self._assert_fuzz_match('Mitz Pettel', 'Dan Bernstein', 1)
+        self._assert_fuzz_match('Mitzpettel', 'Dan Bernstein', 0)
+        self._assert_fuzz_match('Mitz Pettel RTL', 'Dan Bernstein', 5)
+        self._assert_fuzz_match('Teh Mitzpettel', 'Dan Bernstein', 4)
+#        self._assert_fuzz_match('The Mitz', 'Dan Bernstein', 0)
+
+        self._assert_fuzz_match('Darin', 'Darin Adler', 0)  # Thankfully "Fisher" is longer than "Adler"
+
+        self._assert_fuzz_match('Dave Harrison', 'David Harrison', 2)
+        self._assert_fuzz_match('harrison', 'David Harrison', 0)
+        self._assert_fuzz_match('Dr. Harrison', 'David Harrison', 4)
+        self._assert_fuzz_match('Dave Harrson', 'David Harrison', 3)
+        self._assert_fuzz_match('Dave Harrsion', 'David Harrison', 4)  # Damerau-Levenshtein distance is 3
+
+        self._assert_fuzz_match('Daddy Hyatt', 'David Hyatt', 3)
+#        self._assert_fuzz_match('Dave', 'David Hyatt', 0)  # 'Dave' could mean harrison.
+        self._assert_fuzz_match('hyatt', 'David Hyatt', 0)
+#        self._assert_fuzz_match('Haytt', 'David Hyatt', 0)  # Works if we had implemented Damerau-Levenshtein distance!
+        self._assert_fuzz_match('Dave Kilzer', 'David Kilzer', 2)
+        self._assert_fuzz_match('David D. Kilzer', 'David Kilzer', 3)
+        self._assert_fuzz_match('ddkilzer', 'David Kilzer', 0)
+        self._assert_fuzz_match('Don', 'Don Melton', 0)
+        self._assert_fuzz_match('Gramps', 'Don Melton', 0)
+
+#        self._assert_fuzz_match('eric', 'Eric Seidel', 0)
+        self._assert_fuzz_match('Eric S', 'Eric Seidel', 0)
+#        self._assert_fuzz_match('MacDome', 'Eric Seidel', 0)
+        self._assert_fuzz_match('eseidel', 'Eric Seidel', 0)
+
+#        self._assert_fuzz_match('Geof', 'Geoffrey Garen', 4)
+#        self._assert_fuzz_match('Geoff', 'Geoffrey Garen', 3)
+        self._assert_fuzz_match('Geoff Garen', 'Geoffrey Garen', 3)
+        self._assert_fuzz_match('ggaren', 'Geoffrey Garen', 0)
+#        self._assert_fuzz_match('geoff', 'Geoffrey Garen', 0)
+        self._assert_fuzz_match('Geoffrey', 'Geoffrey Garen', 0)
+        self._assert_fuzz_match('GGaren', 'Geoffrey Garen', 0)
+#        self._assert_fuzz_match('Greg', 'Greg Bolsinga', 0)
+
+        self._assert_fuzz_match('Holger', 'Holger Freyther', 0)
+        self._assert_fuzz_match('Holger Hans Peter Freyther', 'Holger Freyther', 11)
+
+#        self._assert_fuzz_match('john', 'John Sullivan', 0)
+        self._assert_fuzz_match('sullivan', 'John Sullivan', 0)
+        self._assert_fuzz_match('John Honeycutt', 'Jon Honeycutt', 1)
+#        self._assert_fuzz_match('Jon', 'Jon Honeycutt', 0)
+#        self._assert_fuzz_match('justin', 'Justin Garcia', 0)
+        self._assert_fuzz_match('justing', 'Justin Garcia', 0)
+
+        self._assert_fuzz_match('ken', 'Ken Kocienda', 0)
+        self._assert_fuzz_match('kocienda', 'Ken Kocienda', 0)
+        self._assert_fuzz_match('kdecker', 'Kevin Decker', 0)
+        self._assert_fuzz_match('Kevin M', 'Kevin McCullough', 0)
+        self._assert_fuzz_match('Kevin McCulough', 'Kevin McCullough', 1)
+        self._assert_fuzz_match('mccullough', 'Kevin McCullough', 0)
+
+        self._assert_fuzz_match('lars', 'Lars Knoll', 0)
+        self._assert_fuzz_match('levi', 'Levi Weintraub', 0)
+
+        self._assert_fuzz_match('Maciej', 'Maciej Stachowiak', 0)
+#        self._assert_fuzz_match('mjs', 'Maciej Stachowiak', 0)
+        self._assert_fuzz_match('Maciej S', 'Maciej Stachowiak', 0)
+
+#        self._assert_fuzz_match('Mark', 'Mark Rowe', 0)
+        self._assert_fuzz_match('bdash', 'Mark Rowe', 0)
+        self._assert_fuzz_match('mrowe', 'Mark Rowe', 0)
+#        self._assert_fuzz_match('Brian Dash', 'Mark Rowe', 0)
+
+#        self._assert_fuzz_match('Niko', 'Nikolas Zimmermann', 1)
+        self._assert_fuzz_match('Niko Zimmermann', 'Nikolas Zimmermann', 3)
+        self._assert_fuzz_match('Nikolas', 'Nikolas Zimmermann', 0)
+
+#        self._assert_fuzz_match('Oliver', 'Oliver Hunt', 0)
+        self._assert_fuzz_match('Ollie', 'Oliver Hunt', 1)
+        self._assert_fuzz_match('Olliej', 'Oliver Hunt', 0)
+        self._assert_fuzz_match('Olliej Hunt', 'Oliver Hunt', 3)
+        self._assert_fuzz_match('olliej', 'Oliver Hunt', 0)
+        self._assert_fuzz_match('ollie', 'Oliver Hunt', 1)
+        self._assert_fuzz_match('ollliej', 'Oliver Hunt', 1)
+
+        self._assert_fuzz_match('Richard', 'Richard Williamson', 0)
+        self._assert_fuzz_match('rjw', 'Richard Williamson', 0)
+        self._assert_fuzz_match('Rob', 'Rob Buis', 0)
+        self._assert_fuzz_match('rwlbuis', 'Rob Buis', 0)
+
+        self._assert_fuzz_match('Sam', 'Sam Weinig', 0)
+#        self._assert_fuzz_match('Weinig Sam', 'weinig', 0)
+        self._assert_fuzz_match('Weinig', 'Sam Weinig', 0)
+        self._assert_fuzz_match('Sam W', 'Sam Weinig', 0)
+        self._assert_fuzz_match('Sammy Weinig', 'Sam Weinig', 2)
+#        self._assert_fuzz_match('Simon', 'Simon Fraser', 0)
+        self._assert_fuzz_match('Sfalken', 'Steve Falkenburg', 0)
+#        self._assert_fuzz_match('Steve', 'Steve Falkenburg', 0)
+
+#        self._assert_fuzz_match('timo', 'Tim Omernick', 0)
+        self._assert_fuzz_match('TimO', 'Tim Omernick', 0)
+#        self._assert_fuzz_match('Timo O', 'Tim Omernick', 0)
+#        self._assert_fuzz_match('Tim O.', 'Tim Omernick', 0)
+        self._assert_fuzz_match('Tim O', 'Tim Omernick', 0)
+
+#        self._assert_fuzz_match('Tim', 'Timothy Hatcher', 0)
+#        self._assert_fuzz_match('Tim H', 'Timothy Hatcher', 0)
+        self._assert_fuzz_match('Tim Hatcher', 'Timothy Hatcher', 4)
+        self._assert_fuzz_match('Tim Hatcheri', 'Timothy Hatcher', 5)
+        self._assert_fuzz_match('timothy', 'Timothy Hatcher', 0)
+        self._assert_fuzz_match('thatcher', 'Timothy Hatcher', 1)
+        self._assert_fuzz_match('xenon', 'Timothy Hatcher', 0)
+        self._assert_fuzz_match('Hatcher', 'Timothy Hatcher', 0)
+#        self._assert_fuzz_match('TimH', 'Timothy Hatcher', 0)
+
+        self._assert_fuzz_match('Tor Arne', u"Tor Arne Vestb\u00f8", 1)  # Matches IRC nickname
+        self._assert_fuzz_match('Vicki', u"Vicki Murley", 0)
+        self._assert_fuzz_match('Zack', 'Zack Rusin', 0)
_______________________________________________
webkit-changes mailing list
webkit-changes@lists.webkit.org
http://lists.webkit.org/mailman/listinfo.cgi/webkit-changes

Reply via email to