# HG changeset patch # User Josef 'Jeff' Sipek <jef...@josefsipek.net> # Date 1519251311 18000 # Wed Feb 21 17:15:11 2018 -0500 # Node ID b99df94fdd4813e0ce538a8caa682802da4a6cb2 # Parent 106872aa15af9919220705ed72c78459774e1575 mdiff: split on unicode character boundaries when shortening function name
Splitting the raw bytes may lead to truncating the string in the middle of a UTF-8 character which would lead to the generated diff containing an invalid byte sequence even though the original data had none. For example, the Unicode codepoint U+308B (る) gets represented as \xe3\x82\x8b in UTF-8. Before this change a diff on i18n/ja.po would yield: @@ -28953,7 +28953,7 @@ msgstr "Mercurial と SSH を併用す<E3><82> After this change, the output is cleaner: @@ -28953,7 +28953,7 @@ msgstr "Mercurial と SSH を併用する場合の注意点:" diff --git a/mercurial/mdiff.py b/mercurial/mdiff.py --- a/mercurial/mdiff.py +++ b/mercurial/mdiff.py @@ -348,7 +348,12 @@ def _unidiff(t1, t2, opts=defaultopts): # alphanumeric char. for i in xrange(astart - 1, lastpos - 1, -1): if l1[i][0:1].isalnum(): - func = ' ' + l1[i].rstrip()[:40] + func = l1[i].rstrip() + try: + func = func.decode("utf-8")[:40].encode("utf-8") + except: + func = func[:40] + func = ' ' + func lastfunc[1] = func break # by recording this hunk's starting point as the next place to _______________________________________________ Mercurial-devel mailing list Mercurial-devel@mercurial-scm.org https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel