Re: [PATCH] diff: do not split function name if character encoding is unknown
On Fri, Feb 23, 2018 at 11:53:18PM +0900, Yuya Nishihara wrote: > # HG changeset patch > # User Yuya Nishihara> # Date 1519394998 -32400 > # Fri Feb 23 23:09:58 2018 +0900 > # Node ID 98cfd7926442dc0a649e0359455ad6962815bd13 > # Parent b8d0761a85c7421071750de23228415306852d69 > diff: do not split function name if character encoding is unknown queued, thanks ___ Mercurial-devel mailing list Mercurial-devel@mercurial-scm.org https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel
Re: [PATCH] diff: do not split function name if character encoding is unknown
On Fri, Feb 23, 2018 at 23:53:18 +0900, Yuya Nishihara wrote: > # HG changeset patch > # User Yuya Nishihara> # Date 1519394998 -32400 > # Fri Feb 23 23:09:58 2018 +0900 > # Node ID 98cfd7926442dc0a649e0359455ad6962815bd13 > # Parent b8d0761a85c7421071750de23228415306852d69 > diff: do not split function name if character encoding is unknown > > Only ASCII characters can be split reliably at any byte positions, so let's > just leave long multi-byte sequence long. It's probably less bad than putting > an invalid byte sequence into a diff. > > This doesn't try to split the first ASCII slice from multi-byte sequence > because a combining character may follow. I like it! Thanks, Jeff. > > diff --git a/mercurial/mdiff.py b/mercurial/mdiff.py > --- a/mercurial/mdiff.py > +++ b/mercurial/mdiff.py > @@ -13,6 +13,7 @@ import zlib > > from .i18n import _ > from . import ( > +encoding, > error, > policy, > pycompat, > @@ -348,7 +349,11 @@ def _unidiff(t1, t2, opts=defaultopts): > # alphanumeric char. > for i in xrange(astart - 1, lastpos - 1, -1): > if l1[i][0:1].isalnum(): > -func = ' ' + l1[i].rstrip()[:40] > +func = b' ' + l1[i].rstrip() > +# split long function name if ASCII. otherwise we have no > +# idea where the multi-byte boundary is, so just leave > it. > +if encoding.isasciistr(func): > +func = func[:41] > lastfunc[1] = func > break > # by recording this hunk's starting point as the next place to > diff --git a/tests/test-diff-unified.t b/tests/test-diff-unified.t > --- a/tests/test-diff-unified.t > +++ b/tests/test-diff-unified.t > @@ -386,3 +386,73 @@ If [diff] git is set to true, but the us > } > >$ cd .. > + > +Long function names should be abbreviated, but multi-byte character shouldn't > +be broken up > + > + $ hg init longfunc > + $ cd longfunc > + > + >>> with open('a', 'wb') as f: > + ... f.write(b'a' * 39 + b'bb' + b'\n') > + ... f.write(b' .\n' * 3) > + ... f.write(b' 0 b\n') > + ... f.write(b' .\n' * 3) > + ... f.write(b'a' * 39 + b'\xc3\xa0' + b'\n') > + ... f.write(b' .\n' * 3) > + ... f.write(b' 0 a with grave (single code point)\n') > + ... f.write(b' .\n' * 3) > + ... f.write(b'a' * 39 + b'a\xcc\x80' + b'\n') > + ... f.write(b' .\n' * 3) > + ... f.write(b' 0 a with grave (composition)\n') > + ... f.write(b' .\n' * 3) > + $ hg ci -qAm0 > + > + >>> with open('a', 'wb') as f: > + ... f.write(b'a' * 39 + b'bb' + b'\n') > + ... f.write(b' .\n' * 3) > + ... f.write(b' 1 b\n') > + ... f.write(b' .\n' * 3) > + ... f.write(b'a' * 39 + b'\xc3\xa0' + b'\n') > + ... f.write(b' .\n' * 3) > + ... f.write(b' 1 a with grave (single code point)\n') > + ... f.write(b' .\n' * 3) > + ... f.write(b'a' * 39 + b'a\xcc\x80' + b'\n') > + ... f.write(b' .\n' * 3) > + ... f.write(b' 1 a with grave (composition)\n') > + ... f.write(b' .\n' * 3) > + $ hg ci -m1 > + > + $ hg diff -c1 --nodates --show-function > + diff -r 3e92dd6fa812 -r a256341606cb a > + --- a/a > + +++ b/a > + @@ -2,7 +2,7 @@ aaab > +. > +. > +. > + - 0 b > + + 1 b > +. > +. > +. > + @@ -10,7 +10,7 @@ aaa\xc3\xa0 (esc) > +. > +. > +. > + - 0 a with grave (single code point) > + + 1 a with grave (single code point) > +. > +. > +. > + @@ -18,7 +18,7 @@ \xcc\x80 (esc) > +. > +. > +. > + - 0 a with grave (composition) > + + 1 a with grave (composition) > +. > +. > +. > + > + $ cd .. -- In personal conversations with technical people, I call myself a hacker. But when I'm talking to journalists I just say "programmer" or something like that. - Linus Torvalds ___ Mercurial-devel mailing list Mercurial-devel@mercurial-scm.org https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel
[PATCH] diff: do not split function name if character encoding is unknown
# HG changeset patch # User Yuya Nishihara# Date 1519394998 -32400 # Fri Feb 23 23:09:58 2018 +0900 # Node ID 98cfd7926442dc0a649e0359455ad6962815bd13 # Parent b8d0761a85c7421071750de23228415306852d69 diff: do not split function name if character encoding is unknown Only ASCII characters can be split reliably at any byte positions, so let's just leave long multi-byte sequence long. It's probably less bad than putting an invalid byte sequence into a diff. This doesn't try to split the first ASCII slice from multi-byte sequence because a combining character may follow. diff --git a/mercurial/mdiff.py b/mercurial/mdiff.py --- a/mercurial/mdiff.py +++ b/mercurial/mdiff.py @@ -13,6 +13,7 @@ import zlib from .i18n import _ from . import ( +encoding, error, policy, pycompat, @@ -348,7 +349,11 @@ def _unidiff(t1, t2, opts=defaultopts): # alphanumeric char. for i in xrange(astart - 1, lastpos - 1, -1): if l1[i][0:1].isalnum(): -func = ' ' + l1[i].rstrip()[:40] +func = b' ' + l1[i].rstrip() +# split long function name if ASCII. otherwise we have no +# idea where the multi-byte boundary is, so just leave it. +if encoding.isasciistr(func): +func = func[:41] lastfunc[1] = func break # by recording this hunk's starting point as the next place to diff --git a/tests/test-diff-unified.t b/tests/test-diff-unified.t --- a/tests/test-diff-unified.t +++ b/tests/test-diff-unified.t @@ -386,3 +386,73 @@ If [diff] git is set to true, but the us } $ cd .. + +Long function names should be abbreviated, but multi-byte character shouldn't +be broken up + + $ hg init longfunc + $ cd longfunc + + >>> with open('a', 'wb') as f: + ... f.write(b'a' * 39 + b'bb' + b'\n') + ... f.write(b' .\n' * 3) + ... f.write(b' 0 b\n') + ... f.write(b' .\n' * 3) + ... f.write(b'a' * 39 + b'\xc3\xa0' + b'\n') + ... f.write(b' .\n' * 3) + ... f.write(b' 0 a with grave (single code point)\n') + ... f.write(b' .\n' * 3) + ... f.write(b'a' * 39 + b'a\xcc\x80' + b'\n') + ... f.write(b' .\n' * 3) + ... f.write(b' 0 a with grave (composition)\n') + ... f.write(b' .\n' * 3) + $ hg ci -qAm0 + + >>> with open('a', 'wb') as f: + ... f.write(b'a' * 39 + b'bb' + b'\n') + ... f.write(b' .\n' * 3) + ... f.write(b' 1 b\n') + ... f.write(b' .\n' * 3) + ... f.write(b'a' * 39 + b'\xc3\xa0' + b'\n') + ... f.write(b' .\n' * 3) + ... f.write(b' 1 a with grave (single code point)\n') + ... f.write(b' .\n' * 3) + ... f.write(b'a' * 39 + b'a\xcc\x80' + b'\n') + ... f.write(b' .\n' * 3) + ... f.write(b' 1 a with grave (composition)\n') + ... f.write(b' .\n' * 3) + $ hg ci -m1 + + $ hg diff -c1 --nodates --show-function + diff -r 3e92dd6fa812 -r a256341606cb a + --- a/a + +++ b/a + @@ -2,7 +2,7 @@ aaab +. +. +. + - 0 b + + 1 b +. +. +. + @@ -10,7 +10,7 @@ aaa\xc3\xa0 (esc) +. +. +. + - 0 a with grave (single code point) + + 1 a with grave (single code point) +. +. +. + @@ -18,7 +18,7 @@ \xcc\x80 (esc) +. +. +. + - 0 a with grave (composition) + + 1 a with grave (composition) +. +. +. + + $ cd .. ___ Mercurial-devel mailing list Mercurial-devel@mercurial-scm.org https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel