Thanks for reminding me about that problem. I've pushed the following patches. The first two are merely tuning and reorganization; the third one does the real work.
>From 37bffc430560df85029b2cacda65893542f0d455 Mon Sep 17 00:00:00 2001 From: Paul Eggert <[email protected]> Date: Wed, 3 Apr 2013 07:48:22 -0700 Subject: [PATCH 1/3] diff: tune compare_names_for_qsort * src/dir.c (compare_collated): New function. (compare_names): Use it. (compare_names_for_qsort): Use it. This is a bit more efficient as it can avoid a double invocation of file_name_cmp when file_name_cmp returns zero. --- src/dir.c | 49 +++++++++++++++++++++++++++++++------------------ 1 file changed, 31 insertions(+), 18 deletions(-) diff --git a/src/dir.c b/src/dir.c index 7f647b0..fc42f62 100644 --- a/src/dir.c +++ b/src/dir.c @@ -140,28 +140,34 @@ dir_read (struct file_data const *dir, struct dirdata *dirdata) return true; } -/* Compare file names, returning a value compatible with strcmp. */ +/* Compare strings in a locale-specific way, returning a value + compatible with strcmp. */ static int -compare_names (char const *name1, char const *name2) +compare_collated (char const *name1, char const *name2) { - if (locale_specific_sorting) + int r; + errno = 0; + if (ignore_file_name_case) + r = strcasecoll (name1, name2); + else + r = strcoll (name1, name2); + if (errno) { - int r; - errno = 0; - if (ignore_file_name_case) - r = strcasecoll (name1, name2); - else - r = strcoll (name1, name2); - if (errno) - { - error (0, errno, _("cannot compare file names '%s' and '%s'"), - name1, name2); - longjmp (failed_locale_specific_sorting, 1); - } - return r; + error (0, errno, _("cannot compare file names '%s' and '%s'"), + name1, name2); + longjmp (failed_locale_specific_sorting, 1); } + return r; +} + +/* Compare file names, returning a value compatible with strcmp. */ +static int +compare_names (char const *name1, char const *name2) +{ + if (locale_specific_sorting) + return compare_collated (name1, name2); return file_name_cmp (name1, name2); } @@ -173,8 +179,15 @@ compare_names_for_qsort (void const *file1, void const *file2) { char const *const *f1 = file1; char const *const *f2 = file2; - int diff = compare_names (*f1, *f2); - return diff ? diff : file_name_cmp (*f1, *f2); + char const *name1 = *f1; + char const *name2 = *f2; + if (locale_specific_sorting) + { + int diff = compare_collated (name1, name2); + if (diff) + return diff; + } + return file_name_cmp (name1, name2); } /* Compare the contents of two directories named in CMP. -- 1.7.11.7 >From 73482f40100760b276d383ed0a588ce13a3d52b4 Mon Sep 17 00:00:00 2001 From: Paul Eggert <[email protected]> Date: Wed, 3 Apr 2013 07:51:33 -0700 Subject: [PATCH 2/3] diff: remove unnecessary decl * src/dir.c (compare_names_for_qsort): Remove declaration. Not needed now that we assume C89. --- src/dir.c | 1 - 1 file changed, 1 deletion(-) diff --git a/src/dir.c b/src/dir.c index fc42f62..21b1935 100644 --- a/src/dir.c +++ b/src/dir.c @@ -45,7 +45,6 @@ static bool locale_specific_sorting; static jmp_buf failed_locale_specific_sorting; static bool dir_loop (struct comparison const *, int); -static int compare_names_for_qsort (void const *, void const *); /* Read a directory and get its vector of names. */ -- 1.7.11.7 >From e82f540d1134ba3d30434024e6fc9aea8ec71cf1 Mon Sep 17 00:00:00 2001 From: Paul Eggert <[email protected]> Date: Wed, 3 Apr 2013 08:20:31 -0700 Subject: [PATCH 3/3] diff: fix bug with Asian file names Problem reported by Errembault Philippe in: http://lists.gnu.org/archive/html/bug-diffutils/2013-03/msg00012.html * NEWS: Document this. * src/dir.c (compare_names): Fall back on file_name_cmp if compare_collated returns 0, unless ignoring file name case. (diff_dirs): Don't bother with the O(N**2) stuff unless ignoring file name case. * tests/Makefile.am (TESTS): Add strcoll-0-names. * tests/strcoll-0-names: New file. --- NEWS | 7 +++++++ src/dir.c | 8 ++++++-- tests/Makefile.am | 1 + tests/strcoll-0-names | 25 +++++++++++++++++++++++++ 4 files changed, 39 insertions(+), 2 deletions(-) create mode 100755 tests/strcoll-0-names diff --git a/NEWS b/NEWS index ac7a75e..79517f2 100644 --- a/NEWS +++ b/NEWS @@ -2,6 +2,13 @@ GNU diffutils NEWS -*- outline -*- * Noteworthy changes in release ?.? (????-??-??) [?] +** Bug fixes + + Unless the --ignore-file-name-case option is used, diff now + considers file names to be equal only if they are byte-for-byte + equivalent. This fixes a bug where diff in an English locale might + consider two Asian file names to be the same merely because they + contain no English characters. * Noteworthy changes in release 3.3 (2013-03-24) [stable] diff --git a/src/dir.c b/src/dir.c index 21b1935..d3b0a2d 100644 --- a/src/dir.c +++ b/src/dir.c @@ -166,7 +166,11 @@ static int compare_names (char const *name1, char const *name2) { if (locale_specific_sorting) - return compare_collated (name1, name2); + { + int diff = compare_collated (name1, name2); + if (diff || ignore_file_name_case) + return diff; + } return file_name_cmp (name1, name2); } @@ -271,7 +275,7 @@ diff_dirs (struct comparison const *cmp, O(N**2), where N is the number of names in a directory that compare_names says are all equal, but in practice N is so small it's not worth tuning. */ - if (nameorder == 0) + if (nameorder == 0 && ignore_file_name_case) { int raw_order = file_name_cmp (*names[0], *names[1]); if (raw_order != 0) diff --git a/tests/Makefile.am b/tests/Makefile.am index 5cbcfb4..dd2d514 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -12,6 +12,7 @@ TESTS = \ no-dereference \ no-newline-at-eof \ stdin \ + strcoll-0-names \ filename-quoting EXTRA_DIST = \ diff --git a/tests/strcoll-0-names b/tests/strcoll-0-names new file mode 100755 index 0000000..33c4a3c --- /dev/null +++ b/tests/strcoll-0-names @@ -0,0 +1,25 @@ +#!/bin/sh +# Check that diff responds well with two different file names +# that compare equal with strcoll. See: +# http://lists.gnu.org/archive/html/bug-diffutils/2013-03/msg00012.html + +. "${srcdir=.}/init.sh"; path_prepend_ ../src + +# These two names compare equal in the en_US.UTF-8 locale +# in current (2013) versions of glibc. +# On systems where the names do not compare equal, +# this diff test should still do the right thing. +LC_ALL=en_US.UTF-8 +export LC_ALL +name1='エンドカード1' +name2='ブックレット1' + +mkdir d1 d2 || fail=1 +echo x >d1/"$name1" || fail=1 +echo x >d2/"$name2" || fail=1 + +# This should report a difference, but on the affected systems +# diffutils 3.3 does not. +diff d1 d2 && fail=1 + +Exit $fail -- 1.7.11.7
