ping

On Oct 11 15:37:03, schwa...@usta.de wrote:
> Jan Stary wrote on Tue, Oct 11, 2016 at 11:26:50AM +0200:
> 
> > Feeling encouraged by Ingo's ok to remove locale from cp/rm,
> > here's a diff that removes the locale stuff we don't actually do
> > from the code and documentation of sort(1). Leave just LC_CTYPE
> 
> You removed that too, but that's OK because the code doesn't
> currently use it.  Right now, even though some wide character
> code is there, it always operates in narrow mode.
> 
> > which determines isblank() and case conversions.
> 
> LC_CTYPE does NOT affect isblank(), all the wide-character-related
> functions are in dead code as far as i can see, and OpenBSD does
> not support wide character case conversions.
> 
> 
> Your diff is related to more than one thing:
> 
>  1. Kill LC_NUMERIC support.
>     I very strongly agree with that.
>     I think we should never support that anywhere.
>     It is an absolutely terrible idea no matter how you look at it.
>     Your removal was incomplete, though.
>     In the patch below, i delete some additional bits
>     that you left behind.
> 
>  2. Remove LC_COLLATE support for now.
>     I also agree with that, even if not quite as strongly.
>     In the distant future, we might or might not want LC_COLLATE support.
>     In Belgrade, i talked to Baptiste Daroussin who just implemented
>     LC_COLLATE in FreeBSD libc and who was utterly scared by the complexity.
>     Knowing ourselves, we will be scared even more once we get there.
>     So it will definitely not happen quickly.
> 
>     If it ever does, the trivial bits that are present right now
>     are so tiny that they won't help at all.  It might even be
>     easier to have clean earth to till when starting.
>     Who knows at this point.
> 
>     In any case, deleting the incomplete pieces scattered around
>     the current code makes maintenance easier until that time,
>     which seems worthwhile.
> 
>  3. With step 2 above, some flags become explicitly const
>     that appear to be variable now (but actually aren't).
>     That would allow deleting large amounts of dead code,
>     but i didn't add that to the patch below.
> 
>     The grep multibyte code is absolutely horrible, with insanity
>     up to and including unions of char and wchar_t coming with
>     hosts of trivial non-standard wrapper functions and tons
>     of duplicate, in some case triplicate code.  Another
>     example:  Even though the code half-heartedly attempts to
>     isolate multibyte stuff in bwstring.{c,h} - superficially
>     reminding of our utf8.c technique, but half-hearted because
>     the respective functions are numerous and called all over the
>     place, such that isolation is effectively a failure -
>     the main program (!) at one place abuses btowc(3) (!)
>     on an ASCII (!) command line argument and stores the result
>     as wchar_t in the top level global state structure (!) -
>     that is obviously functionally completely futile, but very
>     effective for polluting *all* of the code with complicated
>     data types and headers.
> 
>     But cleaning up all the parts of item 3 would be way too intrusive
>     for this patch, so it leaves behind hundreds of lines of code
>     that is already both dead and duplicate right now.
> 
> > Annotate a missed -z flag while there,
> 
> Committed.
> 
> > and change /var/tmp to /tmp.
> 
> No, according to file.c, the program still writes to /var/tmp,
> not to /tmp.  Before changing that in the manual, we would have
> to change it in the code.
> 
> 
> The following version of the patch survives our test suite,
> but it no doubt needs more testing.
> 
> Yours,
>   Ingo
> 
> 
> Index: bwstring.c
> ===================================================================
> RCS file: /cvs/src/usr.bin/sort/bwstring.c,v
> retrieving revision 1.7
> diff -u -p -r1.7 bwstring.c
> --- bwstring.c        1 Apr 2015 22:38:08 -0000       1.7
> +++ bwstring.c        11 Oct 2016 13:20:59 -0000
> @@ -40,8 +40,8 @@
>  #include "bwstring.h"
>  #include "sort.h"
>  
> -bool byte_sort;
> -size_t sort_mb_cur_max = 1;
> +static const bool byte_sort = true;
> +const size_t sort_mb_cur_max = 1;
>  
>  static wchar_t **wmonths;
>  static char **cmonths;
> Index: bwstring.h
> ===================================================================
> RCS file: /cvs/src/usr.bin/sort/bwstring.h,v
> retrieving revision 1.2
> diff -u -p -r1.2 bwstring.h
> --- bwstring.h        31 Dec 2015 16:09:31 -0000      1.2
> +++ bwstring.h        11 Oct 2016 13:20:59 -0000
> @@ -37,8 +37,7 @@
>  
>  #include "mem.h"
>  
> -extern bool byte_sort;
> -extern size_t sort_mb_cur_max;
> +extern const size_t sort_mb_cur_max;
>  
>  /* wchar_t is of 4 bytes: */
>  #define      SIZEOF_WCHAR_STRING(LEN) ((LEN)*sizeof(wchar_t))
> Index: coll.c
> ===================================================================
> RCS file: /cvs/src/usr.bin/sort/coll.c,v
> retrieving revision 1.11
> diff -u -p -r1.11 coll.c
> --- coll.c    11 Dec 2015 21:41:51 -0000      1.11
> +++ coll.c    11 Oct 2016 13:20:59 -0000
> @@ -46,12 +46,6 @@
>  struct key_specs *keys;
>  size_t keys_num = 0;
>  
> -wint_t symbol_decimal_point = L'.';
> -/* there is no default thousands separator in collate rules: */
> -wint_t symbol_thousands_sep = 0;
> -wint_t symbol_negative_sign = L'-';
> -wint_t symbol_positive_sign = L'+';
> -
>  static int wstrcoll(struct key_value *kv1, struct key_value *kv2, size_t 
> offset);
>  static int gnumcoll(struct key_value*, struct key_value *, size_t offset);
>  static int monthcoll(struct key_value*, struct key_value *, size_t offset);
> @@ -701,7 +695,7 @@ read_number(struct bwstring *s0, int *si
>       while (iswblank(bws_get_iter_value(s)))
>               s = bws_iterator_inc(s, 1);
>  
> -     if (bws_get_iter_value(s) == (wchar_t)symbol_negative_sign) {
> +     if (bws_get_iter_value(s) == L'-') {
>               *sign = -1;
>               s = bws_iterator_inc(s, 1);
>       }
> @@ -716,16 +710,13 @@ read_number(struct bwstring *s0, int *si
>                       smain[*main_len] = bws_get_iter_value(s);
>                       s = bws_iterator_inc(s, 1);
>                       *main_len += 1;
> -             } else if (symbol_thousands_sep &&
> -                 (bws_get_iter_value(s) == (wchar_t)symbol_thousands_sep))
> -                     s = bws_iterator_inc(s, 1);
> -             else
> +             } else
>                       break;
>       }
>  
>       smain[*main_len] = 0;
>  
> -     if (bws_get_iter_value(s) == (wchar_t)symbol_decimal_point) {
> +     if (bws_get_iter_value(s) == L'.') {
>               s = bws_iterator_inc(s, 1);
>               while (iswdigit(bws_get_iter_value(s)) &&
>                   *frac_len < MAX_NUM_SIZE) {
> Index: coll.h
> ===================================================================
> RCS file: /cvs/src/usr.bin/sort/coll.h,v
> retrieving revision 1.1
> diff -u -p -r1.1 coll.h
> --- coll.h    17 Mar 2015 17:45:13 -0000      1.1
> +++ coll.h    11 Oct 2016 13:20:59 -0000
> @@ -123,14 +123,6 @@ typedef int (*listcoll_t)(struct sort_li
>  extern struct key_specs *keys;
>  extern size_t keys_num;
>  
> -/*
> - * Main localised symbols. These must be wint_t as they may hold WEOF.
> - */
> -extern wint_t symbol_decimal_point;
> -extern wint_t symbol_thousands_sep;
> -extern wint_t symbol_negative_sign;
> -extern wint_t symbol_positive_sign;
> -
>  /* funcs */
>  
>  cmpcoll_t get_sort_func(struct sort_mods *sm);
> Index: file.c
> ===================================================================
> RCS file: /cvs/src/usr.bin/sort/file.c,v
> retrieving revision 1.20
> diff -u -p -r1.20 file.c
> --- file.c    3 Apr 2015 10:37:24 -0000       1.20
> +++ file.c    11 Oct 2016 13:20:59 -0000
> @@ -1078,7 +1078,7 @@ sort_list_to_file(struct sort_list *list
>  
>       if (!sm->Mflag && !sm->Rflag && !sm->Vflag &&
>           !sm->gflag && !sm->hflag && !sm->nflag) {
> -             if ((sort_opts_vals.sort_method == SORT_DEFAULT) && byte_sort)
> +             if (sort_opts_vals.sort_method == SORT_DEFAULT)
>                       sort_opts_vals.sort_method = SORT_RADIXSORT;
>  
>       } else if (sort_opts_vals.sort_method == SORT_RADIXSORT)
> Index: sort.1
> ===================================================================
> RCS file: /cvs/src/usr.bin/sort/sort.1,v
> retrieving revision 1.55
> diff -u -p -r1.55 sort.1
> --- sort.1    11 Oct 2016 13:08:20 -0000      1.55
> +++ sort.1    11 Oct 2016 13:20:59 -0000
> @@ -60,9 +60,8 @@ option
>  A record can contain any printable or unprintable characters.
>  Comparisons are based on one or more sort keys extracted from
>  each line of input, and are performed lexicographically,
> -according to the current locale's collating rules and the
> -specified command-line options that can tune the actual
> -sorting behavior.
> +according to the specified command-line options
> +that can tune the actual sorting behavior.
>  By default, if keys are not given,
>  .Nm
>  uses entire lines for comparison.
> @@ -177,10 +176,6 @@ Unknown strings are considered smaller t
>  .It Fl n , Fl Fl numeric-sort, Fl Fl sort=numeric
>  An initial numeric string, consisting of optional blank space, optional
>  minus sign, and zero or more digits (including decimal point)
> -.\" with
> -.\" optional radix character and thousands
> -.\" separator
> -.\" (as defined in the current locale),
>  is sorted by arithmetic value.
>  Leading blank characters are ignored.
>  .It Fl R, Fl Fl random-sort, Fl Fl sort=random
> @@ -205,7 +200,6 @@ The files are compared by their prefixes
>  zeros are ignored in version numbers, see example below).
>  If an input string does not match the pattern, then it is compared
>  using the byte compare function.
> -All string comparisons are performed in the C locale.
>  .Pp
>  For example:
>  .Bd -literal -offset indent
> @@ -498,43 +492,6 @@ which has no
>  equivalent.
>  .Sh ENVIRONMENT
>  .Bl -tag -width Fl
> -.It Ev GNUSORT_NUMERIC_COMPATIBILITY
> -If defined
> -.Fl t
> -will not override the locale numeric symbols, that is, thousand
> -separators and decimal separators.
> -By default, if we specify
> -.Fl t
> -with the same symbol as the thousand separator or decimal point,
> -the symbol will be treated as the field separator.
> -Older behavior was less definite: the symbol was treated as both field
> -separator and numeric separator, simultaneously.
> -This environment variable enables the old behavior.
> -.It Ev LANG
> -Used as a last resort to determine different kinds of locale-specific
> -behavior if neither the respective environment variable nor
> -.Ev LC_ALL
> -are set.
> -.It Ev LC_ALL
> -Locale settings that override all of the other locale settings.
> -This environment variable can be used to set all these settings
> -to the same value at once.
> -.It Ev LC_COLLATE
> -Locale settings to be used to determine the collation for
> -sorting records.
> -.It Ev LC_CTYPE
> -Locale settings to be used to case conversion and classification
> -of characters, that is, which characters are considered
> -whitespaces, etc.
> -.It Ev LC_MESSAGES
> -Locale settings that determine the language of output messages
> -that
> -.Nm
> -prints out.
> -.It Ev LC_NUMERIC
> -Locale settings that determine the number format used in numeric sort.
> -.It Ev LC_TIME
> -Locale settings that determine the month format used in month sort.
>  .It Ev TMPDIR
>  Path to the directory in which temporary files will be stored.
>  Note that
> @@ -578,7 +535,10 @@ The
>  .Nm
>  utility is compliant with the
>  .St -p1003.1-2008
> -specification.
> +specification, except that it ignores the user's
> +.Xr locale 1
> +and always assumes
> +.Ev LC_ALL Ns =C.
>  .Pp
>  The flags
>  .Op Fl gHhiMRSsTVz
> @@ -628,13 +588,10 @@ This implementation of
>  has no limits on input line length (other than imposed by available
>  memory) or any restrictions on bytes allowed within lines.
>  .Pp
> -The performance depends highly on locale settings,
> +The performance depends highly on
>  efficient choice of sort keys and key complexity.
> -The fastest sort is with the C locale, on whole lines, with option
> +The fastest sort is on whole lines, with option
>  .Fl s .
> -In general, the C locale is the fastest, followed by single-byte
> -locales with multi-byte locales being the slowest.
> -The correct collation order respected in all cases.
>  For the key specification, the simpler to process the
>  lines the faster the search will be.
>  .Pp
> Index: sort.c
> ===================================================================
> RCS file: /cvs/src/usr.bin/sort/sort.c,v
> retrieving revision 1.86
> diff -u -p -r1.86 sort.c
> --- sort.c    14 Jul 2016 08:31:18 -0000      1.86
> +++ sort.c    11 Oct 2016 13:20:59 -0000
> @@ -36,7 +36,6 @@
>  #include <errno.h>
>  #include <getopt.h>
>  #include <limits.h>
> -#include <locale.h>
>  #include <md5.h>
>  #include <regex.h>
>  #include <signal.h>
> @@ -70,8 +69,6 @@ struct sort_opts sort_opts_vals;
>  bool debug_sort;
>  bool need_hint;
>  
> -static bool gnusort_numeric_compatibility;
> -
>  static struct sort_mods default_sort_mods_object;
>  struct sort_mods * const default_sort_mods = &default_sort_mods_object;
>  
> @@ -239,68 +236,6 @@ set_hw_params(void)
>  }
>  
>  /*
> - * Convert "plain" symbol to wide symbol, with default value.
> - */
> -static void
> -conv_mbtowc(wchar_t *wc, const char *c, const wchar_t def)
> -{
> -     int res;
> -
> -     res = mbtowc(wc, c, MB_CUR_MAX);
> -     if (res < 1)
> -             *wc = def;
> -}
> -
> -/*
> - * Set current locale symbols.
> - */
> -static void
> -set_locale(void)
> -{
> -     struct lconv *lc;
> -     const char *locale;
> -
> -     setlocale(LC_ALL, "");
> -
> -     /* Obtain LC_NUMERIC info */
> -     lc = localeconv();
> -
> -     /* Convert to wide char form */
> -     conv_mbtowc(&symbol_decimal_point, lc->decimal_point,
> -         symbol_decimal_point);
> -     conv_mbtowc(&symbol_thousands_sep, lc->thousands_sep,
> -         symbol_thousands_sep);
> -     conv_mbtowc(&symbol_positive_sign, lc->positive_sign,
> -         symbol_positive_sign);
> -     conv_mbtowc(&symbol_negative_sign, lc->negative_sign,
> -         symbol_negative_sign);
> -
> -     if (getenv("GNUSORT_NUMERIC_COMPATIBILITY"))
> -             gnusort_numeric_compatibility = true;
> -
> -     locale = setlocale(LC_COLLATE, NULL);
> -     if (locale != NULL) {
> -             char *tmpl;
> -             const char *byteclocale;
> -
> -             tmpl = sort_strdup(locale);
> -             byteclocale = setlocale(LC_COLLATE, "C");
> -             if (byteclocale && strcmp(byteclocale, tmpl) == 0) {
> -                     byte_sort = true;
> -             } else {
> -                     byteclocale = setlocale(LC_COLLATE, "POSIX");
> -                     if (byteclocale && strcmp(byteclocale, tmpl) == 0)
> -                             byte_sort = true;
> -                     else
> -                             setlocale(LC_COLLATE, tmpl);
> -             }
> -             sort_free(tmpl);
> -     }
> -     if (!byte_sort)
> -             sort_mb_cur_max = MB_CUR_MAX;
> -}
> -
> -/*
>   * Set directory temporary files.
>   */
>  static void
> @@ -883,7 +818,6 @@ main(int argc, char *argv[])
>  
>       atexit(clear_tmp_files);
>  
> -     set_locale();
>       set_tmpdir();
>       set_sort_opts();
>  
> @@ -963,16 +897,6 @@ main(int argc, char *argv[])
>                                       errno = EINVAL;
>                                       err(2, NULL);
>                               }
> -                             if (!gnusort_numeric_compatibility) {
> -                                     if (symbol_decimal_point == 
> sort_opts_vals.field_sep)
> -                                             symbol_decimal_point = WEOF;
> -                                     if (symbol_thousands_sep == 
> sort_opts_vals.field_sep)
> -                                             symbol_thousands_sep = WEOF;
> -                                     if (symbol_negative_sign == 
> sort_opts_vals.field_sep)
> -                                             symbol_negative_sign = WEOF;
> -                                     if (symbol_positive_sign == 
> sort_opts_vals.field_sep)
> -                                             symbol_positive_sign = WEOF;
> -                             }
>                               break;
>                       case 'u':
>                               sort_opts_vals.uflag = true;
> @@ -1163,18 +1087,6 @@ main(int argc, char *argv[])
>       if (debug_sort) {
>               printf("Memory to be used for sorting: %llu\n",
>                   available_free_memory);
> -             printf("Using collate rules of %s locale\n",
> -                 setlocale(LC_COLLATE, NULL));
> -             if (byte_sort)
> -                     printf("Byte sort is used\n");
> -             if (print_symbols_on_debug) {
> -                     printf("Decimal Point: <%lc>\n", symbol_decimal_point);
> -                     if (symbol_thousands_sep)
> -                             printf("Thousands separator: <%lc>\n",
> -                                 symbol_thousands_sep);
> -                     printf("Positive sign: <%lc>\n", symbol_positive_sign);
> -                     printf("Negative sign: <%lc>\n", symbol_negative_sign);
> -             }
>       }
>  
>       if (sort_opts_vals.cflag)
> 

Reply via email to