On Thu, Aug 13, 2015 at 05:54:57PM +0200, Tim Ruehsen wrote:

> I just made up a test case, but can't apply your patch.
> 
> Please rebase to latest git master and generate your patch with
> git format-patch and send it as attachment. Thanks.

OK, see attached.

Andries
>From 5980a3665d8924c7d2374f0740bb82ff0cdc9043 Mon Sep 17 00:00:00 2001
From: "Andries E. Brouwer" <[email protected]>
Date: Thu, 13 Aug 2015 19:06:03 +0200
Subject: [PATCH] Do not escape high control bytes on a UTF-8 system.

---
 src/init.c    | 26 +++++++++++++++++++++++++-
 src/options.h |  1 +
 src/url.c     | 12 +++++++++---
 3 files changed, 35 insertions(+), 4 deletions(-)

diff --git a/src/init.c b/src/init.c
index ea074cc..6f71de1 100644
--- a/src/init.c
+++ b/src/init.c
@@ -348,6 +348,27 @@ command_by_name (const char *cmdname)
   return -1;
 }
 
+
+/* Used to determine whether bytes 128-159 are OK in a filename */
+static int
+have_utf8_locale() {
+#if defined(WINDOWS) || defined(MSDOS) || defined(__CYGWIN__)
+  /* insert some test for Windows */
+#else
+  char *p;
+
+  p = getenv("LC_ALL");
+  if (p == NULL)
+    p = getenv("LC_CTYPE");
+  if (p == NULL)
+    p = getenv("LANG");
+  if (strstr(p, "UTF-8") != NULL || strstr(p, "UTF8") != NULL ||
+      strstr(p, "utf-8") != NULL || strstr(p, "utf8") != NULL)
+    return true;
+#endif
+  return false;
+}
+
 /* Reset the variables to default values.  */
 void
 defaults (void)
@@ -419,6 +440,7 @@ defaults (void)
   opt.restrict_files_os = restrict_unix;
 #endif
   opt.restrict_files_ctrl = true;
+  opt.restrict_files_highctrl = (have_utf8_locale() ? false : true);
   opt.restrict_files_nonascii = false;
   opt.restrict_files_case = restrict_no_case_restriction;
 
@@ -1487,6 +1509,7 @@ cmd_spec_restrict_file_names (const char *com, const char *val, void *place_igno
 {
   int restrict_os = opt.restrict_files_os;
   int restrict_ctrl = opt.restrict_files_ctrl;
+  int restrict_highctrl = opt.restrict_files_highctrl;
   int restrict_case = opt.restrict_files_case;
   int restrict_nonascii = opt.restrict_files_nonascii;
 
@@ -1511,7 +1534,7 @@ cmd_spec_restrict_file_names (const char *com, const char *val, void *place_igno
       else if (VAL_IS ("uppercase"))
         restrict_case = restrict_uppercase;
       else if (VAL_IS ("nocontrol"))
-        restrict_ctrl = false;
+        restrict_ctrl = restrict_highctrl = false;
       else if (VAL_IS ("ascii"))
         restrict_nonascii = true;
       else
@@ -1532,6 +1555,7 @@ cmd_spec_restrict_file_names (const char *com, const char *val, void *place_igno
 
   opt.restrict_files_os = restrict_os;
   opt.restrict_files_ctrl = restrict_ctrl;
+  opt.restrict_files_highctrl = restrict_highctrl;
   opt.restrict_files_case = restrict_case;
   opt.restrict_files_nonascii = restrict_nonascii;
 
diff --git a/src/options.h b/src/options.h
index 24ddbb5..083d16b 100644
--- a/src/options.h
+++ b/src/options.h
@@ -251,6 +251,7 @@ struct options
   bool restrict_files_ctrl;     /* non-zero if control chars in URLs
                                    are restricted from appearing in
                                    generated file names. */
+  bool restrict_files_highctrl; /* idem for bytes 128-159 */
   bool restrict_files_nonascii; /* non-zero if bytes with values greater
                                    than 127 are restricted. */
   enum {
diff --git a/src/url.c b/src/url.c
index 73c8dd0..e98bfaa 100644
--- a/src/url.c
+++ b/src/url.c
@@ -1348,7 +1348,8 @@ enum {
   filechr_not_unix    = 1,      /* unusable on Unix, / and \0 */
   filechr_not_vms     = 2,      /* unusable on VMS (ODS5), 0x00-0x1F * ? */
   filechr_not_windows = 4,      /* unusable on Windows, one of \|/<>?:*" */
-  filechr_control     = 8       /* a control character, e.g. 0-31 */
+  filechr_control     = 8,      /* a control character, e.g. 0-31 */
+  filechr_highcontrol = 16      /* a high control character, in 128-159 */
 };
 
 #define FILE_CHAR_TEST(c, mask) \
@@ -1360,6 +1361,7 @@ enum {
 #define V filechr_not_vms
 #define W filechr_not_windows
 #define C filechr_control
+#define Z filechr_highcontrol
 
 #define UVWC U|V|W|C
 #define UW U|W
@@ -1392,8 +1394,8 @@ UVWC, VC, VC, VC,  VC, VC, VC, VC,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
    0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
    0,  0,  0,  0,   W,  0,  0,  C,   /* x   y   z   {    |   }   ~   DEL */
 
-  C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 128-143 */
-  C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 144-159 */
+  Z, Z, Z, Z,  Z, Z, Z, Z,  Z, Z, Z, Z,  Z, Z, Z, Z, /* 128-143 */
+  Z, Z, Z, Z,  Z, Z, Z, Z,  Z, Z, Z, Z,  Z, Z, Z, Z, /* 144-159 */
   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
 
@@ -1406,6 +1408,7 @@ UVWC, VC, VC, VC,  VC, VC, VC, VC,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
 #undef V
 #undef W
 #undef C
+#undef Z
 #undef UW
 #undef UVWC
 #undef VC
@@ -1448,8 +1451,11 @@ append_uri_pathel (const char *b, const char *e, bool escaped,
     mask = filechr_not_vms;
   else
     mask = filechr_not_windows;
+
   if (opt.restrict_files_ctrl)
     mask |= filechr_control;
+  if (opt.restrict_files_highctrl)
+    mask |= filechr_highcontrol;
 
   /* Copy [b, e) to PATHEL and URL-unescape it. */
   if (escaped)
-- 
1.9.1

Reply via email to