On Thursday 13 August 2015 19:10:41 Andries E. Brouwer wrote:
> On Thu, Aug 13, 2015 at 05:54:57PM +0200, Tim Ruehsen wrote:
> > I just made up a test case, but can't apply your patch.
> >
> > Please rebase to latest git master and generate your patch with
> > git format-patch and send it as attachment. Thanks.
>
> OK, see attached.
>
> Andries

Based on that, and your proposal about the progress bar, I made up a bunch of
patches. The new test case is not yet ready.
@Andries: Maybe you can put a few more test cases into that (or send me a few
examples that should work). I also would like to see broken UTF-8 sequences in
this test.

@Darshit Could you have a closer look into the patches, please ? Neither is
python nor the progress code my playground... you are the expert here.

Tim
From 1ae1aeda78d83e570fe7ee5881c7e9caf182e991 Mon Sep 17 00:00:00 2001
From: "Andries E. Brouwer" <[email protected]>
Date: Thu, 13 Aug 2015 19:06:03 +0200
Subject: [PATCH 1/4] Do not escape high control bytes on a UTF-8 system.

---
 src/init.c    | 26 +++++++++++++++++++++++++-
 src/options.h |  1 +
 src/url.c     | 12 +++++++++---
 3 files changed, 35 insertions(+), 4 deletions(-)

diff --git a/src/init.c b/src/init.c
index ea074cc..6f71de1 100644
--- a/src/init.c
+++ b/src/init.c
@@ -348,6 +348,27 @@ command_by_name (const char *cmdname)
   return -1;
 }

+
+/* Used to determine whether bytes 128-159 are OK in a filename */
+static int
+have_utf8_locale() {
+#if defined(WINDOWS) || defined(MSDOS) || defined(__CYGWIN__)
+  /* insert some test for Windows */
+#else
+  char *p;
+
+  p = getenv("LC_ALL");
+  if (p == NULL)
+    p = getenv("LC_CTYPE");
+  if (p == NULL)
+    p = getenv("LANG");
+  if (strstr(p, "UTF-8") != NULL || strstr(p, "UTF8") != NULL ||
+      strstr(p, "utf-8") != NULL || strstr(p, "utf8") != NULL)
+    return true;
+#endif
+  return false;
+}
+
 /* Reset the variables to default values.  */
 void
 defaults (void)
@@ -419,6 +440,7 @@ defaults (void)
   opt.restrict_files_os = restrict_unix;
 #endif
   opt.restrict_files_ctrl = true;
+  opt.restrict_files_highctrl = (have_utf8_locale() ? false : true);
   opt.restrict_files_nonascii = false;
   opt.restrict_files_case = restrict_no_case_restriction;

@@ -1487,6 +1509,7 @@ cmd_spec_restrict_file_names (const char *com, const char *val, void *place_igno
 {
   int restrict_os = opt.restrict_files_os;
   int restrict_ctrl = opt.restrict_files_ctrl;
+  int restrict_highctrl = opt.restrict_files_highctrl;
   int restrict_case = opt.restrict_files_case;
   int restrict_nonascii = opt.restrict_files_nonascii;

@@ -1511,7 +1534,7 @@ cmd_spec_restrict_file_names (const char *com, const char *val, void *place_igno
       else if (VAL_IS ("uppercase"))
         restrict_case = restrict_uppercase;
       else if (VAL_IS ("nocontrol"))
-        restrict_ctrl = false;
+        restrict_ctrl = restrict_highctrl = false;
       else if (VAL_IS ("ascii"))
         restrict_nonascii = true;
       else
@@ -1532,6 +1555,7 @@ cmd_spec_restrict_file_names (const char *com, const char *val, void *place_igno

   opt.restrict_files_os = restrict_os;
   opt.restrict_files_ctrl = restrict_ctrl;
+  opt.restrict_files_highctrl = restrict_highctrl;
   opt.restrict_files_case = restrict_case;
   opt.restrict_files_nonascii = restrict_nonascii;

diff --git a/src/options.h b/src/options.h
index 24ddbb5..083d16b 100644
--- a/src/options.h
+++ b/src/options.h
@@ -251,6 +251,7 @@ struct options
   bool restrict_files_ctrl;     /* non-zero if control chars in URLs
                                    are restricted from appearing in
                                    generated file names. */
+  bool restrict_files_highctrl; /* idem for bytes 128-159 */
   bool restrict_files_nonascii; /* non-zero if bytes with values greater
                                    than 127 are restricted. */
   enum {
diff --git a/src/url.c b/src/url.c
index 73c8dd0..e98bfaa 100644
--- a/src/url.c
+++ b/src/url.c
@@ -1348,7 +1348,8 @@ enum {
   filechr_not_unix    = 1,      /* unusable on Unix, / and \0 */
   filechr_not_vms     = 2,      /* unusable on VMS (ODS5), 0x00-0x1F * ? */
   filechr_not_windows = 4,      /* unusable on Windows, one of \|/<>?:*" */
-  filechr_control     = 8       /* a control character, e.g. 0-31 */
+  filechr_control     = 8,      /* a control character, e.g. 0-31 */
+  filechr_highcontrol = 16      /* a high control character, in 128-159 */
 };

 #define FILE_CHAR_TEST(c, mask) \
@@ -1360,6 +1361,7 @@ enum {
 #define V filechr_not_vms
 #define W filechr_not_windows
 #define C filechr_control
+#define Z filechr_highcontrol

 #define UVWC U|V|W|C
 #define UW U|W
@@ -1392,8 +1394,8 @@ UVWC, VC, VC, VC,  VC, VC, VC, VC,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
    0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
    0,  0,  0,  0,   W,  0,  0,  C,   /* x   y   z   {    |   }   ~   DEL */

-  C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 128-143 */
-  C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 144-159 */
+  Z, Z, Z, Z,  Z, Z, Z, Z,  Z, Z, Z, Z,  Z, Z, Z, Z, /* 128-143 */
+  Z, Z, Z, Z,  Z, Z, Z, Z,  Z, Z, Z, Z,  Z, Z, Z, Z, /* 144-159 */
   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
   0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,

@@ -1406,6 +1408,7 @@ UVWC, VC, VC, VC,  VC, VC, VC, VC,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
 #undef V
 #undef W
 #undef C
+#undef Z
 #undef UW
 #undef UVWC
 #undef VC
@@ -1448,8 +1451,11 @@ append_uri_pathel (const char *b, const char *e, bool escaped,
     mask = filechr_not_vms;
   else
     mask = filechr_not_windows;
+
   if (opt.restrict_files_ctrl)
     mask |= filechr_control;
+  if (opt.restrict_files_highctrl)
+    mask |= filechr_highcontrol;

   /* Copy [b, e) to PATHEL and URL-unescape it. */
   if (escaped)
--
2.5.0

From 5f58576fb78be7af205ca86ee51362e653ed8772 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim Rühsen?= <[email protected]>
Date: Mon, 17 Aug 2015 13:03:25 +0200
Subject: [PATCH 2/4] Fix progress bar for multibyte filenames

* progress.c (create_image): Fix filename padding

Reported-by: "Andries E. Brouwer" <[email protected]>
---
 src/progress.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/progress.c b/src/progress.c
index 61b635d..d97e329 100644
--- a/src/progress.c
+++ b/src/progress.c
@@ -950,10 +950,8 @@ create_image (struct bar_progress *bp, double dl_total_time, bool done)

   if (orig_filename_cols <= MAX_FILENAME_COLS)
     {
-      int padding = MAX_FILENAME_COLS - orig_filename_cols;
-      sprintf (p, "%s ", bp->f_download);
-      p += orig_filename_cols + 1;
-      for (;padding;padding--)
+      p += sprintf (p, "%s ", bp->f_download);
+      while (p < bp->buffer + MAX_FILENAME_COLS)
         *p++ = ' ';
     }
   else
--
2.5.0

From 0f5fa65964bac7a80da6dfd7c32b5503494da76f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim Rühsen?= <[email protected]>
Date: Mon, 17 Aug 2015 13:05:53 +0200
Subject: [PATCH 3/4] Test unescaped URL names in python test suite

* testenv/server/http/http_server.py (send_head):
  Check unescaped URLs

This patch allows us to use UTF-8 file names in our python tests.
---
 testenv/server/http/http_server.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/testenv/server/http/http_server.py b/testenv/server/http/http_server.py
index 85769c4..40caddb 100644
--- a/testenv/server/http/http_server.py
+++ b/testenv/server/http/http_server.py
@@ -8,6 +8,7 @@ from hashlib import md5
 import threading
 import socket
 import os
+import urllib


 class StoppableHTTPServer(HTTPServer):
@@ -387,7 +388,7 @@ class _Handler(BaseHTTPRequestHandler):
         if self.path == "/":
             path = "index.html"
         else:
-            path = self.path[1:]
+            path = urllib.parse.unquote(self.path[1:])

         self.__log_request(method)

--
2.5.0

From 420b9f3923b4015f233bc22881f6e6e9edde8980 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim Rühsen?= <[email protected]>
Date: Mon, 17 Aug 2015 13:10:42 +0200
Subject: [PATCH 4/4] Add new test testenv/Test-not-escaping.py

---
 testenv/Test-not-escaping.py | 45 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)
 create mode 100755 testenv/Test-not-escaping.py

diff --git a/testenv/Test-not-escaping.py b/testenv/Test-not-escaping.py
new file mode 100755
index 0000000..7c48f7f
--- /dev/null
+++ b/testenv/Test-not-escaping.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python3
+from sys import exit
+from test.http_test import HTTPTest
+from misc.wget_file import WgetFile
+
+"""
+    This test ensures that Wget correctly handles the -O command for output
+    filenames.
+"""
+TEST_NAME = "Output Filename Command"
+############# File Definitions ###############################################
+File1 = "Test Contents."
+
+A_File = WgetFile ("Сердце", File1)
+
+WGET_OPTIONS = "-d"
+#WGET_URLS = [["%D0%A1%D0%B5%D1%80%D0%B4%D1%86%D0%B5"]]
+WGET_URLS = [["Сердце"]]
+
+Files = [[A_File]]
+
+ExpectedReturnCode = 0
+ExpectedDownloadedFiles = [WgetFile ("Сердце", File1)]
+
+################ Pre and Post Test Hooks #####################################
+pre_test = {
+    "ServerFiles"       : Files
+}
+test_options = {
+    "WgetCommands"      : WGET_OPTIONS,
+    "Urls"              : WGET_URLS
+}
+post_test = {
+    "ExpectedFiles"     : ExpectedDownloadedFiles,
+    "ExpectedRetcode"   : ExpectedReturnCode
+}
+
+err = HTTPTest (
+                name=TEST_NAME,
+                pre_hook=pre_test,
+                test_params=test_options,
+                post_hook=post_test
+).begin ()
+
+exit (err)
--
2.5.0

Attachment: signature.asc
Description: This is a digitally signed message part.

Reply via email to