Re: [Bug-wget] Gzip Content-Encoding Patches

2017-08-04 Thread Tim Schlueter
On 08/04/2017 07:46 AM, Tim Rühsen wrote:
> Hi Tim,
> 
> sorry for the delay.
> 
> 
> I just pushed your patches. Thanks gain for your work !
> 
> 
> With Best Regards, Tim
> 

Hi Tim,

I'm glad to hear that.

I've started working on adding some automated tests for gzip
compression, but it will be a little while before they are ready.

Thanks,
Tim



signature.asc
Description: OpenPGP digital signature


[Bug-wget] Gzip Content-Encoding Patches

2017-07-31 Thread Tim Schlueter
Hi,

Please see the attached patches which add automatic gzip decompression
for HTTP files with the Content-Encoding response header set correctly.

It also adjusts a downloaded file's extension for br, compress, and
deflate Content-Encodings.

Since the first patch set:
* doc/wget.texi has been updated to reflect the changes in the patches.
* Commit messages have been changed to be in the GNU change log style.
* The patches are attached to this email instead of being in the body.

I have not yet had a chance to look at what would be involved to add
automated tests for this patch set.

Regards,
Tim
From cbdd976dea6289a1f167c2b50cc1d4b1ff878686 Mon Sep 17 00:00:00 2001
From: Tim Schlueter 
Date: Mon, 24 Jul 2017 20:39:24 -0700
Subject: [PATCH 1/3] Adjust Extension based on Content-Encoding

* doc/wget.texi (--adjust-extension, adjust_extension): Updated documentation.
* src/http.c (encoding_t): New enum.
(struct http_stat): Add local_encoding field.
(gethttp): --adjust-extension based on Content-Encoding.
---
 doc/wget.texi | 10 +--
 src/http.c| 90 +++
 2 files changed, 98 insertions(+), 2 deletions(-)

diff --git a/doc/wget.texi b/doc/wget.texi
index 6453c35..e582d4f 100644
--- a/doc/wget.texi
+++ b/doc/wget.texi
@@ -1346,6 +1346,11 @@ renamed from @samp{--html-extension}, to better reflect its new
 behavior. The old option name is still acceptable, but should now be
 considered deprecated.
 
+As of version 1.20, Wget will also ensure that any downloaded files with
+a @code{Content-Encoding} of @samp{br}, @samp{compress}, @samp{deflate}
+or @samp{gzip} end in the suffix @samp{.br}, @samp{.Z}, @samp{.zlib}
+and @samp{.gz} respectively.
+
 At some point in the future, this option may well be expanded to
 include suffixes for other types of content, including content types
 that are not parsed by Wget.
@@ -3365,8 +3370,9 @@ Define a header for HTTP downloads, like using
 
 @item adjust_extension = on/off
 Add a @samp{.html} extension to @samp{text/html} or
-@samp{application/xhtml+xml} files that lack one, or a @samp{.css}
-extension to @samp{text/css} files that lack one, like
+@samp{application/xhtml+xml} files that lack one, a @samp{.css}
+extension to @samp{text/css} files that lack one, and a @samp{.br},
+@samp{.Z}, @samp{.zlib} or @samp{.gz} to compressed files like
 @samp{-E}. Previously named @samp{html_extension} (still acceptable,
 but deprecated).
 
diff --git a/src/http.c b/src/http.c
index f5d9caf..a8c6e18 100644
--- a/src/http.c
+++ b/src/http.c
@@ -1539,6 +1539,16 @@ persistent_available_p (const char *host, int port, bool ssl,
   fd = -1;  \
 } while (0)
 
+typedef enum
+{
+  ENC_INVALID = -1, /* invalid encoding */
+  ENC_NONE = 0, /* no special encoding */
+  ENC_GZIP, /* gzip compression */
+  ENC_DEFLATE,  /* deflate compression */
+  ENC_COMPRESS, /* compress compression */
+  ENC_BROTLI/* brotli compression */
+} encoding_t;
+
 struct http_stat
 {
   wgint len;/* received length */
@@ -1569,6 +1579,9 @@ struct http_stat
 #ifdef HAVE_METALINK
   metalink_t *metalink;
 #endif
+
+  encoding_t local_encoding;/* the encoding of the local file */
+
   bool temporary;   /* downloading a temporary file */
 };
 
@@ -3189,6 +3202,7 @@ gethttp (const struct url *u, struct url *original_url, struct http_stat *hs,
   xfree (hs->remote_time);
   hs->error = NULL;
   hs->message = NULL;
+  hs->local_encoding = ENC_NONE;
 
   conn = u;
 
@@ -3639,6 +3653,49 @@ gethttp (const struct url *u, struct url *original_url, struct http_stat *hs,
 }
 }
 
+  if (resp_header_copy (resp, "Content-Encoding", hdrval, sizeof (hdrval)))
+{
+  hs->local_encoding = ENC_INVALID;
+
+  switch (hdrval[0])
+{
+case 'b': case 'B':
+  if (0 == c_strcasecmp(hdrval, "br"))
+hs->local_encoding = ENC_BROTLI;
+  break;
+case 'c': case 'C':
+  if (0 == c_strcasecmp(hdrval, "compress"))
+hs->local_encoding = ENC_COMPRESS;
+  break;
+case 'd': case 'D':
+  if (0 == c_strcasecmp(hdrval, "deflate"))
+hs->local_encoding = ENC_DEFLATE;
+  break;
+case 'g': case 'G':
+  if (0 == c_strcasecmp(hdrval, "gzip"))
+hs->local_encoding = ENC_GZIP;
+  break;
+case 'i': case 'I':
+  if (0 == c_strcasecmp(hdrval, "identity"))
+hs->local_encoding = ENC_NONE;
+  break;
+case 'x': case 'X':
+  if (0 == c_strcasecmp(hdrval, "x-compress"))
+hs->local_enco

[Bug-wget] [PATCH 3/3] Add on the fly gzip decompression support

2017-07-28 Thread Tim Schlueter
Add on the fly gzip decompression support to wget1.

If the --compression=none argument is given to wget, gzip decompression
is disabled, and wget will function as it has before this patch. The
"Accept-Encoding: identity" header will be sent (unless overridden by
--header) and any compressed response from the remote server will be
written out as-is to the appropriate file.  E.g. index.html without the
-E argument, index.html.gz with -E.

If --compression=auto is given (or no --compression argument is given)
and --continue or --start-pos are given, compression will be disabled
for backwards compatibility.

Otherwise, --compression=auto (or no --compression argument) functions
the same as --compression=gzip.

If the --compression=gzip argument is given, the --continue and
--start-pos arguments will be ignored and a warning will be printed if
they are present, see patch 2. The "Accept-Encoding: gzip" header will
be sent with the request (unless overridden). If the Content-Encoding
response header is set to "gzip" or "x-gzip", wget will decompress the
data on the fly and store them in the appropriate file (and -E will NOT
append .gz).

Incorrect server responses with the Content-Type set to */gzip will be
NOT be decompressed and will instead be written out as-is (-E will still
ensure it has a .gz extension however).

---
 src/http.c |  39 -
 src/retr.c | 143 --
 src/retr.h |   4 +-
 3 files changed, 180 insertions(+), 6 deletions(-)

diff --git a/src/http.c b/src/http.c
index a8c6e18..08b2ed6 100644
--- a/src/http.c
+++ b/src/http.c
@@ -1581,6 +1581,7 @@ struct http_stat
 #endif

   encoding_t local_encoding;/* the encoding of the local file */
+  encoding_t remote_encoding;   /* the encoding of the remote file */

   bool temporary;   /* downloading a temporary file */
 };
@@ -1693,6 +1694,9 @@ read_response_body (struct http_stat *hs, int
sock, FILE *fp, wgint contlen,
   if (chunked_transfer_encoding)
 flags |= rb_chunked_transfer_encoding;

+  if (hs->remote_encoding == ENC_GZIP)
+flags |= rb_compressed_gzip;
+
   hs->len = hs->restval;
   hs->rd_size = 0;
   /* Download the response body and write it to fp.
@@ -1886,7 +1890,12 @@ initialize_request (const struct url *u, struct
http_stat *hs, int *dt, struct u
 rel_value);
   SET_USER_AGENT (req);
   request_set_header (req, "Accept", "*/*", rel_none);
-  request_set_header (req, "Accept-Encoding", "identity", rel_none);
+#ifdef HAVE_LIBZ
+  if (opt.compression != compression_none)
+request_set_header (req, "Accept-Encoding", "gzip", rel_none);
+  else
+#endif
+request_set_header (req, "Accept-Encoding", "identity", rel_none);

   /* Find the username with priority */
   if (u->user)
@@ -3203,6 +3212,7 @@ gethttp (const struct url *u, struct url
*original_url, struct http_stat *hs,
   hs->error = NULL;
   hs->message = NULL;
   hs->local_encoding = ENC_NONE;
+  hs->remote_encoding = ENC_NONE;

   conn = u;

@@ -3694,6 +3704,30 @@ gethttp (const struct url *u, struct url
*original_url, struct http_stat *hs,
   DEBUGP (("Unrecognized Content-Encoding: %s\n", hdrval));
   hs->local_encoding = ENC_NONE;
 }
+#ifdef HAVE_LIBZ
+  else if (hs->local_encoding == ENC_GZIP
+   && opt.compression != compression_none)
+{
+  /* Make sure the Content-Type is not gzip before decompressing */
+  const char * p = strchr (type, '/');
+  if (p == NULL)
+{
+  hs->remote_encoding = ENC_GZIP;
+  hs->local_encoding = ENC_NONE;
+}
+  else
+{
+  p++;
+  if (c_tolower(p[0]) == 'x' && p[1] == '-')
+p += 2;
+  if (0 != c_strcasecmp (p, "gzip"))
+{
+  hs->remote_encoding = ENC_GZIP;
+  hs->local_encoding = ENC_NONE;
+}
+}
+}
+#endif
 }

   /* 20x responses are counted among successful by default.  */
@@ -3930,6 +3964,9 @@ gethttp (const struct url *u, struct url
*original_url, struct http_stat *hs,
 }
   if (contlen == -1)
 hs->contlen = -1;
+  /* If the response is gzipped, the uncompressed size is unknown. */
+  else if (hs->remote_encoding == ENC_GZIP)
+hs->contlen = -1;
   else
 hs->contlen = contlen + contrange;

diff --git a/src/retr.c b/src/retr.c
index 0cf438e..a27d58a 100644
--- a/src/retr.c
+++ b/src/retr.c
@@ -41,6 +41,10 @@ as that of the covered work.  */
 # include /* For delete(). */
 #endif

+#ifdef HAVE_LIBZ
+# include 
+#endif
+
 #include "exits.h"
 #include "utils.h"
 #include "retr.h"
@@ -84,6 +88,22 @@ limit_bandwidth_reset (void)
   xzero (limit_data);
 }

+#ifdef HAVE_LIBZ
+static voidpf
+zalloc (voidpf opaque, unsigned int items, unsigned int size)
+{
+  (void) opaque;
+  return (voidpf) xcalloc (items, size);
+}
+
+static void
+zfree (vo

[Bug-wget] [PATCH 2/3] Add --compression argument

2017-07-28 Thread Tim Schlueter
Adds a --compression argument for later use. It currently has 3 possible
values: auto, gzip, or none.

Since for the time being, I'm only planning on adding gzip support via
zlib, the compression option is disabled when wget is built without zlib.

It is worth mentioning that --continue and --start-pos will not work
with on the fly decompression of stream compression algorithms like gzip
that do not have a way to translate a compressed size to an uncompressed
size and vice versa.

If a web server only serves gzipped content, on the fly gzip
decompression is enabled, and the download is interrupted, there is no
way to continue the download from where it left off.

Conversely, in such a scenario if gzip decompression is disabled so that
the compressed data is being stored locally, the gzipped download can be
continued in some cases (depends on the server).

---
 src/init.c| 29 +
 src/main.c| 27 +++
 src/options.h |  8 
 3 files changed, 64 insertions(+)

diff --git a/src/init.c b/src/init.c
index 5f4eefa..1064883 100644
--- a/src/init.c
+++ b/src/init.c
@@ -99,6 +99,9 @@ CMD_DECLARE (cmd_vector);

 CMD_DECLARE (cmd_use_askpass);

+#ifdef HAVE_LIBZ
+CMD_DECLARE (cmd_spec_compression);
+#endif
 CMD_DECLARE (cmd_spec_dirstruct);
 CMD_DECLARE (cmd_spec_header);
 CMD_DECLARE (cmd_spec_warc_header);
@@ -161,6 +164,9 @@ static const struct {
   { "checkcertificate", &opt.check_cert,cmd_check_cert },
 #endif
   { "chooseconfig", &opt.choose_config, cmd_file },
+#ifdef HAVE_LIBZ
+  { "compression",  &opt.compression,   cmd_spec_compression },
+#endif
   { "connecttimeout",   &opt.connect_timeout,   cmd_time },
   { "contentdisposition", &opt.content_disposition, cmd_boolean },
   { "contentonerror",   &opt.content_on_error,  cmd_boolean },
@@ -445,6 +451,10 @@ defaults (void)
   opt.ftps_clear_data_connection = false;
 #endif

+#ifdef HAVE_LIBZ
+  opt.compression = compression_auto;
+#endif
+
   /* The default for file name restriction defaults to the OS type. */
 #if defined(WINDOWS) || defined(MSDOS) || defined(__CYGWIN__)
   opt.restrict_files_os = restrict_windows;
@@ -1445,6 +1455,25 @@ cmd_cert_type (const char *com, const char *val,
void *place)

 static bool check_user_specified_header (const char *);

+#ifdef HAVE_LIBZ
+static bool
+cmd_spec_compression (const char *com, const char *val, void *place)
+{
+  static const struct decode_item choices[] = {
+{ "auto", compression_auto },
+{ "gzip", compression_gzip },
+{ "none", compression_none },
+  };
+  int ok = decode_string (val, choices, countof (choices), place);
+  if (!ok)
+{
+  fprintf (stderr, _("%s: %s: Invalid value %s.\n"), exec_name, com,
+   quote (val));
+}
+  return ok;
+}
+#endif
+
 static bool
 cmd_spec_dirstruct (const char *com, const char *val, void
*place_ignored _GL_UNUSED)
 {
diff --git a/src/main.c b/src/main.c
index 297499e..f9759c3 100644
--- a/src/main.c
+++ b/src/main.c
@@ -275,6 +275,9 @@ static struct cmdline_option option_data[] =
 { IF_SSL ("certificate-type"), 0, OPT_VALUE, "certificatetype", -1 },
 { IF_SSL ("check-certificate"), 0, OPT_BOOLEAN, "checkcertificate",
-1 },
 { "clobber", 0, OPT__CLOBBER, NULL, optional_argument },
+#ifdef HAVE_LIBZ
+{ "compression", 0, OPT_VALUE, "compression", -1 },
+#endif
 { "config", 0, OPT_VALUE, "chooseconfig", -1 },
 { "connect-timeout", 0, OPT_VALUE, "connecttimeout", -1 },
 { "continue", 'c', OPT_BOOLEAN, "continue", -1 },
@@ -763,6 +766,10 @@ HTTP options:\n"),
--ignore-length ignore 'Content-Length' header
field\n"),
 N_("\
--header=STRING insert STRING among the headers\n"),
+#ifdef HAVE_LIBZ
+N_("\
+   --compression=TYPE  choose compression, one of auto,
gzip and none\n"),
+#endif
 N_("\
--max-redirect  maximum redirections allowed per
page\n"),
 N_("\
@@ -1675,6 +1682,26 @@ for details.\n\n"));
 }
 }

+#ifdef HAVE_LIBZ
+  if (opt.always_rest || opt.start_pos >= 0)
+{
+  if (opt.compression == compression_auto)
+{
+  /* Compression does not work with --continue or --start-pos.
+ Since compression was not explicitly set, it will be
disabled. */
+  opt.compression = compression_none;
+}
+  else if (opt.compression != compression_none)
+{
+  fprintf (stderr,
+   _("Compression does not work with --continue or"
+ " --start-pos, they will be disabled.\n"));
+  opt.always_rest = false;
+  opt.start_pos = -1;
+}
+}
+#endif
+
   if (opt.ask_passwd && opt.passwd)
 {
   fprintf (stderr,
diff --git a/src/options.h b/src/options.h
index 3972945..cf945c1 100644
--- a/src/options.h
+++ b/src/options.h
@@ -326,6 +326,14 @@ struct options
name. */
   bool report_bps; 

[Bug-wget] [PATCH 1/3] Add Content-Encoding support to --adjust-extension

2017-07-28 Thread Tim Schlueter
When -E or --adjust-extension are specified, and the remote web
server responds with the content-encoding header set to gzip, deflate,
compress, or br, wget will now add the .gz, .zlib, .Z, and .br extensions
respectively.

This was inspired by Yuriy M. Kaminskiy's patch set:
https://lists.gnu.org/archive/html/bug-wget/2014-12/msg00087.html
---
 src/http.c | 90 +++
 1 file changed, 90 insertions(+)

diff --git a/src/http.c b/src/http.c
index f5d9caf..a8c6e18 100644
--- a/src/http.c
+++ b/src/http.c
@@ -1539,6 +1539,16 @@ persistent_available_p (const char *host, int port,
   fd = -1;  \
 } while (0)

+typedef enum
+{
+  ENC_INVALID = -1, /* invalid encoding */
+  ENC_NONE = 0, /* no special encoding */
+  ENC_GZIP, /* gzip compression */
+  ENC_DEFLATE,  /* deflate compression */
+  ENC_COMPRESS, /* compress compression */
+  ENC_BROTLI/* brotli compression */
+} encoding_t;
+
 struct http_stat
 {
   wgint len;/* received length */
@@ -1569,6 +1579,9 @@ struct http_stat
 #ifdef HAVE_METALINK
   metalink_t *metalink;
 #endif
+
+  encoding_t local_encoding;/* the encoding of the local file */
+
   bool temporary;   /* downloading a temporary file */
 };

@@ -3189,6 +3202,7 @@ gethttp (const struct url *u, struct url *
   xfree (hs->remote_time);
   hs->error = NULL;
   hs->message = NULL;
+  hs->local_encoding = ENC_NONE;

   conn = u;

@@ -3639,6 +3653,49 @@ gethttp (const struct url *u, struct url *
 }
 }

+  if (resp_header_copy (resp, "Content-Encoding", hdrval, sizeof (hdrval)))
+{
+  hs->local_encoding = ENC_INVALID;
+
+  switch (hdrval[0])
+{
+case 'b': case 'B':
+  if (0 == c_strcasecmp(hdrval, "br"))
+hs->local_encoding = ENC_BROTLI;
+  break;
+case 'c': case 'C':
+  if (0 == c_strcasecmp(hdrval, "compress"))
+hs->local_encoding = ENC_COMPRESS;
+  break;
+case 'd': case 'D':
+  if (0 == c_strcasecmp(hdrval, "deflate"))
+hs->local_encoding = ENC_DEFLATE;
+  break;
+case 'g': case 'G':
+  if (0 == c_strcasecmp(hdrval, "gzip"))
+hs->local_encoding = ENC_GZIP;
+  break;
+case 'i': case 'I':
+  if (0 == c_strcasecmp(hdrval, "identity"))
+hs->local_encoding = ENC_NONE;
+  break;
+case 'x': case 'X':
+  if (0 == c_strcasecmp(hdrval, "x-compress"))
+hs->local_encoding = ENC_COMPRESS;
+  else if (0 == c_strcasecmp(hdrval, "x-gzip"))
+hs->local_encoding = ENC_GZIP;
+  break;
+case '\0':
+  hs->local_encoding = ENC_NONE;
+}
+
+  if (hs->local_encoding == ENC_INVALID)
+{
+  DEBUGP (("Unrecognized Content-Encoding: %s\n", hdrval));
+  hs->local_encoding = ENC_NONE;
+}
+}
+
   /* 20x responses are counted among successful by default.  */
   if (H_20X (statcode))
 *dt |= RETROKF;
@@ -3767,6 +3824,35 @@ gethttp (const struct url *u, struct url *

   if (opt.adjust_extension)
 {
+  const char *encoding_ext = NULL;
+  switch (hs->local_encoding)
+{
+case ENC_INVALID:
+case ENC_NONE:
+  break;
+case ENC_BROTLI:
+  encoding_ext = ".br";
+  break;
+case ENC_COMPRESS:
+  encoding_ext = ".Z";
+  break;
+case ENC_DEFLATE:
+  encoding_ext = ".zlib";
+  break;
+case ENC_GZIP:
+  encoding_ext = ".gz";
+  break;
+default:
+  DEBUGP (("No extension found for encoding %d\n",
+   hs->local_encoding));
+  }
+  if (encoding_ext != NULL)
+{
+  char *file_ext = strrchr (hs->local_file, '.');
+  /* strip Content-Encoding extension (it will be re-added
later) */
+  if (file_ext != NULL && 0 == strcasecmp (file_ext, encoding_ext))
+*file_ext = '\0';
+}
   if (*dt & TEXTHTML)
 /* -E / --adjust-extension / adjust_extension = on was specified,
and this is a text/html file.  If some case-insensitive
@@ -3779,6 +3865,10 @@ gethttp (const struct url *u, struct url *
 {
   ensure_extension (hs, ".css", dt);
 }
+  if (encoding_ext != NULL)
+{
+  ensure_extension (hs, encoding_ext, dt);
+}
 }

   if (cond_get)
-- 



signature.asc
Description: OpenPGP digital signature


[Bug-wget] Wget1 Gzip Compression

2017-07-25 Thread Tim Schlueter
Hi,

I was wondering if there is any interest here in adding gzip compression
support to wget1.

I recently came across a misconfigured web server which would gzip all
responses regardless of the accept-encoding HTTP request header.

This motivated me to spend some time working on adding on the fly gzip
decompression to wget1 and making some other compression-related
improvements to the codebase (before I discovered that wget2 is a work
in progress).

So before I spend more time working on it, I wanted to see if gzip
support is something that you would consider adding to wget1 if I
submitted a patch.

Thanks,
Tim