Ángel González wrote: > No problem for suggesting things. I agree it would be cool. Doesn't even > look too hard, just detect the Content-Encoding: gzip and filter through gzip > -d when saving. > But someone needs to code it :)
Something like attached.
>From ccb95548926a0ab8ad4ed3787470d4803efad4ca Mon Sep 17 00:00:00 2001 From: "Yuriy M. Kaminskiy" <yum...@gmail.com> Date: Tue, 16 Dec 2014 17:19:22 +0300 Subject: [PATCH 1/3] Recognize Content-Encoding header --- src/http.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ src/wget.h | 8 +++++++- 2 files changed, 56 insertions(+), 1 deletions(-) diff --git a/src/http.c b/src/http.c index 1a6cd39..c769e95 100644 --- a/src/http.c +++ b/src/http.c @@ -2682,6 +2682,39 @@ read_header: contlen = last_byte_pos - first_byte_pos + 1; } } + + if (resp_header_copy (resp, "Content-Encoding", hdrval, sizeof(hdrval))) + { + const char *p = hdrval; + if (p[0] == 'x' && p[1] == '-') + p += 2; + switch (p[0]) + { + case 'g': case 'G': + if (0 == c_strcasecmp(p, "gzip")) + *dt |= CE_GZIP; + break; + case 'c': case 'C': + if (0 == c_strcasecmp(p, "compress")) + *dt |= CE_COMPRESS; + break; + case 'd': case 'D': + if (0 == c_strcasecmp(p, "deflate")) + *dt |= CE_DEFLATE; + break; + case 'b': case 'B': + if (0 == c_strcasecmp(p, "bzip2")) + *dt |= CE_BZIP2; + break; + default: + break; + } + if (!(*dt & CE_ANY)) + { + DEBUGP (("Unrecognized Content-Encoding: %s\n", p)); + } + } + resp_free (resp); /* 20x responses are counted among successful by default. */ @@ -2810,6 +2843,18 @@ read_header: if (opt.adjust_extension) { + const char *ce_ext = ((*dt & CE_GZIP) ? ".gz" : + (*dt & CE_COMPRESS) ? ".Z" : + (*dt & CE_BZIP2) ? ".bz2" : + (*dt & CE_DEFLATE) ? ".zlib" : NULL); + char *last_period; + if (ce_ext != NULL && + (last_period = strrchr(hs->local_file, '.')) != NULL && + strcasecmp(last_period, ce_ext) == 0) + /* strip Content-Encoding extension (it will be re-added later) */ + { + *last_period = '\0'; + } if (*dt & TEXTHTML) /* -E / --adjust-extension / adjust_extension = on was specified, and this is a text/html file. If some case-insensitive @@ -2822,6 +2867,10 @@ read_header: { ensure_extension (hs, ".css", dt); } + if (ce_ext != NULL) + { + ensure_extension (hs, ce_ext, dt); + } } if (statcode == HTTP_STATUS_RANGE_NOT_SATISFIABLE diff --git a/src/wget.h b/src/wget.h index 6edbfb8..be5fe51 100644 --- a/src/wget.h +++ b/src/wget.h @@ -331,7 +331,13 @@ enum SEND_NOCACHE = 0x0008, /* send Pragma: no-cache directive */ ACCEPTRANGES = 0x0010, /* Accept-ranges header was found */ ADDED_HTML_EXTENSION = 0x0020, /* added ".html" extension due to -E */ - TEXTCSS = 0x0040 /* document is of type text/css */ + TEXTCSS = 0x0040, /* document is of type text/css */ + CE_COMPRESS = 0x0100, /* Content-Encoding: compress */ + CE_DEFLATE = 0x0200, /* Content-Encoding: deflate */ + CE_GZIP = 0x0400, /* Content-Encoding: gzip */ + CE_BZIP2 = 0x0800, /* Content-Encoding: bzip2 */ +#define CE_ANY (CE_COMPRESS|CE_DEFLATE|CE_GZIP|CE_BZIP2) + /* Any known Content-Encoding */ }; /* Universal error type -- used almost everywhere. Error reporting of --
>From b41339ba8d0e28f9033e83883d33889177ebbdc5 Mon Sep 17 00:00:00 2001 From: "Yuriy M. Kaminskiy" <yum...@gmail.com> Date: Tue, 16 Dec 2014 17:29:34 +0300 Subject: [PATCH 2/3] Added option to support HTTP compression Send Accept-Encoding header, decompress gzip-compressed document (recognize by extension, should work well with --adjust-extension option) --- src/http.c | 6 +++++- src/init.c | 1 + src/main.c | 3 +++ src/options.h | 1 + src/utils.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 53 insertions(+), 1 deletions(-) diff --git a/src/http.c b/src/http.c index c769e95..5ce6c93 100644 --- a/src/http.c +++ b/src/http.c @@ -1798,6 +1798,9 @@ gethttp (struct url *u, struct http_stat *hs, int *dt, struct url *proxy, rel_value); SET_USER_AGENT (req); request_set_header (req, "Accept", "*/*", rel_none); + if (opt.compressed) + request_set_header (req, "Accept-Encoding", "gzip, deflate", rel_none); + else request_set_header (req, "Accept-Encoding", "identity", rel_none); /* Find the username and password for authentication. */ @@ -2683,7 +2686,8 @@ read_header: } } - if (resp_header_copy (resp, "Content-Encoding", hdrval, sizeof(hdrval))) + if (opt.compressed && + resp_header_copy (resp, "Content-Encoding", hdrval, sizeof(hdrval))) { const char *p = hdrval; if (p[0] == 'x' && p[1] == '-') diff --git a/src/init.c b/src/init.c index 088a6e9..ccf742a 100644 --- a/src/init.c +++ b/src/init.c @@ -154,6 +154,7 @@ static const struct { { "checkcertificate", &opt.check_cert, cmd_boolean }, #endif { "chooseconfig", &opt.choose_config, cmd_file }, + { "compressed", &opt.compressed, cmd_boolean }, { "connecttimeout", &opt.connect_timeout, cmd_time }, { "contentdisposition", &opt.content_disposition, cmd_boolean }, { "contentonerror", &opt.content_on_error, cmd_boolean }, diff --git a/src/main.c b/src/main.c index 99c2819..6f0d7b2 100644 --- a/src/main.c +++ b/src/main.c @@ -186,6 +186,7 @@ static struct cmdline_option option_data[] = { IF_SSL ("certificate-type"), 0, OPT_VALUE, "certificatetype", -1 }, { IF_SSL ("check-certificate"), 0, OPT_BOOLEAN, "checkcertificate", -1 }, { "clobber", 0, OPT__CLOBBER, NULL, optional_argument }, + { "compressed", 'C', OPT_BOOLEAN, "compressed", -1 }, { "config", 0, OPT_VALUE, "chooseconfig", -1 }, { "connect-timeout", 0, OPT_VALUE, "connecttimeout", -1 }, { "continue", 'c', OPT_BOOLEAN, "continue", -1 }, @@ -582,6 +583,8 @@ Directories:\n"), N_("\ -nH, --no-host-directories don't create host directories.\n"), N_("\ + --compressed support HTTP compression (Content-Encoding).\n"), + N_("\ --protocol-directories use protocol name in directories.\n"), N_("\ -P, --directory-prefix=PREFIX save files to PREFIX/...\n"), diff --git a/src/options.h b/src/options.h index b995126..d1a7faf 100644 --- a/src/options.h +++ b/src/options.h @@ -288,6 +288,7 @@ struct options bool show_all_dns_entries; /* Show all the DNS entries when resolving a name. */ bool report_bps; /*Output bandwidth in bits format*/ + bool compressed; /* Use Accept-Encoding/Content-Encoding */ }; extern struct options opt; diff --git a/src/utils.c b/src/utils.c index 42a7c4c..46c23a7 100644 --- a/src/utils.c +++ b/src/utils.c @@ -63,6 +63,10 @@ as that of the covered work. */ #include <sys/stat.h> +#ifdef HAVE_LIBZ +# include <zlib.h> +#endif + /* For TIOCGWINSZ and friends: */ #include <sys/ioctl.h> #include <termios.h> @@ -1154,6 +1158,9 @@ wget_read_file (const char *file) struct file_memory *fm; long size; bool inhibit_close = false; +#ifdef HAVE_LIBZ + gzFile *zfile = NULL; +#endif /* Some magic in the finest tradition of Perl and its kin: if FILE is "-", just use stdin. */ @@ -1170,6 +1177,29 @@ wget_read_file (const char *file) return NULL; fm = xnew (struct file_memory); +#ifdef HAVE_LIBZ + do { + const char *ext = strrchr(file, '.'); + int zfd = -1; + + if (!opt.compressed) break; + if (ext == NULL) break; + if (0 != strcasecmp(ext + 1, "gz") && + 0 != strcasecmp(ext + 1, "zlib")) + break; + if ((zfd = dup(fd)) == -1) + break; + if ((zfile = gzdopen(zfd, "r")) == NULL) + { + close(zfd); + break; + } +#ifdef HAVE_MMAP + goto mmap_lose; +#endif + } while(0); +#endif + #ifdef HAVE_MMAP { struct_fstat buf; @@ -1224,6 +1254,11 @@ wget_read_file (const char *file) size <<= 1; fm->content = xrealloc (fm->content, size); } +#ifdef HAVE_LIBZ + if (zfile != NULL) + nread = gzread (zfile, fm->content + fm->length, size - fm->length); + else +#endif nread = read (fd, fm->content + fm->length, size - fm->length); if (nread > 0) /* Successful read. */ @@ -1237,6 +1272,10 @@ wget_read_file (const char *file) } if (!inhibit_close) close (fd); +#ifdef HAVE_LIBZ + if (zfile != NULL) + gzclose(zfile); +#endif if (size > fm->length && fm->length != 0) /* Due to exponential growth of fm->content, the allocated region might be much larger than what is actually needed. */ @@ -1245,6 +1284,10 @@ wget_read_file (const char *file) return fm; lose: +#ifdef HAVE_LIBZ + if (zfile != NULL) + gzclose(zfile); +#endif if (!inhibit_close) close (fd); xfree (fm->content); --
>From d48d72d891773858aa70806787466224c2e8a31f Mon Sep 17 00:00:00 2001 From: "Yuriy M. Kaminskiy" <yum...@gmail.com> Date: Tue, 16 Dec 2014 17:35:27 +0300 Subject: [PATCH 3/3] Attempt to recognize compressed files by signature Should improve --compressed without --adjust-extension --- src/utils.c | 48 +++++++++++++++++++++++++++++++++++++++++++++++- 1 files changed, 47 insertions(+), 1 deletions(-) diff --git a/src/utils.c b/src/utils.c index 46c23a7..04f56b6 100644 --- a/src/utils.c +++ b/src/utils.c @@ -1160,6 +1160,7 @@ wget_read_file (const char *file) bool inhibit_close = false; #ifdef HAVE_LIBZ gzFile *zfile = NULL; + bool try_compression_once = opt.compressed; #endif /* Some magic in the finest tradition of Perl and its kin: if FILE @@ -1182,15 +1183,19 @@ wget_read_file (const char *file) const char *ext = strrchr(file, '.'); int zfd = -1; - if (!opt.compressed) break; + if (!try_compression_once) break; if (ext == NULL) break; if (0 != strcasecmp(ext + 1, "gz") && 0 != strcasecmp(ext + 1, "zlib")) break; + DEBUGP (("Decompress: extension matched: %s\n", ext)); + force_compressed: + try_compression_once = false; if ((zfd = dup(fd)) == -1) break; if ((zfile = gzdopen(zfd, "r")) == NULL) { + DEBUGP (("gzdopen() failed\n")); close(zfd); break; } @@ -1214,6 +1219,19 @@ wget_read_file (const char *file) MAP_PRIVATE, fd, 0); if (fm->content == (char *)MAP_FAILED) goto mmap_lose; +#ifdef HAVE_LIBZ + if (try_compression_once && + fm->length >= 10 /* minimal gzip size */ && + fm->content[0] == '\x1f' && /* gzip signature */ + fm->content[1] == '\x8b' && + fm->content[2] == '\x08') + { + DEBUGP (("Automatic decompress check: ok, munmap\n")); + munmap (fm->content, fm->length); + fm->content = NULL; + goto force_compressed; + } +#endif if (!inhibit_close) close (fd); @@ -1269,6 +1287,34 @@ wget_read_file (const char *file) else /* EOF */ break; +#ifdef HAVE_LIBZ + if (try_compression_once /* && zfile == NULL */ && + fm->length - nread < 10 /* old length was less than minimum */ && + fm->length >= 10 /* new length is longer than minimum */) + { + try_compression_once = false; + DEBUGP (("Automatic decompress check: ")); + if (!(fm->content[0] == '\x1f' && /* gzip signature */ + fm->content[1] == '\x8b' && + fm->content[2] == '\x08')) + { + DEBUGP (("gzip signature mismatch\n")); + continue; + } + /* try to seek back */ + if (lseek (fd, SEEK_CUR, -(fm->length)) == (off_t)-1) + { + DEBUGP (("lseek(%ld) failed: %s\n", -(long)fm->length, + strerror(errno))); + continue; + } + /* drop content and retry with compression */ + xfree(fm->content); + fm->content = NULL; + DEBUGP (("ok, freed\n")); + goto force_compressed; + } +#endif } if (!inhibit_close) close (fd); --