[Bug-wget] [PATCH] Patch to change behavior with redirects under --recurse.

Dale R. Worley Wed, 24 Aug 2016 17:24:06 -0700

This is the change that I'm interested in.  I don't expect this to be
put into the distribution without a lot of discussion.


This version changes the behavior of --recurse:  If a file is
downloaded, it will be scanned for links to follow.  This differs from
the current behavior, in which the URL from which the contents were
obtained (after any redirections) is further checked to see if that URL
passes the recursion limitations.

This patch also includes a test to verify the new behavior.

I worry that this is a substantial change of behavior.  OTOH, the
current behavior seems to be very unintuitive.  And the fact that there
is no test for this behavior suggests that people have not been
depending on it.

Comments?

Dale

>From fe409b1447da28f2c5677ee2d8114e83a19c75f1 Mon Sep 17 00:00:00 2001
From: "Dale R. Worley" <[email protected]>
Date: Tue, 23 Aug 2016 23:07:23 -0400
Subject: [PATCH] Patch to change behavior with redirects under --recurse.  Add
 test for this change.

---
 src/recur.c                        | 55 ++------------------------------
 testenv/Makefile.am                |  1 +
 testenv/Test-recursive-redirect.py | 64 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 68 insertions(+), 52 deletions(-)
 create mode 100644 testenv/Test-recursive-redirect.py

diff --git a/src/recur.c b/src/recur.c
index 2b17e72..72059b4 100644
--- a/src/recur.c
+++ b/src/recur.c
@@ -191,8 +191,6 @@ typedef enum
 
 static reject_reason download_child (const struct urlpos *, struct url *, int,
                               struct url *, struct hash_table *, struct iri *);
-static reject_reason descend_redirect (const char *, struct url *, int,
-                              struct url *, struct hash_table *, struct iri *);
 static void write_reject_log_header (FILE *);
 static void write_reject_log_reason (FILE *, reject_reason,
                               const struct url *, const struct url *);
@@ -358,19 +356,9 @@ retrieve_tree (struct url *start_url_parsed, struct iri *pi)
                      want to follow it.  */
                   if (descend)
                     {
-                      reject_reason r = descend_redirect (redirected, url_parsed,
-                                        depth, start_url_parsed, blacklist, i);
-                      if (r == WG_RR_SUCCESS)
-                        {
-                          /* Make sure that the old pre-redirect form gets
-                             blacklisted. */
-                          blacklist_add (blacklist, url);
-                        }
-                      else
-                        {
-                          write_reject_log_reason (rejectedlog, r, url_parsed, start_url_parsed);
-                          descend = false;
-                        }
+		      /* Make sure that the old pre-redirect form gets
+			 blacklisted. */
+		      blacklist_add (blacklist, url);
                     }
 
                   xfree (url);
@@ -774,43 +762,6 @@ download_child (const struct urlpos *upos, struct url *parent, int depth,
   return reason;
 }
 
-/* This function determines whether we will consider downloading the
-   children of a URL whose download resulted in a redirection,
-   possibly to another host, etc.  It is needed very rarely, and thus
-   it is merely a simple-minded wrapper around download_child.  */
-
-static reject_reason
-descend_redirect (const char *redirected, struct url *orig_parsed, int depth,
-                    struct url *start_url_parsed, struct hash_table *blacklist,
-                    struct iri *iri)
-{
-  struct url *new_parsed;
-  struct urlpos *upos;
-  reject_reason reason;
-
-  assert (orig_parsed != NULL);
-
-  new_parsed = url_parse (redirected, NULL, NULL, false);
-  assert (new_parsed != NULL);
-
-  upos = xnew0 (struct urlpos);
-  upos->url = new_parsed;
-
-  reason = download_child (upos, orig_parsed, depth,
-                              start_url_parsed, blacklist, iri);
-
-  if (reason == WG_RR_SUCCESS)
-    blacklist_add (blacklist, upos->url->url);
-  else
-    DEBUGP (("Redirection \"%s\" failed the test.\n", redirected));
-
-  url_free (new_parsed);
-  xfree (upos);
-
-  return reason;
-}
-
-
 /* This function writes the rejected log header. */
 static void
 write_reject_log_header (FILE *f)
diff --git a/testenv/Makefile.am b/testenv/Makefile.am
index deef18e..036b91c 100644
--- a/testenv/Makefile.am
+++ b/testenv/Makefile.am
@@ -75,6 +75,7 @@ if HAVE_PYTHON3
     Test-Post.py                                    \
     Test-recursive-basic.py                         \
     Test-recursive-include.py                       \
+    Test-recursive-redirect.py                      \
     Test-redirect.py                                \
     Test-redirect-crash.py                          \
     Test--rejected-log.py                           \
diff --git a/testenv/Test-recursive-redirect.py b/testenv/Test-recursive-redirect.py
new file mode 100644
index 0000000..8a114a5
--- /dev/null
+++ b/testenv/Test-recursive-redirect.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python3
+from sys import exit
+from test.http_test import HTTPTest
+from test.base_test import HTTP, HTTPS
+from misc.wget_file import WgetFile
+
+"""
+    Basic test of --recursive.
+"""
+############# File Definitions ###############################################
+File1 = """<html><body>
+<a href=\"/a/File2.html\">text</a>
+<a href=\"/b/File3.html\">text</a>
+</body></html>"""
+File2 = "With lemon or cream?"
+File3 = "Surely you're joking Mr. Feynman"
+
+File1_rules = {
+    "Response"          : 301,
+    "SendHeader"        : {"Location" : "/b/File1.html"}
+}
+
+File1_File = WgetFile ("a/File1.html", "", rules=File1_rules)
+File1_Redirected = WgetFile ("b/File1.html", File1)
+File1_Retrieved = WgetFile ("a/File1.html", File1)
+File2_File = WgetFile ("a/File2.html", File2)
+File3_File = WgetFile ("b/File3.html", File3)
+
+WGET_OPTIONS = "--recursive --no-host-directories --include-directories=a"
+WGET_URLS = [["a/File1.html"]]
+
+Servers = [HTTP]
+
+Files = [[File1_Redirected, File1_File, File2_File, File3_File]]
+Existing_Files = []
+
+ExpectedReturnCode = 0
+ExpectedDownloadedFiles = [File1_Retrieved, File2_File]
+Request_List = [["GET /a/File1.html",
+                 "GET /a/File2.html",
+                 "GET /b/File3.html"]]
+
+################ Pre and Post Test Hooks #####################################
+pre_test = {
+    "ServerFiles"       : Files,
+    "LocalFiles"        : Existing_Files
+}
+test_options = {
+    "WgetCommands"      : WGET_OPTIONS,
+    "Urls"              : WGET_URLS
+}
+post_test = {
+    "ExpectedFiles"     : ExpectedDownloadedFiles,
+    "ExpectedRetcode"   : ExpectedReturnCode
+}
+
+err = HTTPTest (
+                pre_hook=pre_test,
+                test_params=test_options,
+                post_hook=post_test,
+                protocols=Servers
+).begin ()
+
+exit (err)
-- 
2.10.0.rc0.17.gd63263a

[Bug-wget] [PATCH] Patch to change behavior with redirects under --recurse.

Reply via email to