Stephen Hewitt <[EMAIL PROTECTED]> writes: > Attempting to mirror a particular web site, with wget 1.8.1, I got > many nested directories like .../images/images/images/images etc For > example the log file ended like this: [...]
Thanks for the detailed report and for taking the time to find the problem. I've seen similar problems, but have never had the inclination to find the cause; `--mirror' is especially susceptible to this because it implies `-l0'. > The fundamental problem in this case is that if you ask for a page > that does not exist from www.can-online.org.uk, the server does not > respond correctly. Instead of presenting the 404 not found, it > serves up some default web page. Right. And if that same web page contains the link to a non-existent image, an infloop insues. > Now at this point is the opportunity for wget to be more robust. [...] > So my suggestion is this. If wget is following an <img src=""> > address from a page, and instead of the expected [image/gif] (or > jpeg or whatever) file type the server gives a [text/html], then > wget should not follow any links that the text/html file contains. I agree. I've attached a patch against the current CVS code that should fix this problem. Could you please try it out and let me know if it works for you? > Perhaps you could even argue that it should report an error and not > even save the html file, That would be going too far. If the <img src=...> returns an HTML page, so be it. Let the browsers cope with it as best they can; Wget will simply not harvest the links from such a page. 2003-10-10 Hrvoje Niksic <[EMAIL PROTECTED]> * recur.c (retrieve_tree): Don't descend into documents that are not expected to contain HTML, regardless of their content-type. * html-url.c (tag_url_attributes): Record which attributes are supposed to yield HTML links that can be followed. (tag_find_urls): Propagate that information to the caller through struct urlpos. Index: src/convert.h =================================================================== RCS file: /pack/anoncvs/wget/src/convert.h,v retrieving revision 1.1 diff -u -r1.1 convert.h --- src/convert.h 2003/09/21 22:47:13 1.1 +++ src/convert.h 2003/10/10 14:07:41 @@ -56,11 +56,11 @@ /* Information about the original link: */ - unsigned int link_relative_p :1; /* was the link relative? */ - unsigned int link_complete_p :1; /* was the link complete (with the - host name, etc.) */ - unsigned int link_base_p :1; /* was the link <base href=...> */ - unsigned int link_inline_p :1; /* needed to render the page. */ + unsigned int link_relative_p :1; /* the link was relative */ + unsigned int link_complete_p :1; /* the link was complete (had host name) */ + unsigned int link_base_p :1; /* the url came from <base href=...> */ + unsigned int link_inline_p :1; /* needed to render the page */ + unsigned int link_expect_html :1; /* expected to contain HTML */ unsigned int link_refresh_p :1; /* link was received from <meta http-equiv=refresh content=...> */ Index: src/html-url.c =================================================================== RCS file: /pack/anoncvs/wget/src/html-url.c,v retrieving revision 1.33 diff -u -r1.33 html-url.c --- src/html-url.c 2003/10/10 02:46:09 1.33 +++ src/html-url.c 2003/10/10 14:07:42 @@ -121,11 +121,19 @@ /* tag_url_attributes documents which attributes of which tags contain URLs to harvest. It is used by tag_find_urls. */ -/* Defines for the FLAGS field; currently only one flag is defined. */ +/* Defines for the FLAGS. */ -/* This tag points to an external document not necessary for rendering this - document (i.e. it's not an inlined image, stylesheet, etc.). */ -#define TUA_EXTERNAL 1 +/* The link is "inline", i.e. needs to be retrieved for this document + to be correctly rendered. Inline links include inlined images, + stylesheets, children frames, etc. */ +#define ATTR_INLINE 1 + +/* The link is expected to yield HTML contents. It's important not to + try to follow HTML obtained by following e.g. <img src="..."> + regardless of content-type. Doing this causes infinite loops for + "images" that return non-404 error pages with links to the same + image. */ +#define ATTR_HTML 2 /* For tags handled by tag_find_urls: attributes that contain URLs to download. */ @@ -134,26 +142,26 @@ const char *attr_name; int flags; } tag_url_attributes[] = { - { TAG_A, "href", TUA_EXTERNAL }, - { TAG_APPLET, "code", 0 }, - { TAG_AREA, "href", TUA_EXTERNAL }, - { TAG_BGSOUND, "src", 0 }, - { TAG_BODY, "background", 0 }, - { TAG_EMBED, "href", TUA_EXTERNAL }, - { TAG_EMBED, "src", 0 }, - { TAG_FIG, "src", 0 }, - { TAG_FRAME, "src", 0 }, - { TAG_IFRAME, "src", 0 }, - { TAG_IMG, "href", 0 }, - { TAG_IMG, "lowsrc", 0 }, - { TAG_IMG, "src", 0 }, - { TAG_INPUT, "src", 0 }, - { TAG_LAYER, "src", 0 }, - { TAG_OVERLAY, "src", 0 }, - { TAG_SCRIPT, "src", 0 }, - { TAG_TABLE, "background", 0 }, - { TAG_TD, "background", 0 }, - { TAG_TH, "background", 0 } + { TAG_A, "href", ATTR_HTML }, + { TAG_APPLET, "code", ATTR_INLINE }, + { TAG_AREA, "href", ATTR_HTML }, + { TAG_BGSOUND, "src", ATTR_INLINE }, + { TAG_BODY, "background", ATTR_INLINE }, + { TAG_EMBED, "href", ATTR_HTML }, + { TAG_EMBED, "src", ATTR_INLINE | ATTR_HTML }, + { TAG_FIG, "src", ATTR_INLINE }, + { TAG_FRAME, "src", ATTR_INLINE | ATTR_HTML }, + { TAG_IFRAME, "src", ATTR_INLINE | ATTR_HTML }, + { TAG_IMG, "href", ATTR_INLINE }, + { TAG_IMG, "lowsrc", ATTR_INLINE }, + { TAG_IMG, "src", ATTR_INLINE }, + { TAG_INPUT, "src", ATTR_INLINE }, + { TAG_LAYER, "src", ATTR_INLINE | ATTR_HTML }, + { TAG_OVERLAY, "src", ATTR_INLINE | ATTR_HTML }, + { TAG_SCRIPT, "src", ATTR_INLINE }, + { TAG_TABLE, "background", ATTR_INLINE }, + { TAG_TD, "background", ATTR_INLINE }, + { TAG_TH, "background", ATTR_INLINE } }; /* The lists of interesting tags and attributes are built dynamically, @@ -262,7 +270,7 @@ size. */ static struct urlpos * -append_one_url (const char *link_uri, int inlinep, +append_one_url (const char *link_uri, struct taginfo *tag, int attrind, struct map_context *ctx) { int link_has_scheme = url_has_scheme (link_uri); @@ -326,7 +334,6 @@ newel->url = url; newel->pos = tag->attrs[attrind].value_raw_beginning - ctx->text; newel->size = tag->attrs[attrind].value_raw_size; - newel->link_inline_p = inlinep; /* A URL is relative if the host is not named, and the name does not start with `/'. */ @@ -393,8 +400,15 @@ if (0 == strcasecmp (tag->attrs[attrind].name, tag_url_attributes[i].attr_name)) { - int flags = tag_url_attributes[i].flags; - append_one_url (link, !(flags & TUA_EXTERNAL), tag, attrind, ctx); + struct urlpos *up = append_one_url (link, tag, attrind, ctx); + if (up) + { + int flags = tag_url_attributes[i].flags; + if (flags & ATTR_INLINE) + up->link_inline_p = 1; + if (flags & ATTR_HTML) + up->link_expect_html = 1; + } } } } @@ -411,7 +425,7 @@ if (!newbase) return; - base_urlpos = append_one_url (newbase, 0, tag, attrind, ctx); + base_urlpos = append_one_url (newbase, tag, attrind, ctx); if (!base_urlpos) return; base_urlpos->ignore_when_downloading = 1; @@ -434,10 +448,9 @@ char *action = find_attr (tag, "action", &attrind); if (action) { - struct urlpos *action_urlpos = append_one_url (action, 0, tag, - attrind, ctx); - if (action_urlpos) - action_urlpos->ignore_when_downloading = 1; + struct urlpos *up = append_one_url (action, tag, attrind, ctx); + if (up) + up->ignore_when_downloading = 1; } } @@ -458,11 +471,15 @@ */ if (href) { - char *rel = find_attr (tag, "rel", NULL); - int inlinep = (rel - && (0 == strcasecmp (rel, "stylesheet") - || 0 == strcasecmp (rel, "shortcut icon"))); - append_one_url (href, inlinep, tag, attrind, ctx); + struct urlpos *up = append_one_url (href, tag, attrind, ctx); + if (up) + { + char *rel = find_attr (tag, "rel", NULL); + if (rel + && (0 == strcasecmp (rel, "stylesheet") + || 0 == strcasecmp (rel, "shortcut icon"))) + up->link_inline_p = 1; + } } } @@ -511,7 +528,7 @@ while (ISSPACE (*p)) ++p; - entry = append_one_url (p, 0, tag, attrind, ctx); + entry = append_one_url (p, tag, attrind, ctx); if (entry) { entry->link_refresh_p = 1; Index: src/recur.c =================================================================== RCS file: /pack/anoncvs/wget/src/recur.c,v retrieving revision 1.54 diff -u -r1.54 recur.c --- src/recur.c 2003/10/07 23:53:31 1.54 +++ src/recur.c 2003/10/10 14:07:42 @@ -6,7 +6,7 @@ GNU Wget is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or -(at your option) any later version. + (at your option) any later version. GNU Wget is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -66,10 +66,13 @@ /* Functions for maintaining the URL queue. */ struct queue_element { - const char *url; - const char *referer; - int depth; - struct queue_element *next; + const char *url; /* the URL to download */ + const char *referer; /* the referring document */ + int depth; /* the depth */ + unsigned int html_allowed :1; /* whether the document is allowed to + be treated as HTML. */ + + struct queue_element *next; /* next element in queue */ }; struct url_queue { @@ -102,12 +105,13 @@ static void url_enqueue (struct url_queue *queue, - const char *url, const char *referer, int depth) + const char *url, const char *referer, int depth, int html_allowed) { struct queue_element *qel = xmalloc (sizeof (*qel)); qel->url = url; qel->referer = referer; qel->depth = depth; + qel->html_allowed = html_allowed; qel->next = NULL; ++queue->count; @@ -130,7 +134,8 @@ static int url_dequeue (struct url_queue *queue, - const char **url, const char **referer, int *depth) + const char **url, const char **referer, int *depth, + int *html_allowed) { struct queue_element *qel = queue->head; @@ -144,6 +149,7 @@ *url = qel->url; *referer = qel->referer; *depth = qel->depth; + *html_allowed = qel->html_allowed; --queue->count; @@ -208,14 +214,14 @@ /* Enqueue the starting URL. Use start_url_parsed->url rather than just URL so we enqueue the canonical form of the URL. */ - url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0); + url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, 1); string_set_add (blacklist, start_url_parsed->url); while (1) { int descend = 0; char *url, *referer, *file = NULL; - int depth; + int depth, html_allowed; boolean dash_p_leaf_HTML = FALSE; if (downloaded_exceeds_quota ()) @@ -227,7 +233,7 @@ if (!url_dequeue (queue, (const char **)&url, (const char **)&referer, - &depth)) + &depth, &html_allowed)) break; /* ...and download it. Note that this download is in most cases @@ -245,7 +251,8 @@ DEBUGP (("Already downloaded \"%s\", reusing it from \"%s\".\n", url, file)); - if (downloaded_html_set + if (html_allowed + && downloaded_html_set && string_set_contains (downloaded_html_set, file)) descend = 1; } @@ -259,7 +266,7 @@ status = retrieve_url (url, &file, &redirected, referer, &dt); opt.recursive = oldrec; - if (file && status == RETROK + if (html_allowed && file && status == RETROK && (dt & RETROKF) && (dt & TEXTHTML)) descend = 1; @@ -341,7 +348,8 @@ blacklist)) { url_enqueue (queue, xstrdup (child->url->url), - xstrdup (url), depth + 1); + xstrdup (url), depth + 1, + child->link_expect_html); /* We blacklist the URL we have enqueued, because we don't want to enqueue (and hence download) the same URL twice. */ @@ -382,8 +390,9 @@ now. */ { char *d1, *d2; - int d3; - while (url_dequeue (queue, (const char **)&d1, (const char **)&d2, &d3)) + int d3, d4; + while (url_dequeue (queue, + (const char **)&d1, (const char **)&d2, &d3, &d4)) { xfree (d1); FREE_MAYBE (d2);