Hi to all! Today I wanted to download the following web page for means of archiving: https://www.esquire.de/life/reisen/schoenste-wasserfaelle-welt-natur
The following command line did not do what I want: wget -p -N -H -D esquire.de --tries=10 https://www.esquire.de/life/reisen/schoenste-wasserfaelle-welt-natur The following seemed to do: wget -p -r -N -H -D esquire.de --exclude-domains www.esquire.de --tries=10 https://www.esquire.de/life/reisen/schoenste-wasserfaelle-welt-natur : files downloaded: now/static.esquire.de/1200x630/smart/images/2023-08/gettyimages-1391653079.jpg now/www.esquire.de/life/reisen/schoenste-wasserfaelle-welt-natur : dld.log: ... BEENDET --2023-09-12 23:18:01-- Verstrichene Zeit: 1,2s Geholt: 2 Dateien, 246K in 0,07s (3,62 MB/s) i.e. diz "two files fetched, no error" Without -r & --exclude-domains it did download 52 files (most of them .js), all from www.esquire.de and none from static.esquire.de. Finally I succeeded to download the images desired by me by á: (here starting from the second file as I did a manual download of the first) grep -o "https://static.esquire.de/[^ ]*\.jpg" schoenste-wasserfaelle-welt-natur.html | sed -n '2,500/./p' | while read line; do wget -p "$line"; done Might (theoretically) be a bug of wget 1.21.4 (1.mga9, i.e. Mageia 9 i686) that it did not download more than two files at the second attempt, though that may also be supposed to be a public-avail-silicon fallacy by whomever wants it to assume. BTW: 'wpdld' is my scriptlet to archive the web pages I read. Regarding the pages it works for (using wget) I prefer this over a Firefox save-page, as it keeps the web page more or less in pristine state to be mirrored like at the Wayback machine, if necessary. Not to save on disk what I read is something I have experienced that it can be nasty, caus´ not every article in news is kept online forever, or be it that it is just deleted from the indexes of search engines (and on-page searches). I would also have 'wpv' for viewing, but alas that isn´t multidomain or non-relative link ready - Hi, what about a make-relative feature of already downloaded web pages on disk for wget2? (would be my desire as I prefer to download non-relative and doing that on disk allows a 'dircmp' (another self-written program to compare (and sync) directories; using it more or less since 2008).) Regards, Elmar Stellnberger
#!/bin/bash # may be used under GPLv3, (c) copyright by Elmar Stellnberger <[email protected]>, 2022, 2023-09 let verbosity=1 usetitle=0 cont=0 use_agent=1 vwhelp=0 stripRefererUrlPrm=1; while [[ "${1:0:1}" = "-" ]]; do if [[ "${1:1:1}" = "-" ]]; then if [[ "$1" = "--verbose" ]]; then let verbosity+=1; elif [[ "$1" = "--title" ]]; then let usetitle=1; elif [[ "$1" = "--no-title" ]]; then let usetitle=0; elif [[ "$1" = "--continue" ]]; then let cont=1; elif [[ "$1" = "--use-agent" ]]; then let use_agent=1; elif [[ "$1" = "--no-use-agent" ]]; then let use_agent=0; elif [[ "$1" = "--no-strip-referer-url-prm" ]]; then let stripRefererUrlPrm=0; elif [[ "$1" = "--help" ]]; then let vwhelp=1; elif [[ "$1" = "--" ]]; then shift; break; else echo "unknown option $1"; exit 1; fi else for((i=1;i<${#1};i+=1)); do case "${1:i:1}" in v) let verbosity+=1;; c) let cont=1;; h) let vwhelp=1;; t) let usetitle=1;; T) let usetitle=0;; a) let use_agent=1;; A) let use_agent=0;; *) "unknown option -${1:i:1}"; exit 1;; esac done fi shift; done if [[ $# != 2 || vwhelp -ne 0 || "${2%://*}" = "$2" || "${1%/*}" = "$1" ]]; then echo "wpdld [-v|--verbose] [-t|--title] [-T|--no-title] /folder/ https://my.web.page/xy.html" echo " --title: use title tag in .html file rather than URL for visible access link, if possible (--no-title is default)" echo " -c or --continue: don´t tell wget to use timestamps to issue re-downloads; " echo " always continue downloads of existing files if their size is smaller than online" echo " -A or --no-use-agent: make request with wget user agent instead of Firefox user agent" echo " --no-strip-referer-url-prm ... don´t stip ?utm_source=... from url" echo exit 1; fi if [[ "$(id -u)" -eq 0 ]]; then echo "please do not run wpdld as root!" >&2; echo >&2; exit 2; fi createLinkName() { local i ext basename name="$1"; [[ -z "$1" ]] && name="index.html"; if [[ "${1%.*}" != "$1" ]]; then ext=".${1##*.}"; else ext=".html"; name="$name$ext"; fi if [[ usetitle -ne 0 || "$name" = "index.html" || "$name" =~ ^[^G-Zg-z]*\.[^.]*$ ]]; then # last condition: only hex-digits and special chars #title="$(sed -n '/<[Hh][Ee][Aa][Dd]>/,/<\/[Hh][Ee][Aa][Dd]>/s#^.*<[Tt][Ii][Tt][Ll][Ee]>\([^<]*\)</[Tt][Ii][Tt][Ll][Ee]>.*$#\1#;T nope; p;q; :nope' "$2")" #title="$(sed -n '/<[Hh][Ee][Aa][Dd]>/,/<\/[Hh][Ee][Aa][Dd]>/{ /<[Tt][Ii][Tt][Ll][Ee]>/,/<\/[Tt][Ii][Tt][Ll][Ee]>/{ s#^.*<title>##; s#<\/title>.*$##; s#^[ \t]*\([^ ].*\)[\t ]*$#\1#p; T nope; q; }; }; :nope; /<\/[Tt][Ii][Tt][Ll][Ee]>/q' "$2")" title="$(sed -n '/<[Hh][Ee][Aa][Dd]>/,/<\/[Hh][Ee][Aa][Dd]>/{ /<[Tt][Ii][Tt][Ll][Ee][ \t]*[^>]*>/,/<\/[Tt][Ii][Tt][Ll][Ee]>/{ s#^.*<title[ \t]*[^>]*>##; s#<\/title>.*$##; s#^[ \t]*##; s#[\t ]*$##; /^$/b nope; p; q; }; }; :nope;' "$2")" if [[ -n "$title" ]]; then #echo "'${title//\"/\\\"}'" title="$(python -c 'import html; print(html.unescape("'"${title//\"/\\\"}"'").translate({ord("/"):ord("_")})); ')" name="$title.html"; ext=".html"; fi fi basename="$name"; let i=0; while [[ -e "$name" && ! -L "$name" && "$(readlink "$name")" != "$2" ]]; do let i+=1; name="${basename%.*}($i)$ext"; done #if [[ cont -eq 0 ]]; # on --continue set $name to the first discoverable link # then let i=0; while [[ -e "$name" ]]; do let i+=1; name="${basename%.*}($i)$ext"; done # else let i=0; while [[ -e "$name" && ! -L "$name" ]]; do let i+=1; name="${basename%.*}($i)$ext"; done #fi } urldecode_localTg() { local save_localTg; # output: localTag, ltgQuery, input: localTag ltgQuery="${localTg##*\?}"; [[ "$ltgQuery" = "$localTg" ]] && ltgQuery=""; localTg="${localTg%\?$ltgQuery}"; ltgQuery="${ltgQuery:+?}${ltgQuery//\//%2F}"; save_localTg="$localTg"; localTg="$(python -c 'import urllib.parse; print(urllib.parse.unquote("'"${localTg//\"/\\\"}"'")); ')" [[ -z "$localTg" ]] && localTg="$save_localTg" } exec 9>/dev/null adir="$1" wp="$2" if [[ use_agent -ne 0 ]]; then agent="Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0" fi wget_opts="-N" [[ cont -ne 0 ]] && wget_opts="-c" mkdir -p "$adir"; if pushd "$adir" >&9; then [[ stripRefererUrlPrm -ne 0 ]] && wp="${wp%\?utm_source=*}" if [[ ! -e urls ]] || ! grep -q "^$wp\$" urls; then echo "$wp" >>urls; fi wp="${wp%#*}" localTg="${wp#*://}"; # localTarget domain="${localTg%%/*}"; tld="${domain##*.}"; maindom="${domain%.$tld}"; maindom="${maindom##*.}"; wlcddomain="$maindom.$tld"; [[ "${domain%$wlcddomain}" = "$domain" ]] && wlcddomain="$domain"; # wget -H ... span hosts given by -D xx,yy,... #echo "dom: '$domain', '$maindom.$tld', '$tld', '$maindom', '$wlcddomain'" wget -p -r $wget_opts -H -D "$wlcddomain" --exclude-domains "$domain" --tries=10 ${agent:+--user-agent="$agent"} "$wp" -a dld.log; # -N ... download newer files or files of different size and store old as xy.NN urldecode_localTg if [[ "${localTg%/}" = "$localTg" ]]; then # filename given, http://path/xy.html basename="${localTg##*/}"; localTg="$localTg$ltgQuery"; else # http://path/[?ltgQuery] basename="${localTg%/}"; basename="${basename##*/}"; # use name of last dir [[ -n "$ltgQuery" ]] && basename="index.html" # don´t do so with weird url, i.e. where url parameters but no file name localTg="${localTg}index.html$ltgQuery"; fi #echo "$localTg" createLinkName "$basename" "$localTg"; # result in $name [[ ! -e "$name" || ! -L "$name" || "$(readlink "$name")" != "$localTg" ]] && ln -s "$localTg" "$name" [[ "${localTg%.html}" = "$localTg" && "${localTg%.htm}" = "$localTg" && ! -e "${localTg}.html" ]] && ln -s "${localTg##*/}" "${localTg}.html" tail -n 5 dld.log | { read lineA; read lineB; if [[ -n "$lineB" ]]; then echo "$lineB"; else echo "$lineA"; fi; cat; }; if [[ verbosity -ge 2 ]]; then ls -l --color else ls -l --color "$name" fi popd >&9 else echo "failed to change directory to '$adir'" >&2; fi
