ArielGlenn has submitted this change and it was merged.
Change subject: make wget skip urls with query params when retrieving wikitech
dumps
......................................................................
make wget skip urls with query params when retrieving wikitech dumps
We don't want those bogus index files anyways so let's not get them
in the first place
Change-Id: I9e1d6f952a341803d4ee5d8de253dcf4c565500a
---
M modules/dataset/manifests/cron/wikitech_dumps.pp
1 file changed, 2 insertions(+), 1 deletion(-)
Approvals:
ArielGlenn: Looks good to me, approved
jenkins-bot: Verified
diff --git a/modules/dataset/manifests/cron/wikitech_dumps.pp
b/modules/dataset/manifests/cron/wikitech_dumps.pp
index 415bb51..4bf9fc4 100644
--- a/modules/dataset/manifests/cron/wikitech_dumps.pp
+++ b/modules/dataset/manifests/cron/wikitech_dumps.pp
@@ -23,6 +23,7 @@
}
$wget = '/usr/bin/wget'
+ $wgetreject = "--reject-regex '(.*)\?(.*)'"
$wgetargs = "-nv -e robots=off -k -nH --wait 30 -np -m ${url} -P
${wikitechdir}"
# the index.html files we get from wikitech are icky,
@@ -32,7 +33,7 @@
cron { 'wikitech-dumps-grab':
ensure => $ensure,
- command => "${wget} ${wgetargs}; ${cleanuphtml}; ${cleanupold}",
+ command => "${wget} ${wgetreject} ${wgetargs}; ${cleanuphtml};
${cleanupold}",
user => $user,
minute => '20',
hour => '3',
--
To view, visit https://gerrit.wikimedia.org/r/276109
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I9e1d6f952a341803d4ee5d8de253dcf4c565500a
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: ArielGlenn <[email protected]>
Gerrit-Reviewer: ArielGlenn <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits