ArielGlenn has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/355100 )

Change subject: treat wikidata just like enwiki for dumps
......................................................................

treat wikidata just like enwiki for dumps

[WIP] these config changes give it more parallel jobs and start it first
on its dedicated host during a dump run, just like the enwiki dump

This should not be committed until the current dump run completes;
in addition it should not be committed until Flow history dumps
run in two passes with prefetch for the content dumps.  Currently
the mediawikiwiki flow history dumps take 4 days or longer, and
this can mean that if the second mediawikiwiki dump of the month
starts after the wikidatawiki dump completes, it would not complete
in time.

Change-Id: I2026489388fdeb3483a3ea21ca4c21e5c20f0185
---
M hieradata/hosts/snapshot1005.yaml
M hieradata/hosts/snapshot1006.yaml
M modules/snapshot/manifests/dumps/configs.pp
M modules/snapshot/manifests/dumps/dblists.pp
M modules/snapshot/manifests/dumps/stagesconfig.pp
M modules/snapshot/templates/dumps/dumpstages.erb
M modules/snapshot/templates/dumps/fulldumps.sh.erb
7 files changed, 166 insertions(+), 61 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/puppet 
refs/changes/00/355100/1

diff --git a/hieradata/hosts/snapshot1005.yaml 
b/hieradata/hosts/snapshot1005.yaml
index 7370ec6..44bb8a2 100644
--- a/hieradata/hosts/snapshot1005.yaml
+++ b/hieradata/hosts/snapshot1005.yaml
@@ -1,4 +1,4 @@
-snapshot::dumps::runtype: hugewikis
+snapshot::dumps::runtype: enwiki
 snapshot::dumps::maxjobs: 28
 snapshot::dumps::monitor: false
 snapshot::cron::misc: false
diff --git a/hieradata/hosts/snapshot1006.yaml 
b/hieradata/hosts/snapshot1006.yaml
index 6027c4f..fa17665 100644
--- a/hieradata/hosts/snapshot1006.yaml
+++ b/hieradata/hosts/snapshot1006.yaml
@@ -1,4 +1,4 @@
-snapshot::dumps::runtype: regular
+snapshot::dumps::runtype: wikidatawiki
 snapshot::dumps::maxjobs: 28
 snapshot::dumps::monitor: false
 snapshot::cron::misc: false
diff --git a/modules/snapshot/manifests/dumps/configs.pp 
b/modules/snapshot/manifests/dumps/configs.pp
index afd4ee5..e7818d7 100644
--- a/modules/snapshot/manifests/dumps/configs.pp
+++ b/modules/snapshot/manifests/dumps/configs.pp
@@ -8,6 +8,9 @@
     $enchunkhistory1 = 
'30303,58141,112065,152180,212624,327599,375779,522388,545343,710090,880349,1113575,1157158,1547206'
     $enchunkhistory2 = 
'1773248,2021218,2153807,2427469,2634193,2467421,2705827,2895677,3679790,3449365,4114387,4596259,6533612'
 
+    $wikidatachunkhistory1 = 
'235321,350222,430401,531179,581039,600373,762298,826545,947305,1076978,1098243,993874,1418919,2399950'
+    $wikidatachunkhistory2 = 
'2587436,951696,942913,837759,1568292,1293747,2018593,1461235,1797642,1487121,2012246,874850,1486799'
+
     $config = {
         smallwikis => {
             dblist        => "${apachedir}/dblists/all.dblist",
@@ -83,15 +86,6 @@
                     chunksForAbstract     => '4',
                     checkpointTime        => '720',
                 },
-                wikidatawiki => {
-                    pagesPerChunkHistory  => '2421529,4883997,8784997,8199134',
-                    pagesPerChunkAbstract => '5800000',
-                    chunksForAbstract     => '4',
-                    checkpointTime        => '720',
-                    orderrevs             => '1',
-                    minpages              => '10',
-                    maxrevs               => '20000',
-                },
                 zhwiki => {
                     pagesPerChunkHistory  => '231819,564192,1300322,3112369',
                     pagesPerChunkAbstract => '1300000',
@@ -104,8 +98,30 @@
                 },
             },
         },
-        hugewikis => {
-            dblist           => "${dblistsdir}/hugewikis.dblist",
+        wikidatawiki => {
+            dblist           => "${dblistsdir}/wikidatawiki.dblist",
+            skipdblist       => "${dblistsdir}/skipnone.dblist",
+            keep             => '7',
+            chunksEnabled    => '1',
+            recombineHistory => '0',
+            checkpointTime        => '720',
+            revsPerJob       => '1500000',
+            retryWait        => '30',
+            maxRetries       => '3',
+            revsMargin       => '100',
+            wikis => {
+                wikidatawiki => {
+                    jobsperbatch          => 'xmlstubsdump=14',
+                    pagesPerChunkHistory  => 
"${wikidatachunkhistory1},${$wikidatachunkhistory2}",
+                    pagesPerChunkAbstract => '2000000',
+                    orderrevs             => '1',
+                    minpages              => '10',
+                    maxrevs               => '20000',
+                },
+            },
+        }
+        enwiki => {
+            dblist           => "${dblistsdir}/enwiki.dblist",
             skipdblist       => "${dblistsdir}/skipnone.dblist",
             keep             => '7',
             chunksEnabled    => '1',
@@ -118,7 +134,7 @@
             wikis => {
                 enwiki => {
                     jobsperbatch          => 'xmlstubsdump=14',
-                  pagesPerChunkHistory  => 
"${enchunkhistory1},${enchunkhistory2}",
+                    pagesPerChunkHistory  => 
"${enchunkhistory1},${enchunkhistory2}",
                     pagesPerChunkAbstract => '2000000',
                 },
             },
@@ -145,8 +161,12 @@
         configtype => 'bigwikis',
         config     => $config,
     }
-    snapshot::dumps::wikiconf { 'wikidump.conf.hugewikis':
-        configtype => 'hugewikis',
+    snapshot::dumps::wikiconf { 'wikidump.conf.enwiki':
+        configtype => 'enwiki',
+        config     => $config,
+    }
+    snapshot::dumps::wikiconf { 'wikidump.conf.wikidatawiki':
+        configtype => 'wikidatawiki',
         config     => $config,
     }
     snapshot::dumps::wikiconf { 'wikidump.conf.monitor':
diff --git a/modules/snapshot/manifests/dumps/dblists.pp 
b/modules/snapshot/manifests/dumps/dblists.pp
index 7df6520..cfc3e1a 100644
--- a/modules/snapshot/manifests/dumps/dblists.pp
+++ b/modules/snapshot/manifests/dumps/dblists.pp
@@ -1,6 +1,9 @@
 class snapshot::dumps::dblists {
-    $hugewikis = ['enwiki']
-    $hugewikis_dblist = join($hugewikis, "\n")
+    $enwiki = ['enwiki']
+    $enwiki_dblist = join($enwiki, "\n")
+
+    $wikidatawiki = ['wikidatawiki']
+    $wikidatawiki_dblist = join($wikidatawiki, "\n")
 
     $bigwikis = ['dewiki', 'eswiki', 'frwiki', 'itwiki', 'jawiki',
                 'metawiki', 'nlwiki', 'plwiki', 'ptwiki', 'ruwiki', 
'commonswiki',
@@ -11,7 +14,7 @@
     $excludewikis = ['labswiki', 'labtestwiki']
     $excludewikis_dblist = join($excludewikis, "\n")
 
-    $skip_dblist = 
"${hugewikis_dblist}\n${bigwikis_dblist}\n${excludewikis_dblist}"
+    $skip_dblist = 
"${enwiki_dblist}\n${wikidatawiki_dblist}\n${bigwikis_dblist}\n${excludewikis_dblist}"
 
     $skipnone_dblist = ''
 
@@ -21,13 +24,21 @@
 
     $dblistsdir = $snapshot::dumps::dirs::dblistsdir
 
-    file { "${dblistsdir}/hugewikis.dblist":
+    file { "${dblistsdir}/enwiki.dblist":
         ensure  => 'present',
-        path    => "${dblistsdir}/hugewikis.dblist",
+        path    => "${dblistsdir}/enwiki.dblist",
         mode    => '0644',
         owner   => 'root',
         group   => 'root',
-        content => "${hugewikis_dblist}\n",
+        content => "${enwiki_dblist}\n",
+    }
+    file { "${dblistsdir}/wikidatawiki.dblist":
+        ensure  => 'present',
+        path    => "${dblistsdir}/wikidatawiki.dblist",
+        mode    => '0644',
+        owner   => 'root',
+        group   => 'root',
+        content => "${wikidatawiki_dblist}\n",
     }
     file { "${dblistsdir}/bigwikis.dblist":
         ensure  => 'present',
diff --git a/modules/snapshot/manifests/dumps/stagesconfig.pp 
b/modules/snapshot/manifests/dumps/stagesconfig.pp
index ac70ace..cb9f10b 100644
--- a/modules/snapshot/manifests/dumps/stagesconfig.pp
+++ b/modules/snapshot/manifests/dumps/stagesconfig.pp
@@ -9,7 +9,8 @@
 
     $args_smallwikis = "${wikiargs} --configfile ${confsdir}/wikidump.conf"
     $args_bigwikis = "${wikiargs} --configfile 
${confsdir}/wikidump.conf.bigwikis"
-    $args_hugewikis = "${wikiargs} --configfile 
${confsdir}/wikidump.conf.hugewikis"
+    $args_enwiki = "${wikiargs} --configfile ${confsdir}/wikidump.conf.enwiki"
+    $args_wikidatawiki = "${wikiargs} --configfile 
${confsdir}/wikidump.conf.wikidatawiki"
 
     $jobs_to_skip = join(['metahistorybz2dump',
                           'metahistorybz2dumprecombine',
@@ -25,9 +26,13 @@
             firststage => "${args_bigwikis} ${firststage_args}",
             rest       => "${args_bigwikis} ${rest_args}",
         },
-        hugewikis    => {
-            firststage => "${args_hugewikis} ${firststage_args}",
-            rest       => "${args_hugewikis} ${rest_args}",
+        enwiki       => {
+            firststage => "${args_enwiki} ${firststage_args}",
+            rest       => "${args_enwiki} ${rest_args}",
+        },
+        wikidatawiki => {
+            firststage => "${args_wikidatawiki} ${firststage_args}",
+            rest       => "${args_wikidatawiki} ${rest_args}",
         },
         skipjob_args => "--skipjobs ${jobs_to_skip}",
     }
@@ -48,20 +53,36 @@
         stagestype => 'partial_nocreate',
         stages     => $stages,
     }
-    snapshot::dumps::stagesconf { 'stages_normal_hugewikis':
-        stagestype => 'normal_huge',
+    snapshot::dumps::stagesconf { 'stages_normal_enwiki':
+        stagestype => 'normal_enwiki',
         stages     => $stages,
     }
-    snapshot::dumps::stagesconf { 'stages_partial_hugewikis':
-        stagestype => 'partial_huge',
+    snapshot::dumps::stagesconf { 'stages_partial_enwiki':
+        stagestype => 'partial_enwiki',
         stages     => $stages,
     }
-    snapshot::dumps::stagesconf { 'stages_normal_nocreate_hugewikis':
-        stagestype => 'normal_nocreate_huge',
+    snapshot::dumps::stagesconf { 'stages_normal_nocreate_enwiki':
+        stagestype => 'normal_nocreate_enwiki',
         stages     => $stages,
     }
-    snapshot::dumps::stagesconf { 'stages_partial_nocreate_hugewikis':
-        stagestype => 'partial_nocreate_huge',
+    snapshot::dumps::stagesconf { 'stages_partial_nocreate_enwiki':
+        stagestype => 'partial_nocreate_enwiki',
+        stages     => $stages,
+    }
+    snapshot::dumps::stagesconf { 'stages_normal_wikidatawiki':
+        stagestype => 'normal_wikidatawiki',
+        stages     => $stages,
+    }
+    snapshot::dumps::stagesconf { 'stages_partial_wikidatawiki':
+        stagestype => 'partial_wikidatawiki',
+        stages     => $stages,
+    }
+    snapshot::dumps::stagesconf { 'stages_normal_nocreate_wikidatawiki':
+        stagestype => 'normal_nocreate_wikidatawiki',
+        stages     => $stages,
+    }
+    snapshot::dumps::stagesconf { 'stages_partial_nocreate_wikidatawiki':
+        stagestype => 'partial_nocreate_wikidatawiki',
         stages     => $stages,
     }
     snapshot::dumps::stagesconf { 'stages_create':
@@ -76,8 +97,12 @@
         stagestype => 'create_big',
         stages     => $stages,
     }
-    snapshot::dumps::stagesconf { 'stages_create_hugewikis':
-        stagestype => 'create_huge',
+    snapshot::dumps::stagesconf { 'stages_create_enwiki':
+        stagestype => 'create_enwiki',
+        stages     => $stages,
+    }
+    snapshot::dumps::stagesconf { 'stages_create_wikidatawiki':
+        stagestype => 'create_wikidatawiki',
         stages     => $stages,
     }
 }
diff --git a/modules/snapshot/templates/dumps/dumpstages.erb 
b/modules/snapshot/templates/dumps/dumpstages.erb
index eb68cfb..d8d9e0c 100644
--- a/modules/snapshot/templates/dumps/dumpstages.erb
+++ b/modules/snapshot/templates/dumps/dumpstages.erb
@@ -7,9 +7,14 @@
 
 # slots_used numcommands on_failure error_notify command
 
-<% if @stagestype == 'normal_huge' or @stagestype == 'partial_huge' or 
@stagestype == 'create_huge' %>
-# mark the start of the run for all huge wikis
-1 1 continue none <%= @stages['hugewikis']['firststage'] -%> --job createdirs 
--sleep 5
+<% if @stagestype == 'normal_enwiki' or @stagestype == 'partial_enwiki' or 
@stagestype == 'create_enwiki' %>
+# mark the start of the run for enwiki
+1 1 continue none <%= @stages['enwiki']['firststage'] -%> --job createdirs 
--sleep 5
+<% end -%>
+
+<% if @stagestype == 'normal_wikidatawiki' or @stagestype == 
'partial_wikidatawiki' or @stagestype == 'create_wikidatawiki' %>
+# mark the start of the run for wikidatawiki
+1 1 continue none <%= @stages['wikidatawiki']['firststage'] -%> --job 
createdirs --sleep 5
 <% end -%>
 
 <% if @stagestype == 'normal' or @stagestype == 'partial' or @stagestype == 
'create' %>
@@ -62,26 +67,50 @@
 
 <% end -%>
 
-<% if @stagestype == 'normal_huge' or @stagestype == 'partial_huge' or 
@stagestype == 'normal_nocreate_huge' or @stagestype == 'partial_nocreate_huge' 
%>
+<% if @stagestype == 'normal_enwiki' or @stagestype == 'partial_enwiki' or 
@stagestype == 'normal_nocreate_enwiki' or @stagestype == 
'partial_nocreate_enwiki' %>
 # stubs, stubs recombine
-27 1 continue none <%= @stages['hugewikis']['rest'] -%> --job 
xmlstubsdump,xmlstubsdumprecombine
+27 1 continue none <%= @stages['enwiki']['rest'] -%> --job 
xmlstubsdump,xmlstubsdumprecombine
 # tables next so inconsistencies between stubs and tables aren't too huge
-27 1 continue none <%= @stages['hugewikis']['rest'] -%> --job tables
+27 1 continue none <%= @stages['enwiki']['rest'] -%> --job tables
 
 # regular articles, recombine, multistream
-27 1 continue none <%= @stages['hugewikis']['rest'] -%> --job 
articlesdump,articlesdumprecombine,articlesmultistreamdump
+27 1 continue none <%= @stages['enwiki']['rest'] -%> --job 
articlesdump,articlesdumprecombine,articlesmultistreamdump
 
 # articles plus meta pages
-27 1 continue none <%= @stages['hugewikis']['rest'] -%> --job 
metacurrentdump,metacurrentdumprecombine
+27 1 continue none <%= @stages['enwiki']['rest'] -%> --job 
metacurrentdump,metacurrentdumprecombine
 
-<% if @stagestype == 'normal_huge' or @stagestype == 'normal_nocreate_huge' %>
+<% if @stagestype == 'normal_enwiki' or @stagestype == 
'normal_nocreate_enwiki' %>
 # all remaining jobs
-27 1 continue none <%= @stages['hugewikis']['rest'] %>
+27 1 continue none <%= @stages['enwiki']['rest'] %>
 <% end -%>
 
-<% if @stagestype == 'partial_huge' or @stagestype == 'partial_nocreate_huge' 
%>
+<% if @stagestype == 'partial_enwiki' or @stagestype == 
'partial_nocreate_enwiki' %>
 # all remaining jobs except for the history revs
-27 1 continue none <%= @stages['hugewikis']['rest'] %> <%= 
@stages['skipjob_args'] %>
+27 1 continue none <%= @stages['enwiki']['rest'] %> <%= 
@stages['skipjob_args'] %>
+<% end -%>
+
+<% end -%>
+
+<% if @stagestype == 'normal_wikidatawiki' or @stagestype == 
'partial_wikidatawiki' or @stagestype == 'normal_nocreate_wikidatawiki' or 
@stagestype == 'partial_nocreate_wikidatawiki' %>
+# stubs, stubs recombine
+27 1 continue none <%= @stages['wikidatawiki']['rest'] -%> --job 
xmlstubsdump,xmlstubsdumprecombine
+# tables next so inconsistencies between stubs and tables aren't too huge
+27 1 continue none <%= @stages['wikidatawiki']['rest'] -%> --job tables
+
+# regular articles, recombine, multistream
+27 1 continue none <%= @stages['wikidatawiki']['rest'] -%> --job 
articlesdump,articlesdumprecombine,articlesmultistreamdump
+
+# articles plus meta pages
+27 1 continue none <%= @stages['wikidatawiki']['rest'] -%> --job 
metacurrentdump,metacurrentdumprecombine
+
+<% if @stagestype == 'normal_wikidatawiki' or @stagestype == 
'normal_nocreate_wikidatawiki' %>
+# all remaining jobs
+27 1 continue none <%= @stages['wikidatawiki']['rest'] %>
+<% end -%>
+
+<% if @stagestype == 'partial_wikidatawiki' or @stagestype == 
'partial_nocreate_wikidatawiki' %>
+# all remaining jobs except for the history revs
+27 1 continue none <%= @stages['wikidatawiki']['rest'] %> <%= 
@stages['skipjob_args'] %>
 <% end -%>
 
 <% end -%>
diff --git a/modules/snapshot/templates/dumps/fulldumps.sh.erb 
b/modules/snapshot/templates/dumps/fulldumps.sh.erb
index 10e1e62..226469e 100644
--- a/modules/snapshot/templates/dumps/fulldumps.sh.erb
+++ b/modules/snapshot/templates/dumps/fulldumps.sh.erb
@@ -15,13 +15,14 @@
 # some if there are errors or parts that need to be rerun,
 # without requiring manual intervention for the next cron run.
 #
-# The script runs on a dedicated snapshot host for huge wikis
-# and on the rest of the snapshot hosts for regular wikis.
+# The script runs on a dedicated snapshot host for enwiki,
+# a dedicated host for wikidatawiki, and on the rest of the
+# snapshot hosts for regular wikis.
 
 usage(){
-    echo "Usage: $0 startdate enddate huge|regular full|partial"
-    echo "where huge or regular is the type of wikis to be dumped"
-    echo "(huge wikis or small/big wikis),"
+    echo "Usage: $0 startdate enddate enwiki|wikidatawiki|regular full|partial"
+    echo "where enwiki, wikidatawiki, or regular is the type of wikis to be 
dumped"
+    echo "(enwiki/wikidatawiki or small/big wikis),"
     echo "full or partial is the the type of wiki dump to be run"
     echo "(all steps including full page content history, or"
     echo "a partial dump excluding that step)"
@@ -150,10 +151,15 @@
 find "$logdir" -maxdepth 1 -name fulldumps_\* -a -mtime +90 -exec rm {} \;
 
 # create directories for the dump run for each group of wikis
-# (small, big, huge) as needed
+# (small, big, enwiki, wikidatawiki) as needed
 case $wikitype in
-    'hugewikis')
-        maybe_do_createdirs "wikidump.conf.hugewikis" 
"stages_create_hugewikis" "hugewikis"
+    'enwiki')
+        maybe_do_createdirs "wikidump.conf.enwiki" "stages_create_enwiki" 
"enwiki"
+        maybe_do_createdirs "wikidump.conf.bigwikis" "stages_create_bigwikis" 
"bigwikis"
+        maybe_do_createdirs "wikidump.conf" "stages_create_smallwikis" 
"smallwikis"
+        ;;
+    'wikidatawiki')
+        maybe_do_createdirs "wikidump.conf.wikidatawiki" 
"stages_create_wikidatawiki" "wikidatawiki"
         maybe_do_createdirs "wikidump.conf.bigwikis" "stages_create_bigwikis" 
"bigwikis"
         maybe_do_createdirs "wikidump.conf" "stages_create_smallwikis" 
"smallwikis"
         ;;
@@ -172,11 +178,13 @@
 
 case $dumptype in
     'full')
-       hugestages="stages_normal_nocreate_hugewikis"
+       enstages="stages_normal_nocreate_enwiki"
+       wikidatastages="stages_normal_nocreate_wikidatawiki"
        regularstages="stages_normal_nocreate"
        ;;
     'partial')
-       hugestages="stages_partial_nocreate_hugewikis"
+       engestages="stages_partial_nocreate_enwiki"
+       wikidatastages="stages_partial_nocreate_wikidatawiki"
        regularstages="stages_partial_nocreate"
        ;;
     '*')
@@ -186,9 +194,21 @@
 esac
 
 case $wikitype in
-    'hugewikis')
-        maybe_do_dumps "wikidump.conf.hugewikis" "$hugestages" "hugewikis" 
"$maxjobs"
-        # After huge wikis are done, check and do some of the rest if needed
+    'enwiki')
+        maybe_do_dumps "wikidump.conf.enwiki" "$enstages" "enwiki" "$maxjobs"
+        # After enwiki is done, check and do some of the rest if needed
+        # If not all the big wikis are done we will start a dump run covering 
small and big wikis
+        # Only wikis without complete dumps will be updated.
+        maybe_do_dumps "wikidump.conf.bigwikis" "$regularstages" "bigwikis" 
"$maxjobs"
+       if [ $? -ne 0 ]; then
+            # If we did not start a dump run above for small and big wikis, 
and the small wikis
+            # are not done, start such a run now.  Only wikis without complete 
dumps will be updated.
+            maybe_do_dumps "wikidump.conf" "$regularstages" "smallwikis" 
"$maxjobs"
+        fi
+        ;;
+    'wikidatawiki')
+        maybe_do_dumps "wikidump.conf.wikidatawiki" "$wikidatastages" 
"wikidatawiki" "$maxjobs"
+        # After wikidatawiki is done, check and do some of the rest if needed
         # If not all the big wikis are done we will start a dump run covering 
small and big wikis
         # Only wikis without complete dumps will be updated.
         maybe_do_dumps "wikidump.conf.bigwikis" "$regularstages" "bigwikis" 
"$maxjobs"

-- 
To view, visit https://gerrit.wikimedia.org/r/355100
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I2026489388fdeb3483a3ea21ca4c21e5c20f0185
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: ArielGlenn <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to