jenkins-bot has submitted this change and it was merged.

Change subject: Add script to track wikidata dump downloads
......................................................................


Add script to track wikidata dump downloads

Bug: T119070
Change-Id: I8fff0bd875e9dcbd1015858dc2b4b7c830283a89
---
M cron/daily.03.sh
A src/wikidata/dumpDownloads.php
2 files changed, 102 insertions(+), 0 deletions(-)

Approvals:
  Addshore: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/cron/daily.03.sh b/cron/daily.03.sh
index 9e8db9c..f759564 100755
--- a/cron/daily.03.sh
+++ b/cron/daily.03.sh
@@ -37,6 +37,7 @@
 eval "$1/src/wikidata/phabricatorTasks.php"
 eval "$1/src/wikidata/showcaseItems.php"
 eval "$1/src/wikidata/specialEntityData.php"
+eval "$1/src/wikidata/dumpDownloads.php"
 eval "$1/src/catwatch/userprops.php"
 eval "$1/src/betafeatures/counts.php"
 
diff --git a/src/wikidata/dumpDownloads.php b/src/wikidata/dumpDownloads.php
new file mode 100755
index 0000000..51e32a5
--- /dev/null
+++ b/src/wikidata/dumpDownloads.php
@@ -0,0 +1,101 @@
+#!/usr/bin/php
+<?php
+/**
+ * @author Addshore
+ * Track the downloads of Wikidata dumps
+ * https://phabricator.wikimedia.org/T119070
+ */
+
+require_once( __DIR__ . '/../../lib/load.php' );
+Output::startScript( __FILE__ );
+
+// TODO get from puppet / config
+$logDirectory = '/a/log/webrequest/archive/dumps.wikimedia.org';
+
+// Types suffixed with a 1 represent the first part of a joined dump (only 
count the first)
+$weeklyXmlTypes = array(
+       'pages-articles-multistream',
+       'pages-meta-history',
+       'pages-meta-current',
+       'pages-articles',
+);
+
+$regexSnips = array(
+       // latest-all.json.EXT
+       // wikidata-20160101-all.json.EXT
+       'full.json' =>
+               '(latest|wikidata-[0-9]{8})-all\.json\.(gz|bz2)',
+       // latest-all-BETA.ttl.EXT
+       // wikidata-20160101-all-BETA.ttl.EXT
+       'full.ttl_beta' =>
+               '(latest|wikidata-[0-9]{8})-all-BETA\.ttl\.(gz|bz2)',
+       // wikidatawiki-20160701-pages-articles-multistream.xml.bz2
+       // wikidatawiki-20160701-pages-meta-history.xml.bz2
+       // wikidatawiki-20160701-pages-meta-history1.xml-p000000001p000022835.7z
+       // wikidatawiki-20160701-pages-meta-current.xml.bz2
+       // 
wikidatawiki-20160720-pages-meta-current1.xml-p000000001p002421529.bz2
+       // wikidatawiki-20160720-pages-articles.xml.bz2
+       // wikidatawiki-20160720-pages-articles1.xml-p000000001p002421529.bz2
+       'full.xml' =>
+               // Note the '1?' in this regex means the first part of split 
dumps is also matched!
+               'wikidatawiki-(latest|[0-9]{8})-(' . implode( '|', 
$weeklyXmlTypes ) . ')1?\.xml\.(gz|bz2)',
+       // wikidatawiki-20160626-pages-meta-hist-incr.xml.EXT
+       'incr.xml' =>
+               'wikidatawiki-[0-9]{8}-pages-meta-hist-incr.xml\.(gz|bz2)',
+);
+
+// We will get data for 3 days ago. To do this we need the logs from 3 and 2 
days ago.
+$targetDate = date('d/M/Y', strtotime('-3 days', time()));// For format 
[01/Jul/2016:
+$graphiteDate = date('Y-m-d', strtotime('-3 days', time()));// Date formatted 
for graphite
+$twoDaysAgo = date('Ymd', strtotime('-2 days', time()));
+$threeDaysAgo = date('Ymd', strtotime('-3 days', time()));
+
+$logFiles = array(
+       $logDirectory . DIRECTORY_SEPARATOR . 'access.log-' . $twoDaysAgo . 
'.gz',
+       $logDirectory . DIRECTORY_SEPARATOR . 'access.log-' . $threeDaysAgo . 
'.gz',
+);
+
+$counters = array();
+foreach( $logFiles as $logFile ) {
+       $handle = fopen( 'compress.zlib://' . $logFile, 'r' );
+       if ( $handle === false ) {
+               throw new RuntimeException( 'Failed to open file: ' . $logFile 
);
+       }
+
+       while ( ( $line = fgets( $handle ) ) !== false ) {
+               if(
+                       // Log line should contain out target date
+                       strpos( $line, "[$targetDate:" ) === false ||
+                       // And contain wikidatawiki in the request URI
+                       strpos( $line, '/wikidatawiki/' ) === false
+               ) {
+                       continue;
+               }
+
+               $statusCode = 0;
+               if( strpos( $line, ' 200 ' ) ) {
+                       $statusCode = 200;
+               } elseif ( strpos( $line, ' 206 ' ) ) {
+                       $statusCode = 206;
+               } else {
+                       // Only count 200 or 206 status codes
+                       continue;
+               }
+
+               foreach( $regexSnips as $type => $regexSnip ) {
+                       if ( preg_match( "/$regexSnip/i", $line ) ) {
+                               @$counters["$type.$statusCode"]++;
+                               // Once we have matched 1 type of dump do to 
the next line.
+                               break;
+                       }
+               }
+
+       }
+       fclose( $handle );
+}
+
+// Send everything to graphite!
+foreach( $counters as $type => $value ) {
+       $metricName = 'daily.wikidata.dump_requests.' . $type;
+       WikimediaGraphite::send( $metricName, $value, $graphiteDate );
+}
\ No newline at end of file

-- 
To view, visit https://gerrit.wikimedia.org/r/301504
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I8fff0bd875e9dcbd1015858dc2b4b7c830283a89
Gerrit-PatchSet: 2
Gerrit-Project: analytics/wmde/scripts
Gerrit-Branch: master
Gerrit-Owner: Addshore <[email protected]>
Gerrit-Reviewer: Addshore <[email protected]>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to