https://www.mediawiki.org/wiki/Special:Code/MediaWiki/105444
Revision: 105444 Author: diederik Date: 2011-12-07 17:36:45 +0000 (Wed, 07 Dec 2011) Log Message: ----------- Moved reportcard import utils from trunk/wikistats/analytics to trunk/extensions/MetricsReporting Added Paths: ----------- trunk/extensions/MetricsReporting/import/AnalyticsPrepBinariesData.pl trunk/extensions/MetricsReporting/import/AnalyticsPrepComscoreData.pl trunk/extensions/MetricsReporting/import/AnalyticsPrepLanguageNames.pl trunk/extensions/MetricsReporting/import/AnalyticsPrepPageViews.pl trunk/extensions/MetricsReporting/import/AnalyticsPrepWikiCountsOutput.pl trunk/extensions/MetricsReporting/import/_readme.txt trunk/extensions/MetricsReporting/import/analytics_create_and_load_from_csv.txt trunk/extensions/MetricsReporting/import/analytics_generate_csv_files.sh trunk/extensions/MetricsReporting/import/analytics_new.sh trunk/extensions/MetricsReporting/import/analytics_refresh_from_csv.txt trunk/extensions/MetricsReporting/import/analytics_upd.sh Copied: trunk/extensions/MetricsReporting/import/AnalyticsPrepBinariesData.pl (from rev 105441, trunk/wikistats/analytics/AnalyticsPrepBinariesData.pl) =================================================================== --- trunk/extensions/MetricsReporting/import/AnalyticsPrepBinariesData.pl (rev 0) +++ trunk/extensions/MetricsReporting/import/AnalyticsPrepBinariesData.pl 2011-12-07 17:36:45 UTC (rev 105444) @@ -0,0 +1,124 @@ +#!/usr/local/bin/perl + + use Getopt::Std ; + + &ParseArguments ; + + print "Write file '$file_csv_out'\n" ; + open CSV_OUT, '>', $file_csv_out ; + + foreach $project (qw (wb wk wn wp wq ws wv wx)) + { &ReadStatisticsPerBinariesExtension ($project) ; } + + close CSV_OUT ; + + print "\n\nReady\n\n" ; + exit ; + +sub ParseArguments +{ + my @options ; + getopt ("io", \%options) ; + + die ("Specify input folder as: -i path") if (! defined ($options {"i"})) ; + die ("Specify output folder as: -o path'") if (! defined ($options {"o"})) ; + + $path_in = $options {"i"} ; + $path_out = $options {"o"} ; + + die "Input folder '$path_in' does not exist" if (! -d $path_in) ; + die "Output folder '$path_out' does not exist" if (! -d $path_out) ; + + # tests only + # $path_in = "C:/@ Wikimedia/# Out Bayes" ; + # $path_out = "C:/analytics" ; # "w:/@ report card/data" ; + + print "Input folder: $path_in\n" ; + print "Output folder: $path_out\n" ; + print "\n" ; + + $file_csv_out = "$path_out/analytics_in_binaries.csv" ; +} + + +sub ReadStatisticsPerBinariesExtension +{ + my $project = shift ; + my $file_csv_in = "$path_in/csv_$project/StatisticsPerBinariesExtension.csv" ; + $yyyymm_hi = -1 ; + + if (! -e $file_csv_in) + { die "Input file '$file_csv_in' not found" ; } + + + print "Read '$file_csv_in'\n" ; + open CSV_IN, '<', $file_csv_in ; + + $language_prev = '' ; + while ($line = <CSV_IN>) + { + chomp $line ; + next if $line !~ /,.*?,/ ; + + ($language,$date,$data) = split (',', $line, 3) ; + + # for each wiki first line shows ext names, no tcounts + if ($date eq "00/0000") + { + if ($language_prev ne '') + { &WriteMonthlyData ($project, $language_prev) ; } + $language_prev = $language ; + + undef %ext_name ; + undef %ext_ndx ; + undef %ext_cnt ; + undef %months ; + + @exts = split (',', $data) ; + $ndx = 0 ; + foreach $ext (@exts) + { + $ext_name {$ndx} = $ext ; + $ext_ndx {$ext} = $ndx ; + $ndx ++ ; + } + next ; + } + + ($month,$year) = split ('\/', $date) ; + $yyyymm = sprintf ("%04d-%02d", $year, $month) ; + if ($yyyymm gt $yyyymm_hi) + { $yyyymm_hi = $yyyymm ; } + $months {$yyyymm}++ ; + + @counts = split (',', $data) ; + $ndx = 0 ; + foreach $count (@counts) + { + $ext_cnt {$yyyymm}{$ext_name {$ndx}} = $count ; + $ndx ++ ; + } + } + &WriteMonthlyData ($project, $language_prev) ; + + close CSV_IN ; +} + +sub WriteMonthlyData +{ + my ($project,$language) = @_ ; + # get sorted array of extensions, order by count for most recent month + %ext_cnt_yyyymm_hi = %{$ext_cnt {$yyyymm_hi}} ; + @ext_cnt_yyyymm_hi = (sort {$ext_cnt_yyyymm_hi {$b} <=> $ext_cnt_yyyymm_hi {$a}} keys %ext_cnt_yyyymm_hi) ; + + foreach $month (sort keys %months) + { + $ndx = 0 ; + foreach $ext (@ext_cnt_yyyymm_hi) + { + print CSV_OUT "$project,$language,$month,$ext,${ext_cnt{$yyyymm}{$ext_name {$ndx}}}\n" ; + # print "$month,$ext,${ext_cnt{$yyyymm}{$ext_name {$ndx}}}\n" ; + last if (++ $ndx > 25) ; + } + } +} Copied: trunk/extensions/MetricsReporting/import/AnalyticsPrepComscoreData.pl (from rev 105441, trunk/wikistats/analytics/AnalyticsPrepComscoreData.pl) =================================================================== --- trunk/extensions/MetricsReporting/import/AnalyticsPrepComscoreData.pl (rev 0) +++ trunk/extensions/MetricsReporting/import/AnalyticsPrepComscoreData.pl 2011-12-07 17:36:45 UTC (rev 105444) @@ -0,0 +1,564 @@ +#!/usr/bin/perl + +# Copyright (C) 2011 Wikimedia Foundation +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License version 2 +# as published by the Free Software Foundation. +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# See the GNU General Public License for more details, at +# http://www.fsf.org/licenses/gpl.html + +# Author: +# Erik Zachte, email ezac...@wikimedia.org + +# Functionality: +# comScore data can be downloaded as csv file, which each contain 14 months history +# This script uses these files to update 'master' csv files which contain all known history +# Note: only entities which are already in master file will be updated! +# Then it merges these master files into one csv file which can be loaded into analytics database +# Data are: reach by region, unique visitors by region, unique visitors by web property + +# Parameters: +# -m (required) folder with 'master' csv files (files with all known history) +# -u (required) folder with 'update' csv files (files with lastest 14 months history, produced by comScore) + +# Output: +# updated master csv files + merged and formatted csv for import in MySQL + +# http://svn.wikimedia.org/viewvc/mediawiki/trunk/wikistats/analytics/ + + use Getopt::Std ; + use Cwd; + + my $options ; + getopt ("imo", \%options) ; + + $true = 1 ; + $false = 0 ; + + $script_name = "AnalyticsPrepComscoreData.pl" ; + $script_version = "0.31" ; + +# EZ test only +# $source = "comscore" ; +# $server = "ez_test" ; +# $generated = "2011-05-06 00:00:00" ; +# $user = "ezachte" ; + + $dir_in = $options {"i"} ; + $dir_upd = $options {"m"} ; + $dir_out = $options {"o"} ; + $mode = 'add'; + if (defined $options {'r'}) + { $mode = 'replace'; } + + print "Mode is $mode (specify '-r' for replace)\n\n"; + + if (! -d "/home/") # EZ test machine + { + $dir_in = "C:/@ Wikimedia/@ Report Card/Data" ; + $dir_upd = "C:/MySQL/analytics" ; + $dir_out = "C:/MySQL/analytics" ; + $mode = 'replace' ; + } + + if ($dir_in eq '') + { Abort ("Specify folder for input file (new comScore data) '-i folder'") ; } + if ($dir_upd eq '') + { Abort ("Specify folder for master files (full history) as '-m folder'") ; } + if ($dir_out eq '') + { Abort ("Specify folder for output file '-o folder'") ; } + + $file_comscore_reach_master = "history_comscore_reach_regions.csv" ; + $file_comscore_reach_update = "*reach*by*region*csv" ; + $file_comscore_uv_region_master = "history_comscore_UV_regions.csv" ; + $file_comscore_uv_region_update = "*UVs*by*region*csv" ; + $file_comscore_uv_property_master = "history_comscore_UV_properties.csv" ; + $file_comscore_uv_property_update = "*UV*trend*csv" ; + + $layout_csv_reach = 1 ; + $layout_csv_regions = 2 ; + $layout_csv_properties = 3 ; + + print "Directories:\n" . + "Input (new comScore data): '$dir_in'\n". + "Master files (full history): '$dir_upd'\n" . + "Output (database feed): '$dir_out'\n\n" ; + + %region_codes = ( + "Europe"=>"EU", + "North America"=>"NA", + "Latin America"=>"LA", + "World-Wide" => "W", + "Middle East - Africa" => "MA", + "Asia Pacific"=> "AS", + "United States" => "US", + "India" => "I", + "China" => "C" + ) ; + + foreach $region_name (keys %region_codes) + { $region_names {$region_codes {$region_name}} = $region_name ; } + + @months_short = qw "Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec" ; + + &ReadMasterComscoreDataReachPerRegion ($file_comscore_reach_master, $file_comscore_reach_update, "%.1f", 1, $layout_csv_reach) ; + %reach_region_code = %data ; + + &ReadMasterComscoreDataVisitorsPerRegion ($file_comscore_uv_region_master, $file_comscore_uv_region_update, "%.0f", 1000, $layout_csv_regions) ; + %visitors_region_code = %data ; + + &ReadMasterComscoreDataVisitorsPerProperty ($file_comscore_uv_property_master, $file_comscore_uv_property_update, "%.0f", 1000, $layout_csv_properties) ; + %visitors_web_property = %data ; + + &WriteDataAnalytics ; + + print "\nReady\n\n" ; + exit ; + +sub ReadMasterComscoreDataReachPerRegion +{ + my ($file_comscore_master, $file_comscore_updates, $precision, $layout_csv) = @_ ; + + print "ReadMasterComscoreDataReachPerRegion\n\n" ; + + undef %months ; + undef %data ; + undef @regions ; + + open IN, '<', "$dir_upd/$file_comscore_master" ; + + $lines = 0 ; + while ($line = <IN>) + { + chomp $line ; + $line =~ s/\r//g ; + + ($yyyymm,@data) = split (',', $line) ; + + if ($lines++ == 0) + { + @regions = @data ; + print "Regions found: " . (join ',', @regions) . "\n"; + next ; + } + + $field_ndx = 0 ; + foreach (@data) + { + $region = $regions [$field_ndx] ; + $region_code = $region_codes {$region} ; + + $data = $data [$field_ndx] ; + if ($data eq '') + { $data = '0' ; } + $months {$yyyymm} ++ ; + $data {"$yyyymm,$region_code"} = $data ; + # print "Old data $yyyymm,$region_code = $data\n" ; + $field_ndx++ ; + } + } + close IN ; + + my $updates_found = &UpdateMasterFileFromRecentComscoreData ($file_comscore_master, $file_comscore_updates, 1, $layout_csv, @regions) ; + return if ! $updates_found ; + + rename "$dir_upd/$file_comscore_master", "$dir_upd/$file_comscore_master.~" ; + open OUT, '>', "$dir_upd/$file_comscore_master" ; + + $line_out = "yyyymm" ; + foreach $region_name (@regions) + { $line_out .= ",$region_name" ; } + print OUT "$line_out" ; + + foreach $yyyymm (sort {$b cmp $a} keys %months) + { + $line_out = "\n$yyyymm" ; + foreach $region_name (@regions) + { + $yyyymm_region_code = $yyyymm . ',' . $region_codes {$region_name} ; + $line_out .= "," . sprintf ($precision, $data {$yyyymm_region_code}) ; + } + print OUT "$line_out" ; + } + + close OUT ; +} + +sub ReadMasterComscoreDataVisitorsPerRegion +{ + my ($file_comscore_master, $file_comscore_updates, $precision, $multiplier, $layout_csv) = @_ ; + + print "ReadMasterComscoreDataVisitorsPerRegion\n\n"; + + undef %months ; + undef %data ; + undef @regions ; + + open IN, '<', "$dir_upd/$file_comscore_master" ; + + $lines = 0 ; + $metric = 'unique_visitors' ; + while ($line = <IN>) + { + chomp $line ; + $line =~ s/\r//g ; + $line = &GetNumberOnly ($line) ; + + next if $line !~ /(?:yyyymm|\d\d\d\d-\d\d)/ ; + + ($yyyymm,@data) = split (',', $line) ; + + if ($lines++ == 0) + { + @regions = @data ; + print "Regions found: " . (join ',', @regions) . "\n"; + next ; + } + + $field_ndx = 0 ; + foreach (@data) + { + $region = $regions [$field_ndx] ; + $region_code = $region_codes {$region} ; + + $data = $data [$field_ndx] ; + if ($data eq '') + { $data = '0' ; } + + # print "Old data $yyyymm,$region = $data\n" ; + + $months {$yyyymm} ++ ; + $data {"$yyyymm,$region_code"} = $data ; + + $field_ndx++ ; + } + } + close IN ; + + my $updates_found = &UpdateMasterFileFromRecentComscoreData ($file_comscore_master, $file_comscore_updates, 1000, $layout_csv, @regions) ; + return if ! $updates_found ; + + rename "$dir_upd/$file_comscore_master", "$dir_upd/$file_comscore_master.~" ; + open OUT, '>', "$dir_upd/$file_comscore_master" ; + + $line_out = "yyyymm" ; + foreach $region_name (@regions) + { $line_out .= ",$region_name" ; } + print OUT "$line_out" ; + + foreach $yyyymm (sort {$b cmp $a} keys %months) + { + $line_out = "\n$yyyymm" ; + foreach $region_name (@regions) + { + $yyyymm_region_code = $yyyymm . ',' . $region_codes {$region_name} ; + $line_out .= "," . sprintf ($precision, $data {$yyyymm_region_code}) ; + } + print OUT "$line_out" ; + } + + close OUT ; +} + +sub ReadMasterComscoreDataVisitorsPerProperty +{ + my ($file_comscore_master, $file_comscore_updates, $precision, $multiplier, $layout_csv) = @_ ; + + print "ReadMasterComscoreDataVisitorsPerProperty\n\n"; + + undef %months ; + undef %data ; + undef @properties ; + + open IN, '<', "$dir_upd/$file_comscore_master" ; + + $lines = 0 ; + $metric = 'unique_visitors' ; + while ($line = <IN>) + { + chomp $line ; + $line =~ s/\r//g ; + + ($yyyymm,@data) = split (',', $line) ; + if ($lines++ == 0) + { @properties = @data ; next ; } + + $field_ndx = 0 ; + foreach (@data) + { + $property = $properties [$field_ndx] ; + $property =~ s/.*Yahoo.*/Yahoo/ ; + $data = $data [$field_ndx] ; + if ($data eq '') + { $data = '0' ; } + + # print "Old data $yyyymm,$property = $data\n" ; + + $months {$yyyymm} ++ ; + $data {"$yyyymm,$property"} = $data ; + + $field_ndx++ ; + } + } + close IN ; + + my $updates_found = &UpdateMasterFileFromRecentComscoreData ($file_comscore_master, $file_comscore_updates, 1000, $layout_csv, @properties) ; + return if ! $updates_found ; + + rename "$dir_upd/$file_comscore_master", "$dir_upd/$file_comscore_master.~" ; + open OUT, '>', "$dir_upd/$file_comscore_master" ; + + $line_out = "yyyymm" ; + foreach $property (@properties) + { $line_out .= ",$property" ; } + print OUT "$line_out" ; + + foreach $yyyymm (sort {$b cmp $a} keys %months) + { + $line_out = "\n$yyyymm" ; + foreach $property (@properties) + { + $yyyymm_property = "$yyyymm,$property" ; + $line_out .= "," . sprintf ($precision, $data {$yyyymm_property}) ; + } + print OUT "$line_out" ; + } + + close OUT ; +} + +sub UpdateMasterFileFromRecentComscoreData +{ + my ($file_comscore_master, $file_comscore_updates, $multiplier, $layout_csv, @white_list) = @_ ; + + print "UpdateMasterFileFromRecentComscoreData\n\n"; + + undef %white_list ; + undef %not_white_listed ; + + print "White list: ". (join (',', @white_list)) . "\n\n"; + + foreach $id (@white_list) + { $white_list {$id} = $true ; } + + if (! -e "$dir_upd/$file_comscore_master") + { Abort ("File $file_comscore_master not found!") ; } + + $age_all = -M "$dir_upd/$file_comscore_master" ; + print "Latest comscore master file is " . sprintf ("%.0f", $age_all) . " days old: '$file_comscore_master'\n" ; + + my $cwd = getcwd ; + chdir $dir_in ; + + @files = glob($file_comscore_updates) ; + $min_age_upd = 999999 ; + $file_comscore_updates_latest = '' ; + foreach $file (@files) + { + $age = -M $file ; + if ($age < $min_age_upd) + { + $min_age_upd = $age ; + $file_comscore_updates_latest = $file ; + } + } + print "Latest comscore update file is " . sprintf ("%.0f", $min_age_upd) . " days old: '$file_comscore_updates_latest'\n" ; + + if ($min_age_upd == 999999) + { + print "No valid update file found. Nothing to update." ; + return ; + } + + #if ($age_all > $min_age_upd) + #{ + # print "File with master data more recent than latest update csv from comScore. Nothing to update." ; + # return ; + #} + + my $updates_found = $false ; + + open CSV, '<', $file_comscore_updates_latest ; + binmode CSV ; + while ($line = <CSV>) + { + chomp $line ; + $line =~ s/\r//g ; + $line = &GetNumberOnly ($line) ; + + if ($line =~ /Jan-\d\d\d\d.*?Feb-\d\d\d\d/) # e.g. 'Location,Location,Jan-2010,Feb-2010,Mar-2010,Apr-2010,...' + { + if ($layout_csv == $layout_csv_properties) + { ($dummy1,$dummy2,$dummy3,@months) = split (',', $line) ; } # web properties csv file + else + { ($dummy1,$dummy2,@months) = split (',', $line) ; } # uv / reach csv files + + @months = &mmm_yyyy2yyyy_mm (@months) ; + } + + if (($line =~ /^\d+,/) || ($line =~ /,,.*?Total Internet/)) + { + if ($layout_csv == $layout_csv_properties) + { + ($index,$dummy,$property,@data) = split (',', $line) ; + $property =~ s/^\s+// ; + $property =~ s/\s+$// ; + + $property =~ s/.*Total Internet.*/Total Internet/i ; + $property =~ s/.*Google.*/Google/i ; + $property =~ s/.*Microsoft.*/Microsoft/i ; + $property =~ s/.*FACEBOOK.*/Facebook/i ; + $property =~ s/.*Yahoo.*/Yahoo/i ; + $property =~ s/.*Amazon.*/Amazon/i ; + $property =~ s/.*Apple.*/Apple/i ; + $property =~ s/.*AOL.*/AOL/i ; + $property =~ s/.*Wikimedia.*/Wikimedia/i ; + $property =~ s/.*Tencent.*/Tencent/i ; + $property =~ s/.*Baidu.*/Baidu/i ; + $property =~ s/.*CBS.*/CBS/i ; + + if (! $white_list {$property}) + { + $not_white_listed {$property}++ ; + next ; + } + + $id = $property ; + } + else + { + ($index,$region,@data) = split (',', $line) ; + $region =~ s/^\s+// ; + $region =~ s/\s+$// ; + + if (! $white_list {$region}) + { + $not_white_listed {$region}++ ; + next ; + } + + $id = $region_codes {$region} ; + } + + for ($m = 0 ; $m <= $#months ; $m++) + { + $yyyymm = $months [$m] ; + $months {$yyyymm} ++ ; + $yyyymm_id = "$yyyymm,$id" ; + $data = $data [$m] * $multiplier ; + + if ($mode eq 'add') + { + if (! defined $data {$yyyymm_id}) + { + $updates_found = $true ; + print "New data found: $yyyymm_id = $data\n" ; + $data {$yyyymm_id} = $data ; + } + } + else + { + $updates_found = $true ; + print "Data found: $yyyymm_id = $data\n" ; + $data {$yyyymm_id} = $data ; + } + } + } + } + + $entities_not_white_listed = join (', ', sort keys %not_white_listed) ; + if ($entities_not_white_listed ne '') + { print "\nEntities ignored:\n$entities_not_white_listed\n\n" ; } + + if (! $updates_found) + { print "No new updates found\n" ; } + else + { print "\nUpdates found, rewrite master file '$file_comscore_master'\n\n" ; } + + return ($updates_found) ; +} + +sub WriteDataAnalytics +{ + print "WriteDataAnalytics\n\n"; + + open OUT, '>', "$dir_out/analytics_in_comscore.csv" ; + + $metric = 'unique_visitors' ; + foreach $yyyymm (sort keys %months) + { + # store meta data elsewhere + # $line = "$generated,$source,$server,$script_name,$script_version,$user,$yyyymm,$country_code,$region_code,$property,$project,$normalized,$metric,$data\n" ; + foreach $region_code (sort values %region_codes) + { + $country_code = '-' ; + $property = '-' ; + $project = '-' ; + $reach = $reach_region_code {"$yyyymm,$region_code"} ; + $visitors = $visitors_region_code {"$yyyymm,$region_code"} ; + + if (! defined $reach) { $reach = -1 ; } + if (! defined $visitors) { $visitors = -1 ; } + + next if $reach == -1 and $visitors == -1 ; + + $line = "$yyyymm,$country_code,$region_code,$property,$project,$reach,$visitors\n" ; + print OUT $line ; + print $line ; + } + + foreach $property (sort @properties) + { + $country_code = '-' ; + $region_code = '-' ; + $project = '-' ; + $reach = '-1' ; + $visitors = $visitors_web_property {"$yyyymm,$property"} ; + + next if ! defined $visitors ; + + $line = "$yyyymm,$country_code,$region_code,$property,$project,$reach,$visitors\n" ; + print OUT $line ; + # print $line ; + } + } +} + +sub GetNumberOnly +{ + my $line = shift ; + $line =~ s/("[^\"]+")/($a=$1,$a=~s#,##g,$a)/ge ; # nested regexp: remove comma's inside double quotes + $line =~ s/"//g ; + return $line ; +} + +sub mmm_yyyy2yyyy_mm +{ + my @months = @_ ; + my ($m) ; + # Jan -> 01, etc + foreach $month (@months) + { + my ($mmm,$yyyy) = split ('-', $month) ; + for ($m = 0 ; $m <= $#months_short ; $m++) + { + if ($mmm eq $months_short [$m]) + { + $month = "$yyyy-" . sprintf ("%02d", $m+1) ; + last ; + } + } + } + return @months ; +} + +sub Abort +{ + $msg = shift ; + + print "\nAbort, reason: $msg\n\n" ; + exit ; +} Copied: trunk/extensions/MetricsReporting/import/AnalyticsPrepLanguageNames.pl (from rev 105441, trunk/wikistats/analytics/AnalyticsPrepLanguageNames.pl) =================================================================== --- trunk/extensions/MetricsReporting/import/AnalyticsPrepLanguageNames.pl (rev 0) +++ trunk/extensions/MetricsReporting/import/AnalyticsPrepLanguageNames.pl 2011-12-07 17:36:45 UTC (rev 105444) @@ -0,0 +1,162 @@ +#!/usr/bin/perl + +# This module prepares a csv file with language names for feed into the analytics database +# The main work of collecting these names from different sources is done by +# http://svn.wikimedia.org/viewvc/mediawiki/trunk/wikistats/dumps/WikiReportsLocalizations.pm +# which is part of wikistats reporting phase and once each month updates local master csv files from all sources + +# The following code to merge and filter these master csv files is based on parts of the code in WikiReportsLocalizations.pm, function Localization +# To do some day: make a completely independent script out of this code and code from WikiReportsLocalizations.pm which covers the whole production cycle + +# Sources for language names: +# - php files +# - translatewiki +# - English Wikipedia API (interwikilinks) + +# Right now multilingual support for the analytics database is just a nice idea, so to speed up data feeds, just keep English and German translations + + use Getopt::Std ; + + $true = 1 ; + $false = 0 ; + + $max_language_name = 50 ; # truncate if longer + + $file_csv_language_names_php = "LanguageNamesViaPhp.csv" ; + $file_csv_language_names_wp = "LanguageNamesViaWpEnEdited.csv" ; + $file_csv_analytics_in = "analytics_in_language_names.csv" ; + + $languages_force_case_uc = "ast|br|de|en|id|nl|wa" ; # incomplete list, based on languages supported by wikistats reporting + $languages_filter = "de|en" ; + foreach $language (split '\|', $languages_filter) + { $include_target_language {$language} = $true ; } + + &ParseArguments ; + &ReadCsvFiles ; + &WriteCsvFile ; + + +# if ($language eq "id") # swap which file takes precedence + + print "\nReady\n\n" ; + exit ; + +sub ParseArguments +{ + my (@options, $arguments) ; + + getopt ("io", \%options) ; + + foreach $arg (sort keys %options) + { $arguments .= " -$arg " . $options {$arg} . "\n" ; } + print ("\nArguments\n$arguments\n") ; + + $options {"i"} = "w:/# out bayes/csv_wp" ; # EZ test + $options {"o"} = "c:/MySQL/analytics" ; # EZ test + + die ("Specify input folder for projectcounts files as: -i path") if (! defined ($options {"i"})) ; + die ("Specify output folder as: -o path'") if (! defined ($options {"o"})) ; + + ($path_in = $options {"i"}) =~ s/[\/\\]+$// ; # remove trailing (back)slash if any + ($path_out = $options {"o"}) =~ s/[\/\\]+$// ; # remove trailing (back)slash if any + + die "Input folder '$path_in' does not exist" if (! -d $path_in) ; + die "Output folder '$path_out' does not exist" if (! -d $path_out) ; + + print "Input folder: $path_in\n" ; + print "Output folder: $path_out\n\n" ; + + $file_csv_language_names_php = "$path_in/$file_csv_language_names_php" ; + $file_csv_language_names_wp = "$path_in/$file_csv_language_names_wp" ; + $file_csv_analytics_in = "$path_out/$file_csv_analytics_in" ; + + die "Input file '$file_csv_language_names_php' not found" if (! -e $file_csv_language_names_php) ; + die "Input file '$file_csv_language_names_wp' not found" if (! -e $file_csv_language_names_wp) ; +} + +sub ReadCsvFiles +{ + #first read definitions from php message files, then overwrite with definitions from interwiki links when available + # except for target language 'id' (Indonesian) where quality of php file has been deemed more reliable + + open CSV_IN, "<", $file_csv_language_names_php ; + while ($line = <CSV_IN>) + { + chomp ($line) ; + ($target_language, $code, $name_unicode, $name_html) = split (',', $line) ; + + next if ! $include_target_language {$target_language} ; + + $out_languages {$target_language} {$code} = &FormatName ($target_language, $name_unicode) ; # forget about html for analytics database + } + close CSV_IN ; + + open CSV_IN, "<", $file_csv_language_names_wp ; + while ($line = <CSV_IN>) + { + chomp ($line) ; + ($target_language, $code, $name_unicode, $name_html) = split (',', $line) ; + + next if ! $include_target_language {$target_language} ; + + next if $target_language eq 'id' and $out_languages {$target_language} {$code} ne '' ; + + # $name_unicode_php = $out_languages {$target_language} {$code} ; # test only + # $name_unicode_wp = &FormatName ($target_language, $name_unicode) ; # test only + # if (($name_unicode_php ne '') && ($name_unicode_php ne $name_unicode_wp)) # test only + # { print "$name_unicode_php => $name_unicode_wp\n" ; } # test only + + $out_languages {$target_language} {$code} = &FormatName ($target_language, $name_unicode) ; # forget about html for analytics database + } + close CSV_IN ; +} + +sub FormatName +{ + my ($target_language, $name_unicode) = @_ ; + + $name_unicode2 = $name_unicode ; + + if ($target_language eq "de") + { $name_unicode =~ s/e?\s*\-?sprache//i ; } + + if ($target_language =~ /^(?:$languages_force_case_uc)/) + { $name_unicode = ucfirst $name_unicode ; } + else + { $name_unicode = lc $name_unicode ; } + +# Test only +# if (($target_language eq 'de') && ($name_unicode ne $name_unicode2)) +# { print "$name_unicode2 => $name_unicode\n" ; } + + return ($name_unicode) ; +} + +sub WriteCsvFile +{ + open CSV_OUT, ">", $file_csv_analytics_in || die ("File '$file_csv_analytics_in' could not be opened") ; + binmode CSV_OUT ; # force Unix style linebreak \012 + + foreach $target_language (sort keys %out_languages) + { + print "\nTarget language '$target_language'\n" ; + %translations = %{$out_languages {$target_language}} ; + + foreach $code (sort keys %translations) + { + $language_name = $translations{$code} ; + $language_name2 = substr ($language_name,0,$max_language_name) ; + + if ($language_name ne $language_name2) + { print "Language name truncated to $max_language_name chars: '$language_name' -> '$language_name2'\n" ; } + + if ($language_name2 =~ /,/) + { $language_name2 = "\"$language_name2\"" ; } + # test only + print CSV_OUT "$target_language,$code,$language_name2\n" ; + } + } + + close CSV_OUT ; +} + Copied: trunk/extensions/MetricsReporting/import/AnalyticsPrepPageViews.pl (from rev 105441, trunk/wikistats/analytics/AnalyticsPrepPageViews.pl) =================================================================== --- trunk/extensions/MetricsReporting/import/AnalyticsPrepPageViews.pl (rev 0) +++ trunk/extensions/MetricsReporting/import/AnalyticsPrepPageViews.pl 2011-12-07 17:36:45 UTC (rev 105444) @@ -0,0 +1,11 @@ +#!/usr/bin/perl + +# This file is just a place holder (alias) +# Page views per day/week/month/etc, per project, per language, nrmalized and not normalized are collected by +# http://svn.wikimedia.org/viewvc/mediawiki/trunk/wikistats/dumps/WikiCountsSummarizeProjectCounts.pl +# +# Added May 2001: +# For analytics database one file is written for all projects and languages combined, +# with per month, not normalized and normalized page view counts in one row. +# This file is written to same output folder as other csv files generated by WikiCountsSummarizeProjectCounts.pl +# and needs to be copied to analytics folder later \ No newline at end of file Copied: trunk/extensions/MetricsReporting/import/AnalyticsPrepWikiCountsOutput.pl (from rev 105441, trunk/wikistats/analytics/AnalyticsPrepWikiCountsOutput.pl) =================================================================== --- trunk/extensions/MetricsReporting/import/AnalyticsPrepWikiCountsOutput.pl (rev 0) +++ trunk/extensions/MetricsReporting/import/AnalyticsPrepWikiCountsOutput.pl 2011-12-07 17:36:45 UTC (rev 105444) @@ -0,0 +1,334 @@ +#!/usr/local/bin/perl + +# Copyright (C) 2011 Wikimedia Foundation +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License version 2 +# as published by the Free Software Foundation. +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# See the GNU General Public License for more details, at +# http://www.fsf.org/licenses/gpl.html + +# Author: +# Erik Zachte, email ezac...@wikimedia.org +# loosely based on predecessor +# http://svn.wikimedia.org/viewvc/mediawiki/trunk/wikistats/reportcard/ReportCardExtractWikiCountsOutput.pl + +# Functionality: +# tba + +# Parameters: +# tba + +# Output: +# updated csv file for import in MySQL + +# http://svn.wikimedia.org/viewvc/mediawiki/trunk/wikistats/analytics/ + + use Getopt::Std ; + + $true = 1 ; + $false = 0 ; + + @projects = ('wb','wk','wn','wp','wq','ws','wv','wx','commons','*') ; + + $file_csv_monthly_data = "StatisticsMonthly.csv" ; + $file_csv_user_activity_spread = "StatisticsUserActivitySpread.csv" ; + $file_csv_analytics_in = "analytics_in_wikistats.csv" ; + + &ParseArguments ; + &ReadStatisticsMonthly ; + &FindLargestWikis ; + &WriteMonthlyData ; + + print "\nReady\n\n" ; + exit ; + +sub ParseArguments +{ + print "ParseArguments\n" ; + my (@options, $arguments) ; + + getopt ("io", \%options) ; + + foreach $arg (sort keys %options) + { $arguments .= " -$arg " . $options {$arg} . "\n" ; } + print ("\nArguments\n$arguments\n") ; + + if (! -d '/mnt/') # EZ test + { + $path_in = "c:/\@ wikimedia/# out bayes" ; + $path_out = "c:/MySQL/analytics" ; + } + else + { + die ("Specify input folder for projectcounts files as: -i path") if (! defined ($options {"i"})) ; + die ("Specify output folder as: -o path'") if (! defined ($options {"o"})) ; + + $path_in = $options {"i"} ; + $path_out = $options {"o"} ; + } + + die "Input folder '$path_in' does not exist" if (! -d $path_in) ; + die "Output folder '$path_out' does not exist" if (! -d $path_out) ; + + print "Input folder: $path_in\n" ; + print "Output folder: $path_out\n\n" ; + + $file_csv_out = "$path_out/analytics_in_wikistats.csv" ; +} + +sub ReadStatisticsMonthly +{ + print "ReadStatisticsMonthly\n" ; + &ReadStatisticsMonthlyForProject ("wb") ; + &ReadStatisticsMonthlyForProject ("wk") ; + &ReadStatisticsMonthlyForProject ("wn") ; + &ReadStatisticsMonthlyForProject ("wp") ; + &ReadStatisticsMonthlyForProject ("wq") ; + &ReadStatisticsMonthlyForProject ("ws") ; + &ReadStatisticsMonthlyForProject ("wv") ; + &ReadStatisticsMonthlyForProject ("wx") ; + +# &ReadStatisticsPerBinariesExtensionCommons ; +} + +sub ReadStatisticsMonthlyForProject +{ + my $project = shift; + $all_projects = "*" ; + + my $file_csv_in_1 = "$path_in/csv_$project/$file_csv_monthly_data" ; + my $file_csv_in_2 = "$path_in/csv_$project/$file_csv_user_activity_spread" ; + + if (! -e $file_csv_in_1) + { &Abort ("Input file '$file_csv_in_1' not found") ; } + if (! -e $file_csv_in_2) + { &Abort ("Input file '$file_csv_in_2' not found") ; } + + my $yyyymm ; + + print "Read '$file_csv_in_1'\n" ; + open CSV_IN, '<', $file_csv_in_1 ; + while ($line = <CSV_IN>) + { + chomp $line ; + ($language,$date,$counts) = split (',', $line, 3) ; + @fields = split (',', $counts) ; + + next if ! &AcceptWiki ($project,$language) ; + + ($month,$day,$year) = split ('\/', $date) ; + $yyyymm = sprintf ("%04d-%02d", $year, $month) ; + + foreach $field (@fields) + { + if ($field eq '-') + { $field = 0 ; } + } + + $data = $fields [0] . ',' . # contributors all time + $fields [1] . ',' . # new contributors + 'data2,' . # place holder for more data, to be inserted later + $fields [4] . ',' . # articles + $fields [6] . ',' . # articles new per day + $fields [9] . ',' . # larger than 0.5 kB + $fields [10] . ',' . # larger than 2.0 kB + $fields [7] . ',' . # mean edits per article + $fields [8] . ',' . # mean bytes per article + $fields [11] . ',' . # edits + $fields [12] . ',' . # size in bytes + $fields [13] . ',' . # size in words + $fields [14] . ',' . # links internal + $fields [15] . ',' . # links interwiki + $fields [16] . ',' . # links images + $fields [17] . ',' . # links external + $fields [18] ; # redirects + + $data1 {"$project,$language,$yyyymm"} = $data ; + } + close CSV_IN ; + + # now read (very) active editors from newer more accurate file (split data for reg users and bots, unlike StatisticsMonthly.csv) + + print "Read '$file_csv_in_2'\n" ; + open CSV_IN, '<', $file_csv_in_2 ; + while ($line = <CSV_IN>) + { + chomp $line ; + ($language,$date,$reguser_bot,$group,@counts) = split (',', $line) ; + + next if ! &AcceptWiki ($project,$language) ; + + if ($reguser_bot ne "R") { next ; } # R: reg user, B: bot + if ($group ne "A") { next ; } # A: articles, T: talk pages, O: other namespaces + + ($month,$day,$year) = split ('\/', $date) ; + $yyyymm = sprintf ("%04d-%02d", $year, $month) ; + $months {$yyyymm} ++ ; +# print "YYYYMM $yyyymm\n" ; + + # data have been collected in WikiCountsProcess.pm and been written in WikiCountsOutput.pm + # count user with over x edits + # threshold starting with a 3 are 10xSQRT(10), 100xSQRT(10), 1000xSQRT(10), etc + # @thresholds = (1,3,5,10,25,32,50,100,250,316,500,1000,2500,3162,5000,10000,25000,31623,50000,100000,250000,316228,500000,1000000,2500000,3162278,500000,10000000,25000000,31622777,5000000,100000000) ; + $edits_ge_5 = @counts [2] > 0 ? @counts [2] : 0 ; + $edits_ge_25 = @counts [4] > 0 ? @counts [4] : 0 ; + $edits_ge_100 = @counts [7] > 0 ? @counts [7] : 0 ; + $data2 {"$project,$language,$yyyymm"} = "$edits_ge_5,$edits_ge_25,$edits_ge_100" ; + + $total_edits_ge_5 {"$project,$language"} += $edits_ge_5 ; + $total_edits_ge_25 {"$project,$language"} += $edits_ge_25 ; + $total_edits_ge_100 {"$project,$language"} += $edits_ge_100 ; + + # prep string with right amount of comma's + if ($data2_default eq '') + { + $data2_default = $data2 {"$project,$language,$yyyymm"} ; + $data2_default =~ s/[^,]+/0/g ; + } + } + close CSV_IN ; +} + +#sub ReadStatisticsPerBinariesExtensionCommons +#{ +# my $file_csv_in = "$path_in/csv_wx/StatisticsPerBinariesExtension.csv" ; +# my $mmax = -1 ; + +# if (! -e $file_csv_in) +# { &Abort ("Input file '$file_csv_in' not found") ; } + +# print "Read '$file_csv_in'\n" ; +# open CSV_IN, '<', $file_csv_in ; +# while ($line = <CSV_IN>) +# { +# chomp $line ; +# ($language,$date,$counts) = split (',', $line, 3) ; + +# if ($language ne "commons") { next ; } + +# if ($date eq "00/0000") +# { +# @fields = split (',', $counts) ; +# $field_ndx = 0 ; +# foreach $field (@fields) +# { +# $ext_cnt {-1}{$field_ndx} = $field ; +# # print "EXT_CNT $field_ndx : $field\n" ; +# $field_ndx ++ ; +# } +# next ; +# } + +# ($month,$year) = split ('\/', $date) ; +# my $m = &months_since_2000_01 ($year,$month) ; +# next if $m < $m_start ; + +# if ($m > $mmax) +# { $mmax = $m ; } + +# @fields = split (',', $counts) ; +# $field_ndx = 0 ; +# foreach $field (@fields) +# { +# $ext_cnt {$m}{$field_ndx} = $field ; +# $ext_tot {$m} += $field ; +# $field_ndx ++ ; +# } +# } +# close CSV_IN ; + +# %ext_cnt_mmax = %{$ext_cnt {$mmax}} ; +# @ext_cnt_mmax = (sort {$ext_cnt_mmax {$b} <=> $ext_cnt_mmax {$a}} keys %ext_cnt_mmax) ; + +# $extcnt = 0 ; +# foreach $extndx (@ext_cnt_mmax) +# { +# # print "$extndx < ${ext_cnt {-1}{$extndx}} > : ${ext_cnt_mmax {$extndx}}\n" ; +# push @extndxs, $extndx ; +# if ($extcnt++ >= 9) { last ; } +# } +#} + +sub FindLargestWikis +{ + print "FindLargestWikis\n" ; + print "Largest projects (most accumulated very active editors):\n"; + @total_edits_ge_100 = sort {$total_edits_ge_100 {$b} <=> $total_edits_ge_100 {$a}} keys %total_edits_ge_100 ; + $rank = 0 ; + foreach $project_language (@total_edits_ge_100) + { + $largest_projects {$project_language} = $rank++ ; + print "$project_language," ; + last if $rank > 10 ; + } + print "\n\n" ; + + foreach $yyyymm (sort keys %months) + { + next if $yyyymm lt '2011' ; + foreach $project_language (keys %largest_projects) + { + ($project,$language) = split (',', $project_language) ; + if ($data2 {"$project,$language,$yyyymm"} eq '') + { + print "No data yet for large wiki $project_language for $yyyymm-> skip month $yyyymm\n" ; + $months {$yyyymm} = 0 ; + } + } + } +} + +sub WriteMonthlyData +{ + print "WriteMonthlyData\n" ; + my $file_csv_out = "$path_out/$file_csv_analytics_in" ; + open CSV_OUT, '>', $file_csv_out ; + foreach $project_wiki_month (sort keys %data1) + { + ($project,$wiki,$yyyymm) = split (',', $project_wiki_month) ; + + # recent month misses on eor more large wikis? + next if $months {$yyyymm} == 0 ; + + $data1 = $data1 {$project_wiki_month} ; + $data2 = $data2 {$project_wiki_month} ; + if ($data2 eq '') + { + print "Editor data missing for $project_wiki_month\n" ; + $data2 = $data2_default ; + } + $data1 =~ s/data2/$data2/ ; # insert rather than append to have all editor fields close together + print CSV_OUT "$project_wiki_month,$data1\n" ; + } + $total_edits_ge_5 {"$project,*,$yyyymm"} += $edits_ge_5 ; + $total_edits_ge_25 {"$project,*,$yyyymm"} += $edits_ge_25 ; + $total_edits_ge_100 {"$project,*,$yyyymm"} += $edits_ge_100 ; + close CSV_OUT ; +} + +sub AcceptWiki +{ + my ($project,$language) = @_ ; + + return $false if $language eq 'commons' and $project ne 'wx' ; # commons also in wikipedia csv files (bug, hard to cleanup, just skip) + return $false if $language eq 'sr' and $project eq 'wn' ; # ignore insane bot spam on + return $false if $language =~ /mania|team|comcom|closed|chair|langcom|office|searchcom|sep11|nostalgia|stats|test/i ; + + return $false if $language =~ /^(?:dk|tlh|ru_sib)$/ ; # dk=dumps exist(ed?) but site not, tlh=Klignon, ru-sib=Siberian + return $false if $project eq 'wk' and ($language eq "als" or $language eq "tlh") ; + + return $true ; +} + +sub Abort +{ + my $msg = shift ; + print "$msg\nExecution aborted." ; + # to do: log also to file + exit ; +} + + Copied: trunk/extensions/MetricsReporting/import/_readme.txt (from rev 105441, trunk/wikistats/analytics/_readme.txt) =================================================================== --- trunk/extensions/MetricsReporting/import/_readme.txt (rev 0) +++ trunk/extensions/MetricsReporting/import/_readme.txt 2011-12-07 17:36:45 UTC (rev 105444) @@ -0,0 +1,21 @@ +This folder contain bash and perl files to create and fill database 'analytics', a.o. for monthly dashboard. + +== analytics_new.sh == +Defines the database and tables and loads data from existing csv files. +It executes SQL from analytics_create_and_load_from_csv.txt + +== analytics_upd.sh == +Prepares new csv files (delegated to analytics_generate_csv_files.sh), +and empties/reloads all tables for which csv files are in this folder. +It executes SQL from analytics_refresh_from_csv.txt + +== CSV files == +CSV files and where they are generated: + +analytics_in_binaries.csv <- AnalyticsPrepBinariesData.pl +analytics_in_comscore.csv <- AnalyticsPrepComscoreData.pl +analytics_in_comscore_regions.csv <- manual +analytics_in_language_names.csv +analytics_in_offline.csv <- manual +analytics_in_page_views.csv <- /home/ezachte/wikistats/pageviews_monthly.sh (file copy after that) +analytics_in_wikistats.csv <- AnalyticsPrepWikiCountsOutput.pl Copied: trunk/extensions/MetricsReporting/import/analytics_create_and_load_from_csv.txt (from rev 105441, trunk/wikistats/analytics/analytics_create_and_load_from_csv.txt) =================================================================== --- trunk/extensions/MetricsReporting/import/analytics_create_and_load_from_csv.txt (rev 0) +++ trunk/extensions/MetricsReporting/import/analytics_create_and_load_from_csv.txt 2011-12-07 17:36:45 UTC (rev 105444) @@ -0,0 +1,181 @@ +-- make sure to delete output files *test*.csv first if any exist (MySQL on purpose forbids overwrite) + +-- tables implemented: +-- comscore +-- comscore_regions +-- wikistats +-- page_views +-- language names +-- binaries + +-- more tables planned (O= optional, not needed for report card stage) +-- project_names +-- O edits per project_code, per language, per month, per normalization type (Y/N), editor type (manual, anonymous, bot), namespace group (articles, talk pages, other) +-- O editors per project_code, per language, per month, per normalization type (Y/N), editor type (manual, anonymous, bot), namespace group (articles, talk pages, other) + + +-- open issues: +-- only store basic data in database and calculate all aggregates on the fly or do some aggregation before hand ? (e.g. count for mobile / non-mobile / ==> total of both ? <==) +-- for binaries, store one extension type per row? (future proof, more work to query), or a selected few as columns? (one row only needed per month) + +-- Create database and two tables from scratch +DROP DATABASE IF EXISTS `analytics` ; + +CREATE DATABASE `analytics` ; + +USE `analytics` ; + +CREATE TABLE `comscore` ( + `date` date NOT NULL, + `country_code` varchar (3), + `region_code` varchar (3), + `web_property` varchar (20), + `project_code` varchar (10), + `reach` decimal (4,1) DEFAULT NULL, + `visitors` decimal (15) DEFAULT NULL, + PRIMARY KEY (date,country_code,region_code,project_code,web_property), + KEY (`country_code`) +) ; + +CREATE TABLE `comscore_regions` ( + `region_code` varchar (2), + `report_language` varchar (10), + `region_name` varchar (18), + PRIMARY KEY (region_code,report_language) +) ; + +CREATE TABLE `wikistats` ( + `date` date NOT NULL, + `project_code` varchar (10), + `language_code` varchar (15), + `editors_all_time` int (10) DEFAULT NULL, + `editors_new` int (7) DEFAULT NULL, + `editors_ge_5` int (7) DEFAULT NULL, + `editors_ge_25` int (7) DEFAULT NULL, + `editors_ge_100` int (7) DEFAULT NULL, + `articles` int (12) DEFAULT NULL, + `articles_new_per_day` int (9) DEFAULT NULL, + `articles_over_bytes_500` int (12) DEFAULT NULL, + `articles_over_bytes_2000` int (12) DEFAULT NULL, + `edits_per_article` decimal (9,1) DEFAULT NULL, + `bytes_per_article` decimal (9,1) DEFAULT NULL, + `edits` int (12) DEFAULT NULL, + `size_in_bytes` int (15) DEFAULT NULL, + `size_in_words` int (15) DEFAULT NULL, + `links_internal` int (15) DEFAULT NULL, + `links_interwiki` int (15) DEFAULT NULL, + `links_image` int (15) DEFAULT NULL, + `links_external` int (15) DEFAULT NULL, + `redirects` int (15) DEFAULT NULL, + PRIMARY KEY (date,project_code,language_code) +) ; + +CREATE TABLE `page_views` ( + `date` date NOT NULL, + `project_code` char (2), + `language_code` char (15), + `views_non_mobile_raw` bigint (15), + `views_mobile_raw` bigint (15), + `views_non_mobile_normalized` bigint (15), + `views_mobile_normalized` bigint (15), + `views_raw` bigint (15), + `views_normalized` bigint (15), + PRIMARY KEY (date,project_code,language_code), + KEY (date,language_code,project_code) +) ; + +CREATE TABLE `language_names` ( + `report_language` varchar (15), + `language_code` varchar (15), + `language_name` varchar (50), + PRIMARY KEY (report_language,language_code) +) ; + +CREATE TABLE `binaries` ( + `date` date NOT NULL, + `project_code` char (2), + `language_code` char (15), + `extension` varchar (10), + `binaries` bigint (15), + PRIMARY KEY (date,project_code,language_code,extension) +) ; + +CREATE TABLE `offline` ( + `date` date NOT NULL, + `source` varchar (255), + `readers` bigint (12), + PRIMARY KEY (date,source) +) ; + +-- SHOW TABLES ; +-- DESCRIBE comscore ; +-- DESCRIBE comscore_regions ; +-- DESCRIBE wikistats ; +-- DESCRIBE page_views ; +-- DESCRIBE language_names ; +-- DESCRIBE binaries ; +-- DESCRIBE offline ; + +-- Database Manipulation +-- Obviously in real world this is a separate script + +LOAD DATA LOCAL INFILE 'analytics_in_comscore.csv' + INTO TABLE comscore + FIELDS TERMINATED BY ',' + OPTIONALLY ENCLOSED BY '"' + (@date,country_code,region_code,web_property,project_code,reach,visitors) + SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ; + +LOAD DATA LOCAL INFILE 'analytics_in_comscore_regions.csv' + INTO TABLE comscore_regions + FIELDS TERMINATED BY ',' + OPTIONALLY ENCLOSED BY '"' + (report_language,region_code,region_name) ; + +LOAD DATA LOCAL INFILE 'analytics_in_wikistats.csv' + INTO TABLE wikistats + FIELDS TERMINATED BY ',' + OPTIONALLY ENCLOSED BY '"' + (project_code,language_code,@date,editors_all_time,editors_new,editors_ge_5,editors_ge_25,editors_ge_100,articles,articles_new_per_day,articles_over_bytes_500,articles_over_bytes_2000,edits_per_article,bytes_per_article,edits,size_in_bytes,size_in_words,links_internal,links_interwiki,links_image,links_external,redirects) + SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ; + +LOAD DATA LOCAL INFILE 'analytics_in_page_views.csv' + INTO TABLE page_views + FIELDS TERMINATED BY ',' + OPTIONALLY ENCLOSED BY '"' + (project_code,language_code,@date,views_non_mobile_raw,views_mobile_raw,views_non_mobile_normalized,views_mobile_normalized,views_raw,views_normalized) + SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ; + + +LOAD DATA LOCAL INFILE 'analytics_in_language_names.csv' + INTO TABLE language_names + FIELDS TERMINATED BY ',' + OPTIONALLY ENCLOSED BY '"' + (report_language,language_code,language_name) ; + +LOAD DATA LOCAL INFILE 'analytics_in_binaries.csv' + INTO TABLE binaries + FIELDS TERMINATED BY ',' + OPTIONALLY ENCLOSED BY '"' + (project_code,language_code,@date,extension,binaries) + SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ; + +LOAD DATA LOCAL INFILE 'analytics_in_offline.csv' + INTO TABLE offline + FIELDS TERMINATED BY ',' + OPTIONALLY ENCLOSED BY '"' + (@date,readers) + SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ; + + +-- show contents (debugging only) +-- SELECT * FROM comscore ; +-- SELECT * FROM comscore_regions ; +-- SELECT * FROM wikistats ; +-- SELECT * FROM page_views ; +-- SELECT * FROM language_names ; +-- SELECT * FROM binaries +-- WHERE project_code = 'commons' ; + SELECT * FROM offline ; + + Copied: trunk/extensions/MetricsReporting/import/analytics_generate_csv_files.sh (from rev 105441, trunk/wikistats/analytics/analytics_generate_csv_files.sh) =================================================================== --- trunk/extensions/MetricsReporting/import/analytics_generate_csv_files.sh (rev 0) +++ trunk/extensions/MetricsReporting/import/analytics_generate_csv_files.sh 2011-12-07 17:36:45 UTC (rev 105444) @@ -0,0 +1,47 @@ +#!/bin/sh + +# Prepare several csv files, ready for importing into analytics database +# All generated files have _in_ in name signalling these contain data ready for importing into database +# One input record corresponds to one database record + +ulimit -v 8000000 + +clear +cd /a/analytics + + +# AnalyticsPrepBinariesData.pl read counts for binaries which were generated by wikistats +# and which reside in /a/wikistats/csv_[project code]/StatisticsPerBinariesExtension.csv +# It filters and reorganizes data and produces analytics_in_binaries.csv +# Output csv contains: project code, language, month, extension name, count + +perl AnalyticsPrepBinariesData.pl -i /a/wikistats/ -o /a/analytics/ + +# AnalyticsPrepComscoreData.pl scans /a/analytics/comscore for newest comScore csv files (with data for last 14 months) +# parses those csv files, adds/replaces data from these csv files into master files (containing full history) +# and generates input csv file analytics_in_comscore.csv ready for importing into database +# +# note : these csv files were manually downloaded from http://mymetrix.comscore.com/app/report.aspx +# and given more descriptive names, script finds newest files based on partial name search +# +# -r replace (default is add only) +# -i input folder, contains manually downloaded csv files from comScore (or xls files manually converted to csv) +# -m master files with full history +# -o output csv file, with reach per region, UV's per region and UV's per top web property, ready for import into database + +perl AnalyticsPrepComscoreData.pl -r -i /a/analytics/comscore -m /a/analytics -o /a/analytics + +# AnalyticsPrepWikiCountsOutput.pl reads a plethora of fields from several csv files from wikistats process +# It filters and reorganizes data and produces analytics_in_wikistats.csv, ready for import into analytics database + +perl AnalyticsPrepWikiCountsOutput.pl -i /a/wikistats/ -o /a/analytics + +# analytics_in_page_views.csv is written daily as part of WikiCountsSummarizeProjectCounts.pl +# part of (/home/ezachte/pageviews_monthly.sh job) +# which processes hourly projectcounts files (per wiki page view totals for one hour) from http://dammit.lt/wikistats +# and generates several files on different aggregation levels +# only action here is to copy data to this folder to have everything in one place +# note: unlike folder name suggests this file contains stats for all projects + +cp /a/wikistats/csv_wp/analytics_in_page_views.csv . + Copied: trunk/extensions/MetricsReporting/import/analytics_new.sh (from rev 105441, trunk/wikistats/analytics/analytics_new.sh) =================================================================== --- trunk/extensions/MetricsReporting/import/analytics_new.sh (rev 0) +++ trunk/extensions/MetricsReporting/import/analytics_new.sh 2011-12-07 17:36:45 UTC (rev 105444) @@ -0,0 +1,5 @@ +clear +cd /a/analytics +# rm *test*.csv +# mysql --user=root --password=changerootwhenpriyankareturns < analytics_create_and_load_from_csv.txt > mysql_log.txt + mysql -u analytics -h project2.wikimedia.org -preport < analytics_create_and_load_from_csv.txt > mysql_log.txt Copied: trunk/extensions/MetricsReporting/import/analytics_refresh_from_csv.txt (from rev 105441, trunk/wikistats/analytics/analytics_refresh_from_csv.txt) =================================================================== --- trunk/extensions/MetricsReporting/import/analytics_refresh_from_csv.txt (rev 0) +++ trunk/extensions/MetricsReporting/import/analytics_refresh_from_csv.txt 2011-12-07 17:36:45 UTC (rev 105444) @@ -0,0 +1,55 @@ +USE `analytics` ; + +TRUNCATE TABLE comscore ; +LOAD DATA LOCAL INFILE 'analytics_in_comscore.csv' + INTO TABLE comscore + FIELDS TERMINATED BY ',' + OPTIONALLY ENCLOSED BY '"' + (@date,country_code,region_code,web_property,project_code,reach,visitors) + SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ; + +TRUNCATE TABLE comscore_regions ; +LOAD DATA LOCAL INFILE 'analytics_in_comscore_regions.csv' + INTO TABLE comscore_regions + FIELDS TERMINATED BY ',' + OPTIONALLY ENCLOSED BY '"' + (report_language,region_code,region_name) ; + +TRUNCATE TABLE wikistats ; +LOAD DATA LOCAL INFILE 'analytics_in_wikistats.csv' + INTO TABLE wikistats + FIELDS TERMINATED BY ',' + OPTIONALLY ENCLOSED BY '"' + (project_code,language_code,@date,editors_all_time,editors_new,editors_ge_5,editors_ge_25,editors_ge_100,articles,articles_new_per_day,articles_over_bytes_500,articles_over_bytes_2000,edits_per_article,bytes_per_article,edits,size_in_bytes,size_in_words,links_internal,links_interwiki,links_image,links_external,redirects) + SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ; + +TRUNCATE TABLE page_views ; +LOAD DATA LOCAL INFILE 'analytics_in_page_views.csv' + INTO TABLE page_views + FIELDS TERMINATED BY ',' + OPTIONALLY ENCLOSED BY '"' + (project_code,language_code,@date,views_non_mobile_raw,views_mobile_raw,views_non_mobile_normalized,views_mobile_normalized,views_raw,views_normalized) + SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ; + +TRUNCATE TABLE language_names ; +LOAD DATA LOCAL INFILE 'analytics_in_language_names.csv' + INTO TABLE language_names + FIELDS TERMINATED BY ',' + OPTIONALLY ENCLOSED BY '"' + (report_language,language_code,language_name) ; + +TRUNCATE TABLE binaries ; +LOAD DATA LOCAL INFILE 'analytics_in_binaries.csv' + INTO TABLE binaries + FIELDS TERMINATED BY ',' + OPTIONALLY ENCLOSED BY '"' + (project_code,language_code,@date,extension,binaries) + SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ; + +TRUNCATE TABLE offline ; +LOAD DATA LOCAL INFILE 'analytics_in_offline.csv' + INTO TABLE offline + FIELDS TERMINATED BY ',' + OPTIONALLY ENCLOSED BY '"' + (@date, readers) + SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ; Copied: trunk/extensions/MetricsReporting/import/analytics_upd.sh (from rev 105441, trunk/wikistats/analytics/analytics_upd.sh) =================================================================== --- trunk/extensions/MetricsReporting/import/analytics_upd.sh (rev 0) +++ trunk/extensions/MetricsReporting/import/analytics_upd.sh 2011-12-07 17:36:45 UTC (rev 105444) @@ -0,0 +1,6 @@ +clear +cd /a/analytics + +./analytics_generate_csv_files.sh + +mysql -u analytics -h project2.wikimedia.org -preport < analytics_refresh_from_csv.txt _______________________________________________ MediaWiki-CVS mailing list MediaWiki-CVS@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs