https://www.mediawiki.org/wiki/Special:Code/MediaWiki/105445
Revision: 105445 Author: diederik Date: 2011-12-07 17:39:50 +0000 (Wed, 07 Dec 2011) Log Message: ----------- Moved files to trunk/extensions/MetricsReporting/import Added Paths: ----------- trunk/wikistats/analytics/README Removed Paths: ------------- trunk/wikistats/analytics/AnalyticsPrepBinariesData.pl trunk/wikistats/analytics/AnalyticsPrepComscoreData.pl trunk/wikistats/analytics/AnalyticsPrepLanguageNames.pl trunk/wikistats/analytics/AnalyticsPrepPageViews.pl trunk/wikistats/analytics/AnalyticsPrepWikiCountsOutput.pl trunk/wikistats/analytics/_readme.txt trunk/wikistats/analytics/analytics_create_and_load_from_csv.txt trunk/wikistats/analytics/analytics_generate_csv_files.sh trunk/wikistats/analytics/analytics_new.sh trunk/wikistats/analytics/analytics_refresh_from_csv.txt trunk/wikistats/analytics/analytics_upd.sh Deleted: trunk/wikistats/analytics/AnalyticsPrepBinariesData.pl =================================================================== --- trunk/wikistats/analytics/AnalyticsPrepBinariesData.pl 2011-12-07 17:36:45 UTC (rev 105444) +++ trunk/wikistats/analytics/AnalyticsPrepBinariesData.pl 2011-12-07 17:39:50 UTC (rev 105445) @@ -1,124 +0,0 @@ -#!/usr/local/bin/perl - - use Getopt::Std ; - - &ParseArguments ; - - print "Write file '$file_csv_out'\n" ; - open CSV_OUT, '>', $file_csv_out ; - - foreach $project (qw (wb wk wn wp wq ws wv wx)) - { &ReadStatisticsPerBinariesExtension ($project) ; } - - close CSV_OUT ; - - print "\n\nReady\n\n" ; - exit ; - -sub ParseArguments -{ - my @options ; - getopt ("io", \%options) ; - - die ("Specify input folder as: -i path") if (! defined ($options {"i"})) ; - die ("Specify output folder as: -o path'") if (! defined ($options {"o"})) ; - - $path_in = $options {"i"} ; - $path_out = $options {"o"} ; - - die "Input folder '$path_in' does not exist" if (! -d $path_in) ; - die "Output folder '$path_out' does not exist" if (! -d $path_out) ; - - # tests only - # $path_in = "C:/@ Wikimedia/# Out Bayes" ; - # $path_out = "C:/analytics" ; # "w:/@ report card/data" ; - - print "Input folder: $path_in\n" ; - print "Output folder: $path_out\n" ; - print "\n" ; - - $file_csv_out = "$path_out/analytics_in_binaries.csv" ; -} - - -sub ReadStatisticsPerBinariesExtension -{ - my $project = shift ; - my $file_csv_in = "$path_in/csv_$project/StatisticsPerBinariesExtension.csv" ; - $yyyymm_hi = -1 ; - - if (! -e $file_csv_in) - { die "Input file '$file_csv_in' not found" ; } - - - print "Read '$file_csv_in'\n" ; - open CSV_IN, '<', $file_csv_in ; - - $language_prev = '' ; - while ($line = <CSV_IN>) - { - chomp $line ; - next if $line !~ /,.*?,/ ; - - ($language,$date,$data) = split (',', $line, 3) ; - - # for each wiki first line shows ext names, no tcounts - if ($date eq "00/0000") - { - if ($language_prev ne '') - { &WriteMonthlyData ($project, $language_prev) ; } - $language_prev = $language ; - - undef %ext_name ; - undef %ext_ndx ; - undef %ext_cnt ; - undef %months ; - - @exts = split (',', $data) ; - $ndx = 0 ; - foreach $ext (@exts) - { - $ext_name {$ndx} = $ext ; - $ext_ndx {$ext} = $ndx ; - $ndx ++ ; - } - next ; - } - - ($month,$year) = split ('\/', $date) ; - $yyyymm = sprintf ("%04d-%02d", $year, $month) ; - if ($yyyymm gt $yyyymm_hi) - { $yyyymm_hi = $yyyymm ; } - $months {$yyyymm}++ ; - - @counts = split (',', $data) ; - $ndx = 0 ; - foreach $count (@counts) - { - $ext_cnt {$yyyymm}{$ext_name {$ndx}} = $count ; - $ndx ++ ; - } - } - &WriteMonthlyData ($project, $language_prev) ; - - close CSV_IN ; -} - -sub WriteMonthlyData -{ - my ($project,$language) = @_ ; - # get sorted array of extensions, order by count for most recent month - %ext_cnt_yyyymm_hi = %{$ext_cnt {$yyyymm_hi}} ; - @ext_cnt_yyyymm_hi = (sort {$ext_cnt_yyyymm_hi {$b} <=> $ext_cnt_yyyymm_hi {$a}} keys %ext_cnt_yyyymm_hi) ; - - foreach $month (sort keys %months) - { - $ndx = 0 ; - foreach $ext (@ext_cnt_yyyymm_hi) - { - print CSV_OUT "$project,$language,$month,$ext,${ext_cnt{$yyyymm}{$ext_name {$ndx}}}\n" ; - # print "$month,$ext,${ext_cnt{$yyyymm}{$ext_name {$ndx}}}\n" ; - last if (++ $ndx > 25) ; - } - } -} Deleted: trunk/wikistats/analytics/AnalyticsPrepComscoreData.pl =================================================================== --- trunk/wikistats/analytics/AnalyticsPrepComscoreData.pl 2011-12-07 17:36:45 UTC (rev 105444) +++ trunk/wikistats/analytics/AnalyticsPrepComscoreData.pl 2011-12-07 17:39:50 UTC (rev 105445) @@ -1,564 +0,0 @@ -#!/usr/bin/perl - -# Copyright (C) 2011 Wikimedia Foundation -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License version 2 -# as published by the Free Software Foundation. -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. -# See the GNU General Public License for more details, at -# http://www.fsf.org/licenses/gpl.html - -# Author: -# Erik Zachte, email ezac...@wikimedia.org - -# Functionality: -# comScore data can be downloaded as csv file, which each contain 14 months history -# This script uses these files to update 'master' csv files which contain all known history -# Note: only entities which are already in master file will be updated! -# Then it merges these master files into one csv file which can be loaded into analytics database -# Data are: reach by region, unique visitors by region, unique visitors by web property - -# Parameters: -# -m (required) folder with 'master' csv files (files with all known history) -# -u (required) folder with 'update' csv files (files with lastest 14 months history, produced by comScore) - -# Output: -# updated master csv files + merged and formatted csv for import in MySQL - -# http://svn.wikimedia.org/viewvc/mediawiki/trunk/wikistats/analytics/ - - use Getopt::Std ; - use Cwd; - - my $options ; - getopt ("imo", \%options) ; - - $true = 1 ; - $false = 0 ; - - $script_name = "AnalyticsPrepComscoreData.pl" ; - $script_version = "0.31" ; - -# EZ test only -# $source = "comscore" ; -# $server = "ez_test" ; -# $generated = "2011-05-06 00:00:00" ; -# $user = "ezachte" ; - - $dir_in = $options {"i"} ; - $dir_upd = $options {"m"} ; - $dir_out = $options {"o"} ; - $mode = 'add'; - if (defined $options {'r'}) - { $mode = 'replace'; } - - print "Mode is $mode (specify '-r' for replace)\n\n"; - - if (! -d "/home/") # EZ test machine - { - $dir_in = "C:/@ Wikimedia/@ Report Card/Data" ; - $dir_upd = "C:/MySQL/analytics" ; - $dir_out = "C:/MySQL/analytics" ; - $mode = 'replace' ; - } - - if ($dir_in eq '') - { Abort ("Specify folder for input file (new comScore data) '-i folder'") ; } - if ($dir_upd eq '') - { Abort ("Specify folder for master files (full history) as '-m folder'") ; } - if ($dir_out eq '') - { Abort ("Specify folder for output file '-o folder'") ; } - - $file_comscore_reach_master = "history_comscore_reach_regions.csv" ; - $file_comscore_reach_update = "*reach*by*region*csv" ; - $file_comscore_uv_region_master = "history_comscore_UV_regions.csv" ; - $file_comscore_uv_region_update = "*UVs*by*region*csv" ; - $file_comscore_uv_property_master = "history_comscore_UV_properties.csv" ; - $file_comscore_uv_property_update = "*UV*trend*csv" ; - - $layout_csv_reach = 1 ; - $layout_csv_regions = 2 ; - $layout_csv_properties = 3 ; - - print "Directories:\n" . - "Input (new comScore data): '$dir_in'\n". - "Master files (full history): '$dir_upd'\n" . - "Output (database feed): '$dir_out'\n\n" ; - - %region_codes = ( - "Europe"=>"EU", - "North America"=>"NA", - "Latin America"=>"LA", - "World-Wide" => "W", - "Middle East - Africa" => "MA", - "Asia Pacific"=> "AS", - "United States" => "US", - "India" => "I", - "China" => "C" - ) ; - - foreach $region_name (keys %region_codes) - { $region_names {$region_codes {$region_name}} = $region_name ; } - - @months_short = qw "Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec" ; - - &ReadMasterComscoreDataReachPerRegion ($file_comscore_reach_master, $file_comscore_reach_update, "%.1f", 1, $layout_csv_reach) ; - %reach_region_code = %data ; - - &ReadMasterComscoreDataVisitorsPerRegion ($file_comscore_uv_region_master, $file_comscore_uv_region_update, "%.0f", 1000, $layout_csv_regions) ; - %visitors_region_code = %data ; - - &ReadMasterComscoreDataVisitorsPerProperty ($file_comscore_uv_property_master, $file_comscore_uv_property_update, "%.0f", 1000, $layout_csv_properties) ; - %visitors_web_property = %data ; - - &WriteDataAnalytics ; - - print "\nReady\n\n" ; - exit ; - -sub ReadMasterComscoreDataReachPerRegion -{ - my ($file_comscore_master, $file_comscore_updates, $precision, $layout_csv) = @_ ; - - print "ReadMasterComscoreDataReachPerRegion\n\n" ; - - undef %months ; - undef %data ; - undef @regions ; - - open IN, '<', "$dir_upd/$file_comscore_master" ; - - $lines = 0 ; - while ($line = <IN>) - { - chomp $line ; - $line =~ s/\r//g ; - - ($yyyymm,@data) = split (',', $line) ; - - if ($lines++ == 0) - { - @regions = @data ; - print "Regions found: " . (join ',', @regions) . "\n"; - next ; - } - - $field_ndx = 0 ; - foreach (@data) - { - $region = $regions [$field_ndx] ; - $region_code = $region_codes {$region} ; - - $data = $data [$field_ndx] ; - if ($data eq '') - { $data = '0' ; } - $months {$yyyymm} ++ ; - $data {"$yyyymm,$region_code"} = $data ; - # print "Old data $yyyymm,$region_code = $data\n" ; - $field_ndx++ ; - } - } - close IN ; - - my $updates_found = &UpdateMasterFileFromRecentComscoreData ($file_comscore_master, $file_comscore_updates, 1, $layout_csv, @regions) ; - return if ! $updates_found ; - - rename "$dir_upd/$file_comscore_master", "$dir_upd/$file_comscore_master.~" ; - open OUT, '>', "$dir_upd/$file_comscore_master" ; - - $line_out = "yyyymm" ; - foreach $region_name (@regions) - { $line_out .= ",$region_name" ; } - print OUT "$line_out" ; - - foreach $yyyymm (sort {$b cmp $a} keys %months) - { - $line_out = "\n$yyyymm" ; - foreach $region_name (@regions) - { - $yyyymm_region_code = $yyyymm . ',' . $region_codes {$region_name} ; - $line_out .= "," . sprintf ($precision, $data {$yyyymm_region_code}) ; - } - print OUT "$line_out" ; - } - - close OUT ; -} - -sub ReadMasterComscoreDataVisitorsPerRegion -{ - my ($file_comscore_master, $file_comscore_updates, $precision, $multiplier, $layout_csv) = @_ ; - - print "ReadMasterComscoreDataVisitorsPerRegion\n\n"; - - undef %months ; - undef %data ; - undef @regions ; - - open IN, '<', "$dir_upd/$file_comscore_master" ; - - $lines = 0 ; - $metric = 'unique_visitors' ; - while ($line = <IN>) - { - chomp $line ; - $line =~ s/\r//g ; - $line = &GetNumberOnly ($line) ; - - next if $line !~ /(?:yyyymm|\d\d\d\d-\d\d)/ ; - - ($yyyymm,@data) = split (',', $line) ; - - if ($lines++ == 0) - { - @regions = @data ; - print "Regions found: " . (join ',', @regions) . "\n"; - next ; - } - - $field_ndx = 0 ; - foreach (@data) - { - $region = $regions [$field_ndx] ; - $region_code = $region_codes {$region} ; - - $data = $data [$field_ndx] ; - if ($data eq '') - { $data = '0' ; } - - # print "Old data $yyyymm,$region = $data\n" ; - - $months {$yyyymm} ++ ; - $data {"$yyyymm,$region_code"} = $data ; - - $field_ndx++ ; - } - } - close IN ; - - my $updates_found = &UpdateMasterFileFromRecentComscoreData ($file_comscore_master, $file_comscore_updates, 1000, $layout_csv, @regions) ; - return if ! $updates_found ; - - rename "$dir_upd/$file_comscore_master", "$dir_upd/$file_comscore_master.~" ; - open OUT, '>', "$dir_upd/$file_comscore_master" ; - - $line_out = "yyyymm" ; - foreach $region_name (@regions) - { $line_out .= ",$region_name" ; } - print OUT "$line_out" ; - - foreach $yyyymm (sort {$b cmp $a} keys %months) - { - $line_out = "\n$yyyymm" ; - foreach $region_name (@regions) - { - $yyyymm_region_code = $yyyymm . ',' . $region_codes {$region_name} ; - $line_out .= "," . sprintf ($precision, $data {$yyyymm_region_code}) ; - } - print OUT "$line_out" ; - } - - close OUT ; -} - -sub ReadMasterComscoreDataVisitorsPerProperty -{ - my ($file_comscore_master, $file_comscore_updates, $precision, $multiplier, $layout_csv) = @_ ; - - print "ReadMasterComscoreDataVisitorsPerProperty\n\n"; - - undef %months ; - undef %data ; - undef @properties ; - - open IN, '<', "$dir_upd/$file_comscore_master" ; - - $lines = 0 ; - $metric = 'unique_visitors' ; - while ($line = <IN>) - { - chomp $line ; - $line =~ s/\r//g ; - - ($yyyymm,@data) = split (',', $line) ; - if ($lines++ == 0) - { @properties = @data ; next ; } - - $field_ndx = 0 ; - foreach (@data) - { - $property = $properties [$field_ndx] ; - $property =~ s/.*Yahoo.*/Yahoo/ ; - $data = $data [$field_ndx] ; - if ($data eq '') - { $data = '0' ; } - - # print "Old data $yyyymm,$property = $data\n" ; - - $months {$yyyymm} ++ ; - $data {"$yyyymm,$property"} = $data ; - - $field_ndx++ ; - } - } - close IN ; - - my $updates_found = &UpdateMasterFileFromRecentComscoreData ($file_comscore_master, $file_comscore_updates, 1000, $layout_csv, @properties) ; - return if ! $updates_found ; - - rename "$dir_upd/$file_comscore_master", "$dir_upd/$file_comscore_master.~" ; - open OUT, '>', "$dir_upd/$file_comscore_master" ; - - $line_out = "yyyymm" ; - foreach $property (@properties) - { $line_out .= ",$property" ; } - print OUT "$line_out" ; - - foreach $yyyymm (sort {$b cmp $a} keys %months) - { - $line_out = "\n$yyyymm" ; - foreach $property (@properties) - { - $yyyymm_property = "$yyyymm,$property" ; - $line_out .= "," . sprintf ($precision, $data {$yyyymm_property}) ; - } - print OUT "$line_out" ; - } - - close OUT ; -} - -sub UpdateMasterFileFromRecentComscoreData -{ - my ($file_comscore_master, $file_comscore_updates, $multiplier, $layout_csv, @white_list) = @_ ; - - print "UpdateMasterFileFromRecentComscoreData\n\n"; - - undef %white_list ; - undef %not_white_listed ; - - print "White list: ". (join (',', @white_list)) . "\n\n"; - - foreach $id (@white_list) - { $white_list {$id} = $true ; } - - if (! -e "$dir_upd/$file_comscore_master") - { Abort ("File $file_comscore_master not found!") ; } - - $age_all = -M "$dir_upd/$file_comscore_master" ; - print "Latest comscore master file is " . sprintf ("%.0f", $age_all) . " days old: '$file_comscore_master'\n" ; - - my $cwd = getcwd ; - chdir $dir_in ; - - @files = glob($file_comscore_updates) ; - $min_age_upd = 999999 ; - $file_comscore_updates_latest = '' ; - foreach $file (@files) - { - $age = -M $file ; - if ($age < $min_age_upd) - { - $min_age_upd = $age ; - $file_comscore_updates_latest = $file ; - } - } - print "Latest comscore update file is " . sprintf ("%.0f", $min_age_upd) . " days old: '$file_comscore_updates_latest'\n" ; - - if ($min_age_upd == 999999) - { - print "No valid update file found. Nothing to update." ; - return ; - } - - #if ($age_all > $min_age_upd) - #{ - # print "File with master data more recent than latest update csv from comScore. Nothing to update." ; - # return ; - #} - - my $updates_found = $false ; - - open CSV, '<', $file_comscore_updates_latest ; - binmode CSV ; - while ($line = <CSV>) - { - chomp $line ; - $line =~ s/\r//g ; - $line = &GetNumberOnly ($line) ; - - if ($line =~ /Jan-\d\d\d\d.*?Feb-\d\d\d\d/) # e.g. 'Location,Location,Jan-2010,Feb-2010,Mar-2010,Apr-2010,...' - { - if ($layout_csv == $layout_csv_properties) - { ($dummy1,$dummy2,$dummy3,@months) = split (',', $line) ; } # web properties csv file - else - { ($dummy1,$dummy2,@months) = split (',', $line) ; } # uv / reach csv files - - @months = &mmm_yyyy2yyyy_mm (@months) ; - } - - if (($line =~ /^\d+,/) || ($line =~ /,,.*?Total Internet/)) - { - if ($layout_csv == $layout_csv_properties) - { - ($index,$dummy,$property,@data) = split (',', $line) ; - $property =~ s/^\s+// ; - $property =~ s/\s+$// ; - - $property =~ s/.*Total Internet.*/Total Internet/i ; - $property =~ s/.*Google.*/Google/i ; - $property =~ s/.*Microsoft.*/Microsoft/i ; - $property =~ s/.*FACEBOOK.*/Facebook/i ; - $property =~ s/.*Yahoo.*/Yahoo/i ; - $property =~ s/.*Amazon.*/Amazon/i ; - $property =~ s/.*Apple.*/Apple/i ; - $property =~ s/.*AOL.*/AOL/i ; - $property =~ s/.*Wikimedia.*/Wikimedia/i ; - $property =~ s/.*Tencent.*/Tencent/i ; - $property =~ s/.*Baidu.*/Baidu/i ; - $property =~ s/.*CBS.*/CBS/i ; - - if (! $white_list {$property}) - { - $not_white_listed {$property}++ ; - next ; - } - - $id = $property ; - } - else - { - ($index,$region,@data) = split (',', $line) ; - $region =~ s/^\s+// ; - $region =~ s/\s+$// ; - - if (! $white_list {$region}) - { - $not_white_listed {$region}++ ; - next ; - } - - $id = $region_codes {$region} ; - } - - for ($m = 0 ; $m <= $#months ; $m++) - { - $yyyymm = $months [$m] ; - $months {$yyyymm} ++ ; - $yyyymm_id = "$yyyymm,$id" ; - $data = $data [$m] * $multiplier ; - - if ($mode eq 'add') - { - if (! defined $data {$yyyymm_id}) - { - $updates_found = $true ; - print "New data found: $yyyymm_id = $data\n" ; - $data {$yyyymm_id} = $data ; - } - } - else - { - $updates_found = $true ; - print "Data found: $yyyymm_id = $data\n" ; - $data {$yyyymm_id} = $data ; - } - } - } - } - - $entities_not_white_listed = join (', ', sort keys %not_white_listed) ; - if ($entities_not_white_listed ne '') - { print "\nEntities ignored:\n$entities_not_white_listed\n\n" ; } - - if (! $updates_found) - { print "No new updates found\n" ; } - else - { print "\nUpdates found, rewrite master file '$file_comscore_master'\n\n" ; } - - return ($updates_found) ; -} - -sub WriteDataAnalytics -{ - print "WriteDataAnalytics\n\n"; - - open OUT, '>', "$dir_out/analytics_in_comscore.csv" ; - - $metric = 'unique_visitors' ; - foreach $yyyymm (sort keys %months) - { - # store meta data elsewhere - # $line = "$generated,$source,$server,$script_name,$script_version,$user,$yyyymm,$country_code,$region_code,$property,$project,$normalized,$metric,$data\n" ; - foreach $region_code (sort values %region_codes) - { - $country_code = '-' ; - $property = '-' ; - $project = '-' ; - $reach = $reach_region_code {"$yyyymm,$region_code"} ; - $visitors = $visitors_region_code {"$yyyymm,$region_code"} ; - - if (! defined $reach) { $reach = -1 ; } - if (! defined $visitors) { $visitors = -1 ; } - - next if $reach == -1 and $visitors == -1 ; - - $line = "$yyyymm,$country_code,$region_code,$property,$project,$reach,$visitors\n" ; - print OUT $line ; - print $line ; - } - - foreach $property (sort @properties) - { - $country_code = '-' ; - $region_code = '-' ; - $project = '-' ; - $reach = '-1' ; - $visitors = $visitors_web_property {"$yyyymm,$property"} ; - - next if ! defined $visitors ; - - $line = "$yyyymm,$country_code,$region_code,$property,$project,$reach,$visitors\n" ; - print OUT $line ; - # print $line ; - } - } -} - -sub GetNumberOnly -{ - my $line = shift ; - $line =~ s/("[^\"]+")/($a=$1,$a=~s#,##g,$a)/ge ; # nested regexp: remove comma's inside double quotes - $line =~ s/"//g ; - return $line ; -} - -sub mmm_yyyy2yyyy_mm -{ - my @months = @_ ; - my ($m) ; - # Jan -> 01, etc - foreach $month (@months) - { - my ($mmm,$yyyy) = split ('-', $month) ; - for ($m = 0 ; $m <= $#months_short ; $m++) - { - if ($mmm eq $months_short [$m]) - { - $month = "$yyyy-" . sprintf ("%02d", $m+1) ; - last ; - } - } - } - return @months ; -} - -sub Abort -{ - $msg = shift ; - - print "\nAbort, reason: $msg\n\n" ; - exit ; -} Deleted: trunk/wikistats/analytics/AnalyticsPrepLanguageNames.pl =================================================================== --- trunk/wikistats/analytics/AnalyticsPrepLanguageNames.pl 2011-12-07 17:36:45 UTC (rev 105444) +++ trunk/wikistats/analytics/AnalyticsPrepLanguageNames.pl 2011-12-07 17:39:50 UTC (rev 105445) @@ -1,162 +0,0 @@ -#!/usr/bin/perl - -# This module prepares a csv file with language names for feed into the analytics database -# The main work of collecting these names from different sources is done by -# http://svn.wikimedia.org/viewvc/mediawiki/trunk/wikistats/dumps/WikiReportsLocalizations.pm -# which is part of wikistats reporting phase and once each month updates local master csv files from all sources - -# The following code to merge and filter these master csv files is based on parts of the code in WikiReportsLocalizations.pm, function Localization -# To do some day: make a completely independent script out of this code and code from WikiReportsLocalizations.pm which covers the whole production cycle - -# Sources for language names: -# - php files -# - translatewiki -# - English Wikipedia API (interwikilinks) - -# Right now multilingual support for the analytics database is just a nice idea, so to speed up data feeds, just keep English and German translations - - use Getopt::Std ; - - $true = 1 ; - $false = 0 ; - - $max_language_name = 50 ; # truncate if longer - - $file_csv_language_names_php = "LanguageNamesViaPhp.csv" ; - $file_csv_language_names_wp = "LanguageNamesViaWpEnEdited.csv" ; - $file_csv_analytics_in = "analytics_in_language_names.csv" ; - - $languages_force_case_uc = "ast|br|de|en|id|nl|wa" ; # incomplete list, based on languages supported by wikistats reporting - $languages_filter = "de|en" ; - foreach $language (split '\|', $languages_filter) - { $include_target_language {$language} = $true ; } - - &ParseArguments ; - &ReadCsvFiles ; - &WriteCsvFile ; - - -# if ($language eq "id") # swap which file takes precedence - - print "\nReady\n\n" ; - exit ; - -sub ParseArguments -{ - my (@options, $arguments) ; - - getopt ("io", \%options) ; - - foreach $arg (sort keys %options) - { $arguments .= " -$arg " . $options {$arg} . "\n" ; } - print ("\nArguments\n$arguments\n") ; - - $options {"i"} = "w:/# out bayes/csv_wp" ; # EZ test - $options {"o"} = "c:/MySQL/analytics" ; # EZ test - - die ("Specify input folder for projectcounts files as: -i path") if (! defined ($options {"i"})) ; - die ("Specify output folder as: -o path'") if (! defined ($options {"o"})) ; - - ($path_in = $options {"i"}) =~ s/[\/\\]+$// ; # remove trailing (back)slash if any - ($path_out = $options {"o"}) =~ s/[\/\\]+$// ; # remove trailing (back)slash if any - - die "Input folder '$path_in' does not exist" if (! -d $path_in) ; - die "Output folder '$path_out' does not exist" if (! -d $path_out) ; - - print "Input folder: $path_in\n" ; - print "Output folder: $path_out\n\n" ; - - $file_csv_language_names_php = "$path_in/$file_csv_language_names_php" ; - $file_csv_language_names_wp = "$path_in/$file_csv_language_names_wp" ; - $file_csv_analytics_in = "$path_out/$file_csv_analytics_in" ; - - die "Input file '$file_csv_language_names_php' not found" if (! -e $file_csv_language_names_php) ; - die "Input file '$file_csv_language_names_wp' not found" if (! -e $file_csv_language_names_wp) ; -} - -sub ReadCsvFiles -{ - #first read definitions from php message files, then overwrite with definitions from interwiki links when available - # except for target language 'id' (Indonesian) where quality of php file has been deemed more reliable - - open CSV_IN, "<", $file_csv_language_names_php ; - while ($line = <CSV_IN>) - { - chomp ($line) ; - ($target_language, $code, $name_unicode, $name_html) = split (',', $line) ; - - next if ! $include_target_language {$target_language} ; - - $out_languages {$target_language} {$code} = &FormatName ($target_language, $name_unicode) ; # forget about html for analytics database - } - close CSV_IN ; - - open CSV_IN, "<", $file_csv_language_names_wp ; - while ($line = <CSV_IN>) - { - chomp ($line) ; - ($target_language, $code, $name_unicode, $name_html) = split (',', $line) ; - - next if ! $include_target_language {$target_language} ; - - next if $target_language eq 'id' and $out_languages {$target_language} {$code} ne '' ; - - # $name_unicode_php = $out_languages {$target_language} {$code} ; # test only - # $name_unicode_wp = &FormatName ($target_language, $name_unicode) ; # test only - # if (($name_unicode_php ne '') && ($name_unicode_php ne $name_unicode_wp)) # test only - # { print "$name_unicode_php => $name_unicode_wp\n" ; } # test only - - $out_languages {$target_language} {$code} = &FormatName ($target_language, $name_unicode) ; # forget about html for analytics database - } - close CSV_IN ; -} - -sub FormatName -{ - my ($target_language, $name_unicode) = @_ ; - - $name_unicode2 = $name_unicode ; - - if ($target_language eq "de") - { $name_unicode =~ s/e?\s*\-?sprache//i ; } - - if ($target_language =~ /^(?:$languages_force_case_uc)/) - { $name_unicode = ucfirst $name_unicode ; } - else - { $name_unicode = lc $name_unicode ; } - -# Test only -# if (($target_language eq 'de') && ($name_unicode ne $name_unicode2)) -# { print "$name_unicode2 => $name_unicode\n" ; } - - return ($name_unicode) ; -} - -sub WriteCsvFile -{ - open CSV_OUT, ">", $file_csv_analytics_in || die ("File '$file_csv_analytics_in' could not be opened") ; - binmode CSV_OUT ; # force Unix style linebreak \012 - - foreach $target_language (sort keys %out_languages) - { - print "\nTarget language '$target_language'\n" ; - %translations = %{$out_languages {$target_language}} ; - - foreach $code (sort keys %translations) - { - $language_name = $translations{$code} ; - $language_name2 = substr ($language_name,0,$max_language_name) ; - - if ($language_name ne $language_name2) - { print "Language name truncated to $max_language_name chars: '$language_name' -> '$language_name2'\n" ; } - - if ($language_name2 =~ /,/) - { $language_name2 = "\"$language_name2\"" ; } - # test only - print CSV_OUT "$target_language,$code,$language_name2\n" ; - } - } - - close CSV_OUT ; -} - Deleted: trunk/wikistats/analytics/AnalyticsPrepPageViews.pl =================================================================== --- trunk/wikistats/analytics/AnalyticsPrepPageViews.pl 2011-12-07 17:36:45 UTC (rev 105444) +++ trunk/wikistats/analytics/AnalyticsPrepPageViews.pl 2011-12-07 17:39:50 UTC (rev 105445) @@ -1,11 +0,0 @@ -#!/usr/bin/perl - -# This file is just a place holder (alias) -# Page views per day/week/month/etc, per project, per language, nrmalized and not normalized are collected by -# http://svn.wikimedia.org/viewvc/mediawiki/trunk/wikistats/dumps/WikiCountsSummarizeProjectCounts.pl -# -# Added May 2001: -# For analytics database one file is written for all projects and languages combined, -# with per month, not normalized and normalized page view counts in one row. -# This file is written to same output folder as other csv files generated by WikiCountsSummarizeProjectCounts.pl -# and needs to be copied to analytics folder later \ No newline at end of file Deleted: trunk/wikistats/analytics/AnalyticsPrepWikiCountsOutput.pl =================================================================== --- trunk/wikistats/analytics/AnalyticsPrepWikiCountsOutput.pl 2011-12-07 17:36:45 UTC (rev 105444) +++ trunk/wikistats/analytics/AnalyticsPrepWikiCountsOutput.pl 2011-12-07 17:39:50 UTC (rev 105445) @@ -1,334 +0,0 @@ -#!/usr/local/bin/perl - -# Copyright (C) 2011 Wikimedia Foundation -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License version 2 -# as published by the Free Software Foundation. -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. -# See the GNU General Public License for more details, at -# http://www.fsf.org/licenses/gpl.html - -# Author: -# Erik Zachte, email ezac...@wikimedia.org -# loosely based on predecessor -# http://svn.wikimedia.org/viewvc/mediawiki/trunk/wikistats/reportcard/ReportCardExtractWikiCountsOutput.pl - -# Functionality: -# tba - -# Parameters: -# tba - -# Output: -# updated csv file for import in MySQL - -# http://svn.wikimedia.org/viewvc/mediawiki/trunk/wikistats/analytics/ - - use Getopt::Std ; - - $true = 1 ; - $false = 0 ; - - @projects = ('wb','wk','wn','wp','wq','ws','wv','wx','commons','*') ; - - $file_csv_monthly_data = "StatisticsMonthly.csv" ; - $file_csv_user_activity_spread = "StatisticsUserActivitySpread.csv" ; - $file_csv_analytics_in = "analytics_in_wikistats.csv" ; - - &ParseArguments ; - &ReadStatisticsMonthly ; - &FindLargestWikis ; - &WriteMonthlyData ; - - print "\nReady\n\n" ; - exit ; - -sub ParseArguments -{ - print "ParseArguments\n" ; - my (@options, $arguments) ; - - getopt ("io", \%options) ; - - foreach $arg (sort keys %options) - { $arguments .= " -$arg " . $options {$arg} . "\n" ; } - print ("\nArguments\n$arguments\n") ; - - if (! -d '/mnt/') # EZ test - { - $path_in = "c:/\@ wikimedia/# out bayes" ; - $path_out = "c:/MySQL/analytics" ; - } - else - { - die ("Specify input folder for projectcounts files as: -i path") if (! defined ($options {"i"})) ; - die ("Specify output folder as: -o path'") if (! defined ($options {"o"})) ; - - $path_in = $options {"i"} ; - $path_out = $options {"o"} ; - } - - die "Input folder '$path_in' does not exist" if (! -d $path_in) ; - die "Output folder '$path_out' does not exist" if (! -d $path_out) ; - - print "Input folder: $path_in\n" ; - print "Output folder: $path_out\n\n" ; - - $file_csv_out = "$path_out/analytics_in_wikistats.csv" ; -} - -sub ReadStatisticsMonthly -{ - print "ReadStatisticsMonthly\n" ; - &ReadStatisticsMonthlyForProject ("wb") ; - &ReadStatisticsMonthlyForProject ("wk") ; - &ReadStatisticsMonthlyForProject ("wn") ; - &ReadStatisticsMonthlyForProject ("wp") ; - &ReadStatisticsMonthlyForProject ("wq") ; - &ReadStatisticsMonthlyForProject ("ws") ; - &ReadStatisticsMonthlyForProject ("wv") ; - &ReadStatisticsMonthlyForProject ("wx") ; - -# &ReadStatisticsPerBinariesExtensionCommons ; -} - -sub ReadStatisticsMonthlyForProject -{ - my $project = shift; - $all_projects = "*" ; - - my $file_csv_in_1 = "$path_in/csv_$project/$file_csv_monthly_data" ; - my $file_csv_in_2 = "$path_in/csv_$project/$file_csv_user_activity_spread" ; - - if (! -e $file_csv_in_1) - { &Abort ("Input file '$file_csv_in_1' not found") ; } - if (! -e $file_csv_in_2) - { &Abort ("Input file '$file_csv_in_2' not found") ; } - - my $yyyymm ; - - print "Read '$file_csv_in_1'\n" ; - open CSV_IN, '<', $file_csv_in_1 ; - while ($line = <CSV_IN>) - { - chomp $line ; - ($language,$date,$counts) = split (',', $line, 3) ; - @fields = split (',', $counts) ; - - next if ! &AcceptWiki ($project,$language) ; - - ($month,$day,$year) = split ('\/', $date) ; - $yyyymm = sprintf ("%04d-%02d", $year, $month) ; - - foreach $field (@fields) - { - if ($field eq '-') - { $field = 0 ; } - } - - $data = $fields [0] . ',' . # contributors all time - $fields [1] . ',' . # new contributors - 'data2,' . # place holder for more data, to be inserted later - $fields [4] . ',' . # articles - $fields [6] . ',' . # articles new per day - $fields [9] . ',' . # larger than 0.5 kB - $fields [10] . ',' . # larger than 2.0 kB - $fields [7] . ',' . # mean edits per article - $fields [8] . ',' . # mean bytes per article - $fields [11] . ',' . # edits - $fields [12] . ',' . # size in bytes - $fields [13] . ',' . # size in words - $fields [14] . ',' . # links internal - $fields [15] . ',' . # links interwiki - $fields [16] . ',' . # links images - $fields [17] . ',' . # links external - $fields [18] ; # redirects - - $data1 {"$project,$language,$yyyymm"} = $data ; - } - close CSV_IN ; - - # now read (very) active editors from newer more accurate file (split data for reg users and bots, unlike StatisticsMonthly.csv) - - print "Read '$file_csv_in_2'\n" ; - open CSV_IN, '<', $file_csv_in_2 ; - while ($line = <CSV_IN>) - { - chomp $line ; - ($language,$date,$reguser_bot,$group,@counts) = split (',', $line) ; - - next if ! &AcceptWiki ($project,$language) ; - - if ($reguser_bot ne "R") { next ; } # R: reg user, B: bot - if ($group ne "A") { next ; } # A: articles, T: talk pages, O: other namespaces - - ($month,$day,$year) = split ('\/', $date) ; - $yyyymm = sprintf ("%04d-%02d", $year, $month) ; - $months {$yyyymm} ++ ; -# print "YYYYMM $yyyymm\n" ; - - # data have been collected in WikiCountsProcess.pm and been written in WikiCountsOutput.pm - # count user with over x edits - # threshold starting with a 3 are 10xSQRT(10), 100xSQRT(10), 1000xSQRT(10), etc - # @thresholds = (1,3,5,10,25,32,50,100,250,316,500,1000,2500,3162,5000,10000,25000,31623,50000,100000,250000,316228,500000,1000000,2500000,3162278,500000,10000000,25000000,31622777,5000000,100000000) ; - $edits_ge_5 = @counts [2] > 0 ? @counts [2] : 0 ; - $edits_ge_25 = @counts [4] > 0 ? @counts [4] : 0 ; - $edits_ge_100 = @counts [7] > 0 ? @counts [7] : 0 ; - $data2 {"$project,$language,$yyyymm"} = "$edits_ge_5,$edits_ge_25,$edits_ge_100" ; - - $total_edits_ge_5 {"$project,$language"} += $edits_ge_5 ; - $total_edits_ge_25 {"$project,$language"} += $edits_ge_25 ; - $total_edits_ge_100 {"$project,$language"} += $edits_ge_100 ; - - # prep string with right amount of comma's - if ($data2_default eq '') - { - $data2_default = $data2 {"$project,$language,$yyyymm"} ; - $data2_default =~ s/[^,]+/0/g ; - } - } - close CSV_IN ; -} - -#sub ReadStatisticsPerBinariesExtensionCommons -#{ -# my $file_csv_in = "$path_in/csv_wx/StatisticsPerBinariesExtension.csv" ; -# my $mmax = -1 ; - -# if (! -e $file_csv_in) -# { &Abort ("Input file '$file_csv_in' not found") ; } - -# print "Read '$file_csv_in'\n" ; -# open CSV_IN, '<', $file_csv_in ; -# while ($line = <CSV_IN>) -# { -# chomp $line ; -# ($language,$date,$counts) = split (',', $line, 3) ; - -# if ($language ne "commons") { next ; } - -# if ($date eq "00/0000") -# { -# @fields = split (',', $counts) ; -# $field_ndx = 0 ; -# foreach $field (@fields) -# { -# $ext_cnt {-1}{$field_ndx} = $field ; -# # print "EXT_CNT $field_ndx : $field\n" ; -# $field_ndx ++ ; -# } -# next ; -# } - -# ($month,$year) = split ('\/', $date) ; -# my $m = &months_since_2000_01 ($year,$month) ; -# next if $m < $m_start ; - -# if ($m > $mmax) -# { $mmax = $m ; } - -# @fields = split (',', $counts) ; -# $field_ndx = 0 ; -# foreach $field (@fields) -# { -# $ext_cnt {$m}{$field_ndx} = $field ; -# $ext_tot {$m} += $field ; -# $field_ndx ++ ; -# } -# } -# close CSV_IN ; - -# %ext_cnt_mmax = %{$ext_cnt {$mmax}} ; -# @ext_cnt_mmax = (sort {$ext_cnt_mmax {$b} <=> $ext_cnt_mmax {$a}} keys %ext_cnt_mmax) ; - -# $extcnt = 0 ; -# foreach $extndx (@ext_cnt_mmax) -# { -# # print "$extndx < ${ext_cnt {-1}{$extndx}} > : ${ext_cnt_mmax {$extndx}}\n" ; -# push @extndxs, $extndx ; -# if ($extcnt++ >= 9) { last ; } -# } -#} - -sub FindLargestWikis -{ - print "FindLargestWikis\n" ; - print "Largest projects (most accumulated very active editors):\n"; - @total_edits_ge_100 = sort {$total_edits_ge_100 {$b} <=> $total_edits_ge_100 {$a}} keys %total_edits_ge_100 ; - $rank = 0 ; - foreach $project_language (@total_edits_ge_100) - { - $largest_projects {$project_language} = $rank++ ; - print "$project_language," ; - last if $rank > 10 ; - } - print "\n\n" ; - - foreach $yyyymm (sort keys %months) - { - next if $yyyymm lt '2011' ; - foreach $project_language (keys %largest_projects) - { - ($project,$language) = split (',', $project_language) ; - if ($data2 {"$project,$language,$yyyymm"} eq '') - { - print "No data yet for large wiki $project_language for $yyyymm-> skip month $yyyymm\n" ; - $months {$yyyymm} = 0 ; - } - } - } -} - -sub WriteMonthlyData -{ - print "WriteMonthlyData\n" ; - my $file_csv_out = "$path_out/$file_csv_analytics_in" ; - open CSV_OUT, '>', $file_csv_out ; - foreach $project_wiki_month (sort keys %data1) - { - ($project,$wiki,$yyyymm) = split (',', $project_wiki_month) ; - - # recent month misses on eor more large wikis? - next if $months {$yyyymm} == 0 ; - - $data1 = $data1 {$project_wiki_month} ; - $data2 = $data2 {$project_wiki_month} ; - if ($data2 eq '') - { - print "Editor data missing for $project_wiki_month\n" ; - $data2 = $data2_default ; - } - $data1 =~ s/data2/$data2/ ; # insert rather than append to have all editor fields close together - print CSV_OUT "$project_wiki_month,$data1\n" ; - } - $total_edits_ge_5 {"$project,*,$yyyymm"} += $edits_ge_5 ; - $total_edits_ge_25 {"$project,*,$yyyymm"} += $edits_ge_25 ; - $total_edits_ge_100 {"$project,*,$yyyymm"} += $edits_ge_100 ; - close CSV_OUT ; -} - -sub AcceptWiki -{ - my ($project,$language) = @_ ; - - return $false if $language eq 'commons' and $project ne 'wx' ; # commons also in wikipedia csv files (bug, hard to cleanup, just skip) - return $false if $language eq 'sr' and $project eq 'wn' ; # ignore insane bot spam on - return $false if $language =~ /mania|team|comcom|closed|chair|langcom|office|searchcom|sep11|nostalgia|stats|test/i ; - - return $false if $language =~ /^(?:dk|tlh|ru_sib)$/ ; # dk=dumps exist(ed?) but site not, tlh=Klignon, ru-sib=Siberian - return $false if $project eq 'wk' and ($language eq "als" or $language eq "tlh") ; - - return $true ; -} - -sub Abort -{ - my $msg = shift ; - print "$msg\nExecution aborted." ; - # to do: log also to file - exit ; -} - - Added: trunk/wikistats/analytics/README =================================================================== --- trunk/wikistats/analytics/README (rev 0) +++ trunk/wikistats/analytics/README 2011-12-07 17:39:50 UTC (rev 105445) @@ -0,0 +1 @@ +The files in this folder have been moved to trunk/extensions/MetricReporting/import. Property changes on: trunk/wikistats/analytics/README ___________________________________________________________________ Added: svn:eol-style + native Deleted: trunk/wikistats/analytics/_readme.txt =================================================================== --- trunk/wikistats/analytics/_readme.txt 2011-12-07 17:36:45 UTC (rev 105444) +++ trunk/wikistats/analytics/_readme.txt 2011-12-07 17:39:50 UTC (rev 105445) @@ -1,21 +0,0 @@ -This folder contain bash and perl files to create and fill database 'analytics', a.o. for monthly dashboard. - -== analytics_new.sh == -Defines the database and tables and loads data from existing csv files. -It executes SQL from analytics_create_and_load_from_csv.txt - -== analytics_upd.sh == -Prepares new csv files (delegated to analytics_generate_csv_files.sh), -and empties/reloads all tables for which csv files are in this folder. -It executes SQL from analytics_refresh_from_csv.txt - -== CSV files == -CSV files and where they are generated: - -analytics_in_binaries.csv <- AnalyticsPrepBinariesData.pl -analytics_in_comscore.csv <- AnalyticsPrepComscoreData.pl -analytics_in_comscore_regions.csv <- manual -analytics_in_language_names.csv -analytics_in_offline.csv <- manual -analytics_in_page_views.csv <- /home/ezachte/wikistats/pageviews_monthly.sh (file copy after that) -analytics_in_wikistats.csv <- AnalyticsPrepWikiCountsOutput.pl Deleted: trunk/wikistats/analytics/analytics_create_and_load_from_csv.txt =================================================================== --- trunk/wikistats/analytics/analytics_create_and_load_from_csv.txt 2011-12-07 17:36:45 UTC (rev 105444) +++ trunk/wikistats/analytics/analytics_create_and_load_from_csv.txt 2011-12-07 17:39:50 UTC (rev 105445) @@ -1,181 +0,0 @@ --- make sure to delete output files *test*.csv first if any exist (MySQL on purpose forbids overwrite) - --- tables implemented: --- comscore --- comscore_regions --- wikistats --- page_views --- language names --- binaries - --- more tables planned (O= optional, not needed for report card stage) --- project_names --- O edits per project_code, per language, per month, per normalization type (Y/N), editor type (manual, anonymous, bot), namespace group (articles, talk pages, other) --- O editors per project_code, per language, per month, per normalization type (Y/N), editor type (manual, anonymous, bot), namespace group (articles, talk pages, other) - - --- open issues: --- only store basic data in database and calculate all aggregates on the fly or do some aggregation before hand ? (e.g. count for mobile / non-mobile / ==> total of both ? <==) --- for binaries, store one extension type per row? (future proof, more work to query), or a selected few as columns? (one row only needed per month) - --- Create database and two tables from scratch -DROP DATABASE IF EXISTS `analytics` ; - -CREATE DATABASE `analytics` ; - -USE `analytics` ; - -CREATE TABLE `comscore` ( - `date` date NOT NULL, - `country_code` varchar (3), - `region_code` varchar (3), - `web_property` varchar (20), - `project_code` varchar (10), - `reach` decimal (4,1) DEFAULT NULL, - `visitors` decimal (15) DEFAULT NULL, - PRIMARY KEY (date,country_code,region_code,project_code,web_property), - KEY (`country_code`) -) ; - -CREATE TABLE `comscore_regions` ( - `region_code` varchar (2), - `report_language` varchar (10), - `region_name` varchar (18), - PRIMARY KEY (region_code,report_language) -) ; - -CREATE TABLE `wikistats` ( - `date` date NOT NULL, - `project_code` varchar (10), - `language_code` varchar (15), - `editors_all_time` int (10) DEFAULT NULL, - `editors_new` int (7) DEFAULT NULL, - `editors_ge_5` int (7) DEFAULT NULL, - `editors_ge_25` int (7) DEFAULT NULL, - `editors_ge_100` int (7) DEFAULT NULL, - `articles` int (12) DEFAULT NULL, - `articles_new_per_day` int (9) DEFAULT NULL, - `articles_over_bytes_500` int (12) DEFAULT NULL, - `articles_over_bytes_2000` int (12) DEFAULT NULL, - `edits_per_article` decimal (9,1) DEFAULT NULL, - `bytes_per_article` decimal (9,1) DEFAULT NULL, - `edits` int (12) DEFAULT NULL, - `size_in_bytes` int (15) DEFAULT NULL, - `size_in_words` int (15) DEFAULT NULL, - `links_internal` int (15) DEFAULT NULL, - `links_interwiki` int (15) DEFAULT NULL, - `links_image` int (15) DEFAULT NULL, - `links_external` int (15) DEFAULT NULL, - `redirects` int (15) DEFAULT NULL, - PRIMARY KEY (date,project_code,language_code) -) ; - -CREATE TABLE `page_views` ( - `date` date NOT NULL, - `project_code` char (2), - `language_code` char (15), - `views_non_mobile_raw` bigint (15), - `views_mobile_raw` bigint (15), - `views_non_mobile_normalized` bigint (15), - `views_mobile_normalized` bigint (15), - `views_raw` bigint (15), - `views_normalized` bigint (15), - PRIMARY KEY (date,project_code,language_code), - KEY (date,language_code,project_code) -) ; - -CREATE TABLE `language_names` ( - `report_language` varchar (15), - `language_code` varchar (15), - `language_name` varchar (50), - PRIMARY KEY (report_language,language_code) -) ; - -CREATE TABLE `binaries` ( - `date` date NOT NULL, - `project_code` char (2), - `language_code` char (15), - `extension` varchar (10), - `binaries` bigint (15), - PRIMARY KEY (date,project_code,language_code,extension) -) ; - -CREATE TABLE `offline` ( - `date` date NOT NULL, - `source` varchar (255), - `readers` bigint (12), - PRIMARY KEY (date,source) -) ; - --- SHOW TABLES ; --- DESCRIBE comscore ; --- DESCRIBE comscore_regions ; --- DESCRIBE wikistats ; --- DESCRIBE page_views ; --- DESCRIBE language_names ; --- DESCRIBE binaries ; --- DESCRIBE offline ; - --- Database Manipulation --- Obviously in real world this is a separate script - -LOAD DATA LOCAL INFILE 'analytics_in_comscore.csv' - INTO TABLE comscore - FIELDS TERMINATED BY ',' - OPTIONALLY ENCLOSED BY '"' - (@date,country_code,region_code,web_property,project_code,reach,visitors) - SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ; - -LOAD DATA LOCAL INFILE 'analytics_in_comscore_regions.csv' - INTO TABLE comscore_regions - FIELDS TERMINATED BY ',' - OPTIONALLY ENCLOSED BY '"' - (report_language,region_code,region_name) ; - -LOAD DATA LOCAL INFILE 'analytics_in_wikistats.csv' - INTO TABLE wikistats - FIELDS TERMINATED BY ',' - OPTIONALLY ENCLOSED BY '"' - (project_code,language_code,@date,editors_all_time,editors_new,editors_ge_5,editors_ge_25,editors_ge_100,articles,articles_new_per_day,articles_over_bytes_500,articles_over_bytes_2000,edits_per_article,bytes_per_article,edits,size_in_bytes,size_in_words,links_internal,links_interwiki,links_image,links_external,redirects) - SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ; - -LOAD DATA LOCAL INFILE 'analytics_in_page_views.csv' - INTO TABLE page_views - FIELDS TERMINATED BY ',' - OPTIONALLY ENCLOSED BY '"' - (project_code,language_code,@date,views_non_mobile_raw,views_mobile_raw,views_non_mobile_normalized,views_mobile_normalized,views_raw,views_normalized) - SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ; - - -LOAD DATA LOCAL INFILE 'analytics_in_language_names.csv' - INTO TABLE language_names - FIELDS TERMINATED BY ',' - OPTIONALLY ENCLOSED BY '"' - (report_language,language_code,language_name) ; - -LOAD DATA LOCAL INFILE 'analytics_in_binaries.csv' - INTO TABLE binaries - FIELDS TERMINATED BY ',' - OPTIONALLY ENCLOSED BY '"' - (project_code,language_code,@date,extension,binaries) - SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ; - -LOAD DATA LOCAL INFILE 'analytics_in_offline.csv' - INTO TABLE offline - FIELDS TERMINATED BY ',' - OPTIONALLY ENCLOSED BY '"' - (@date,readers) - SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ; - - --- show contents (debugging only) --- SELECT * FROM comscore ; --- SELECT * FROM comscore_regions ; --- SELECT * FROM wikistats ; --- SELECT * FROM page_views ; --- SELECT * FROM language_names ; --- SELECT * FROM binaries --- WHERE project_code = 'commons' ; - SELECT * FROM offline ; - - Deleted: trunk/wikistats/analytics/analytics_generate_csv_files.sh =================================================================== --- trunk/wikistats/analytics/analytics_generate_csv_files.sh 2011-12-07 17:36:45 UTC (rev 105444) +++ trunk/wikistats/analytics/analytics_generate_csv_files.sh 2011-12-07 17:39:50 UTC (rev 105445) @@ -1,47 +0,0 @@ -#!/bin/sh - -# Prepare several csv files, ready for importing into analytics database -# All generated files have _in_ in name signalling these contain data ready for importing into database -# One input record corresponds to one database record - -ulimit -v 8000000 - -clear -cd /a/analytics - - -# AnalyticsPrepBinariesData.pl read counts for binaries which were generated by wikistats -# and which reside in /a/wikistats/csv_[project code]/StatisticsPerBinariesExtension.csv -# It filters and reorganizes data and produces analytics_in_binaries.csv -# Output csv contains: project code, language, month, extension name, count - -perl AnalyticsPrepBinariesData.pl -i /a/wikistats/ -o /a/analytics/ - -# AnalyticsPrepComscoreData.pl scans /a/analytics/comscore for newest comScore csv files (with data for last 14 months) -# parses those csv files, adds/replaces data from these csv files into master files (containing full history) -# and generates input csv file analytics_in_comscore.csv ready for importing into database -# -# note : these csv files were manually downloaded from http://mymetrix.comscore.com/app/report.aspx -# and given more descriptive names, script finds newest files based on partial name search -# -# -r replace (default is add only) -# -i input folder, contains manually downloaded csv files from comScore (or xls files manually converted to csv) -# -m master files with full history -# -o output csv file, with reach per region, UV's per region and UV's per top web property, ready for import into database - -perl AnalyticsPrepComscoreData.pl -r -i /a/analytics/comscore -m /a/analytics -o /a/analytics - -# AnalyticsPrepWikiCountsOutput.pl reads a plethora of fields from several csv files from wikistats process -# It filters and reorganizes data and produces analytics_in_wikistats.csv, ready for import into analytics database - -perl AnalyticsPrepWikiCountsOutput.pl -i /a/wikistats/ -o /a/analytics - -# analytics_in_page_views.csv is written daily as part of WikiCountsSummarizeProjectCounts.pl -# part of (/home/ezachte/pageviews_monthly.sh job) -# which processes hourly projectcounts files (per wiki page view totals for one hour) from http://dammit.lt/wikistats -# and generates several files on different aggregation levels -# only action here is to copy data to this folder to have everything in one place -# note: unlike folder name suggests this file contains stats for all projects - -cp /a/wikistats/csv_wp/analytics_in_page_views.csv . - Deleted: trunk/wikistats/analytics/analytics_new.sh =================================================================== --- trunk/wikistats/analytics/analytics_new.sh 2011-12-07 17:36:45 UTC (rev 105444) +++ trunk/wikistats/analytics/analytics_new.sh 2011-12-07 17:39:50 UTC (rev 105445) @@ -1,5 +0,0 @@ -clear -cd /a/analytics -# rm *test*.csv -# mysql --user=root --password=changerootwhenpriyankareturns < analytics_create_and_load_from_csv.txt > mysql_log.txt - mysql -u analytics -h project2.wikimedia.org -preport < analytics_create_and_load_from_csv.txt > mysql_log.txt Deleted: trunk/wikistats/analytics/analytics_refresh_from_csv.txt =================================================================== --- trunk/wikistats/analytics/analytics_refresh_from_csv.txt 2011-12-07 17:36:45 UTC (rev 105444) +++ trunk/wikistats/analytics/analytics_refresh_from_csv.txt 2011-12-07 17:39:50 UTC (rev 105445) @@ -1,55 +0,0 @@ -USE `analytics` ; - -TRUNCATE TABLE comscore ; -LOAD DATA LOCAL INFILE 'analytics_in_comscore.csv' - INTO TABLE comscore - FIELDS TERMINATED BY ',' - OPTIONALLY ENCLOSED BY '"' - (@date,country_code,region_code,web_property,project_code,reach,visitors) - SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ; - -TRUNCATE TABLE comscore_regions ; -LOAD DATA LOCAL INFILE 'analytics_in_comscore_regions.csv' - INTO TABLE comscore_regions - FIELDS TERMINATED BY ',' - OPTIONALLY ENCLOSED BY '"' - (report_language,region_code,region_name) ; - -TRUNCATE TABLE wikistats ; -LOAD DATA LOCAL INFILE 'analytics_in_wikistats.csv' - INTO TABLE wikistats - FIELDS TERMINATED BY ',' - OPTIONALLY ENCLOSED BY '"' - (project_code,language_code,@date,editors_all_time,editors_new,editors_ge_5,editors_ge_25,editors_ge_100,articles,articles_new_per_day,articles_over_bytes_500,articles_over_bytes_2000,edits_per_article,bytes_per_article,edits,size_in_bytes,size_in_words,links_internal,links_interwiki,links_image,links_external,redirects) - SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ; - -TRUNCATE TABLE page_views ; -LOAD DATA LOCAL INFILE 'analytics_in_page_views.csv' - INTO TABLE page_views - FIELDS TERMINATED BY ',' - OPTIONALLY ENCLOSED BY '"' - (project_code,language_code,@date,views_non_mobile_raw,views_mobile_raw,views_non_mobile_normalized,views_mobile_normalized,views_raw,views_normalized) - SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ; - -TRUNCATE TABLE language_names ; -LOAD DATA LOCAL INFILE 'analytics_in_language_names.csv' - INTO TABLE language_names - FIELDS TERMINATED BY ',' - OPTIONALLY ENCLOSED BY '"' - (report_language,language_code,language_name) ; - -TRUNCATE TABLE binaries ; -LOAD DATA LOCAL INFILE 'analytics_in_binaries.csv' - INTO TABLE binaries - FIELDS TERMINATED BY ',' - OPTIONALLY ENCLOSED BY '"' - (project_code,language_code,@date,extension,binaries) - SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ; - -TRUNCATE TABLE offline ; -LOAD DATA LOCAL INFILE 'analytics_in_offline.csv' - INTO TABLE offline - FIELDS TERMINATED BY ',' - OPTIONALLY ENCLOSED BY '"' - (@date, readers) - SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ; Deleted: trunk/wikistats/analytics/analytics_upd.sh =================================================================== --- trunk/wikistats/analytics/analytics_upd.sh 2011-12-07 17:36:45 UTC (rev 105444) +++ trunk/wikistats/analytics/analytics_upd.sh 2011-12-07 17:39:50 UTC (rev 105445) @@ -1,6 +0,0 @@ -clear -cd /a/analytics - -./analytics_generate_csv_files.sh - -mysql -u analytics -h project2.wikimedia.org -preport < analytics_refresh_from_csv.txt _______________________________________________ MediaWiki-CVS mailing list MediaWiki-CVS@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs