http://www.mediawiki.org/wiki/Special:Code/MediaWiki/91509
Revision: 91509 Author: ezachte Date: 2011-07-06 00:37:49 +0000 (Wed, 06 Jul 2011) Log Message: ----------- new MySQL tables and fields, updated perl files, more bash scripts Modified Paths: -------------- trunk/wikistats/analytics/AnalyticsPrepBinariesData.pl trunk/wikistats/analytics/AnalyticsPrepComscoreData.pl trunk/wikistats/analytics/AnalyticsPrepWikiCountsOutput.pl Added Paths: ----------- trunk/wikistats/analytics/analytics_create_and_load_from_csv.txt trunk/wikistats/analytics/analytics_generate_csv_files.sh trunk/wikistats/analytics/analytics_new.sh trunk/wikistats/analytics/analytics_refresh_from_csv.txt trunk/wikistats/analytics/analytics_upd.sh Modified: trunk/wikistats/analytics/AnalyticsPrepBinariesData.pl =================================================================== --- trunk/wikistats/analytics/AnalyticsPrepBinariesData.pl 2011-07-06 00:24:59 UTC (rev 91508) +++ trunk/wikistats/analytics/AnalyticsPrepBinariesData.pl 2011-07-06 00:37:49 UTC (rev 91509) @@ -17,21 +17,21 @@ sub ParseArguments { -# my @options ; -# getopt ("io", \%options) ; + my @options ; + getopt ("io", \%options) ; -# die ("Specify input folder as: -i path") if (! defined ($options {"i"})) ; -# die ("Specify output folder as: -o path'") if (! defined ($options {"o"})) ; + die ("Specify input folder as: -i path") if (! defined ($options {"i"})) ; + die ("Specify output folder as: -o path'") if (! defined ($options {"o"})) ; -# $path_in = $options {"i"} ; -# $path_out = $options {"o"} ; + $path_in = $options {"i"} ; + $path_out = $options {"o"} ; -# die "Input folder '$path_in' does not exist" if (! -d $path_in) ; -# die "Output folder '$path_out' does not exist" if (! -d $path_out) ; + die "Input folder '$path_in' does not exist" if (! -d $path_in) ; + die "Output folder '$path_out' does not exist" if (! -d $path_out) ; # tests only - $path_in = "C:/@ Wikimedia/# Out Bayes" ; - $path_out = "C:/analytics" ; # "w:/@ report card/data" ; + # $path_in = "C:/@ Wikimedia/# Out Bayes" ; + # $path_out = "C:/analytics" ; # "w:/@ report card/data" ; print "Input folder: $path_in\n" ; print "Output folder: $path_out\n" ; Modified: trunk/wikistats/analytics/AnalyticsPrepComscoreData.pl =================================================================== --- trunk/wikistats/analytics/AnalyticsPrepComscoreData.pl 2011-07-06 00:24:59 UTC (rev 91508) +++ trunk/wikistats/analytics/AnalyticsPrepComscoreData.pl 2011-07-06 00:37:49 UTC (rev 91509) @@ -33,13 +33,13 @@ use Cwd; my $options ; - getopt ("mu", \%options) ; + getopt ("imo", \%options) ; $true = 1 ; $false = 0 ; $script_name = "AnalyticsPrepComscoreData.pl" ; - $script_version = "0.32" ; + $script_version = "0.31" ; # EZ test only # $source = "comscore" ; @@ -47,21 +47,34 @@ # $generated = "2011-05-06 00:00:00" ; # $user = "ezachte" ; - $dir_analytics = $options {"m"} ; - $dir_comscore_updates = $options {"u"} ; + $dir_in = $options {"i"} ; + $dir_upd = $options {"m"} ; + $dir_out = $options {"o"} ; + $mode = 'add'; + if (defined $options {'r'}) + { $mode = 'replace'; } - $dir_analytics = "C:/@ Wikimedia/! MySQL/analytics" ; # EZ test only - $dir_comscore_updates = "C:/@ Wikimedia/@ Report Card/Data" ; # EZ test only + print "Mode is $mode (specify '-r' for replace)\n\n"; - if (($dir_analytics eq '') || ($dir_comscore_updates eq '')) - { Abort ("Specify folder for 'master' csv files as '-m folder', folder for 'update' csv files as -u folder'") ; } + if (! -d "/home/") # EZ test machine + { + $dir_in = "C:/@ Wikimedia/@ Report Card/Data" ; + $dir_upd = "C:/MySQL/analytics" ; + $dir_out = "C:/MySQL/analytics" ; + $mode = 'replace' ; + } - $file_comscore_reach_master = "history_comscore_reach_regions.csv" ; - $file_comscore_reach_update = "*reach*by*region*csv" ; + if ($dir_in eq '') + { Abort ("Specify folder for input file (new comScore data) '-i folder'") ; } + if ($dir_upd eq '') + { Abort ("Specify folder for master files (full history) as '-m folder'") ; } + if ($dir_out eq '') + { Abort ("Specify folder for output file '-o folder'") ; } - $file_comscore_uv_region_master = "history_comscore_UV_regions.csv" ; - $file_comscore_uv_region_update = "*UVs*by*region*csv" ; - + $file_comscore_reach_master = "history_comscore_reach_regions.csv" ; + $file_comscore_reach_update = "*reach*by*region*csv" ; + $file_comscore_uv_region_master = "history_comscore_UV_regions.csv" ; + $file_comscore_uv_region_update = "*UVs*by*region*csv" ; $file_comscore_uv_property_master = "history_comscore_UV_properties.csv" ; $file_comscore_uv_property_update = "*UV*trend*csv" ; @@ -69,7 +82,10 @@ $layout_csv_regions = 2 ; $layout_csv_properties = 3 ; - print "Directories:\nAnalytics '$dir_analytics'\nUpdates '$dir_comscore_updates'\n\n" ; + print "Directories:\n" . + "Input (new comScore data): '$dir_in'\n". + "Master files (full history): '$dir_upd'\n" . + "Output (database feed): '$dir_out'\n\n" ; %region_codes = ( "Europe"=>"EU", @@ -88,13 +104,13 @@ @months_short = qw "Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec" ; - &ReadDataReachPerRegion ($file_comscore_reach_master, $file_comscore_reach_update, "%.1f", 1, $layout_csv_reach) ; + &ReadMasterComscoreDataReachPerRegion ($file_comscore_reach_master, $file_comscore_reach_update, "%.1f", 1, $layout_csv_reach) ; %reach_region_code = %data ; - &ReadDataVisitorsPerRegion ($file_comscore_uv_region_master, $file_comscore_uv_region_update, "%.0f", 1000, $layout_csv_regions) ; + &ReadMasterComscoreDataVisitorsPerRegion ($file_comscore_uv_region_master, $file_comscore_uv_region_update, "%.0f", 1000, $layout_csv_regions) ; %visitors_region_code = %data ; - &ReadDataVisitorsPerProperty ($file_comscore_uv_property_master, $file_comscore_uv_property_update, "%.0f", 1000, $layout_csv_properties) ; + &ReadMasterComscoreDataVisitorsPerProperty ($file_comscore_uv_property_master, $file_comscore_uv_property_update, "%.0f", 1000, $layout_csv_properties) ; %visitors_web_property = %data ; &WriteDataAnalytics ; @@ -102,153 +118,32 @@ print "\nReady\n\n" ; exit ; -sub UpdateFromLatestComscoreData +sub ReadMasterComscoreDataReachPerRegion { - my ($file_comscore_master, $file_comscore_updates, $multiplier, $layout_csv, @update_only) = @_ ; + my ($file_comscore_master, $file_comscore_updates, $precision, $layout_csv) = @_ ; - undef %update_only ; - undef %do_not_update ; + print "ReadMasterComscoreDataReachPerRegion\n\n" ; - foreach $id (@update_only) - { $update_only {$id} = $true ; } - - if (! -e "$dir_analytics/$file_comscore_master") - { Abort ("File $file_comscore_master not found!") ; } - - $age_master = -M "$dir_analytics/$file_comscore_master" ; - print "\nLatest comscore master file is " . sprintf ("%.0f", $age_master) . " days old: '$file_comscore_master'\n" ; - - my $cwd = getcwd ; - chdir $dir_comscore_updates ; - - @files = glob($file_comscore_updates) ; - $age_update = 999999 ; - $file_comscore_updates_latest = '' ; - foreach $file (@files) - { - $age = -M $file ; - if ($age < $age_update) - { - $age_update = $age ; - $file_comscore_updates_latest = $file ; - } - } - print "\nLatest comscore update file is " . sprintf ("%.0f", $age_update) . " days old: '$file_comscore_updates_latest'\n" ; - - if ($age_update == 999999) - { - print "No valid update file found. Nothing to update." ; - return ; - } - - if ($age_master < $age_update) - { - print "File with master data more recent than latest update csv from comScore. Nothing to update." ; - return ; - } - - my $updates_found = $false ; - - print "\nRead updates\n\n" ; - open CSV, '<', $file_comscore_updates_latest ; - while ($line = <CSV>) - { - chomp $line ; - $line = &GetNumberOnly ($line) ; - - if ($line =~ /Jan-\d\d\d\d.*?Feb-\d\d\d\d/) # e.g. 'Location,Location,Jan-2010,Feb-2010,Mar-2010,Apr-2010,...' - { - if ($layout_csv == $layout_csv_properties) - { ($dummy1,$dummy2,$dummy3,@months) = split (',', $line) ; } # web properties csv file - else - { ($dummy1,$dummy2,@months) = split (',', $line) ; } # uv / reach csv files - - @months = &mmm_yyyy2yyyy_mm (@months) ; - } - - if ($line =~ /^\d+,/) - { - if ($layout_csv == $layout_csv_properties) - { - ($index,$dummy,$property,@data) = split (',', $line) ; - $property =~ s/^\s+// ; - $property =~ s/\s+$// ; - - $property =~ s/.*Google.*/Google/i ; - $property =~ s/.*Microsoft.*/Microsoft/i ; - $property =~ s/.*FACEBOOK.*/Facebook/i ; - $property =~ s/.*Yahoo.*/Yahoo/i ; - $property =~ s/.*Amazon.*/Amazon/i ; - $property =~ s/.*Apple.*/Apple/i ; - $property =~ s/.*AOL.*/AOL/i ; - $property =~ s/.*Wikimedia.*/Wikimedia/i ; - $property =~ s/.*Tencent.*/Tencent/i ; - $property =~ s/.*Baidu.*/Baidu/i ; - $property =~ s/.*CBS.*/CBS/i ; - - $id = $property ; - } - else - { - ($index,$region,@data) = split (',', $line) ; - $region =~ s/^\s+// ; - $region =~ s/\s+$// ; - $id = $region_codes {$region} ; - } - - if ($update_only {$id} == 0) - { - $do_not_update {$id}++ ; - next ; - } - - for ($m = 0 ; $m <= $#months ; $m++) - { - $yyyymm = $months [$m] ; - $months {$yyyymm} ++ ; - $yyyymm_id = "$yyyymm,$id" ; - $data = $data [$m] * $multiplier ; - - if (! defined $data {$yyyymm_id}) - { - $updates_found = $true ; - print "New data found: $yyyymm_id = $data\n" ; - $data {$yyyymm_id} = $data ; - } - } - } - } - - $ignored = join ', ', sort keys %do_not_update ; - print "\nEntities ignored:\n$ignored\n\n" ; - - if (! $updates_found) - { print "No new updates found\n" ; } - else - { print "\nUpdates found, rewrite master file '$file_comscore_master'\n\n" ; } - - return ($updates_found) ; -} - -sub ReadDataReachPerRegion -{ - my ($file_comscore_master, $file_comscore_updates, $precision, $layout_csv) = @_ ; - undef %months ; undef %data ; undef @regions ; - open IN, '<', "$dir_analytics/$file_comscore_master" ; + open IN, '<', "$dir_upd/$file_comscore_master" ; $lines = 0 ; while ($line = <IN>) { chomp $line ; + $line =~ s/\r//g ; ($yyyymm,@data) = split (',', $line) ; if ($lines++ == 0) - { @regions = @data ; next ; } + { + @regions = @data ; + print "Regions found: " . (join ',', @regions) . "\n"; + next ; + } $field_ndx = 0 ; foreach (@data) @@ -267,11 +162,11 @@ } close IN ; - my $updates_found = &UpdateFromLatestComscoreData ($file_comscore_master, $file_comscore_updates, 1, $layout_csv, @regions) ; + my $updates_found = &UpdateMasterFileFromRecentComscoreData ($file_comscore_master, $file_comscore_updates, 1, $layout_csv, @regions) ; return if ! $updates_found ; - rename "$dir_analytics/$file_comscore_master", "$dir_analytics/$file_comscore_master.~" ; - open OUT, '>', "$dir_analytics/$file_comscore_master" ; + rename "$dir_upd/$file_comscore_master", "$dir_upd/$file_comscore_master.~" ; + open OUT, '>', "$dir_upd/$file_comscore_master" ; $line_out = "yyyymm" ; foreach $region_name (@regions) @@ -292,27 +187,36 @@ close OUT ; } -sub ReadDataVisitorsPerRegion +sub ReadMasterComscoreDataVisitorsPerRegion { my ($file_comscore_master, $file_comscore_updates, $precision, $multiplier, $layout_csv) = @_ ; + print "ReadMasterComscoreDataVisitorsPerRegion\n\n"; + undef %months ; undef %data ; undef @regions ; - open IN, '<', "$dir_analytics/$file_comscore_master" ; + open IN, '<', "$dir_upd/$file_comscore_master" ; $lines = 0 ; $metric = 'unique_visitors' ; while ($line = <IN>) { chomp $line ; + $line =~ s/\r//g ; $line = &GetNumberOnly ($line) ; + next if $line !~ /(?:yyyymm|\d\d\d\d-\d\d)/ ; + ($yyyymm,@data) = split (',', $line) ; if ($lines++ == 0) - { @regions = @data ; next ; } + { + @regions = @data ; + print "Regions found: " . (join ',', @regions) . "\n"; + next ; + } $field_ndx = 0 ; foreach (@data) @@ -334,11 +238,11 @@ } close IN ; - my $updates_found = &UpdateFromLatestComscoreData ($file_comscore_master, $file_comscore_updates, 1000, $layout_csv, @regions) ; + my $updates_found = &UpdateMasterFileFromRecentComscoreData ($file_comscore_master, $file_comscore_updates, 1000, $layout_csv, @regions) ; return if ! $updates_found ; - rename "$dir_analytics/$file_comscore_master", "$dir_analytics/$file_comscore_master.~" ; - open OUT, '>', "$dir_analytics/$file_comscore_master" ; + rename "$dir_upd/$file_comscore_master", "$dir_upd/$file_comscore_master.~" ; + open OUT, '>', "$dir_upd/$file_comscore_master" ; $line_out = "yyyymm" ; foreach $region_name (@regions) @@ -359,21 +263,24 @@ close OUT ; } -sub ReadDataVisitorsPerProperty +sub ReadMasterComscoreDataVisitorsPerProperty { my ($file_comscore_master, $file_comscore_updates, $precision, $multiplier, $layout_csv) = @_ ; + print "ReadMasterComscoreDataVisitorsPerProperty\n\n"; + undef %months ; undef %data ; undef @properties ; - open IN, '<', "$dir_analytics/$file_comscore_master" ; + open IN, '<', "$dir_upd/$file_comscore_master" ; $lines = 0 ; $metric = 'unique_visitors' ; while ($line = <IN>) { chomp $line ; + $line =~ s/\r//g ; ($yyyymm,@data) = split (',', $line) ; if ($lines++ == 0) @@ -398,11 +305,11 @@ } close IN ; - my $updates_found = &UpdateFromLatestComscoreData ($file_comscore_master, $file_comscore_updates, 1000, $layout_csv, @properties) ; + my $updates_found = &UpdateMasterFileFromRecentComscoreData ($file_comscore_master, $file_comscore_updates, 1000, $layout_csv, @properties) ; return if ! $updates_found ; - rename "$dir_analytics/$file_comscore_master", "$dir_analytics/$file_comscore_master.~" ; - open OUT, '>', "$dir_analytics/$file_comscore_master" ; + rename "$dir_upd/$file_comscore_master", "$dir_upd/$file_comscore_master.~" ; + open OUT, '>', "$dir_upd/$file_comscore_master" ; $line_out = "yyyymm" ; foreach $property (@properties) @@ -423,10 +330,163 @@ close OUT ; } +sub UpdateMasterFileFromRecentComscoreData +{ + my ($file_comscore_master, $file_comscore_updates, $multiplier, $layout_csv, @white_list) = @_ ; + + print "UpdateMasterFileFromRecentComscoreData\n\n"; + + undef %white_list ; + undef %not_white_listed ; + + print "White list: ". (join (',', @white_list)) . "\n\n"; + + foreach $id (@white_list) + { $white_list {$id} = $true ; } + + if (! -e "$dir_upd/$file_comscore_master") + { Abort ("File $file_comscore_master not found!") ; } + + $age_all = -M "$dir_upd/$file_comscore_master" ; + print "Latest comscore master file is " . sprintf ("%.0f", $age_all) . " days old: '$file_comscore_master'\n" ; + + my $cwd = getcwd ; + chdir $dir_in ; + + @files = glob($file_comscore_updates) ; + $min_age_upd = 999999 ; + $file_comscore_updates_latest = '' ; + foreach $file (@files) + { + $age = -M $file ; + if ($age < $min_age_upd) + { + $min_age_upd = $age ; + $file_comscore_updates_latest = $file ; + } + } + print "Latest comscore update file is " . sprintf ("%.0f", $min_age_upd) . " days old: '$file_comscore_updates_latest'\n" ; + + if ($min_age_upd == 999999) + { + print "No valid update file found. Nothing to update." ; + return ; + } + + #if ($age_all > $min_age_upd) + #{ + # print "File with master data more recent than latest update csv from comScore. Nothing to update." ; + # return ; + #} + + my $updates_found = $false ; + + open CSV, '<', $file_comscore_updates_latest ; + binmode CSV ; + while ($line = <CSV>) + { + chomp $line ; + $line =~ s/\r//g ; + $line = &GetNumberOnly ($line) ; + + if ($line =~ /Jan-\d\d\d\d.*?Feb-\d\d\d\d/) # e.g. 'Location,Location,Jan-2010,Feb-2010,Mar-2010,Apr-2010,...' + { + if ($layout_csv == $layout_csv_properties) + { ($dummy1,$dummy2,$dummy3,@months) = split (',', $line) ; } # web properties csv file + else + { ($dummy1,$dummy2,@months) = split (',', $line) ; } # uv / reach csv files + + @months = &mmm_yyyy2yyyy_mm (@months) ; + } + + if (($line =~ /^\d+,/) || ($line =~ /,,.*?Total Internet/)) + { + if ($layout_csv == $layout_csv_properties) + { + ($index,$dummy,$property,@data) = split (',', $line) ; + $property =~ s/^\s+// ; + $property =~ s/\s+$// ; + + $property =~ s/.*Total Internet.*/Total Internet/i ; + $property =~ s/.*Google.*/Google/i ; + $property =~ s/.*Microsoft.*/Microsoft/i ; + $property =~ s/.*FACEBOOK.*/Facebook/i ; + $property =~ s/.*Yahoo.*/Yahoo/i ; + $property =~ s/.*Amazon.*/Amazon/i ; + $property =~ s/.*Apple.*/Apple/i ; + $property =~ s/.*AOL.*/AOL/i ; + $property =~ s/.*Wikimedia.*/Wikimedia/i ; + $property =~ s/.*Tencent.*/Tencent/i ; + $property =~ s/.*Baidu.*/Baidu/i ; + $property =~ s/.*CBS.*/CBS/i ; + + if (! $white_list {$property}) + { + $not_white_listed {$property}++ ; + next ; + } + + $id = $property ; + } + else + { + ($index,$region,@data) = split (',', $line) ; + $region =~ s/^\s+// ; + $region =~ s/\s+$// ; + + if (! $white_list {$region}) + { + $not_white_listed {$region}++ ; + next ; + } + + $id = $region_codes {$region} ; + } + + for ($m = 0 ; $m <= $#months ; $m++) + { + $yyyymm = $months [$m] ; + $months {$yyyymm} ++ ; + $yyyymm_id = "$yyyymm,$id" ; + $data = $data [$m] * $multiplier ; + + if ($mode eq 'add') + { + if (! defined $data {$yyyymm_id}) + { + $updates_found = $true ; + print "New data found: $yyyymm_id = $data\n" ; + $data {$yyyymm_id} = $data ; + } + } + else + { + $updates_found = $true ; + print "Data found: $yyyymm_id = $data\n" ; + $data {$yyyymm_id} = $data ; + } + } + } + } + + $entities_not_white_listed = join (', ', sort keys %not_white_listed) ; + if ($entities_not_white_listed ne '') + { print "\nEntities ignored:\n$entities_not_white_listed\n\n" ; } + + if (! $updates_found) + { print "No new updates found\n" ; } + else + { print "\nUpdates found, rewrite master file '$file_comscore_master'\n\n" ; } + + return ($updates_found) ; +} + sub WriteDataAnalytics { - open OUT, '>', "$dir_analytics/analytics_in_comscore.csv" ; + print "WriteDataAnalytics\n\n"; + open OUT, '>', "$dir_out/analytics_in_comscore.csv" ; + $metric = 'unique_visitors' ; foreach $yyyymm (sort keys %months) { @@ -447,7 +507,7 @@ $line = "$yyyymm,$country_code,$region_code,$property,$project,$reach,$visitors\n" ; print OUT $line ; - # print $line ; + print $line ; } foreach $property (sort @properties) @@ -478,14 +538,18 @@ sub mmm_yyyy2yyyy_mm { my @months = @_ ; + my ($m) ; # Jan -> 01, etc - foreach my $month (@months) + foreach $month (@months) { my ($mmm,$yyyy) = split ('-', $month) ; for ($m = 0 ; $m <= $#months_short ; $m++) { if ($mmm eq $months_short [$m]) - { $month = "$yyyy-" . sprintf ("%02d", $m+1) ; } + { + $month = "$yyyy-" . sprintf ("%02d", $m+1) ; + last ; + } } } return @months ; Modified: trunk/wikistats/analytics/AnalyticsPrepWikiCountsOutput.pl =================================================================== --- trunk/wikistats/analytics/AnalyticsPrepWikiCountsOutput.pl 2011-07-06 00:24:59 UTC (rev 91508) +++ trunk/wikistats/analytics/AnalyticsPrepWikiCountsOutput.pl 2011-07-06 00:37:49 UTC (rev 91509) @@ -39,6 +39,7 @@ &ParseArguments ; &ReadStatisticsMonthly ; + &FindLargestWikis ; &WriteMonthlyData ; print "\nReady\n\n" ; @@ -54,15 +55,20 @@ { $arguments .= " -$arg " . $options {$arg} . "\n" ; } print ("\nArguments\n$arguments\n") ; -# $options {"i"} = "w:/# out bayes" ; # EZ test -# $options {"o"} = "c:/MySQL/analytics" ; # EZ test + if (! -d '/mnt/') # EZ test + { + $path_in = "c:/\@ wikimedia/# out bayes" ; + $path_out = "c:/MySQL/analytics" ; + } + else + { + die ("Specify input folder for projectcounts files as: -i path") if (! defined ($options {"i"})) ; + die ("Specify output folder as: -o path'") if (! defined ($options {"o"})) ; - die ("Specify input folder for projectcounts files as: -i path") if (! defined ($options {"i"})) ; - die ("Specify output folder as: -o path'") if (! defined ($options {"o"})) ; + $path_in = $options {"i"} ; + $path_out = $options {"o"} ; + } - $path_in = $options {"i"} ; - $path_out = $options {"o"} ; - die "Input folder '$path_in' does not exist" if (! -d $path_in) ; die "Output folder '$path_out' does not exist" if (! -d $path_out) ; @@ -158,6 +164,8 @@ ($month,$day,$year) = split ('\/', $date) ; $yyyymm = sprintf ("%04d-%02d", $year, $month) ; + $months {$yyyymm} ++ ; +# print "YYYYMM $yyyymm\n" ; # data have been collected in WikiCountsProcess.pm and been written in WikiCountsOutput.pm # count user with over x edits @@ -167,6 +175,11 @@ $edits_ge_25 = @counts [4] > 0 ? @counts [4] : 0 ; $edits_ge_100 = @counts [7] > 0 ? @counts [7] : 0 ; $data2 {"$project,$language,$yyyymm"} = "$edits_ge_5,$edits_ge_25,$edits_ge_100" ; + + $total_edits_ge_5 {"$project,$language"} += $edits_ge_5 ; + $total_edits_ge_25 {"$project,$language"} += $edits_ge_25 ; + $total_edits_ge_100 {"$project,$language"} += $edits_ge_100 ; + # prep string with right amount of comma's if ($data2_default eq '') { @@ -237,12 +250,46 @@ # } #} +sub FindLargestWikis +{ + print "Largest projects (most accumulated very active editors):\n"; + @total_edits_ge_100 = sort {$total_edits_ge_100 {$b} <=> $total_edits_ge_100 {$a}} keys %total_edits_ge_100 ; + $rank = 0 ; + foreach $project_language (@total_edits_ge_100) + { + $largest_projects {$project_language} = $rank++ ; + print "$project_language," ; + last if $rank > 10 ; + } + print "\n\n" ; + + foreach $yyyymm (sort keys %months) + { + next if $yyyymm lt '2011' ; + foreach $project_language (keys %largest_projects) + { + ($project,$language) = split (',', $project_language) ; + if ($data2 {"$project,$language,$yyyymm"} eq '') + { + print "No data yet for large wiki $project_language for $yyyymm-> skip month $yyyymm\n" ; + $months {$yyyymm} = 0 ; + } + } + } + exit ; +} + sub WriteMonthlyData { my $file_csv_out = "$path_out/$file_csv_analytics_in" ; open CSV_OUT, '>', $file_csv_out ; foreach $project_wiki_month (sort keys %data1) { + ($project,$wiki,$yyyymm) = split (',', $project_wiki_month) ; + + # recent month misses on eor more large wikis? + next if $months {$yyyymm} == 0 ; + $data1 = $data1 {$project_wiki_month} ; $data2 = $data2 {$project_wiki_month} ; if ($data2 eq '') @@ -250,9 +297,12 @@ print "Editor data missing for $project_wiki_month\n" ; $data2 = $data2_default ; } - $data1 =~ s/data2/$data2/ ; # insert rather than append to have all editor fields follow each other + $data1 =~ s/data2/$data2/ ; # insert rather than append to have all editor fields close together print CSV_OUT "$project_wiki_month,$data1\n" ; } + $total_edits_ge_5 {"$project,*,$yyyymm"} += $edits_ge_5 ; + $total_edits_ge_25 {"$project,*,$yyyymm"} += $edits_ge_25 ; + $total_edits_ge_100 {"$project,*,$yyyymm"} += $edits_ge_100 ; close CSV_OUT ; } Added: trunk/wikistats/analytics/analytics_create_and_load_from_csv.txt =================================================================== --- trunk/wikistats/analytics/analytics_create_and_load_from_csv.txt (rev 0) +++ trunk/wikistats/analytics/analytics_create_and_load_from_csv.txt 2011-07-06 00:37:49 UTC (rev 91509) @@ -0,0 +1,179 @@ +-- make sure to delete output files *test*.csv first if any exist (MySQL on purpose forbids overwrite) + +-- tables implemented: +-- comscore +-- comscore_regions +-- wikistats +-- page_views +-- language names +-- binaries + +-- more tables planned (O= optional, not needed for report card stage) +-- project_names +-- O edits per project_code, per language, per month, per normalization type (Y/N), editor type (manual, anonymous, bot), namespace group (articles, talk pages, other) +-- O editors per project_code, per language, per month, per normalization type (Y/N), editor type (manual, anonymous, bot), namespace group (articles, talk pages, other) + + +-- open issues: +-- only store basic data in database and calculate all aggregates on the fly or do some aggragation before hand ? (e.g. count for mobile / non-mobile / ==> total of both ? <==) +-- for binaries, store one extension type per row? (future proof, more work to query), or a selected few as columns? (one row only needed per month) + +-- Create database and two tables from scratch +DROP DATABASE IF EXISTS `analytics` ; + +CREATE DATABASE `analytics` ; + +USE `analytics` ; + +CREATE TABLE `comscore` ( + `date` date NOT NULL, + `country_code` varchar (3), + `region_code` varchar (3), + `web_property` varchar (20), + `project_code` varchar (10), + `reach` decimal (4,1) DEFAULT NULL, + `visitors` decimal (15) DEFAULT NULL, + PRIMARY KEY (date,country_code,region_code,project_code,web_property), + KEY (`country_code`) +) ; + +CREATE TABLE `comscore_regions` ( + `region_code` varchar (2), + `report_language` varchar (10), + `region_name` varchar (18), + PRIMARY KEY (report_language,region_code) +) ; + +CREATE TABLE `wikistats` ( + `date` date NOT NULL, + `project_code` varchar (10), + `language_code` varchar (15), + `editors_all_time` int (10) DEFAULT NULL, + `editors_new` int (7) DEFAULT NULL, + `editors_ge_5` int (7) DEFAULT NULL, + `editors_ge_25` int (7) DEFAULT NULL, + `editors_ge_100` int (7) DEFAULT NULL, + `articles` int (12) DEFAULT NULL, + `articles_new_per_day` int (9) DEFAULT NULL, + `articles_over_bytes_500` int (12) DEFAULT NULL, + `articles_over_bytes_2000` int (12) DEFAULT NULL, + `edits_per_article` decimal (9,1) DEFAULT NULL, + `bytes_per_article` decimal (9,1) DEFAULT NULL, + `edits` int (12) DEFAULT NULL, + `size_in_bytes` int (15) DEFAULT NULL, + `size_in_words` int (15) DEFAULT NULL, + `links_internal` int (15) DEFAULT NULL, + `links_interwiki` int (15) DEFAULT NULL, + `links_image` int (15) DEFAULT NULL, + `links_external` int (15) DEFAULT NULL, + `redirects` int (15) DEFAULT NULL, + PRIMARY KEY (date,project_code,language_code) +) ; + +CREATE TABLE `page_views` ( + `date` date NOT NULL, + `project_code` char (2), + `language_code` char (15), + `views_non_mobile_raw` bigint (15), + `views_mobile_raw` bigint (15), + `views_non_mobile_normalized` bigint (15), + `views_mobile_normalized` bigint (15), + `views_raw` bigint (15), + `views_normalized` bigint (15), + PRIMARY KEY (date,project_code,language_code) +) ; + +CREATE TABLE `language_names` ( + `report_language` varchar (15), + `language_code` varchar (15), + `language_name` varchar (50), + PRIMARY KEY (report_language,language_code) +) ; + +CREATE TABLE `binaries` ( + `date` date NOT NULL, + `project_code` char (2), + `language_code` char (15), + `extension` varchar (10), + `binaries` bigint (15), + PRIMARY KEY (date,project_code,language_code,extension) +) ; + +CREATE TABLE `offline` ( + `date` date NOT NULL, + `readers` bigint (12), + PRIMARY KEY (date,readers) +) ; + +-- SHOW TABLES ; +-- DESCRIBE comscore ; +-- DESCRIBE comscore_regions ; +-- DESCRIBE wikistats ; +-- DESCRIBE page_views ; +-- DESCRIBE language_names ; +-- DESCRIBE binaries ; +-- DESCRIBE offline ; + +-- Database Manipulation +-- Obviously in real world this is a separate script + +LOAD DATA LOCAL INFILE 'analytics_in_comscore.csv' + INTO TABLE comscore + FIELDS TERMINATED BY ',' + OPTIONALLY ENCLOSED BY '"' + (@date,country_code,region_code,web_property,project_code,reach,visitors) + SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ; + +LOAD DATA LOCAL INFILE 'analytics_in_comscore_regions.csv' + INTO TABLE comscore_regions + FIELDS TERMINATED BY ',' + OPTIONALLY ENCLOSED BY '"' + (report_language,region_code,region_name) ; + +LOAD DATA LOCAL INFILE 'analytics_in_wikistats.csv' + INTO TABLE wikistats + FIELDS TERMINATED BY ',' + OPTIONALLY ENCLOSED BY '"' + (project_code,language_code,@date,editors_all_time,editors_new,editors_ge_5,editors_ge_25,editors_ge_100,articles,articles_new_per_day,articles_over_bytes_500,articles_over_bytes_2000,edits_per_article,bytes_per_article,edits,size_in_bytes,size_in_words,links_internal,links_interwiki,links_image,links_external,redirects) + SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ; + +LOAD DATA LOCAL INFILE 'analytics_in_page_views.csv' + INTO TABLE page_views + FIELDS TERMINATED BY ',' + OPTIONALLY ENCLOSED BY '"' + (project_code,language_code,@date,views_non_mobile_raw,views_mobile_raw,views_non_mobile_normalized,views_mobile_normalized,views_raw,views_normalized) + SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ; + + +LOAD DATA LOCAL INFILE 'analytics_in_language_names.csv' + INTO TABLE language_names + FIELDS TERMINATED BY ',' + OPTIONALLY ENCLOSED BY '"' + (report_language,language_code,language_name) ; + +LOAD DATA LOCAL INFILE 'analytics_in_binaries.csv' + INTO TABLE binaries + FIELDS TERMINATED BY ',' + OPTIONALLY ENCLOSED BY '"' + (project_code,language_code,@date,extension,binaries) + SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ; + +LOAD DATA LOCAL INFILE 'analytics_in_offline.csv' + INTO TABLE offline + FIELDS TERMINATED BY ',' + OPTIONALLY ENCLOSED BY '"' + (@date,readers) + SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ; + + +-- show contents (debugging only) +-- SELECT * FROM comscore ; +-- SELECT * FROM comscore_regions ; +-- SELECT * FROM wikistats ; +-- SELECT * FROM page_views ; +-- SELECT * FROM language_names ; +-- SELECT * FROM binaries +-- WHERE project_code = 'commons' ; + SELECT * FROM offline ; + + Added: trunk/wikistats/analytics/analytics_generate_csv_files.sh =================================================================== --- trunk/wikistats/analytics/analytics_generate_csv_files.sh (rev 0) +++ trunk/wikistats/analytics/analytics_generate_csv_files.sh 2011-07-06 00:37:49 UTC (rev 91509) @@ -0,0 +1,21 @@ +#!/bin/sh +ulimit -v 8000000 + +clear +cd /a/analytics + +perl AnalyticsPrepBinariesData.pl -i /a/wikistats/ -o /a/analytics/test/ + +# add or replace data from newest comScore csv files (last 14 months) into master files (full history) +# and generate database input csv file from it + +# -r replace (default is add only) +# -i input folder, contains manually downloaded csv files from comScore (or xls, converted to csv) +# -m master files with full history +# -o output csv file, with reach and UV's per region and UV's per top web property, ready for import into database +perl AnalyticsPrepComscoreData.pl -r -i /a/analytics/comscore -m /a/analytics -o /a/analytics + +perl AnalyticsPrepWikiCountsOutput.pl -i /a/wikistats/ -o /a/analytics + +cp /a/wikistats/csv_wp/analytics_in_page_views.csv . + Added: trunk/wikistats/analytics/analytics_new.sh =================================================================== --- trunk/wikistats/analytics/analytics_new.sh (rev 0) +++ trunk/wikistats/analytics/analytics_new.sh 2011-07-06 00:37:49 UTC (rev 91509) @@ -0,0 +1,5 @@ +clear +cd /a/analytics +# rm *test*.csv +# mysql --user=root --password=changerootwhenpriyankareturns < analytics_create_and_load_from_csv.txt > mysql_log.txt + mysql -u analytics -h project2.wikimedia.org -preport < analytics_create_and_load_from_csv.txt > mysql_log.txt Added: trunk/wikistats/analytics/analytics_refresh_from_csv.txt =================================================================== --- trunk/wikistats/analytics/analytics_refresh_from_csv.txt (rev 0) +++ trunk/wikistats/analytics/analytics_refresh_from_csv.txt 2011-07-06 00:37:49 UTC (rev 91509) @@ -0,0 +1,145 @@ +-- make sure to delete output files *test*.csv first if any exist (MySQL on purpose forbids overwrite) + +USE `analytics` ; + +CREATE TABLE `comscore` ( + `date` date NOT NULL, + `country_code` varchar (3), + `region_code` varchar (3), + `web_property` varchar (20), + `project_code` varchar (10), + `reach` decimal (4,1) DEFAULT NULL, + `visitors` decimal (15) DEFAULT NULL, + PRIMARY KEY (date,country_code,region_code,project_code,web_property), + KEY (`country_code`) +) ; + +CREATE TABLE `comscore_regions` ( + `region_code` varchar (2), + `report_language` varchar (10), + `region_name` varchar (18), + PRIMARY KEY (report_language,region_code) +) ; + +CREATE TABLE `wikistats` ( + `date` date NOT NULL, + `project_code` varchar (10), + `language_code` varchar (15), + `editors_all_time` int (10) DEFAULT NULL, + `editors_new` int (7) DEFAULT NULL, + `editors_ge_5` int (7) DEFAULT NULL, + `editors_ge_25` int (7) DEFAULT NULL, + `editors_ge_100` int (7) DEFAULT NULL, + `articles` int (12) DEFAULT NULL, + `articles_new_per_day` int (9) DEFAULT NULL, + `articles_over_bytes_500` int (12) DEFAULT NULL, + `articles_over_bytes_2000` int (12) DEFAULT NULL, + `edits_per_article` decimal (9,1) DEFAULT NULL, + `bytes_per_article` decimal (9,1) DEFAULT NULL, + `edits` int (12) DEFAULT NULL, + `size_in_bytes` int (15) DEFAULT NULL, + `size_in_words` int (15) DEFAULT NULL, + `links_internal` int (15) DEFAULT NULL, + `links_interwiki` int (15) DEFAULT NULL, + `links_image` int (15) DEFAULT NULL, + `links_external` int (15) DEFAULT NULL, + `redirects` int (15) DEFAULT NULL, + PRIMARY KEY (date,project_code,language_code) +) ; + +CREATE TABLE `page_views` ( + `date` date NOT NULL, + `project_code` char (2), + `language_code` char (15), + `views_non_mobile_raw` bigint (15), + `views_mobile_raw` bigint (15), + `views_non_mobile_normalized` bigint (15), + `views_mobile_normalized` bigint (15), + `views_raw` bigint (15), + `views_normalized` bigint (15), + PRIMARY KEY (date,project_code,language_code) +) ; + +CREATE TABLE `language_names` ( + `report_language` varchar (15), + `language_code` varchar (15), + `language_name` varchar (50), + PRIMARY KEY (report_language,language_code) +) ; + +CREATE TABLE `binaries` ( + `date` date NOT NULL, + `project_code` char (2), + `language_code` char (15), + `extension` varchar (10), + `binaries` bigint (15), + PRIMARY KEY (date,project_code,language_code,extension) +) ; + +CREATE TABLE `offline` ( + `date` date NOT NULL, + `readers` bigint (12), + PRIMARY KEY (date,readers) +) ; + + +-- SHOW TABLES ; +-- DESCRIBE comscore ; +-- DESCRIBE comscore_regions ; +-- DESCRIBE wikistats ; +-- DESCRIBE page_views ; +-- DESCRIBE language_names ; +-- DESCRIBE binaries ; +-- DESCRIBE offline ; + +-- Database Manipulation +-- Obviously in real world this is a separate script + +TRUNCATE TABLE 'analytics_in_comscore.csv' ; +LOAD DATA LOCAL INFILE 'analytics_in_comscore.csv' + INTO TABLE comscore + FIELDS TERMINATED BY ',' + OPTIONALLY ENCLOSED BY '"' + (@date,country_code,region_code,web_property,project_code,reach,visitors) + SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ; + +TRUNCATE TABLE 'analytics_in_comscore_regions.csv' ; +LOAD DATA LOCAL INFILE 'analytics_in_comscore_regions.csv' + INTO TABLE comscore_regions + FIELDS TERMINATED BY ',' + OPTIONALLY ENCLOSED BY '"' + (report_language,region_code,region_name) ; + +TRUNCATE TABLE 'analytics_in_wikistats.csv' ; +LOAD DATA LOCAL INFILE 'analytics_in_wikistats.csv' + INTO TABLE wikistats + FIELDS TERMINATED BY ',' + OPTIONALLY ENCLOSED BY '"' + (project_code,language_code,@date,editors_all_time,editors_new,editors_ge_5,editors_ge_25,editors_ge_100,articles,articles_new_per_day,articles_over_bytes_500,articles_over_bytes_2000,edits_per_article,bytes_per_article,edits,size_in_bytes,size_in_words,links_internal,links_interwiki,links_image,links_external,redirects) + SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ; + +TRUNCATE TABLE 'analytics_in_page_views.csv' ; +LOAD DATA LOCAL INFILE 'analytics_in_page_views.csv' + INTO TABLE page_views + FIELDS TERMINATED BY ',' + OPTIONALLY ENCLOSED BY '"' + (project_code,language_code,@date,views_non_mobile_raw,views_mobile_raw,views_non_mobile_normalized,views_mobile_normalized,views_raw,views_normalized) + SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ; + + +TRUNCATE TABLE 'analytics_in_language_names.csv' ; +LOAD DATA LOCAL INFILE 'analytics_in_language_names.csv' + INTO TABLE language_names + FIELDS TERMINATED BY ',' + OPTIONALLY ENCLOSED BY '"' + (report_language,language_code,language_name) ; + +TRUNCATE TABLE 'analytics_in_binaries.csv' ; +LOAD DATA LOCAL INFILE 'analytics_in_binaries.csv' + INTO TABLE binaries + FIELDS TERMINATED BY ',' + OPTIONALLY ENCLOSED BY '"' + (project_code,language_code,@date,extension,binaries) + SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ; + +TRUNCATE TABLE 'analytics_in_offline.csv' ; Added: trunk/wikistats/analytics/analytics_upd.sh =================================================================== --- trunk/wikistats/analytics/analytics_upd.sh (rev 0) +++ trunk/wikistats/analytics/analytics_upd.sh 2011-07-06 00:37:49 UTC (rev 91509) @@ -0,0 +1,6 @@ +clear +cd /a/analytics + +./analytics_generate_csv_files.sh + +mysql -u analytics -h project2.wikimedia.org -preport < analytics_refresh_from_csv.txt \ No newline at end of file _______________________________________________ MediaWiki-CVS mailing list MediaWiki-CVS@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs