https://www.mediawiki.org/wiki/Special:Code/MediaWiki/105444

Revision: 105444
Author:   diederik
Date:     2011-12-07 17:36:45 +0000 (Wed, 07 Dec 2011)
Log Message:
-----------
Moved reportcard import utils from trunk/wikistats/analytics to 
trunk/extensions/MetricsReporting

Added Paths:
-----------
    trunk/extensions/MetricsReporting/import/AnalyticsPrepBinariesData.pl
    trunk/extensions/MetricsReporting/import/AnalyticsPrepComscoreData.pl
    trunk/extensions/MetricsReporting/import/AnalyticsPrepLanguageNames.pl
    trunk/extensions/MetricsReporting/import/AnalyticsPrepPageViews.pl
    trunk/extensions/MetricsReporting/import/AnalyticsPrepWikiCountsOutput.pl
    trunk/extensions/MetricsReporting/import/_readme.txt
    
trunk/extensions/MetricsReporting/import/analytics_create_and_load_from_csv.txt
    trunk/extensions/MetricsReporting/import/analytics_generate_csv_files.sh
    trunk/extensions/MetricsReporting/import/analytics_new.sh
    trunk/extensions/MetricsReporting/import/analytics_refresh_from_csv.txt
    trunk/extensions/MetricsReporting/import/analytics_upd.sh

Copied: trunk/extensions/MetricsReporting/import/AnalyticsPrepBinariesData.pl 
(from rev 105441, trunk/wikistats/analytics/AnalyticsPrepBinariesData.pl)
===================================================================
--- trunk/extensions/MetricsReporting/import/AnalyticsPrepBinariesData.pl       
                        (rev 0)
+++ trunk/extensions/MetricsReporting/import/AnalyticsPrepBinariesData.pl       
2011-12-07 17:36:45 UTC (rev 105444)
@@ -0,0 +1,124 @@
+#!/usr/local/bin/perl
+
+  use Getopt::Std ;
+
+  &ParseArguments ;
+
+  print "Write file '$file_csv_out'\n" ;
+  open CSV_OUT, '>', $file_csv_out ;
+
+  foreach $project (qw (wb wk wn wp wq ws wv wx))
+  { &ReadStatisticsPerBinariesExtension ($project) ; }
+
+  close CSV_OUT  ;
+
+  print "\n\nReady\n\n" ;
+  exit ;
+
+sub ParseArguments
+{
+  my @options ;
+  getopt ("io", \%options) ;
+
+  die ("Specify input folder as: -i path")   if (! defined ($options {"i"})) ;
+  die ("Specify output folder as: -o path'") if (! defined ($options {"o"})) ;
+
+  $path_in  = $options {"i"} ;
+  $path_out = $options {"o"} ;
+
+  die "Input folder '$path_in' does not exist"   if (! -d $path_in) ;
+  die "Output folder '$path_out' does not exist" if (! -d $path_out) ;
+
+  # tests only
+  # $path_in  = "C:/@ Wikimedia/# Out Bayes" ;
+  # $path_out = "C:/analytics" ; # "w:/@ report card/data" ;
+
+  print "Input  folder: $path_in\n" ;
+  print "Output folder: $path_out\n" ;
+  print "\n" ;
+
+  $file_csv_out = "$path_out/analytics_in_binaries.csv" ;
+}
+
+
+sub ReadStatisticsPerBinariesExtension
+{
+  my $project = shift ;
+  my $file_csv_in = "$path_in/csv_$project/StatisticsPerBinariesExtension.csv" 
;
+  $yyyymm_hi = -1 ;
+
+  if (! -e $file_csv_in)
+  { die "Input file '$file_csv_in' not found" ; }
+
+
+  print "Read '$file_csv_in'\n" ;
+  open CSV_IN, '<', $file_csv_in ;
+
+  $language_prev = '' ;
+  while ($line = <CSV_IN>)
+  {
+    chomp $line ;
+    next if $line !~ /,.*?,/ ;
+
+    ($language,$date,$data) = split (',', $line, 3) ;
+
+    # for each wiki first line shows ext names, no tcounts
+    if ($date eq "00/0000")
+    {
+      if ($language_prev ne '')
+      { &WriteMonthlyData ($project, $language_prev) ; }
+      $language_prev = $language ;
+
+      undef %ext_name ;
+      undef %ext_ndx ;
+      undef %ext_cnt ;
+      undef %months ;
+
+      @exts = split (',', $data) ;
+      $ndx = 0 ;
+      foreach $ext (@exts)
+      {
+        $ext_name {$ndx} = $ext ;
+        $ext_ndx  {$ext} = $ndx ;
+        $ndx ++ ;
+      }
+      next ;
+    }
+
+    ($month,$year) = split ('\/', $date) ;
+    $yyyymm = sprintf ("%04d-%02d", $year, $month) ;
+    if ($yyyymm gt $yyyymm_hi)
+    { $yyyymm_hi = $yyyymm ; }
+    $months {$yyyymm}++ ;
+
+    @counts = split (',', $data) ;
+    $ndx = 0 ;
+    foreach $count (@counts)
+    {
+      $ext_cnt {$yyyymm}{$ext_name {$ndx}}  = $count ;
+      $ndx ++ ;
+    }
+  }
+  &WriteMonthlyData ($project, $language_prev) ;
+
+  close CSV_IN ;
+}
+
+sub WriteMonthlyData
+{
+  my ($project,$language) = @_ ;
+  # get sorted array of extensions, order by count for most recent month
+  %ext_cnt_yyyymm_hi = %{$ext_cnt {$yyyymm_hi}} ;
+  @ext_cnt_yyyymm_hi = (sort {$ext_cnt_yyyymm_hi {$b} <=> $ext_cnt_yyyymm_hi 
{$a}} keys %ext_cnt_yyyymm_hi) ;
+
+  foreach $month (sort keys %months)
+  {
+    $ndx = 0 ;
+    foreach $ext (@ext_cnt_yyyymm_hi)
+    {
+      print CSV_OUT 
"$project,$language,$month,$ext,${ext_cnt{$yyyymm}{$ext_name {$ndx}}}\n" ;
+    # print         "$month,$ext,${ext_cnt{$yyyymm}{$ext_name {$ndx}}}\n" ;
+      last if (++ $ndx > 25) ;
+    }
+  }
+}

Copied: trunk/extensions/MetricsReporting/import/AnalyticsPrepComscoreData.pl 
(from rev 105441, trunk/wikistats/analytics/AnalyticsPrepComscoreData.pl)
===================================================================
--- trunk/extensions/MetricsReporting/import/AnalyticsPrepComscoreData.pl       
                        (rev 0)
+++ trunk/extensions/MetricsReporting/import/AnalyticsPrepComscoreData.pl       
2011-12-07 17:36:45 UTC (rev 105444)
@@ -0,0 +1,564 @@
+#!/usr/bin/perl
+
+# Copyright (C) 2011 Wikimedia Foundation
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License version 2
+# as published by the Free Software Foundation.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# See the GNU General Public License for more details, at
+# http://www.fsf.org/licenses/gpl.html
+
+# Author:
+# Erik Zachte, email ezac...@wikimedia.org
+
+# Functionality:
+# comScore data can be downloaded as csv file, which each contain 14 months 
history
+# This script uses these files to update 'master' csv files which contain all 
known history
+# Note: only entities which are already in master file will be updated!
+# Then it merges these master files into one csv file which can be loaded into 
analytics database
+# Data are: reach by region, unique visitors by region, unique visitors by web 
property
+
+# Parameters:
+# -m (required) folder with 'master' csv files (files with all known history)
+# -u (required) folder with 'update' csv files (files with lastest 14 months 
history, produced by comScore)
+
+# Output:
+# updated master csv files + merged and formatted csv for import in MySQL
+
+# http://svn.wikimedia.org/viewvc/mediawiki/trunk/wikistats/analytics/
+
+  use Getopt::Std ;
+  use Cwd;
+
+  my $options ;
+  getopt ("imo", \%options) ;
+
+  $true  = 1 ;
+  $false = 0 ;
+
+  $script_name    = "AnalyticsPrepComscoreData.pl" ;
+  $script_version = "0.31" ;
+
+# EZ test only
+# $source       = "comscore" ;
+# $server       = "ez_test" ;
+# $generated    = "2011-05-06 00:00:00" ;
+# $user         = "ezachte" ;
+
+  $dir_in   = $options {"i"} ;
+  $dir_upd  = $options {"m"} ;
+  $dir_out  = $options {"o"} ;
+  $mode = 'add';
+  if (defined $options {'r'})
+  { $mode = 'replace'; }
+
+  print "Mode is $mode (specify '-r' for replace)\n\n";
+
+  if (! -d "/home/") # EZ test machine
+  {
+    $dir_in  = "C:/@ Wikimedia/@ Report Card/Data" ;
+    $dir_upd = "C:/MySQL/analytics" ;
+    $dir_out = "C:/MySQL/analytics" ;
+    $mode = 'replace' ;
+  }
+
+  if ($dir_in eq '')
+  { Abort ("Specify folder for input file (new comScore data) '-i folder'") ; }
+  if ($dir_upd eq '')
+  { Abort ("Specify folder for master files (full history) as '-m folder'") ; }
+  if ($dir_out eq '')
+  { Abort ("Specify folder for output file '-o folder'") ; }
+
+  $file_comscore_reach_master     = "history_comscore_reach_regions.csv" ;
+  $file_comscore_reach_update     = "*reach*by*region*csv" ;
+  $file_comscore_uv_region_master = "history_comscore_UV_regions.csv" ;
+  $file_comscore_uv_region_update = "*UVs*by*region*csv" ;
+  $file_comscore_uv_property_master = "history_comscore_UV_properties.csv" ;
+  $file_comscore_uv_property_update = "*UV*trend*csv" ;
+
+  $layout_csv_reach      = 1 ;
+  $layout_csv_regions    = 2 ;
+  $layout_csv_properties = 3 ;
+
+  print "Directories:\n" .
+        "Input (new comScore data): '$dir_in'\n".
+        "Master files (full history): '$dir_upd'\n" .
+        "Output (database feed): '$dir_out'\n\n" ;
+
+  %region_codes = (
+    "Europe"=>"EU",
+    "North America"=>"NA",
+    "Latin America"=>"LA",
+    "World-Wide" => "W",
+    "Middle East - Africa" => "MA",
+    "Asia Pacific"=> "AS",
+    "United States" => "US",
+    "India" => "I",
+    "China" => "C"
+  ) ;
+
+  foreach $region_name (keys %region_codes)
+  { $region_names {$region_codes {$region_name}} = $region_name ; }
+
+  @months_short = qw "Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec" ;
+
+  &ReadMasterComscoreDataReachPerRegion ($file_comscore_reach_master, 
$file_comscore_reach_update, "%.1f", 1, $layout_csv_reach) ;
+  %reach_region_code = %data ;
+
+  &ReadMasterComscoreDataVisitorsPerRegion ($file_comscore_uv_region_master, 
$file_comscore_uv_region_update, "%.0f", 1000, $layout_csv_regions) ;
+  %visitors_region_code = %data ;
+
+  &ReadMasterComscoreDataVisitorsPerProperty 
($file_comscore_uv_property_master, $file_comscore_uv_property_update, "%.0f", 
1000, $layout_csv_properties) ;
+  %visitors_web_property = %data ;
+
+  &WriteDataAnalytics ;
+
+  print "\nReady\n\n" ;
+  exit ;
+
+sub ReadMasterComscoreDataReachPerRegion
+{
+  my ($file_comscore_master, $file_comscore_updates, $precision, $layout_csv) 
= @_ ;
+
+  print "ReadMasterComscoreDataReachPerRegion\n\n" ;
+
+  undef %months ;
+  undef %data ;
+  undef @regions ;
+
+  open IN,  '<', "$dir_upd/$file_comscore_master" ;
+
+  $lines = 0 ;
+  while ($line = <IN>)
+  {
+    chomp $line ;
+    $line =~ s/\r//g ;
+
+    ($yyyymm,@data) = split (',', $line) ;
+
+    if ($lines++ == 0)
+    {
+      @regions = @data ;
+      print "Regions found: " . (join ',', @regions) . "\n";
+      next ;
+    }
+
+    $field_ndx = 0 ;
+    foreach (@data)
+    {
+      $region      = $regions [$field_ndx] ;
+      $region_code = $region_codes {$region} ;
+
+      $data      = $data [$field_ndx] ;
+      if ($data eq '')
+      { $data = '0' ; }
+      $months {$yyyymm} ++ ;
+      $data {"$yyyymm,$region_code"} = $data ;
+      # print "Old data $yyyymm,$region_code = $data\n" ;
+      $field_ndx++ ;
+    }
+  }
+  close IN ;
+
+  my $updates_found = &UpdateMasterFileFromRecentComscoreData 
($file_comscore_master, $file_comscore_updates, 1, $layout_csv, @regions) ;
+  return if ! $updates_found ;
+
+  rename "$dir_upd/$file_comscore_master", "$dir_upd/$file_comscore_master.~" ;
+  open OUT,  '>', "$dir_upd/$file_comscore_master" ;
+
+  $line_out = "yyyymm" ;
+  foreach $region_name (@regions)
+  { $line_out .= ",$region_name" ; }
+  print OUT "$line_out" ;
+
+  foreach $yyyymm (sort {$b cmp $a} keys %months)
+  {
+    $line_out = "\n$yyyymm" ;
+    foreach $region_name (@regions)
+    {
+      $yyyymm_region_code = $yyyymm . ',' . $region_codes {$region_name} ;
+      $line_out .= "," . sprintf ($precision, $data {$yyyymm_region_code}) ;
+    }
+    print OUT "$line_out" ;
+  }
+
+  close OUT ;
+}
+
+sub ReadMasterComscoreDataVisitorsPerRegion
+{
+  my ($file_comscore_master, $file_comscore_updates, $precision, $multiplier, 
$layout_csv) = @_ ;
+
+  print "ReadMasterComscoreDataVisitorsPerRegion\n\n";
+
+  undef %months ;
+  undef %data ;
+  undef @regions ;
+
+  open IN,  '<', "$dir_upd/$file_comscore_master" ;
+
+  $lines  = 0 ;
+  $metric = 'unique_visitors' ;
+  while ($line = <IN>)
+  {
+    chomp $line ;
+    $line =~ s/\r//g ;
+    $line = &GetNumberOnly ($line) ;
+
+    next if $line !~ /(?:yyyymm|\d\d\d\d-\d\d)/ ;
+
+    ($yyyymm,@data) = split (',', $line) ;
+
+    if ($lines++ == 0)
+    {
+      @regions = @data ;
+      print "Regions found: " . (join ',', @regions) . "\n";
+      next ;
+    }
+
+    $field_ndx = 0 ;
+    foreach (@data)
+    {
+      $region      = $regions [$field_ndx] ;
+      $region_code = $region_codes {$region} ;
+
+      $data      = $data    [$field_ndx] ;
+      if ($data eq '')
+      { $data = '0' ; }
+
+      # print "Old data $yyyymm,$region = $data\n" ;
+
+      $months {$yyyymm} ++ ;
+      $data {"$yyyymm,$region_code"} = $data ;
+
+      $field_ndx++ ;
+    }
+  }
+  close IN ;
+
+  my $updates_found = &UpdateMasterFileFromRecentComscoreData 
($file_comscore_master, $file_comscore_updates, 1000, $layout_csv, @regions) ;
+  return if ! $updates_found ;
+
+  rename "$dir_upd/$file_comscore_master", "$dir_upd/$file_comscore_master.~" ;
+  open OUT,  '>', "$dir_upd/$file_comscore_master" ;
+
+  $line_out = "yyyymm" ;
+  foreach $region_name (@regions)
+  { $line_out .= ",$region_name" ; }
+  print OUT "$line_out" ;
+
+  foreach $yyyymm (sort {$b cmp $a} keys %months)
+  {
+    $line_out = "\n$yyyymm" ;
+    foreach $region_name (@regions)
+    {
+      $yyyymm_region_code = $yyyymm . ',' . $region_codes {$region_name} ;
+      $line_out .= "," . sprintf ($precision, $data {$yyyymm_region_code}) ;
+    }
+    print OUT "$line_out" ;
+  }
+
+  close OUT ;
+}
+
+sub ReadMasterComscoreDataVisitorsPerProperty
+{
+  my ($file_comscore_master, $file_comscore_updates, $precision, $multiplier, 
$layout_csv) = @_ ;
+
+  print "ReadMasterComscoreDataVisitorsPerProperty\n\n";
+
+  undef %months ;
+  undef %data ;
+  undef @properties ;
+
+  open IN,  '<', "$dir_upd/$file_comscore_master" ;
+
+  $lines = 0 ;
+  $metric       = 'unique_visitors' ;
+  while ($line = <IN>)
+  {
+    chomp $line ;
+    $line =~ s/\r//g ;
+
+    ($yyyymm,@data) = split (',', $line) ;
+    if ($lines++ == 0)
+    { @properties = @data ; next ; }
+
+    $field_ndx = 0 ;
+    foreach (@data)
+    {
+      $property = $properties [$field_ndx] ;
+      $property =~ s/.*Yahoo.*/Yahoo/ ;
+      $data      = $data    [$field_ndx] ;
+      if ($data eq '')
+      { $data = '0' ; }
+
+      # print "Old data $yyyymm,$property = $data\n" ;
+
+      $months {$yyyymm} ++ ;
+      $data {"$yyyymm,$property"} = $data ;
+
+      $field_ndx++ ;
+    }
+  }
+  close IN ;
+
+  my $updates_found = &UpdateMasterFileFromRecentComscoreData 
($file_comscore_master, $file_comscore_updates, 1000, $layout_csv, @properties) 
;
+  return if ! $updates_found ;
+
+  rename "$dir_upd/$file_comscore_master", "$dir_upd/$file_comscore_master.~" ;
+  open OUT,  '>', "$dir_upd/$file_comscore_master" ;
+
+  $line_out = "yyyymm" ;
+  foreach $property (@properties)
+  { $line_out .= ",$property" ; }
+  print OUT "$line_out" ;
+
+  foreach $yyyymm (sort {$b cmp $a} keys %months)
+  {
+    $line_out = "\n$yyyymm" ;
+    foreach $property (@properties)
+    {
+      $yyyymm_property = "$yyyymm,$property" ;
+      $line_out .= "," . sprintf ($precision, $data {$yyyymm_property}) ;
+    }
+    print OUT "$line_out" ;
+  }
+
+  close OUT ;
+}
+
+sub UpdateMasterFileFromRecentComscoreData
+{
+  my ($file_comscore_master, $file_comscore_updates, $multiplier, $layout_csv, 
@white_list) = @_ ;
+
+  print "UpdateMasterFileFromRecentComscoreData\n\n";
+
+  undef %white_list ;
+  undef %not_white_listed ;
+
+  print "White list: ". (join (',', @white_list)) . "\n\n";
+
+  foreach $id (@white_list)
+  { $white_list {$id} = $true ; }
+
+  if (! -e "$dir_upd/$file_comscore_master")
+  { Abort ("File $file_comscore_master not found!") ; }
+
+  $age_all = -M "$dir_upd/$file_comscore_master" ;
+  print "Latest comscore master file is " . sprintf ("%.0f", $age_all) . " 
days old: '$file_comscore_master'\n" ;
+
+  my $cwd = getcwd ;
+  chdir $dir_in ;
+
+  @files = glob($file_comscore_updates) ;
+  $min_age_upd = 999999 ;
+  $file_comscore_updates_latest = '' ;
+  foreach $file (@files)
+  {
+    $age = -M $file ;
+    if ($age < $min_age_upd)
+    {
+      $min_age_upd = $age ;
+      $file_comscore_updates_latest = $file ;
+    }
+  }
+  print "Latest comscore update file is " . sprintf ("%.0f", $min_age_upd) . " 
days old: '$file_comscore_updates_latest'\n" ;
+
+  if ($min_age_upd == 999999)
+  {
+    print "No valid update file found. Nothing to update." ;
+    return ;
+  }
+
+  #if ($age_all > $min_age_upd)
+  #{
+  #  print "File with master data more recent than latest update csv from 
comScore. Nothing to update." ;
+  #  return ;
+  #}
+
+  my $updates_found = $false ;
+
+  open CSV, '<', $file_comscore_updates_latest ;
+  binmode CSV ;
+  while ($line = <CSV>)
+  {
+    chomp $line ;
+    $line =~ s/\r//g ;
+    $line = &GetNumberOnly ($line) ;
+
+    if ($line =~ /Jan-\d\d\d\d.*?Feb-\d\d\d\d/) # e.g. 
'Location,Location,Jan-2010,Feb-2010,Mar-2010,Apr-2010,...'
+    {
+      if ($layout_csv == $layout_csv_properties)
+      { ($dummy1,$dummy2,$dummy3,@months) = split (',', $line) ; } # web 
properties csv file
+      else
+      { ($dummy1,$dummy2,@months) = split (',', $line) ; }         # uv / 
reach csv files
+
+      @months = &mmm_yyyy2yyyy_mm (@months) ;
+    }
+
+    if (($line =~ /^\d+,/) || ($line =~ /,,.*?Total Internet/))
+    {
+      if ($layout_csv == $layout_csv_properties)
+      {
+        ($index,$dummy,$property,@data) = split (',', $line) ;
+        $property =~ s/^\s+// ;
+        $property =~ s/\s+$// ;
+
+        $property =~ s/.*Total Internet.*/Total Internet/i ;
+        $property =~ s/.*Google.*/Google/i ;
+        $property =~ s/.*Microsoft.*/Microsoft/i ;
+        $property =~ s/.*FACEBOOK.*/Facebook/i ;
+        $property =~ s/.*Yahoo.*/Yahoo/i ;
+        $property =~ s/.*Amazon.*/Amazon/i ;
+        $property =~ s/.*Apple.*/Apple/i ;
+        $property =~ s/.*AOL.*/AOL/i ;
+        $property =~ s/.*Wikimedia.*/Wikimedia/i ;
+        $property =~ s/.*Tencent.*/Tencent/i ;
+        $property =~ s/.*Baidu.*/Baidu/i ;
+        $property =~ s/.*CBS.*/CBS/i ;
+
+        if (! $white_list {$property})
+        {
+          $not_white_listed {$property}++ ;
+          next ;
+        }
+
+        $id = $property ;
+      }
+      else
+      {
+        ($index,$region,@data) = split (',', $line) ;
+        $region =~ s/^\s+// ;
+        $region =~ s/\s+$// ;
+
+        if (! $white_list {$region})
+        {
+          $not_white_listed {$region}++ ;
+          next ;
+        }
+
+        $id = $region_codes {$region} ;
+      }
+
+      for ($m = 0 ; $m <= $#months ; $m++)
+      {
+        $yyyymm = $months [$m] ;
+        $months {$yyyymm} ++ ;
+        $yyyymm_id = "$yyyymm,$id" ;
+        $data = $data [$m] * $multiplier ;
+
+        if ($mode eq 'add')
+        {
+          if (! defined $data {$yyyymm_id})
+          {
+            $updates_found = $true ;
+            print "New data found: $yyyymm_id = $data\n" ;
+            $data {$yyyymm_id} = $data ;
+          }
+        }
+        else
+        {
+          $updates_found = $true ;
+          print "Data found: $yyyymm_id = $data\n" ;
+          $data {$yyyymm_id} = $data ;
+        }
+      }
+    }
+  }
+
+  $entities_not_white_listed = join (', ', sort keys %not_white_listed) ;
+  if ($entities_not_white_listed ne '')
+  { print "\nEntities ignored:\n$entities_not_white_listed\n\n" ; }
+
+  if (! $updates_found)
+  { print "No new updates found\n" ; }
+  else
+  { print "\nUpdates found, rewrite master file '$file_comscore_master'\n\n" ; 
}
+
+  return ($updates_found) ;
+}
+
+sub WriteDataAnalytics
+{
+  print "WriteDataAnalytics\n\n";
+
+  open OUT, '>', "$dir_out/analytics_in_comscore.csv" ;
+
+  $metric = 'unique_visitors' ;
+  foreach $yyyymm (sort keys %months)
+  {
+    # store meta data elsewhere
+    # $line = 
"$generated,$source,$server,$script_name,$script_version,$user,$yyyymm,$country_code,$region_code,$property,$project,$normalized,$metric,$data\n"
 ;
+    foreach $region_code (sort values %region_codes)
+    {
+      $country_code = '-' ;
+      $property     = '-' ;
+      $project      = '-' ;
+      $reach        = $reach_region_code     {"$yyyymm,$region_code"} ;
+      $visitors     = $visitors_region_code  {"$yyyymm,$region_code"} ;
+
+      if (! defined $reach)    { $reach    = -1 ; }
+      if (! defined $visitors) { $visitors = -1 ; }
+
+      next if $reach == -1 and $visitors == -1 ;
+
+      $line = 
"$yyyymm,$country_code,$region_code,$property,$project,$reach,$visitors\n" ;
+      print OUT $line ;
+      print     $line ;
+    }
+
+    foreach $property (sort @properties)
+    {
+      $country_code = '-' ;
+      $region_code  = '-' ;
+      $project      = '-' ;
+      $reach        = '-1' ;
+      $visitors     = $visitors_web_property {"$yyyymm,$property"} ;
+
+      next if ! defined $visitors ;
+
+      $line = 
"$yyyymm,$country_code,$region_code,$property,$project,$reach,$visitors\n" ;
+      print OUT $line ;
+      # print     $line ;
+    }
+  }
+}
+
+sub GetNumberOnly
+{
+  my $line = shift ;
+  $line =~ s/("[^\"]+")/($a=$1,$a=~s#,##g,$a)/ge ; # nested regexp: remove 
comma's inside double quotes
+  $line =~ s/"//g ;
+  return $line ;
+}
+
+sub mmm_yyyy2yyyy_mm
+{
+  my @months = @_ ;
+  my ($m) ;
+  # Jan -> 01, etc
+  foreach $month (@months)
+  {
+    my ($mmm,$yyyy) = split ('-', $month) ;
+    for ($m = 0 ; $m <= $#months_short ; $m++)
+    {
+      if ($mmm eq $months_short [$m])
+      {
+        $month = "$yyyy-" . sprintf ("%02d", $m+1) ;
+        last ;
+      }
+    }
+  }
+  return @months ;
+}
+
+sub Abort
+{
+  $msg = shift ;
+
+  print  "\nAbort, reason: $msg\n\n" ;
+  exit ;
+}

Copied: trunk/extensions/MetricsReporting/import/AnalyticsPrepLanguageNames.pl 
(from rev 105441, trunk/wikistats/analytics/AnalyticsPrepLanguageNames.pl)
===================================================================
--- trunk/extensions/MetricsReporting/import/AnalyticsPrepLanguageNames.pl      
                        (rev 0)
+++ trunk/extensions/MetricsReporting/import/AnalyticsPrepLanguageNames.pl      
2011-12-07 17:36:45 UTC (rev 105444)
@@ -0,0 +1,162 @@
+#!/usr/bin/perl
+
+# This module prepares a csv file with language names for feed into the 
analytics database
+# The main work of collecting these names from different sources is done by
+# 
http://svn.wikimedia.org/viewvc/mediawiki/trunk/wikistats/dumps/WikiReportsLocalizations.pm
+# which is part of wikistats reporting phase and once each month updates local 
master csv files from all sources
+
+# The following code to merge and filter these master csv files is based on 
parts of the code in WikiReportsLocalizations.pm, function Localization
+# To do some day: make a completely independent script out of this code and 
code from WikiReportsLocalizations.pm which covers the whole production cycle
+
+# Sources for language names:
+# - php files
+# - translatewiki
+# - English Wikipedia API (interwikilinks)
+
+# Right now multilingual support for the analytics database is just a nice 
idea, so to speed up data feeds, just keep English and German translations
+
+  use Getopt::Std ;
+
+  $true  = 1 ;
+  $false = 0 ;
+
+  $max_language_name = 50 ; # truncate if longer
+
+  $file_csv_language_names_php   = "LanguageNamesViaPhp.csv" ;
+  $file_csv_language_names_wp    = "LanguageNamesViaWpEnEdited.csv" ;
+  $file_csv_analytics_in         = "analytics_in_language_names.csv" ;
+
+  $languages_force_case_uc = "ast|br|de|en|id|nl|wa" ; # incomplete list, 
based on languages supported by wikistats reporting
+  $languages_filter        = "de|en" ;
+  foreach $language (split '\|', $languages_filter)
+  {  $include_target_language {$language} = $true ; }
+
+  &ParseArguments ;
+  &ReadCsvFiles ;
+  &WriteCsvFile ;
+
+
+#  if ($language eq "id") # swap which file takes precedence
+
+  print "\nReady\n\n" ;
+  exit ;
+
+sub ParseArguments
+{
+  my (@options, $arguments) ;
+
+  getopt ("io", \%options) ;
+
+  foreach $arg (sort keys %options)
+  { $arguments .= " -$arg " . $options {$arg} . "\n" ; }
+  print ("\nArguments\n$arguments\n") ;
+
+  $options {"i"} = "w:/# out bayes/csv_wp" ; # EZ test
+  $options {"o"} = "c:/MySQL/analytics" ;    # EZ test
+
+  die ("Specify input folder for projectcounts files as: -i path") if (! 
defined ($options {"i"})) ;
+  die ("Specify output folder as: -o path'")                       if (! 
defined ($options {"o"})) ;
+
+  ($path_in  = $options {"i"}) =~ s/[\/\\]+$// ; # remove trailing (back)slash 
if any
+  ($path_out = $options {"o"}) =~ s/[\/\\]+$// ; # remove trailing (back)slash 
if any
+
+  die "Input folder '$path_in' does not exist"   if (! -d $path_in) ;
+  die "Output folder '$path_out' does not exist" if (! -d $path_out) ;
+
+  print "Input  folder: $path_in\n" ;
+  print "Output folder: $path_out\n\n" ;
+
+  $file_csv_language_names_php = "$path_in/$file_csv_language_names_php" ;
+  $file_csv_language_names_wp  = "$path_in/$file_csv_language_names_wp" ;
+  $file_csv_analytics_in       = "$path_out/$file_csv_analytics_in" ;
+
+  die "Input file '$file_csv_language_names_php' not found"   if (! -e 
$file_csv_language_names_php) ;
+  die "Input file '$file_csv_language_names_wp' not found"    if (! -e 
$file_csv_language_names_wp) ;
+}
+
+sub ReadCsvFiles
+{
+  #first read definitions from php message files, then overwrite with 
definitions from interwiki links when available
+  # except for target language 'id' (Indonesian) where quality of php file has 
been deemed more reliable
+
+  open CSV_IN, "<", $file_csv_language_names_php ;
+  while ($line = <CSV_IN>)
+  {
+    chomp ($line) ;
+    ($target_language, $code, $name_unicode, $name_html) = split (',', $line) ;
+
+    next if ! $include_target_language {$target_language} ;
+
+    $out_languages {$target_language} {$code} = &FormatName ($target_language, 
$name_unicode) ; # forget about html for analytics database
+  }
+  close CSV_IN ;
+
+  open CSV_IN, "<", $file_csv_language_names_wp ;
+  while ($line = <CSV_IN>)
+  {
+    chomp ($line) ;
+    ($target_language, $code, $name_unicode, $name_html) = split (',', $line) ;
+
+    next if ! $include_target_language {$target_language} ;
+
+    next if $target_language eq 'id' and $out_languages {$target_language} 
{$code} ne '' ;
+
+  # $name_unicode_php = $out_languages {$target_language} {$code} ;           
# test only
+  # $name_unicode_wp  = &FormatName ($target_language, $name_unicode) ;       
# test only
+  # if (($name_unicode_php ne '') && ($name_unicode_php ne $name_unicode_wp)) 
# test only
+  # { print "$name_unicode_php => $name_unicode_wp\n" ; }                     
# test only
+
+    $out_languages {$target_language} {$code} = &FormatName ($target_language, 
$name_unicode) ; # forget about html for analytics database
+  }
+  close CSV_IN ;
+}
+
+sub FormatName
+{
+  my ($target_language, $name_unicode) = @_ ;
+
+  $name_unicode2 = $name_unicode ;
+
+  if ($target_language eq "de")
+  { $name_unicode =~ s/e?\s*\-?sprache//i ; }
+
+  if ($target_language =~ /^(?:$languages_force_case_uc)/)
+  { $name_unicode = ucfirst $name_unicode ; }
+  else
+  { $name_unicode = lc $name_unicode ; }
+
+# Test only
+# if (($target_language eq 'de') && ($name_unicode ne $name_unicode2))
+# { print "$name_unicode2 => $name_unicode\n" ; }
+
+  return ($name_unicode) ;
+}
+
+sub WriteCsvFile
+{
+  open CSV_OUT, ">", $file_csv_analytics_in || die ("File 
'$file_csv_analytics_in' could not be opened") ;
+  binmode CSV_OUT ; # force Unix style linebreak \012 
+
+  foreach $target_language (sort keys %out_languages)
+  {
+    print "\nTarget language '$target_language'\n" ;
+    %translations = %{$out_languages {$target_language}} ;
+
+    foreach $code (sort keys %translations)
+    {
+      $language_name  = $translations{$code} ;
+      $language_name2 = substr ($language_name,0,$max_language_name) ;
+
+      if ($language_name ne $language_name2)
+      { print "Language name truncated to $max_language_name chars: 
'$language_name' -> '$language_name2'\n" ; }
+
+      if ($language_name2 =~ /,/)
+      { $language_name2 = "\"$language_name2\"" ; }
+      # test only
+      print CSV_OUT "$target_language,$code,$language_name2\n" ;
+    }
+  }
+
+  close CSV_OUT ;
+}
+

Copied: trunk/extensions/MetricsReporting/import/AnalyticsPrepPageViews.pl 
(from rev 105441, trunk/wikistats/analytics/AnalyticsPrepPageViews.pl)
===================================================================
--- trunk/extensions/MetricsReporting/import/AnalyticsPrepPageViews.pl          
                (rev 0)
+++ trunk/extensions/MetricsReporting/import/AnalyticsPrepPageViews.pl  
2011-12-07 17:36:45 UTC (rev 105444)
@@ -0,0 +1,11 @@
+#!/usr/bin/perl
+
+# This file is just a place holder (alias)
+# Page views per day/week/month/etc, per project, per language, nrmalized and 
not normalized are collected by 
+# 
http://svn.wikimedia.org/viewvc/mediawiki/trunk/wikistats/dumps/WikiCountsSummarizeProjectCounts.pl
+#
+# Added May 2001: 
+# For analytics database one file is written for all projects and languages 
combined, 
+# with per month, not normalized and normalized page view counts in one row.
+# This file is written to same output folder as other csv files generated by 
WikiCountsSummarizeProjectCounts.pl
+# and needs to be copied to analytics folder later
\ No newline at end of file

Copied: 
trunk/extensions/MetricsReporting/import/AnalyticsPrepWikiCountsOutput.pl (from 
rev 105441, trunk/wikistats/analytics/AnalyticsPrepWikiCountsOutput.pl)
===================================================================
--- trunk/extensions/MetricsReporting/import/AnalyticsPrepWikiCountsOutput.pl   
                        (rev 0)
+++ trunk/extensions/MetricsReporting/import/AnalyticsPrepWikiCountsOutput.pl   
2011-12-07 17:36:45 UTC (rev 105444)
@@ -0,0 +1,334 @@
+#!/usr/local/bin/perl
+
+# Copyright (C) 2011 Wikimedia Foundation
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License version 2
+# as published by the Free Software Foundation.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# See the GNU General Public License for more details, at
+# http://www.fsf.org/licenses/gpl.html
+
+# Author:
+# Erik Zachte, email ezac...@wikimedia.org
+# loosely based on predecessor
+# 
http://svn.wikimedia.org/viewvc/mediawiki/trunk/wikistats/reportcard/ReportCardExtractWikiCountsOutput.pl
+
+# Functionality:
+# tba
+
+# Parameters:
+# tba
+
+# Output:
+# updated csv file for import in MySQL
+
+# http://svn.wikimedia.org/viewvc/mediawiki/trunk/wikistats/analytics/
+
+  use Getopt::Std ;
+
+  $true  = 1 ;
+  $false = 0 ;
+
+  @projects = ('wb','wk','wn','wp','wq','ws','wv','wx','commons','*') ;
+
+  $file_csv_monthly_data         = "StatisticsMonthly.csv" ;
+  $file_csv_user_activity_spread = "StatisticsUserActivitySpread.csv" ;
+  $file_csv_analytics_in         = "analytics_in_wikistats.csv" ;
+
+  &ParseArguments ;
+  &ReadStatisticsMonthly ;
+  &FindLargestWikis ;
+  &WriteMonthlyData ;
+
+  print "\nReady\n\n" ;
+  exit ;
+
+sub ParseArguments
+{
+  print "ParseArguments\n" ;
+  my (@options, $arguments) ;
+
+  getopt ("io", \%options) ;
+
+  foreach $arg (sort keys %options)
+  { $arguments .= " -$arg " . $options {$arg} . "\n" ; }
+  print ("\nArguments\n$arguments\n") ;
+
+  if (! -d '/mnt/') # EZ test
+  {
+    $path_in  = "c:/\@ wikimedia/# out bayes" ;
+    $path_out = "c:/MySQL/analytics" ;
+  }
+  else
+  {
+    die ("Specify input folder for projectcounts files as: -i path") if (! 
defined ($options {"i"})) ;
+    die ("Specify output folder as: -o path'")                       if (! 
defined ($options {"o"})) ;
+
+    $path_in  = $options {"i"} ;
+    $path_out = $options {"o"} ;
+  }
+
+  die "Input folder '$path_in' does not exist"   if (! -d $path_in) ;
+  die "Output folder '$path_out' does not exist" if (! -d $path_out) ;
+
+  print "Input  folder: $path_in\n" ;
+  print "Output folder: $path_out\n\n" ;
+
+  $file_csv_out = "$path_out/analytics_in_wikistats.csv" ;
+}
+
+sub ReadStatisticsMonthly
+{
+  print "ReadStatisticsMonthly\n" ;
+  &ReadStatisticsMonthlyForProject ("wb") ;
+  &ReadStatisticsMonthlyForProject ("wk") ;
+  &ReadStatisticsMonthlyForProject ("wn") ;
+  &ReadStatisticsMonthlyForProject ("wp") ;
+  &ReadStatisticsMonthlyForProject ("wq") ;
+  &ReadStatisticsMonthlyForProject ("ws") ;
+  &ReadStatisticsMonthlyForProject ("wv") ;
+  &ReadStatisticsMonthlyForProject ("wx") ;
+
+# &ReadStatisticsPerBinariesExtensionCommons ;
+}
+
+sub ReadStatisticsMonthlyForProject
+{
+  my $project = shift;
+  $all_projects = "*" ;
+
+  my $file_csv_in_1 = "$path_in/csv_$project/$file_csv_monthly_data" ;
+  my $file_csv_in_2 = "$path_in/csv_$project/$file_csv_user_activity_spread" ;
+
+  if (! -e $file_csv_in_1)
+  { &Abort ("Input file '$file_csv_in_1' not found") ; }
+  if (! -e $file_csv_in_2)
+  { &Abort ("Input file '$file_csv_in_2' not found") ; }
+
+  my $yyyymm ;
+
+  print "Read '$file_csv_in_1'\n" ;
+  open CSV_IN, '<', $file_csv_in_1 ;
+  while ($line = <CSV_IN>)
+  {
+    chomp $line ;
+    ($language,$date,$counts) = split (',', $line, 3) ;
+    @fields = split (',', $counts) ;
+
+    next if ! &AcceptWiki ($project,$language) ;
+
+    ($month,$day,$year) = split ('\/', $date) ;
+    $yyyymm = sprintf ("%04d-%02d", $year, $month) ;
+
+    foreach $field (@fields)
+    {
+      if ($field eq '-')
+      { $field = 0 ; }
+    }
+
+    $data = $fields  [0] . ',' . # contributors all time
+            $fields  [1] . ',' . # new contributors
+            'data2,'           . # place holder for more data, to be inserted 
later
+            $fields  [4] . ',' . # articles
+            $fields  [6] . ',' . # articles new per day
+            $fields  [9] . ',' . # larger than 0.5 kB
+            $fields [10] . ',' . # larger than 2.0 kB
+            $fields  [7] . ',' . # mean edits per article
+            $fields  [8] . ',' . # mean bytes per article
+            $fields [11] . ',' . # edits
+            $fields [12] . ',' . # size in bytes
+            $fields [13] . ',' . # size in words
+            $fields [14] . ',' . # links internal
+            $fields [15] . ',' . # links interwiki
+            $fields [16] . ',' . # links images
+            $fields [17] . ',' . # links external
+            $fields [18] ;       # redirects
+
+    $data1 {"$project,$language,$yyyymm"} = $data ;
+  }
+  close CSV_IN ;
+
+  # now read (very) active editors from newer more accurate file (split data 
for reg users and bots, unlike StatisticsMonthly.csv)
+
+  print "Read '$file_csv_in_2'\n" ;
+  open CSV_IN, '<', $file_csv_in_2 ;
+  while ($line = <CSV_IN>)
+  {
+    chomp $line ;
+    ($language,$date,$reguser_bot,$group,@counts) = split (',', $line) ;
+
+    next if ! &AcceptWiki ($project,$language) ;
+
+    if ($reguser_bot ne "R") { next ; }  # R: reg user, B: bot
+    if ($group       ne "A") { next ; }  # A: articles, T: talk pages, O: 
other namespaces
+
+    ($month,$day,$year) = split ('\/', $date) ;
+    $yyyymm = sprintf ("%04d-%02d", $year, $month) ;
+    $months {$yyyymm} ++ ;
+#    print "YYYYMM $yyyymm\n" ;
+
+    # data have been collected in WikiCountsProcess.pm and been written in 
WikiCountsOutput.pm
+    # count user with over x edits
+    # threshold starting with a 3 are 10xSQRT(10), 100xSQRT(10), 
1000xSQRT(10), etc
+    # @thresholds = 
(1,3,5,10,25,32,50,100,250,316,500,1000,2500,3162,5000,10000,25000,31623,50000,100000,250000,316228,500000,1000000,2500000,3162278,500000,10000000,25000000,31622777,5000000,100000000)
 ;
+    $edits_ge_5   = @counts [2] > 0 ? @counts [2] : 0 ;
+    $edits_ge_25  = @counts [4] > 0 ? @counts [4] : 0 ;
+    $edits_ge_100 = @counts [7] > 0 ? @counts [7] : 0 ;
+    $data2 {"$project,$language,$yyyymm"} = 
"$edits_ge_5,$edits_ge_25,$edits_ge_100" ;
+
+    $total_edits_ge_5   {"$project,$language"} += $edits_ge_5 ;
+    $total_edits_ge_25  {"$project,$language"} += $edits_ge_25 ;
+    $total_edits_ge_100 {"$project,$language"} += $edits_ge_100 ;
+
+    # prep string with right amount of comma's
+    if ($data2_default eq '')
+    {
+      $data2_default = $data2 {"$project,$language,$yyyymm"} ;
+      $data2_default =~ s/[^,]+/0/g ;
+    }
+  }
+  close CSV_IN ;
+}
+
+#sub ReadStatisticsPerBinariesExtensionCommons
+#{
+#  my $file_csv_in = "$path_in/csv_wx/StatisticsPerBinariesExtension.csv" ;
+#  my $mmax = -1 ;
+
+#  if (! -e $file_csv_in)
+#  { &Abort ("Input file '$file_csv_in' not found") ; }
+
+#  print "Read '$file_csv_in'\n" ;
+#  open CSV_IN, '<', $file_csv_in ;
+#  while ($line = <CSV_IN>)
+#  {
+#    chomp $line ;
+#    ($language,$date,$counts) = split (',', $line, 3) ;
+
+#    if ($language ne "commons") { next ; }
+
+#    if ($date eq "00/0000")
+#    {
+#      @fields = split (',', $counts) ;
+#      $field_ndx = 0 ;
+#      foreach $field (@fields)
+#      {
+#        $ext_cnt {-1}{$field_ndx} = $field ;
+#      # print "EXT_CNT $field_ndx : $field\n" ;
+#        $field_ndx ++ ;
+#      }
+#      next ;
+#    }
+
+#    ($month,$year) = split ('\/', $date) ;
+#    my $m = &months_since_2000_01 ($year,$month) ;
+#    next if $m < $m_start ;
+
+#    if ($m > $mmax)
+#    { $mmax = $m ; }
+
+#    @fields = split (',', $counts) ;
+#    $field_ndx = 0 ;
+#    foreach $field (@fields)
+#    {
+#      $ext_cnt {$m}{$field_ndx}  = $field ;
+#      $ext_tot {$m}             += $field ;
+#      $field_ndx ++ ;
+#    }
+#  }
+#  close CSV_IN ;
+
+#  %ext_cnt_mmax = %{$ext_cnt {$mmax}} ;
+#  @ext_cnt_mmax = (sort {$ext_cnt_mmax {$b} <=> $ext_cnt_mmax {$a}} keys 
%ext_cnt_mmax) ;
+
+#  $extcnt = 0 ;
+#  foreach $extndx (@ext_cnt_mmax)
+#  {
+#    # print "$extndx < ${ext_cnt {-1}{$extndx}} > : ${ext_cnt_mmax 
{$extndx}}\n" ;
+#    push @extndxs, $extndx ;
+#    if ($extcnt++ >= 9) { last ; }
+#  }
+#}
+
+sub FindLargestWikis
+{
+  print "FindLargestWikis\n" ;
+  print "Largest projects (most accumulated very active editors):\n";
+  @total_edits_ge_100 = sort {$total_edits_ge_100 {$b} <=> $total_edits_ge_100 
{$a}} keys %total_edits_ge_100 ;
+  $rank = 0 ;
+  foreach $project_language (@total_edits_ge_100)
+  {
+    $largest_projects {$project_language} = $rank++ ;
+    print "$project_language," ;
+    last if $rank > 10 ;
+  }
+  print "\n\n" ;
+
+  foreach $yyyymm (sort keys %months)
+  {
+    next if $yyyymm lt '2011' ;
+    foreach $project_language (keys %largest_projects)
+    {
+      ($project,$language) = split (',', $project_language) ;
+          if ($data2 {"$project,$language,$yyyymm"} eq '')
+      {
+        print "No data yet for large wiki $project_language for $yyyymm-> skip 
month $yyyymm\n" ;
+        $months {$yyyymm} = 0 ;
+      }
+    }
+  }
+}
+
+sub WriteMonthlyData
+{
+  print "WriteMonthlyData\n" ;
+  my $file_csv_out = "$path_out/$file_csv_analytics_in" ;
+  open CSV_OUT, '>', $file_csv_out ;
+  foreach $project_wiki_month (sort keys %data1)
+  {
+    ($project,$wiki,$yyyymm) = split (',', $project_wiki_month) ;
+
+    # recent month misses on eor more large wikis?
+    next if $months {$yyyymm} == 0 ;
+
+    $data1 = $data1 {$project_wiki_month} ;
+    $data2 = $data2 {$project_wiki_month} ;
+    if ($data2 eq '')
+    {
+      print "Editor data missing for $project_wiki_month\n" ;
+      $data2 = $data2_default ;
+    }
+    $data1 =~ s/data2/$data2/ ; # insert rather than append to have all editor 
fields close together
+    print CSV_OUT "$project_wiki_month,$data1\n" ;
+  }
+  $total_edits_ge_5   {"$project,*,$yyyymm"} += $edits_ge_5 ;
+  $total_edits_ge_25  {"$project,*,$yyyymm"} += $edits_ge_25 ;
+  $total_edits_ge_100 {"$project,*,$yyyymm"} += $edits_ge_100 ;
+  close CSV_OUT ;
+}
+
+sub AcceptWiki
+{
+  my ($project,$language) = @_ ;
+
+  return $false if $language eq 'commons' and $project ne 'wx' ; # commons 
also in wikipedia csv files (bug, hard to cleanup, just skip)
+  return $false if $language eq 'sr'      and $project eq 'wn' ; # ignore 
insane bot spam on
+  return $false if $language =~ 
/mania|team|comcom|closed|chair|langcom|office|searchcom|sep11|nostalgia|stats|test/i
 ;
+
+  return $false if $language =~ /^(?:dk|tlh|ru_sib)$/ ; # dk=dumps exist(ed?) 
but site not, tlh=Klignon, ru-sib=Siberian
+  return $false if $project eq 'wk' and ($language eq "als" or $language eq 
"tlh") ;
+
+  return $true ;
+}
+
+sub Abort
+{
+  my $msg = shift ;
+  print "$msg\nExecution aborted." ;
+  # to do: log also to file
+  exit ;
+}
+
+

Copied: trunk/extensions/MetricsReporting/import/_readme.txt (from rev 105441, 
trunk/wikistats/analytics/_readme.txt)
===================================================================
--- trunk/extensions/MetricsReporting/import/_readme.txt                        
        (rev 0)
+++ trunk/extensions/MetricsReporting/import/_readme.txt        2011-12-07 
17:36:45 UTC (rev 105444)
@@ -0,0 +1,21 @@
+This folder contain bash and perl files to create and fill database 
'analytics', a.o. for monthly dashboard.
+
+== analytics_new.sh ==
+Defines the database and tables and loads data from existing csv files.
+It executes SQL from analytics_create_and_load_from_csv.txt
+
+== analytics_upd.sh ==
+Prepares new csv files (delegated to analytics_generate_csv_files.sh),
+and empties/reloads all tables for which csv files are in this folder.   
+It executes SQL from analytics_refresh_from_csv.txt
+
+== CSV files ==
+CSV files and where they are generated:
+
+analytics_in_binaries.csv          <- AnalyticsPrepBinariesData.pl
+analytics_in_comscore.csv          <- AnalyticsPrepComscoreData.pl             
   
+analytics_in_comscore_regions.csv  <- manual
+analytics_in_language_names.csv   
+analytics_in_offline.csv           <- manual
+analytics_in_page_views.csv        <- 
/home/ezachte/wikistats/pageviews_monthly.sh (file copy after that)
+analytics_in_wikistats.csv         <- AnalyticsPrepWikiCountsOutput.pl

Copied: 
trunk/extensions/MetricsReporting/import/analytics_create_and_load_from_csv.txt 
(from rev 105441, 
trunk/wikistats/analytics/analytics_create_and_load_from_csv.txt)
===================================================================
--- 
trunk/extensions/MetricsReporting/import/analytics_create_and_load_from_csv.txt 
                            (rev 0)
+++ 
trunk/extensions/MetricsReporting/import/analytics_create_and_load_from_csv.txt 
    2011-12-07 17:36:45 UTC (rev 105444)
@@ -0,0 +1,181 @@
+-- make sure to delete output files *test*.csv first if any exist (MySQL on 
purpose forbids overwrite)
+
+-- tables implemented:
+--   comscore
+--   comscore_regions
+--   wikistats 
+--   page_views     
+--   language names 
+--   binaries    
+
+-- more tables planned (O= optional, not needed for report card stage)
+--   project_names  
+-- O edits          per project_code, per language, per month, per 
normalization type (Y/N), editor type (manual, anonymous, bot), namespace group 
(articles, talk pages, other)  
+-- O editors        per project_code, per language, per month, per 
normalization type (Y/N), editor type (manual, anonymous, bot), namespace group 
(articles, talk pages, other)  
+
+
+-- open issues: 
+-- only store basic data in database and calculate all aggregates on the fly 
or do some aggregation before hand ? (e.g. count for mobile / non-mobile / ==> 
total of both ? <==)
+-- for binaries, store one extension type per row? (future proof, more work to 
query), or a selected few as columns? (one row only needed per month)
+
+-- Create database and two tables from scratch 
+DROP DATABASE IF EXISTS `analytics` ;
+
+CREATE DATABASE `analytics` ;
+
+USE `analytics` ;
+
+CREATE TABLE `comscore` (
+  `date`           date NOT NULL, 
+  `country_code`   varchar (3), 
+  `region_code`    varchar (3), 
+  `web_property`   varchar (20), 
+  `project_code`   varchar (10), 
+  `reach`          decimal (4,1) DEFAULT NULL,
+  `visitors`       decimal (15)  DEFAULT NULL, 
+  PRIMARY KEY (date,country_code,region_code,project_code,web_property),
+  KEY (`country_code`)  
+) ;
+
+CREATE TABLE `comscore_regions` (
+  `region_code`      varchar (2), 
+  `report_language`  varchar (10),
+  `region_name`      varchar (18),
+  PRIMARY KEY (region_code,report_language)
+) ;  
+
+CREATE TABLE `wikistats` (
+  `date`                     date NOT NULL, 
+  `project_code`             varchar (10),
+  `language_code`            varchar (15),
+  `editors_all_time`         int (10)  DEFAULT NULL, 
+  `editors_new`              int (7)   DEFAULT NULL, 
+  `editors_ge_5`             int (7)   DEFAULT NULL, 
+  `editors_ge_25`            int (7)   DEFAULT NULL, 
+  `editors_ge_100`           int (7)   DEFAULT NULL, 
+  `articles`                 int (12)  DEFAULT NULL, 
+  `articles_new_per_day`     int (9)   DEFAULT NULL, 
+  `articles_over_bytes_500`  int (12)  DEFAULT NULL, 
+  `articles_over_bytes_2000` int (12)  DEFAULT NULL, 
+  `edits_per_article`        decimal (9,1) DEFAULT NULL, 
+  `bytes_per_article`        decimal (9,1) DEFAULT NULL, 
+  `edits`                    int (12)  DEFAULT NULL, 
+  `size_in_bytes`            int (15)  DEFAULT NULL, 
+  `size_in_words`            int (15)  DEFAULT NULL, 
+  `links_internal`           int (15)  DEFAULT NULL, 
+  `links_interwiki`          int (15)  DEFAULT NULL, 
+  `links_image`              int (15)  DEFAULT NULL, 
+  `links_external`           int (15)  DEFAULT NULL, 
+  `redirects`                int (15)  DEFAULT NULL, 
+  PRIMARY KEY (date,project_code,language_code)
+) ;
+
+CREATE TABLE `page_views` (
+  `date`                        date NOT NULL, 
+  `project_code`                char (2),
+  `language_code`               char (15),
+  `views_non_mobile_raw`        bigint (15), 
+  `views_mobile_raw`            bigint (15), 
+  `views_non_mobile_normalized` bigint (15), 
+  `views_mobile_normalized`     bigint (15), 
+  `views_raw`                   bigint (15), 
+  `views_normalized`            bigint (15), 
+  PRIMARY KEY (date,project_code,language_code),
+  KEY (date,language_code,project_code)
+) ;
+
+CREATE TABLE `language_names` (
+  `report_language`             varchar (15),   
+  `language_code`               varchar (15),
+  `language_name`               varchar (50),
+  PRIMARY KEY (report_language,language_code)
+) ;
+
+CREATE TABLE `binaries` (
+  `date`                        date NOT NULL, 
+  `project_code`                char (2),
+  `language_code`               char (15),
+  `extension`                   varchar (10),
+  `binaries`                    bigint (15), 
+  PRIMARY KEY (date,project_code,language_code,extension)
+) ;
+
+CREATE TABLE `offline` (
+  `date`                        date NOT NULL,
+  `source`                      varchar (255),
+  `readers`                     bigint (12),
+  PRIMARY KEY (date,source)
+) ;
+
+-- SHOW TABLES ;
+-- DESCRIBE comscore ;
+-- DESCRIBE comscore_regions ;
+-- DESCRIBE wikistats ;
+-- DESCRIBE page_views ;
+-- DESCRIBE language_names ;
+-- DESCRIBE binaries ;
+-- DESCRIBE offline ;
+
+-- Database Manipulation
+-- Obviously in real world this is a separate script
+
+LOAD DATA LOCAL INFILE 'analytics_in_comscore.csv' 
+     INTO TABLE comscore 
+     FIELDS TERMINATED BY ',' 
+     OPTIONALLY ENCLOSED BY '"' 
+     (@date,country_code,region_code,web_property,project_code,reach,visitors)
+     SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;
+
+LOAD DATA LOCAL INFILE 'analytics_in_comscore_regions.csv' 
+     INTO TABLE comscore_regions 
+     FIELDS TERMINATED BY ',' 
+     OPTIONALLY ENCLOSED BY '"' 
+     (report_language,region_code,region_name) ;
+
+LOAD DATA LOCAL INFILE 'analytics_in_wikistats.csv' 
+     INTO TABLE wikistats 
+     FIELDS TERMINATED BY ',' 
+     OPTIONALLY ENCLOSED BY '"' 
+     
(project_code,language_code,@date,editors_all_time,editors_new,editors_ge_5,editors_ge_25,editors_ge_100,articles,articles_new_per_day,articles_over_bytes_500,articles_over_bytes_2000,edits_per_article,bytes_per_article,edits,size_in_bytes,size_in_words,links_internal,links_interwiki,links_image,links_external,redirects)
 
+     SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;
+
+LOAD DATA LOCAL INFILE 'analytics_in_page_views.csv' 
+     INTO TABLE page_views 
+     FIELDS TERMINATED BY ',' 
+     OPTIONALLY ENCLOSED BY '"' 
+     
(project_code,language_code,@date,views_non_mobile_raw,views_mobile_raw,views_non_mobile_normalized,views_mobile_normalized,views_raw,views_normalized)
 
+     SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;
+
+
+LOAD DATA LOCAL INFILE 'analytics_in_language_names.csv' 
+     INTO TABLE language_names 
+     FIELDS TERMINATED BY ',' 
+     OPTIONALLY ENCLOSED BY '"' 
+     (report_language,language_code,language_name) ;
+
+LOAD DATA LOCAL INFILE 'analytics_in_binaries.csv' 
+     INTO TABLE binaries 
+     FIELDS TERMINATED BY ',' 
+     OPTIONALLY ENCLOSED BY '"' 
+     (project_code,language_code,@date,extension,binaries) 
+     SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;
+
+LOAD DATA LOCAL INFILE 'analytics_in_offline.csv' 
+     INTO TABLE offline 
+     FIELDS TERMINATED BY ',' 
+     OPTIONALLY ENCLOSED BY '"' 
+     (@date,readers) 
+     SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;
+  
+  
+-- show contents (debugging only) 
+-- SELECT * FROM comscore ;
+-- SELECT * FROM comscore_regions ;
+-- SELECT * FROM wikistats ;
+-- SELECT * FROM page_views ;
+-- SELECT * FROM language_names ;
+-- SELECT * FROM binaries 
+-- WHERE project_code = 'commons' ;
+   SELECT * FROM offline ; 
+
+

Copied: 
trunk/extensions/MetricsReporting/import/analytics_generate_csv_files.sh (from 
rev 105441, trunk/wikistats/analytics/analytics_generate_csv_files.sh)
===================================================================
--- trunk/extensions/MetricsReporting/import/analytics_generate_csv_files.sh    
                        (rev 0)
+++ trunk/extensions/MetricsReporting/import/analytics_generate_csv_files.sh    
2011-12-07 17:36:45 UTC (rev 105444)
@@ -0,0 +1,47 @@
+#!/bin/sh
+
+# Prepare several csv files, ready for importing into analytics database
+# All generated files have _in_ in name signalling these contain data ready 
for importing into database
+# One input record corresponds to one database record
+
+ulimit -v 8000000
+
+clear
+cd /a/analytics
+
+
+# AnalyticsPrepBinariesData.pl read counts for binaries which were generated 
by wikistats 
+# and which reside in /a/wikistats/csv_[project 
code]/StatisticsPerBinariesExtension.csv
+# It filters and reorganizes data and produces analytics_in_binaries.csv
+# Output csv contains: project code, language, month, extension name, count
+
+perl AnalyticsPrepBinariesData.pl -i /a/wikistats/ -o /a/analytics/
+
+# AnalyticsPrepComscoreData.pl scans /a/analytics/comscore for newest comScore 
csv files (with data for last 14 months) 
+# parses those csv files, adds/replaces data from these csv files into master 
files (containing full history)
+# and generates input csv file analytics_in_comscore.csv ready for importing 
into database
+#
+# note : these csv files were manually downloaded from 
http://mymetrix.comscore.com/app/report.aspx 
+# and given more descriptive names, script finds newest files based on partial 
name search 
+#
+# -r replace (default is add only)
+# -i input folder, contains manually downloaded csv files from comScore (or 
xls files manually converted to csv) 
+# -m master files with full history
+# -o output csv file, with reach per region, UV's per region and UV's per top 
web property, ready for import into database
+
+perl AnalyticsPrepComscoreData.pl -r -i /a/analytics/comscore -m /a/analytics 
-o /a/analytics
+
+# AnalyticsPrepWikiCountsOutput.pl reads a plethora of fields from several csv 
files from wikistats process
+# It filters and reorganizes data and produces analytics_in_wikistats.csv, 
ready for import into analytics database 
+
+perl AnalyticsPrepWikiCountsOutput.pl -i /a/wikistats/ -o /a/analytics 
+
+# analytics_in_page_views.csv is written daily as part of 
WikiCountsSummarizeProjectCounts.pl 
+# part of (/home/ezachte/pageviews_monthly.sh job) 
+# which processes hourly projectcounts files (per wiki page view totals for 
one hour) from http://dammit.lt/wikistats
+# and generates several files on different aggregation levels
+# only action here is to copy data to this folder to have everything in one 
place
+# note: unlike folder name suggests this file contains stats for all projects
+
+cp /a/wikistats/csv_wp/analytics_in_page_views.csv .
+

Copied: trunk/extensions/MetricsReporting/import/analytics_new.sh (from rev 
105441, trunk/wikistats/analytics/analytics_new.sh)
===================================================================
--- trunk/extensions/MetricsReporting/import/analytics_new.sh                   
        (rev 0)
+++ trunk/extensions/MetricsReporting/import/analytics_new.sh   2011-12-07 
17:36:45 UTC (rev 105444)
@@ -0,0 +1,5 @@
+clear
+cd /a/analytics
+# rm *test*.csv
+# mysql  --user=root --password=changerootwhenpriyankareturns < 
analytics_create_and_load_from_csv.txt > mysql_log.txt
+  mysql  -u analytics -h project2.wikimedia.org -preport < 
analytics_create_and_load_from_csv.txt > mysql_log.txt

Copied: trunk/extensions/MetricsReporting/import/analytics_refresh_from_csv.txt 
(from rev 105441, trunk/wikistats/analytics/analytics_refresh_from_csv.txt)
===================================================================
--- trunk/extensions/MetricsReporting/import/analytics_refresh_from_csv.txt     
                        (rev 0)
+++ trunk/extensions/MetricsReporting/import/analytics_refresh_from_csv.txt     
2011-12-07 17:36:45 UTC (rev 105444)
@@ -0,0 +1,55 @@
+USE `analytics` ;
+
+TRUNCATE TABLE comscore ;
+LOAD DATA LOCAL INFILE 'analytics_in_comscore.csv' 
+     INTO TABLE comscore 
+     FIELDS TERMINATED BY ',' 
+     OPTIONALLY ENCLOSED BY '"' 
+     (@date,country_code,region_code,web_property,project_code,reach,visitors)
+     SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;
+
+TRUNCATE TABLE comscore_regions   ;
+LOAD DATA LOCAL INFILE 'analytics_in_comscore_regions.csv' 
+     INTO TABLE comscore_regions 
+     FIELDS TERMINATED BY ',' 
+     OPTIONALLY ENCLOSED BY '"' 
+     (report_language,region_code,region_name) ;
+
+TRUNCATE TABLE wikistats ; 
+LOAD DATA LOCAL INFILE 'analytics_in_wikistats.csv' 
+     INTO TABLE wikistats 
+     FIELDS TERMINATED BY ',' 
+     OPTIONALLY ENCLOSED BY '"' 
+     
(project_code,language_code,@date,editors_all_time,editors_new,editors_ge_5,editors_ge_25,editors_ge_100,articles,articles_new_per_day,articles_over_bytes_500,articles_over_bytes_2000,edits_per_article,bytes_per_article,edits,size_in_bytes,size_in_words,links_internal,links_interwiki,links_image,links_external,redirects)
 
+     SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;
+
+TRUNCATE TABLE page_views ;
+LOAD DATA LOCAL INFILE 'analytics_in_page_views.csv' 
+     INTO TABLE page_views 
+     FIELDS TERMINATED BY ',' 
+     OPTIONALLY ENCLOSED BY '"' 
+     
(project_code,language_code,@date,views_non_mobile_raw,views_mobile_raw,views_non_mobile_normalized,views_mobile_normalized,views_raw,views_normalized)
 
+     SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;
+
+TRUNCATE TABLE language_names ;   
+LOAD DATA LOCAL INFILE 'analytics_in_language_names.csv' 
+     INTO TABLE language_names 
+     FIELDS TERMINATED BY ',' 
+     OPTIONALLY ENCLOSED BY '"' 
+     (report_language,language_code,language_name) ;
+
+TRUNCATE TABLE binaries ;
+LOAD DATA LOCAL INFILE 'analytics_in_binaries.csv' 
+     INTO TABLE binaries 
+     FIELDS TERMINATED BY ',' 
+     OPTIONALLY ENCLOSED BY '"' 
+     (project_code,language_code,@date,extension,binaries) 
+     SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;
+
+TRUNCATE TABLE offline ;  
+LOAD DATA LOCAL INFILE 'analytics_in_offline.csv' 
+     INTO TABLE offline 
+     FIELDS TERMINATED BY ',' 
+     OPTIONALLY ENCLOSED BY '"' 
+     (@date, readers) 
+     SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;

Copied: trunk/extensions/MetricsReporting/import/analytics_upd.sh (from rev 
105441, trunk/wikistats/analytics/analytics_upd.sh)
===================================================================
--- trunk/extensions/MetricsReporting/import/analytics_upd.sh                   
        (rev 0)
+++ trunk/extensions/MetricsReporting/import/analytics_upd.sh   2011-12-07 
17:36:45 UTC (rev 105444)
@@ -0,0 +1,6 @@
+clear
+cd /a/analytics
+
+./analytics_generate_csv_files.sh
+
+mysql  -u analytics -h project2.wikimedia.org -preport < 
analytics_refresh_from_csv.txt


_______________________________________________
MediaWiki-CVS mailing list
MediaWiki-CVS@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs

Reply via email to