http://www.mediawiki.org/wiki/Special:Code/MediaWiki/91509

Revision: 91509
Author:   ezachte
Date:     2011-07-06 00:37:49 +0000 (Wed, 06 Jul 2011)
Log Message:
-----------
new MySQL tables and fields, updated perl files, more bash scripts

Modified Paths:
--------------
    trunk/wikistats/analytics/AnalyticsPrepBinariesData.pl
    trunk/wikistats/analytics/AnalyticsPrepComscoreData.pl
    trunk/wikistats/analytics/AnalyticsPrepWikiCountsOutput.pl

Added Paths:
-----------
    trunk/wikistats/analytics/analytics_create_and_load_from_csv.txt
    trunk/wikistats/analytics/analytics_generate_csv_files.sh
    trunk/wikistats/analytics/analytics_new.sh
    trunk/wikistats/analytics/analytics_refresh_from_csv.txt
    trunk/wikistats/analytics/analytics_upd.sh

Modified: trunk/wikistats/analytics/AnalyticsPrepBinariesData.pl
===================================================================
--- trunk/wikistats/analytics/AnalyticsPrepBinariesData.pl      2011-07-06 
00:24:59 UTC (rev 91508)
+++ trunk/wikistats/analytics/AnalyticsPrepBinariesData.pl      2011-07-06 
00:37:49 UTC (rev 91509)
@@ -17,21 +17,21 @@
 
 sub ParseArguments
 {
-#  my @options ;
-#  getopt ("io", \%options) ;
+  my @options ;
+  getopt ("io", \%options) ;
 
-#  die ("Specify input folder as: -i path")   if (! defined ($options {"i"})) ;
-#  die ("Specify output folder as: -o path'") if (! defined ($options {"o"})) ;
+  die ("Specify input folder as: -i path")   if (! defined ($options {"i"})) ;
+  die ("Specify output folder as: -o path'") if (! defined ($options {"o"})) ;
 
-#  $path_in  = $options {"i"} ;
-#  $path_out = $options {"o"} ;
+  $path_in  = $options {"i"} ;
+  $path_out = $options {"o"} ;
 
-#  die "Input folder '$path_in' does not exist"   if (! -d $path_in) ;
-#  die "Output folder '$path_out' does not exist" if (! -d $path_out) ;
+  die "Input folder '$path_in' does not exist"   if (! -d $path_in) ;
+  die "Output folder '$path_out' does not exist" if (! -d $path_out) ;
 
   # tests only
-  $path_in  = "C:/@ Wikimedia/# Out Bayes" ;
-  $path_out = "C:/analytics" ; # "w:/@ report card/data" ;
+  # $path_in  = "C:/@ Wikimedia/# Out Bayes" ;
+  # $path_out = "C:/analytics" ; # "w:/@ report card/data" ;
 
   print "Input  folder: $path_in\n" ;
   print "Output folder: $path_out\n" ;

Modified: trunk/wikistats/analytics/AnalyticsPrepComscoreData.pl
===================================================================
--- trunk/wikistats/analytics/AnalyticsPrepComscoreData.pl      2011-07-06 
00:24:59 UTC (rev 91508)
+++ trunk/wikistats/analytics/AnalyticsPrepComscoreData.pl      2011-07-06 
00:37:49 UTC (rev 91509)
@@ -33,13 +33,13 @@
   use Cwd;
 
   my $options ;
-  getopt ("mu", \%options) ;
+  getopt ("imo", \%options) ;
 
   $true  = 1 ;
   $false = 0 ;
 
   $script_name    = "AnalyticsPrepComscoreData.pl" ;
-  $script_version = "0.32" ;
+  $script_version = "0.31" ;
 
 # EZ test only
 # $source       = "comscore" ;
@@ -47,21 +47,34 @@
 # $generated    = "2011-05-06 00:00:00" ;
 # $user         = "ezachte" ;
 
-  $dir_analytics        = $options {"m"} ;
-  $dir_comscore_updates = $options {"u"} ;
+  $dir_in   = $options {"i"} ;
+  $dir_upd  = $options {"m"} ;
+  $dir_out  = $options {"o"} ;
+  $mode = 'add';
+  if (defined $options {'r'})
+  { $mode = 'replace'; }
 
-  $dir_analytics        = "C:/@ Wikimedia/! MySQL/analytics" ;  # EZ test only
-  $dir_comscore_updates = "C:/@ Wikimedia/@ Report Card/Data" ; # EZ test only
+  print "Mode is $mode (specify '-r' for replace)\n\n";
 
-  if (($dir_analytics eq '') || ($dir_comscore_updates eq ''))
-  { Abort ("Specify folder for 'master' csv files as '-m folder', folder for 
'update' csv files as -u folder'") ; }
+  if (! -d "/home/") # EZ test machine
+  {
+    $dir_in  = "C:/@ Wikimedia/@ Report Card/Data" ;
+    $dir_upd = "C:/MySQL/analytics" ;
+    $dir_out = "C:/MySQL/analytics" ;
+    $mode = 'replace' ;
+  }
 
-  $file_comscore_reach_master       = "history_comscore_reach_regions.csv" ;
-  $file_comscore_reach_update       = "*reach*by*region*csv" ;
+  if ($dir_in eq '')
+  { Abort ("Specify folder for input file (new comScore data) '-i folder'") ; }
+  if ($dir_upd eq '')
+  { Abort ("Specify folder for master files (full history) as '-m folder'") ; }
+  if ($dir_out eq '')
+  { Abort ("Specify folder for output file '-o folder'") ; }
 
-  $file_comscore_uv_region_master   = "history_comscore_UV_regions.csv" ;
-  $file_comscore_uv_region_update   = "*UVs*by*region*csv" ;
-
+  $file_comscore_reach_master     = "history_comscore_reach_regions.csv" ;
+  $file_comscore_reach_update     = "*reach*by*region*csv" ;
+  $file_comscore_uv_region_master = "history_comscore_UV_regions.csv" ;
+  $file_comscore_uv_region_update = "*UVs*by*region*csv" ;
   $file_comscore_uv_property_master = "history_comscore_UV_properties.csv" ;
   $file_comscore_uv_property_update = "*UV*trend*csv" ;
 
@@ -69,7 +82,10 @@
   $layout_csv_regions    = 2 ;
   $layout_csv_properties = 3 ;
 
-  print "Directories:\nAnalytics '$dir_analytics'\nUpdates 
'$dir_comscore_updates'\n\n" ;
+  print "Directories:\n" .
+        "Input (new comScore data): '$dir_in'\n".
+        "Master files (full history): '$dir_upd'\n" .
+        "Output (database feed): '$dir_out'\n\n" ;
 
   %region_codes = (
     "Europe"=>"EU",
@@ -88,13 +104,13 @@
 
   @months_short = qw "Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec" ;
 
-  &ReadDataReachPerRegion ($file_comscore_reach_master, 
$file_comscore_reach_update, "%.1f", 1, $layout_csv_reach) ;
+  &ReadMasterComscoreDataReachPerRegion ($file_comscore_reach_master, 
$file_comscore_reach_update, "%.1f", 1, $layout_csv_reach) ;
   %reach_region_code = %data ;
 
-  &ReadDataVisitorsPerRegion ($file_comscore_uv_region_master, 
$file_comscore_uv_region_update, "%.0f", 1000, $layout_csv_regions) ;
+  &ReadMasterComscoreDataVisitorsPerRegion ($file_comscore_uv_region_master, 
$file_comscore_uv_region_update, "%.0f", 1000, $layout_csv_regions) ;
   %visitors_region_code = %data ;
 
-  &ReadDataVisitorsPerProperty ($file_comscore_uv_property_master, 
$file_comscore_uv_property_update, "%.0f", 1000, $layout_csv_properties) ;
+  &ReadMasterComscoreDataVisitorsPerProperty 
($file_comscore_uv_property_master, $file_comscore_uv_property_update, "%.0f", 
1000, $layout_csv_properties) ;
   %visitors_web_property = %data ;
 
   &WriteDataAnalytics ;
@@ -102,153 +118,32 @@
   print "\nReady\n\n" ;
   exit ;
 
-sub UpdateFromLatestComscoreData
+sub ReadMasterComscoreDataReachPerRegion
 {
-  my ($file_comscore_master, $file_comscore_updates, $multiplier, $layout_csv, 
@update_only) = @_ ;
+  my ($file_comscore_master, $file_comscore_updates, $precision, $layout_csv) 
= @_ ;
 
-  undef %update_only ;
-  undef %do_not_update ;
+  print "ReadMasterComscoreDataReachPerRegion\n\n" ;
 
-  foreach $id (@update_only)
-  { $update_only {$id} = $true ; }
-
-  if (! -e "$dir_analytics/$file_comscore_master")
-  { Abort ("File $file_comscore_master not found!") ; }
-
-  $age_master = -M "$dir_analytics/$file_comscore_master" ;
-  print "\nLatest comscore master file is " . sprintf ("%.0f", $age_master) . 
" days old: '$file_comscore_master'\n" ;
-
-  my $cwd = getcwd ;
-  chdir $dir_comscore_updates ;
-
-  @files = glob($file_comscore_updates) ;
-  $age_update = 999999 ;
-  $file_comscore_updates_latest = '' ;
-  foreach $file (@files)
-  {
-    $age = -M $file ;
-    if ($age < $age_update)
-    {
-      $age_update = $age ;
-      $file_comscore_updates_latest = $file ;
-    }
-  }
-  print "\nLatest comscore update file is " . sprintf ("%.0f", $age_update) . 
" days old: '$file_comscore_updates_latest'\n" ;
-
-  if ($age_update == 999999)
-  {
-    print "No valid update file found. Nothing to update." ;
-    return ;
-  }
-
-  if ($age_master < $age_update)
-  {
-    print "File with master data more recent than latest update csv from 
comScore. Nothing to update." ;
-    return ;
-  }
-
-  my $updates_found = $false ;
-
-  print "\nRead updates\n\n" ;
-  open CSV, '<', $file_comscore_updates_latest ;
-  while ($line = <CSV>)
-  {
-    chomp $line ;
-    $line = &GetNumberOnly ($line) ;
-
-    if ($line =~ /Jan-\d\d\d\d.*?Feb-\d\d\d\d/) # e.g. 
'Location,Location,Jan-2010,Feb-2010,Mar-2010,Apr-2010,...'
-    {
-      if ($layout_csv == $layout_csv_properties)
-      { ($dummy1,$dummy2,$dummy3,@months) = split (',', $line) ; } # web 
properties csv file
-      else
-      { ($dummy1,$dummy2,@months) = split (',', $line) ; }         # uv / 
reach csv files
-
-      @months = &mmm_yyyy2yyyy_mm (@months) ;
-    }
-
-    if ($line =~ /^\d+,/)
-    {
-      if ($layout_csv == $layout_csv_properties)
-      {
-        ($index,$dummy,$property,@data) = split (',', $line) ;
-        $property =~ s/^\s+// ;
-        $property =~ s/\s+$// ;
-
-        $property =~ s/.*Google.*/Google/i ;
-        $property =~ s/.*Microsoft.*/Microsoft/i ;
-        $property =~ s/.*FACEBOOK.*/Facebook/i ;
-        $property =~ s/.*Yahoo.*/Yahoo/i ;
-        $property =~ s/.*Amazon.*/Amazon/i ;
-        $property =~ s/.*Apple.*/Apple/i ;
-        $property =~ s/.*AOL.*/AOL/i ;
-        $property =~ s/.*Wikimedia.*/Wikimedia/i ;
-        $property =~ s/.*Tencent.*/Tencent/i ;
-        $property =~ s/.*Baidu.*/Baidu/i ;
-        $property =~ s/.*CBS.*/CBS/i ;
-
-        $id = $property ;
-      }
-      else
-      {
-        ($index,$region,@data) = split (',', $line) ;
-        $region =~ s/^\s+// ;
-        $region =~ s/\s+$// ;
-        $id = $region_codes {$region} ;
-      }
-
-      if ($update_only {$id} == 0)
-      {
-        $do_not_update {$id}++ ;
-        next ;
-      }
-
-      for ($m = 0 ; $m <= $#months ; $m++)
-      {
-        $yyyymm = $months [$m] ;
-        $months {$yyyymm} ++ ;
-        $yyyymm_id = "$yyyymm,$id" ;
-        $data = $data [$m] * $multiplier ;
-
-        if (! defined $data {$yyyymm_id})
-        {
-          $updates_found = $true ;
-          print "New data found: $yyyymm_id = $data\n" ;
-          $data {$yyyymm_id} = $data ;
-        }
-      }
-    }
-  }
-
-  $ignored = join ', ', sort keys %do_not_update ;
-  print "\nEntities ignored:\n$ignored\n\n" ;
-
-  if (! $updates_found)
-  { print "No new updates found\n" ; }
-  else
-  { print "\nUpdates found, rewrite master file '$file_comscore_master'\n\n" ; 
}
-
-  return ($updates_found) ;
-}
-
-sub ReadDataReachPerRegion
-{
-  my ($file_comscore_master, $file_comscore_updates, $precision, $layout_csv) 
= @_ ;
-
   undef %months ;
   undef %data ;
   undef @regions ;
 
-  open IN,  '<', "$dir_analytics/$file_comscore_master" ;
+  open IN,  '<', "$dir_upd/$file_comscore_master" ;
 
   $lines = 0 ;
   while ($line = <IN>)
   {
     chomp $line ;
+    $line =~ s/\r//g ;
 
     ($yyyymm,@data) = split (',', $line) ;
 
     if ($lines++ == 0)
-    { @regions = @data ; next ; }
+    {
+      @regions = @data ;
+      print "Regions found: " . (join ',', @regions) . "\n";
+      next ;
+    }
 
     $field_ndx = 0 ;
     foreach (@data)
@@ -267,11 +162,11 @@
   }
   close IN ;
 
-  my $updates_found = &UpdateFromLatestComscoreData ($file_comscore_master, 
$file_comscore_updates, 1, $layout_csv, @regions) ;
+  my $updates_found = &UpdateMasterFileFromRecentComscoreData 
($file_comscore_master, $file_comscore_updates, 1, $layout_csv, @regions) ;
   return if ! $updates_found ;
 
-  rename "$dir_analytics/$file_comscore_master", 
"$dir_analytics/$file_comscore_master.~" ;
-  open OUT,  '>', "$dir_analytics/$file_comscore_master" ;
+  rename "$dir_upd/$file_comscore_master", "$dir_upd/$file_comscore_master.~" ;
+  open OUT,  '>', "$dir_upd/$file_comscore_master" ;
 
   $line_out = "yyyymm" ;
   foreach $region_name (@regions)
@@ -292,27 +187,36 @@
   close OUT ;
 }
 
-sub ReadDataVisitorsPerRegion
+sub ReadMasterComscoreDataVisitorsPerRegion
 {
   my ($file_comscore_master, $file_comscore_updates, $precision, $multiplier, 
$layout_csv) = @_ ;
 
+  print "ReadMasterComscoreDataVisitorsPerRegion\n\n";
+
   undef %months ;
   undef %data ;
   undef @regions ;
 
-  open IN,  '<', "$dir_analytics/$file_comscore_master" ;
+  open IN,  '<', "$dir_upd/$file_comscore_master" ;
 
   $lines  = 0 ;
   $metric = 'unique_visitors' ;
   while ($line = <IN>)
   {
     chomp $line ;
+    $line =~ s/\r//g ;
     $line = &GetNumberOnly ($line) ;
 
+    next if $line !~ /(?:yyyymm|\d\d\d\d-\d\d)/ ;
+
     ($yyyymm,@data) = split (',', $line) ;
 
     if ($lines++ == 0)
-    { @regions = @data ; next ; }
+    {
+      @regions = @data ;
+      print "Regions found: " . (join ',', @regions) . "\n";
+      next ;
+    }
 
     $field_ndx = 0 ;
     foreach (@data)
@@ -334,11 +238,11 @@
   }
   close IN ;
 
-  my $updates_found = &UpdateFromLatestComscoreData ($file_comscore_master, 
$file_comscore_updates, 1000, $layout_csv, @regions) ;
+  my $updates_found = &UpdateMasterFileFromRecentComscoreData 
($file_comscore_master, $file_comscore_updates, 1000, $layout_csv, @regions) ;
   return if ! $updates_found ;
 
-  rename "$dir_analytics/$file_comscore_master", 
"$dir_analytics/$file_comscore_master.~" ;
-  open OUT,  '>', "$dir_analytics/$file_comscore_master" ;
+  rename "$dir_upd/$file_comscore_master", "$dir_upd/$file_comscore_master.~" ;
+  open OUT,  '>', "$dir_upd/$file_comscore_master" ;
 
   $line_out = "yyyymm" ;
   foreach $region_name (@regions)
@@ -359,21 +263,24 @@
   close OUT ;
 }
 
-sub ReadDataVisitorsPerProperty
+sub ReadMasterComscoreDataVisitorsPerProperty
 {
   my ($file_comscore_master, $file_comscore_updates, $precision, $multiplier, 
$layout_csv) = @_ ;
 
+  print "ReadMasterComscoreDataVisitorsPerProperty\n\n";
+
   undef %months ;
   undef %data ;
   undef @properties ;
 
-  open IN,  '<', "$dir_analytics/$file_comscore_master" ;
+  open IN,  '<', "$dir_upd/$file_comscore_master" ;
 
   $lines = 0 ;
   $metric       = 'unique_visitors' ;
   while ($line = <IN>)
   {
     chomp $line ;
+    $line =~ s/\r//g ;
 
     ($yyyymm,@data) = split (',', $line) ;
     if ($lines++ == 0)
@@ -398,11 +305,11 @@
   }
   close IN ;
 
-  my $updates_found = &UpdateFromLatestComscoreData ($file_comscore_master, 
$file_comscore_updates, 1000, $layout_csv, @properties) ;
+  my $updates_found = &UpdateMasterFileFromRecentComscoreData 
($file_comscore_master, $file_comscore_updates, 1000, $layout_csv, @properties) 
;
   return if ! $updates_found ;
 
-  rename "$dir_analytics/$file_comscore_master", 
"$dir_analytics/$file_comscore_master.~" ;
-  open OUT,  '>', "$dir_analytics/$file_comscore_master" ;
+  rename "$dir_upd/$file_comscore_master", "$dir_upd/$file_comscore_master.~" ;
+  open OUT,  '>', "$dir_upd/$file_comscore_master" ;
 
   $line_out = "yyyymm" ;
   foreach $property (@properties)
@@ -423,10 +330,163 @@
   close OUT ;
 }
 
+sub UpdateMasterFileFromRecentComscoreData
+{
+  my ($file_comscore_master, $file_comscore_updates, $multiplier, $layout_csv, 
@white_list) = @_ ;
+
+  print "UpdateMasterFileFromRecentComscoreData\n\n";
+
+  undef %white_list ;
+  undef %not_white_listed ;
+
+  print "White list: ". (join (',', @white_list)) . "\n\n";
+
+  foreach $id (@white_list)
+  { $white_list {$id} = $true ; }
+
+  if (! -e "$dir_upd/$file_comscore_master")
+  { Abort ("File $file_comscore_master not found!") ; }
+
+  $age_all = -M "$dir_upd/$file_comscore_master" ;
+  print "Latest comscore master file is " . sprintf ("%.0f", $age_all) . " 
days old: '$file_comscore_master'\n" ;
+
+  my $cwd = getcwd ;
+  chdir $dir_in ;
+
+  @files = glob($file_comscore_updates) ;
+  $min_age_upd = 999999 ;
+  $file_comscore_updates_latest = '' ;
+  foreach $file (@files)
+  {
+    $age = -M $file ;
+    if ($age < $min_age_upd)
+    {
+      $min_age_upd = $age ;
+      $file_comscore_updates_latest = $file ;
+    }
+  }
+  print "Latest comscore update file is " . sprintf ("%.0f", $min_age_upd) . " 
days old: '$file_comscore_updates_latest'\n" ;
+
+  if ($min_age_upd == 999999)
+  {
+    print "No valid update file found. Nothing to update." ;
+    return ;
+  }
+
+  #if ($age_all > $min_age_upd)
+  #{
+  #  print "File with master data more recent than latest update csv from 
comScore. Nothing to update." ;
+  #  return ;
+  #}
+
+  my $updates_found = $false ;
+
+  open CSV, '<', $file_comscore_updates_latest ;
+  binmode CSV ;
+  while ($line = <CSV>)
+  {
+    chomp $line ;
+    $line =~ s/\r//g ;
+    $line = &GetNumberOnly ($line) ;
+
+    if ($line =~ /Jan-\d\d\d\d.*?Feb-\d\d\d\d/) # e.g. 
'Location,Location,Jan-2010,Feb-2010,Mar-2010,Apr-2010,...'
+    {
+      if ($layout_csv == $layout_csv_properties)
+      { ($dummy1,$dummy2,$dummy3,@months) = split (',', $line) ; } # web 
properties csv file
+      else
+      { ($dummy1,$dummy2,@months) = split (',', $line) ; }         # uv / 
reach csv files
+
+      @months = &mmm_yyyy2yyyy_mm (@months) ;
+    }
+
+    if (($line =~ /^\d+,/) || ($line =~ /,,.*?Total Internet/))
+    {
+      if ($layout_csv == $layout_csv_properties)
+      {
+        ($index,$dummy,$property,@data) = split (',', $line) ;
+        $property =~ s/^\s+// ;
+        $property =~ s/\s+$// ;
+
+        $property =~ s/.*Total Internet.*/Total Internet/i ;
+        $property =~ s/.*Google.*/Google/i ;
+        $property =~ s/.*Microsoft.*/Microsoft/i ;
+        $property =~ s/.*FACEBOOK.*/Facebook/i ;
+        $property =~ s/.*Yahoo.*/Yahoo/i ;
+        $property =~ s/.*Amazon.*/Amazon/i ;
+        $property =~ s/.*Apple.*/Apple/i ;
+        $property =~ s/.*AOL.*/AOL/i ;
+        $property =~ s/.*Wikimedia.*/Wikimedia/i ;
+        $property =~ s/.*Tencent.*/Tencent/i ;
+        $property =~ s/.*Baidu.*/Baidu/i ;
+        $property =~ s/.*CBS.*/CBS/i ;
+
+        if (! $white_list {$property})
+        {
+          $not_white_listed {$property}++ ;
+          next ;
+        }
+
+        $id = $property ;
+      }
+      else
+      {
+        ($index,$region,@data) = split (',', $line) ;
+        $region =~ s/^\s+// ;
+        $region =~ s/\s+$// ;
+
+        if (! $white_list {$region})
+        {
+          $not_white_listed {$region}++ ;
+          next ;
+        }
+
+        $id = $region_codes {$region} ;
+      }
+
+      for ($m = 0 ; $m <= $#months ; $m++)
+      {
+        $yyyymm = $months [$m] ;
+        $months {$yyyymm} ++ ;
+        $yyyymm_id = "$yyyymm,$id" ;
+        $data = $data [$m] * $multiplier ;
+
+        if ($mode eq 'add')
+        {
+          if (! defined $data {$yyyymm_id})
+          {
+            $updates_found = $true ;
+            print "New data found: $yyyymm_id = $data\n" ;
+            $data {$yyyymm_id} = $data ;
+          }
+        }
+        else
+        {
+          $updates_found = $true ;
+          print "Data found: $yyyymm_id = $data\n" ;
+          $data {$yyyymm_id} = $data ;
+        }
+      }
+    }
+  }
+
+  $entities_not_white_listed = join (', ', sort keys %not_white_listed) ;
+  if ($entities_not_white_listed ne '')
+  { print "\nEntities ignored:\n$entities_not_white_listed\n\n" ; }
+
+  if (! $updates_found)
+  { print "No new updates found\n" ; }
+  else
+  { print "\nUpdates found, rewrite master file '$file_comscore_master'\n\n" ; 
}
+
+  return ($updates_found) ;
+}
+
 sub WriteDataAnalytics
 {
-  open OUT, '>', "$dir_analytics/analytics_in_comscore.csv" ;
+  print "WriteDataAnalytics\n\n";
 
+  open OUT, '>', "$dir_out/analytics_in_comscore.csv" ;
+
   $metric = 'unique_visitors' ;
   foreach $yyyymm (sort keys %months)
   {
@@ -447,7 +507,7 @@
 
       $line = 
"$yyyymm,$country_code,$region_code,$property,$project,$reach,$visitors\n" ;
       print OUT $line ;
-      # print     $line ;
+      print     $line ;
     }
 
     foreach $property (sort @properties)
@@ -478,14 +538,18 @@
 sub mmm_yyyy2yyyy_mm
 {
   my @months = @_ ;
+  my ($m) ;
   # Jan -> 01, etc
-  foreach my $month (@months)
+  foreach $month (@months)
   {
     my ($mmm,$yyyy) = split ('-', $month) ;
     for ($m = 0 ; $m <= $#months_short ; $m++)
     {
       if ($mmm eq $months_short [$m])
-      { $month = "$yyyy-" . sprintf ("%02d", $m+1) ; }
+      {
+        $month = "$yyyy-" . sprintf ("%02d", $m+1) ;
+        last ;
+      }
     }
   }
   return @months ;

Modified: trunk/wikistats/analytics/AnalyticsPrepWikiCountsOutput.pl
===================================================================
--- trunk/wikistats/analytics/AnalyticsPrepWikiCountsOutput.pl  2011-07-06 
00:24:59 UTC (rev 91508)
+++ trunk/wikistats/analytics/AnalyticsPrepWikiCountsOutput.pl  2011-07-06 
00:37:49 UTC (rev 91509)
@@ -39,6 +39,7 @@
 
   &ParseArguments ;
   &ReadStatisticsMonthly ;
+  &FindLargestWikis ;
   &WriteMonthlyData ;
 
   print "\nReady\n\n" ;
@@ -54,15 +55,20 @@
   { $arguments .= " -$arg " . $options {$arg} . "\n" ; }
   print ("\nArguments\n$arguments\n") ;
 
-# $options {"i"} = "w:/# out bayes" ;     # EZ test
-# $options {"o"} = "c:/MySQL/analytics" ; # EZ test
+  if (! -d '/mnt/') # EZ test
+  {
+    $path_in  = "c:/\@ wikimedia/# out bayes" ;
+    $path_out = "c:/MySQL/analytics" ;
+  }
+  else
+  {
+    die ("Specify input folder for projectcounts files as: -i path") if (! 
defined ($options {"i"})) ;
+    die ("Specify output folder as: -o path'")                       if (! 
defined ($options {"o"})) ;
 
-  die ("Specify input folder for projectcounts files as: -i path") if (! 
defined ($options {"i"})) ;
-  die ("Specify output folder as: -o path'")                       if (! 
defined ($options {"o"})) ;
+    $path_in  = $options {"i"} ;
+    $path_out = $options {"o"} ;
+  }
 
-  $path_in  = $options {"i"} ;
-  $path_out = $options {"o"} ;
-
   die "Input folder '$path_in' does not exist"   if (! -d $path_in) ;
   die "Output folder '$path_out' does not exist" if (! -d $path_out) ;
 
@@ -158,6 +164,8 @@
 
     ($month,$day,$year) = split ('\/', $date) ;
     $yyyymm = sprintf ("%04d-%02d", $year, $month) ;
+    $months {$yyyymm} ++ ;
+#    print "YYYYMM $yyyymm\n" ;
 
     # data have been collected in WikiCountsProcess.pm and been written in 
WikiCountsOutput.pm
     # count user with over x edits
@@ -167,6 +175,11 @@
     $edits_ge_25  = @counts [4] > 0 ? @counts [4] : 0 ;
     $edits_ge_100 = @counts [7] > 0 ? @counts [7] : 0 ;
     $data2 {"$project,$language,$yyyymm"} = 
"$edits_ge_5,$edits_ge_25,$edits_ge_100" ;
+
+    $total_edits_ge_5   {"$project,$language"} += $edits_ge_5 ;
+    $total_edits_ge_25  {"$project,$language"} += $edits_ge_25 ;
+    $total_edits_ge_100 {"$project,$language"} += $edits_ge_100 ;
+
     # prep string with right amount of comma's
     if ($data2_default eq '')
     {
@@ -237,12 +250,46 @@
 #  }
 #}
 
+sub FindLargestWikis
+{
+  print "Largest projects (most accumulated very active editors):\n";
+  @total_edits_ge_100 = sort {$total_edits_ge_100 {$b} <=> $total_edits_ge_100 
{$a}} keys %total_edits_ge_100 ;
+  $rank = 0 ;
+  foreach $project_language (@total_edits_ge_100)
+  {
+    $largest_projects {$project_language} = $rank++ ;
+    print "$project_language," ;
+    last if $rank > 10 ;
+  }
+  print "\n\n" ;
+
+  foreach $yyyymm (sort keys %months)
+  {
+    next if $yyyymm lt '2011' ;
+    foreach $project_language (keys %largest_projects)
+    {
+      ($project,$language) = split (',', $project_language) ;
+         if ($data2 {"$project,$language,$yyyymm"} eq '')
+      {
+        print "No data yet for large wiki $project_language for $yyyymm-> skip 
month $yyyymm\n" ;
+        $months {$yyyymm} = 0 ;
+      }
+    }
+  }
+  exit ;
+}
+
 sub WriteMonthlyData
 {
   my $file_csv_out = "$path_out/$file_csv_analytics_in" ;
   open CSV_OUT, '>', $file_csv_out ;
   foreach $project_wiki_month (sort keys %data1)
   {
+    ($project,$wiki,$yyyymm) = split (',', $project_wiki_month) ;
+
+    # recent month misses on eor more large wikis?
+    next if $months {$yyyymm} == 0 ;
+
     $data1 = $data1 {$project_wiki_month} ;
     $data2 = $data2 {$project_wiki_month} ;
     if ($data2 eq '')
@@ -250,9 +297,12 @@
       print "Editor data missing for $project_wiki_month\n" ;
       $data2 = $data2_default ;
     }
-    $data1 =~ s/data2/$data2/ ; # insert rather than append to have all editor 
fields follow each other
+    $data1 =~ s/data2/$data2/ ; # insert rather than append to have all editor 
fields close together
     print CSV_OUT "$project_wiki_month,$data1\n" ;
   }
+  $total_edits_ge_5   {"$project,*,$yyyymm"} += $edits_ge_5 ;
+  $total_edits_ge_25  {"$project,*,$yyyymm"} += $edits_ge_25 ;
+  $total_edits_ge_100 {"$project,*,$yyyymm"} += $edits_ge_100 ;
   close CSV_OUT ;
 }
 

Added: trunk/wikistats/analytics/analytics_create_and_load_from_csv.txt
===================================================================
--- trunk/wikistats/analytics/analytics_create_and_load_from_csv.txt            
                (rev 0)
+++ trunk/wikistats/analytics/analytics_create_and_load_from_csv.txt    
2011-07-06 00:37:49 UTC (rev 91509)
@@ -0,0 +1,179 @@
+-- make sure to delete output files *test*.csv first if any exist (MySQL on 
purpose forbids overwrite)
+
+-- tables implemented:
+--   comscore
+--   comscore_regions
+--   wikistats 
+--   page_views     
+--   language names 
+--   binaries    
+
+-- more tables planned (O= optional, not needed for report card stage)
+--   project_names  
+-- O edits          per project_code, per language, per month, per 
normalization type (Y/N), editor type (manual, anonymous, bot), namespace group 
(articles, talk pages, other)  
+-- O editors        per project_code, per language, per month, per 
normalization type (Y/N), editor type (manual, anonymous, bot), namespace group 
(articles, talk pages, other)  
+
+
+-- open issues: 
+-- only store basic data in database and calculate all aggregates on the fly 
or do some aggragation before hand ? (e.g. count for mobile / non-mobile / ==> 
total of both ? <==)
+-- for binaries, store one extension type per row? (future proof, more work to 
query), or a selected few as columns? (one row only needed per month)
+
+-- Create database and two tables from scratch 
+DROP DATABASE IF EXISTS `analytics` ;
+
+CREATE DATABASE `analytics` ;
+
+USE `analytics` ;
+
+CREATE TABLE `comscore` (
+  `date`           date NOT NULL, 
+  `country_code`   varchar (3), 
+  `region_code`    varchar (3), 
+  `web_property`   varchar (20), 
+  `project_code`   varchar (10), 
+  `reach`          decimal (4,1) DEFAULT NULL,
+  `visitors`       decimal (15)  DEFAULT NULL, 
+  PRIMARY KEY (date,country_code,region_code,project_code,web_property),
+  KEY (`country_code`)  
+) ;
+
+CREATE TABLE `comscore_regions` (
+  `region_code`      varchar (2), 
+  `report_language`  varchar (10),
+  `region_name`      varchar (18),
+  PRIMARY KEY (report_language,region_code)
+) ;  
+
+CREATE TABLE `wikistats` (
+  `date`                     date NOT NULL, 
+  `project_code`             varchar (10),
+  `language_code`            varchar (15),
+  `editors_all_time`         int (10)  DEFAULT NULL, 
+  `editors_new`              int (7)   DEFAULT NULL, 
+  `editors_ge_5`             int (7)   DEFAULT NULL, 
+  `editors_ge_25`            int (7)   DEFAULT NULL, 
+  `editors_ge_100`           int (7)   DEFAULT NULL, 
+  `articles`                 int (12)  DEFAULT NULL, 
+  `articles_new_per_day`     int (9)   DEFAULT NULL, 
+  `articles_over_bytes_500`  int (12)  DEFAULT NULL, 
+  `articles_over_bytes_2000` int (12)  DEFAULT NULL, 
+  `edits_per_article`        decimal (9,1) DEFAULT NULL, 
+  `bytes_per_article`        decimal (9,1) DEFAULT NULL, 
+  `edits`                    int (12)  DEFAULT NULL, 
+  `size_in_bytes`            int (15)  DEFAULT NULL, 
+  `size_in_words`            int (15)  DEFAULT NULL, 
+  `links_internal`           int (15)  DEFAULT NULL, 
+  `links_interwiki`          int (15)  DEFAULT NULL, 
+  `links_image`              int (15)  DEFAULT NULL, 
+  `links_external`           int (15)  DEFAULT NULL, 
+  `redirects`                int (15)  DEFAULT NULL, 
+  PRIMARY KEY (date,project_code,language_code)
+) ;
+
+CREATE TABLE `page_views` (
+  `date`                        date NOT NULL, 
+  `project_code`                char (2),
+  `language_code`               char (15),
+  `views_non_mobile_raw`        bigint (15), 
+  `views_mobile_raw`            bigint (15), 
+  `views_non_mobile_normalized` bigint (15), 
+  `views_mobile_normalized`     bigint (15), 
+  `views_raw`                   bigint (15), 
+  `views_normalized`            bigint (15), 
+  PRIMARY KEY (date,project_code,language_code)
+) ;
+
+CREATE TABLE `language_names` (
+  `report_language`             varchar (15),   
+  `language_code`               varchar (15),
+  `language_name`               varchar (50),
+  PRIMARY KEY (report_language,language_code)
+) ;
+
+CREATE TABLE `binaries` (
+  `date`                        date NOT NULL, 
+  `project_code`                char (2),
+  `language_code`               char (15),
+  `extension`                   varchar (10),
+  `binaries`                    bigint (15), 
+  PRIMARY KEY (date,project_code,language_code,extension)
+) ;
+
+CREATE TABLE `offline` (
+  `date`                        date NOT NULL, 
+  `readers`                     bigint (12),
+  PRIMARY KEY (date,readers)
+) ;
+
+-- SHOW TABLES ;
+-- DESCRIBE comscore ;
+-- DESCRIBE comscore_regions ;
+-- DESCRIBE wikistats ;
+-- DESCRIBE page_views ;
+-- DESCRIBE language_names ;
+-- DESCRIBE binaries ;
+-- DESCRIBE offline ;
+
+-- Database Manipulation
+-- Obviously in real world this is a separate script
+
+LOAD DATA LOCAL INFILE 'analytics_in_comscore.csv' 
+     INTO TABLE comscore 
+     FIELDS TERMINATED BY ',' 
+     OPTIONALLY ENCLOSED BY '"' 
+     (@date,country_code,region_code,web_property,project_code,reach,visitors)
+     SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;
+
+LOAD DATA LOCAL INFILE 'analytics_in_comscore_regions.csv' 
+     INTO TABLE comscore_regions 
+     FIELDS TERMINATED BY ',' 
+     OPTIONALLY ENCLOSED BY '"' 
+     (report_language,region_code,region_name) ;
+
+LOAD DATA LOCAL INFILE 'analytics_in_wikistats.csv' 
+     INTO TABLE wikistats 
+     FIELDS TERMINATED BY ',' 
+     OPTIONALLY ENCLOSED BY '"' 
+     
(project_code,language_code,@date,editors_all_time,editors_new,editors_ge_5,editors_ge_25,editors_ge_100,articles,articles_new_per_day,articles_over_bytes_500,articles_over_bytes_2000,edits_per_article,bytes_per_article,edits,size_in_bytes,size_in_words,links_internal,links_interwiki,links_image,links_external,redirects)
 
+     SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;
+
+LOAD DATA LOCAL INFILE 'analytics_in_page_views.csv' 
+     INTO TABLE page_views 
+     FIELDS TERMINATED BY ',' 
+     OPTIONALLY ENCLOSED BY '"' 
+     
(project_code,language_code,@date,views_non_mobile_raw,views_mobile_raw,views_non_mobile_normalized,views_mobile_normalized,views_raw,views_normalized)
 
+     SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;
+
+
+LOAD DATA LOCAL INFILE 'analytics_in_language_names.csv' 
+     INTO TABLE language_names 
+     FIELDS TERMINATED BY ',' 
+     OPTIONALLY ENCLOSED BY '"' 
+     (report_language,language_code,language_name) ;
+
+LOAD DATA LOCAL INFILE 'analytics_in_binaries.csv' 
+     INTO TABLE binaries 
+     FIELDS TERMINATED BY ',' 
+     OPTIONALLY ENCLOSED BY '"' 
+     (project_code,language_code,@date,extension,binaries) 
+     SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;
+
+LOAD DATA LOCAL INFILE 'analytics_in_offline.csv' 
+     INTO TABLE offline 
+     FIELDS TERMINATED BY ',' 
+     OPTIONALLY ENCLOSED BY '"' 
+     (@date,readers) 
+     SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;
+  
+  
+-- show contents (debugging only) 
+-- SELECT * FROM comscore ;
+-- SELECT * FROM comscore_regions ;
+-- SELECT * FROM wikistats ;
+-- SELECT * FROM page_views ;
+-- SELECT * FROM language_names ;
+-- SELECT * FROM binaries 
+-- WHERE project_code = 'commons' ;
+   SELECT * FROM offline ; 
+
+

Added: trunk/wikistats/analytics/analytics_generate_csv_files.sh
===================================================================
--- trunk/wikistats/analytics/analytics_generate_csv_files.sh                   
        (rev 0)
+++ trunk/wikistats/analytics/analytics_generate_csv_files.sh   2011-07-06 
00:37:49 UTC (rev 91509)
@@ -0,0 +1,21 @@
+#!/bin/sh
+ulimit -v 8000000
+
+clear
+cd /a/analytics
+
+perl AnalyticsPrepBinariesData.pl -i /a/wikistats/ -o /a/analytics/test/
+
+# add or replace data from newest comScore csv files (last 14 months)  into 
master files (full history)
+# and generate database input csv file from it 
+
+# -r replace (default is add only)
+# -i input folder, contains manually downloaded csv files from comScore (or 
xls, converted to csv) 
+# -m master files with full history
+# -o output csv file, with reach and UV's per region and UV's per top web 
property, ready for import into database
+perl AnalyticsPrepComscoreData.pl -r -i /a/analytics/comscore -m /a/analytics 
-o /a/analytics
+
+perl AnalyticsPrepWikiCountsOutput.pl -i /a/wikistats/ -o /a/analytics 
+
+cp /a/wikistats/csv_wp/analytics_in_page_views.csv .
+

Added: trunk/wikistats/analytics/analytics_new.sh
===================================================================
--- trunk/wikistats/analytics/analytics_new.sh                          (rev 0)
+++ trunk/wikistats/analytics/analytics_new.sh  2011-07-06 00:37:49 UTC (rev 
91509)
@@ -0,0 +1,5 @@
+clear
+cd /a/analytics
+# rm *test*.csv
+# mysql  --user=root --password=changerootwhenpriyankareturns < 
analytics_create_and_load_from_csv.txt > mysql_log.txt
+  mysql  -u analytics -h project2.wikimedia.org -preport < 
analytics_create_and_load_from_csv.txt > mysql_log.txt

Added: trunk/wikistats/analytics/analytics_refresh_from_csv.txt
===================================================================
--- trunk/wikistats/analytics/analytics_refresh_from_csv.txt                    
        (rev 0)
+++ trunk/wikistats/analytics/analytics_refresh_from_csv.txt    2011-07-06 
00:37:49 UTC (rev 91509)
@@ -0,0 +1,145 @@
+-- make sure to delete output files *test*.csv first if any exist (MySQL on 
purpose forbids overwrite)
+
+USE `analytics` ;
+
+CREATE TABLE `comscore` (
+  `date`           date NOT NULL, 
+  `country_code`   varchar (3), 
+  `region_code`    varchar (3), 
+  `web_property`   varchar (20), 
+  `project_code`   varchar (10), 
+  `reach`          decimal (4,1) DEFAULT NULL,
+  `visitors`       decimal (15)  DEFAULT NULL, 
+  PRIMARY KEY (date,country_code,region_code,project_code,web_property),
+  KEY (`country_code`)  
+) ;
+
+CREATE TABLE `comscore_regions` (
+  `region_code`      varchar (2), 
+  `report_language`  varchar (10),
+  `region_name`      varchar (18),
+  PRIMARY KEY (report_language,region_code)
+) ;  
+
+CREATE TABLE `wikistats` (
+  `date`                     date NOT NULL, 
+  `project_code`             varchar (10),
+  `language_code`            varchar (15),
+  `editors_all_time`         int (10)  DEFAULT NULL, 
+  `editors_new`              int (7)   DEFAULT NULL, 
+  `editors_ge_5`             int (7)   DEFAULT NULL, 
+  `editors_ge_25`            int (7)   DEFAULT NULL, 
+  `editors_ge_100`           int (7)   DEFAULT NULL, 
+  `articles`                 int (12)  DEFAULT NULL, 
+  `articles_new_per_day`     int (9)   DEFAULT NULL, 
+  `articles_over_bytes_500`  int (12)  DEFAULT NULL, 
+  `articles_over_bytes_2000` int (12)  DEFAULT NULL, 
+  `edits_per_article`        decimal (9,1) DEFAULT NULL, 
+  `bytes_per_article`        decimal (9,1) DEFAULT NULL, 
+  `edits`                    int (12)  DEFAULT NULL, 
+  `size_in_bytes`            int (15)  DEFAULT NULL, 
+  `size_in_words`            int (15)  DEFAULT NULL, 
+  `links_internal`           int (15)  DEFAULT NULL, 
+  `links_interwiki`          int (15)  DEFAULT NULL, 
+  `links_image`              int (15)  DEFAULT NULL, 
+  `links_external`           int (15)  DEFAULT NULL, 
+  `redirects`                int (15)  DEFAULT NULL, 
+  PRIMARY KEY (date,project_code,language_code)
+) ;
+
+CREATE TABLE `page_views` (
+  `date`                        date NOT NULL, 
+  `project_code`                char (2),
+  `language_code`               char (15),
+  `views_non_mobile_raw`        bigint (15), 
+  `views_mobile_raw`            bigint (15), 
+  `views_non_mobile_normalized` bigint (15), 
+  `views_mobile_normalized`     bigint (15), 
+  `views_raw`                   bigint (15), 
+  `views_normalized`            bigint (15), 
+  PRIMARY KEY (date,project_code,language_code)
+) ;
+
+CREATE TABLE `language_names` (
+  `report_language`             varchar (15),   
+  `language_code`               varchar (15),
+  `language_name`               varchar (50),
+  PRIMARY KEY (report_language,language_code)
+) ;
+
+CREATE TABLE `binaries` (
+  `date`                        date NOT NULL, 
+  `project_code`                char (2),
+  `language_code`               char (15),
+  `extension`                   varchar (10),
+  `binaries`                    bigint (15), 
+  PRIMARY KEY (date,project_code,language_code,extension)
+) ;
+
+CREATE TABLE `offline` (
+  `date`                        date NOT NULL, 
+  `readers`                     bigint (12),
+  PRIMARY KEY (date,readers)
+) ;
+
+
+-- SHOW TABLES ;
+-- DESCRIBE comscore ;
+-- DESCRIBE comscore_regions ;
+-- DESCRIBE wikistats ;
+-- DESCRIBE page_views ;
+-- DESCRIBE language_names ;
+-- DESCRIBE binaries ;
+-- DESCRIBE offline ;
+
+-- Database Manipulation
+-- Obviously in real world this is a separate script
+
+TRUNCATE TABLE 'analytics_in_comscore.csv' ;
+LOAD DATA LOCAL INFILE 'analytics_in_comscore.csv' 
+     INTO TABLE comscore 
+     FIELDS TERMINATED BY ',' 
+     OPTIONALLY ENCLOSED BY '"' 
+     (@date,country_code,region_code,web_property,project_code,reach,visitors)
+     SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;
+
+TRUNCATE TABLE 'analytics_in_comscore_regions.csv'   ;
+LOAD DATA LOCAL INFILE 'analytics_in_comscore_regions.csv' 
+     INTO TABLE comscore_regions 
+     FIELDS TERMINATED BY ',' 
+     OPTIONALLY ENCLOSED BY '"' 
+     (report_language,region_code,region_name) ;
+
+TRUNCATE TABLE 'analytics_in_wikistats.csv' ; 
+LOAD DATA LOCAL INFILE 'analytics_in_wikistats.csv' 
+     INTO TABLE wikistats 
+     FIELDS TERMINATED BY ',' 
+     OPTIONALLY ENCLOSED BY '"' 
+     
(project_code,language_code,@date,editors_all_time,editors_new,editors_ge_5,editors_ge_25,editors_ge_100,articles,articles_new_per_day,articles_over_bytes_500,articles_over_bytes_2000,edits_per_article,bytes_per_article,edits,size_in_bytes,size_in_words,links_internal,links_interwiki,links_image,links_external,redirects)
 
+     SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;
+
+TRUNCATE TABLE 'analytics_in_page_views.csv' ;
+LOAD DATA LOCAL INFILE 'analytics_in_page_views.csv' 
+     INTO TABLE page_views 
+     FIELDS TERMINATED BY ',' 
+     OPTIONALLY ENCLOSED BY '"' 
+     
(project_code,language_code,@date,views_non_mobile_raw,views_mobile_raw,views_non_mobile_normalized,views_mobile_normalized,views_raw,views_normalized)
 
+     SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;
+
+
+TRUNCATE TABLE 'analytics_in_language_names.csv' ;   
+LOAD DATA LOCAL INFILE 'analytics_in_language_names.csv' 
+     INTO TABLE language_names 
+     FIELDS TERMINATED BY ',' 
+     OPTIONALLY ENCLOSED BY '"' 
+     (report_language,language_code,language_name) ;
+
+TRUNCATE TABLE 'analytics_in_binaries.csv' ;
+LOAD DATA LOCAL INFILE 'analytics_in_binaries.csv' 
+     INTO TABLE binaries 
+     FIELDS TERMINATED BY ',' 
+     OPTIONALLY ENCLOSED BY '"' 
+     (project_code,language_code,@date,extension,binaries) 
+     SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;
+
+TRUNCATE TABLE 'analytics_in_offline.csv' ;  

Added: trunk/wikistats/analytics/analytics_upd.sh
===================================================================
--- trunk/wikistats/analytics/analytics_upd.sh                          (rev 0)
+++ trunk/wikistats/analytics/analytics_upd.sh  2011-07-06 00:37:49 UTC (rev 
91509)
@@ -0,0 +1,6 @@
+clear
+cd /a/analytics
+
+./analytics_generate_csv_files.sh
+
+mysql  -u analytics -h project2.wikimedia.org -preport < 
analytics_refresh_from_csv.txt
\ No newline at end of file


_______________________________________________
MediaWiki-CVS mailing list
MediaWiki-CVS@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs

Reply via email to