analytics

diederik Wed, 07 Dec 2011 09:40:07 -0800

https://www.mediawiki.org/wiki/Special:Code/MediaWiki/105445


Revision: 105445
Author:   diederik
Date:     2011-12-07 17:39:50 +0000 (Wed, 07 Dec 2011)
Log Message:
-----------
Moved files to trunk/extensions/MetricsReporting/import

Added Paths:
-----------
    trunk/wikistats/analytics/README

Removed Paths:
-------------
    trunk/wikistats/analytics/AnalyticsPrepBinariesData.pl
    trunk/wikistats/analytics/AnalyticsPrepComscoreData.pl
    trunk/wikistats/analytics/AnalyticsPrepLanguageNames.pl
    trunk/wikistats/analytics/AnalyticsPrepPageViews.pl
    trunk/wikistats/analytics/AnalyticsPrepWikiCountsOutput.pl
    trunk/wikistats/analytics/_readme.txt
    trunk/wikistats/analytics/analytics_create_and_load_from_csv.txt
    trunk/wikistats/analytics/analytics_generate_csv_files.sh
    trunk/wikistats/analytics/analytics_new.sh
    trunk/wikistats/analytics/analytics_refresh_from_csv.txt
    trunk/wikistats/analytics/analytics_upd.sh

Deleted: trunk/wikistats/analytics/AnalyticsPrepBinariesData.pl
===================================================================
--- trunk/wikistats/analytics/AnalyticsPrepBinariesData.pl      2011-12-07 
17:36:45 UTC (rev 105444)
+++ trunk/wikistats/analytics/AnalyticsPrepBinariesData.pl      2011-12-07 
17:39:50 UTC (rev 105445)
@@ -1,124 +0,0 @@
-#!/usr/local/bin/perl
-
-  use Getopt::Std ;
-
-  &ParseArguments ;
-
-  print "Write file '$file_csv_out'\n" ;
-  open CSV_OUT, '>', $file_csv_out ;
-
-  foreach $project (qw (wb wk wn wp wq ws wv wx))
-  { &ReadStatisticsPerBinariesExtension ($project) ; }
-
-  close CSV_OUT  ;
-
-  print "\n\nReady\n\n" ;
-  exit ;
-
-sub ParseArguments
-{
-  my @options ;
-  getopt ("io", \%options) ;
-
-  die ("Specify input folder as: -i path")   if (! defined ($options {"i"})) ;
-  die ("Specify output folder as: -o path'") if (! defined ($options {"o"})) ;
-
-  $path_in  = $options {"i"} ;
-  $path_out = $options {"o"} ;
-
-  die "Input folder '$path_in' does not exist"   if (! -d $path_in) ;
-  die "Output folder '$path_out' does not exist" if (! -d $path_out) ;
-
-  # tests only
-  # $path_in  = "C:/@ Wikimedia/# Out Bayes" ;
-  # $path_out = "C:/analytics" ; # "w:/@ report card/data" ;
-
-  print "Input  folder: $path_in\n" ;
-  print "Output folder: $path_out\n" ;
-  print "\n" ;
-
-  $file_csv_out = "$path_out/analytics_in_binaries.csv" ;
-}
-
-
-sub ReadStatisticsPerBinariesExtension
-{
-  my $project = shift ;
-  my $file_csv_in = "$path_in/csv_$project/StatisticsPerBinariesExtension.csv" 
;
-  $yyyymm_hi = -1 ;
-
-  if (! -e $file_csv_in)
-  { die "Input file '$file_csv_in' not found" ; }
-
-
-  print "Read '$file_csv_in'\n" ;
-  open CSV_IN, '<', $file_csv_in ;
-
-  $language_prev = '' ;
-  while ($line = <CSV_IN>)
-  {
-    chomp $line ;
-    next if $line !~ /,.*?,/ ;
-
-    ($language,$date,$data) = split (',', $line, 3) ;
-
-    # for each wiki first line shows ext names, no tcounts
-    if ($date eq "00/0000")
-    {
-      if ($language_prev ne '')
-      { &WriteMonthlyData ($project, $language_prev) ; }
-      $language_prev = $language ;
-
-      undef %ext_name ;
-      undef %ext_ndx ;
-      undef %ext_cnt ;
-      undef %months ;
-
-      @exts = split (',', $data) ;
-      $ndx = 0 ;
-      foreach $ext (@exts)
-      {
-        $ext_name {$ndx} = $ext ;
-        $ext_ndx  {$ext} = $ndx ;
-        $ndx ++ ;
-      }
-      next ;
-    }
-
-    ($month,$year) = split ('\/', $date) ;
-    $yyyymm = sprintf ("%04d-%02d", $year, $month) ;
-    if ($yyyymm gt $yyyymm_hi)
-    { $yyyymm_hi = $yyyymm ; }
-    $months {$yyyymm}++ ;
-
-    @counts = split (',', $data) ;
-    $ndx = 0 ;
-    foreach $count (@counts)
-    {
-      $ext_cnt {$yyyymm}{$ext_name {$ndx}}  = $count ;
-      $ndx ++ ;
-    }
-  }
-  &WriteMonthlyData ($project, $language_prev) ;
-
-  close CSV_IN ;
-}
-
-sub WriteMonthlyData
-{
-  my ($project,$language) = @_ ;
-  # get sorted array of extensions, order by count for most recent month
-  %ext_cnt_yyyymm_hi = %{$ext_cnt {$yyyymm_hi}} ;
-  @ext_cnt_yyyymm_hi = (sort {$ext_cnt_yyyymm_hi {$b} <=> $ext_cnt_yyyymm_hi 
{$a}} keys %ext_cnt_yyyymm_hi) ;
-
-  foreach $month (sort keys %months)
-  {
-    $ndx = 0 ;
-    foreach $ext (@ext_cnt_yyyymm_hi)
-    {
-      print CSV_OUT 
"$project,$language,$month,$ext,${ext_cnt{$yyyymm}{$ext_name {$ndx}}}\n" ;
-    # print         "$month,$ext,${ext_cnt{$yyyymm}{$ext_name {$ndx}}}\n" ;
-      last if (++ $ndx > 25) ;
-    }
-  }
-}

Deleted: trunk/wikistats/analytics/AnalyticsPrepComscoreData.pl
===================================================================
--- trunk/wikistats/analytics/AnalyticsPrepComscoreData.pl      2011-12-07 
17:36:45 UTC (rev 105444)
+++ trunk/wikistats/analytics/AnalyticsPrepComscoreData.pl      2011-12-07 
17:39:50 UTC (rev 105445)
@@ -1,564 +0,0 @@
-#!/usr/bin/perl
-
-# Copyright (C) 2011 Wikimedia Foundation
-# This program is free software; you can redistribute it and/or
-# modify it under the terms of the GNU General Public License version 2
-# as published by the Free Software Foundation.
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-# See the GNU General Public License for more details, at
-# http://www.fsf.org/licenses/gpl.html
-
-# Author:
-# Erik Zachte, email ezac...@wikimedia.org
-
-# Functionality:
-# comScore data can be downloaded as csv file, which each contain 14 months 
history
-# This script uses these files to update 'master' csv files which contain all 
known history
-# Note: only entities which are already in master file will be updated!
-# Then it merges these master files into one csv file which can be loaded into 
analytics database
-# Data are: reach by region, unique visitors by region, unique visitors by web 
property
-
-# Parameters:
-# -m (required) folder with 'master' csv files (files with all known history)
-# -u (required) folder with 'update' csv files (files with lastest 14 months 
history, produced by comScore)
-
-# Output:
-# updated master csv files + merged and formatted csv for import in MySQL
-
-# http://svn.wikimedia.org/viewvc/mediawiki/trunk/wikistats/analytics/
-
-  use Getopt::Std ;
-  use Cwd;
-
-  my $options ;
-  getopt ("imo", \%options) ;
-
-  $true  = 1 ;
-  $false = 0 ;
-
-  $script_name    = "AnalyticsPrepComscoreData.pl" ;
-  $script_version = "0.31" ;
-
-# EZ test only
-# $source       = "comscore" ;
-# $server       = "ez_test" ;
-# $generated    = "2011-05-06 00:00:00" ;
-# $user         = "ezachte" ;
-
-  $dir_in   = $options {"i"} ;
-  $dir_upd  = $options {"m"} ;
-  $dir_out  = $options {"o"} ;
-  $mode = 'add';
-  if (defined $options {'r'})
-  { $mode = 'replace'; }
-
-  print "Mode is $mode (specify '-r' for replace)\n\n";
-
-  if (! -d "/home/") # EZ test machine
-  {
-    $dir_in  = "C:/@ Wikimedia/@ Report Card/Data" ;
-    $dir_upd = "C:/MySQL/analytics" ;
-    $dir_out = "C:/MySQL/analytics" ;
-    $mode = 'replace' ;
-  }
-
-  if ($dir_in eq '')
-  { Abort ("Specify folder for input file (new comScore data) '-i folder'") ; }
-  if ($dir_upd eq '')
-  { Abort ("Specify folder for master files (full history) as '-m folder'") ; }
-  if ($dir_out eq '')
-  { Abort ("Specify folder for output file '-o folder'") ; }
-
-  $file_comscore_reach_master     = "history_comscore_reach_regions.csv" ;
-  $file_comscore_reach_update     = "*reach*by*region*csv" ;
-  $file_comscore_uv_region_master = "history_comscore_UV_regions.csv" ;
-  $file_comscore_uv_region_update = "*UVs*by*region*csv" ;
-  $file_comscore_uv_property_master = "history_comscore_UV_properties.csv" ;
-  $file_comscore_uv_property_update = "*UV*trend*csv" ;
-
-  $layout_csv_reach      = 1 ;
-  $layout_csv_regions    = 2 ;
-  $layout_csv_properties = 3 ;
-
-  print "Directories:\n" .
-        "Input (new comScore data): '$dir_in'\n".
-        "Master files (full history): '$dir_upd'\n" .
-        "Output (database feed): '$dir_out'\n\n" ;
-
-  %region_codes = (
-    "Europe"=>"EU",
-    "North America"=>"NA",
-    "Latin America"=>"LA",
-    "World-Wide" => "W",
-    "Middle East - Africa" => "MA",
-    "Asia Pacific"=> "AS",
-    "United States" => "US",
-    "India" => "I",
-    "China" => "C"
-  ) ;
-
-  foreach $region_name (keys %region_codes)
-  { $region_names {$region_codes {$region_name}} = $region_name ; }
-
-  @months_short = qw "Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec" ;
-
-  &ReadMasterComscoreDataReachPerRegion ($file_comscore_reach_master, 
$file_comscore_reach_update, "%.1f", 1, $layout_csv_reach) ;
-  %reach_region_code = %data ;
-
-  &ReadMasterComscoreDataVisitorsPerRegion ($file_comscore_uv_region_master, 
$file_comscore_uv_region_update, "%.0f", 1000, $layout_csv_regions) ;
-  %visitors_region_code = %data ;
-
-  &ReadMasterComscoreDataVisitorsPerProperty 
($file_comscore_uv_property_master, $file_comscore_uv_property_update, "%.0f", 
1000, $layout_csv_properties) ;
-  %visitors_web_property = %data ;
-
-  &WriteDataAnalytics ;
-
-  print "\nReady\n\n" ;
-  exit ;
-
-sub ReadMasterComscoreDataReachPerRegion
-{
-  my ($file_comscore_master, $file_comscore_updates, $precision, $layout_csv) 
= @_ ;
-
-  print "ReadMasterComscoreDataReachPerRegion\n\n" ;
-
-  undef %months ;
-  undef %data ;
-  undef @regions ;
-
-  open IN,  '<', "$dir_upd/$file_comscore_master" ;
-
-  $lines = 0 ;
-  while ($line = <IN>)
-  {
-    chomp $line ;
-    $line =~ s/\r//g ;
-
-    ($yyyymm,@data) = split (',', $line) ;
-
-    if ($lines++ == 0)
-    {
-      @regions = @data ;
-      print "Regions found: " . (join ',', @regions) . "\n";
-      next ;
-    }
-
-    $field_ndx = 0 ;
-    foreach (@data)
-    {
-      $region      = $regions [$field_ndx] ;
-      $region_code = $region_codes {$region} ;
-
-      $data      = $data [$field_ndx] ;
-      if ($data eq '')
-      { $data = '0' ; }
-      $months {$yyyymm} ++ ;
-      $data {"$yyyymm,$region_code"} = $data ;
-      # print "Old data $yyyymm,$region_code = $data\n" ;
-      $field_ndx++ ;
-    }
-  }
-  close IN ;
-
-  my $updates_found = &UpdateMasterFileFromRecentComscoreData 
($file_comscore_master, $file_comscore_updates, 1, $layout_csv, @regions) ;
-  return if ! $updates_found ;
-
-  rename "$dir_upd/$file_comscore_master", "$dir_upd/$file_comscore_master.~" ;
-  open OUT,  '>', "$dir_upd/$file_comscore_master" ;
-
-  $line_out = "yyyymm" ;
-  foreach $region_name (@regions)
-  { $line_out .= ",$region_name" ; }
-  print OUT "$line_out" ;
-
-  foreach $yyyymm (sort {$b cmp $a} keys %months)
-  {
-    $line_out = "\n$yyyymm" ;
-    foreach $region_name (@regions)
-    {
-      $yyyymm_region_code = $yyyymm . ',' . $region_codes {$region_name} ;
-      $line_out .= "," . sprintf ($precision, $data {$yyyymm_region_code}) ;
-    }
-    print OUT "$line_out" ;
-  }
-
-  close OUT ;
-}
-
-sub ReadMasterComscoreDataVisitorsPerRegion
-{
-  my ($file_comscore_master, $file_comscore_updates, $precision, $multiplier, 
$layout_csv) = @_ ;
-
-  print "ReadMasterComscoreDataVisitorsPerRegion\n\n";
-
-  undef %months ;
-  undef %data ;
-  undef @regions ;
-
-  open IN,  '<', "$dir_upd/$file_comscore_master" ;
-
-  $lines  = 0 ;
-  $metric = 'unique_visitors' ;
-  while ($line = <IN>)
-  {
-    chomp $line ;
-    $line =~ s/\r//g ;
-    $line = &GetNumberOnly ($line) ;
-
-    next if $line !~ /(?:yyyymm|\d\d\d\d-\d\d)/ ;
-
-    ($yyyymm,@data) = split (',', $line) ;
-
-    if ($lines++ == 0)
-    {
-      @regions = @data ;
-      print "Regions found: " . (join ',', @regions) . "\n";
-      next ;
-    }
-
-    $field_ndx = 0 ;
-    foreach (@data)
-    {
-      $region      = $regions [$field_ndx] ;
-      $region_code = $region_codes {$region} ;
-
-      $data      = $data    [$field_ndx] ;
-      if ($data eq '')
-      { $data = '0' ; }
-
-      # print "Old data $yyyymm,$region = $data\n" ;
-
-      $months {$yyyymm} ++ ;
-      $data {"$yyyymm,$region_code"} = $data ;
-
-      $field_ndx++ ;
-    }
-  }
-  close IN ;
-
-  my $updates_found = &UpdateMasterFileFromRecentComscoreData 
($file_comscore_master, $file_comscore_updates, 1000, $layout_csv, @regions) ;
-  return if ! $updates_found ;
-
-  rename "$dir_upd/$file_comscore_master", "$dir_upd/$file_comscore_master.~" ;
-  open OUT,  '>', "$dir_upd/$file_comscore_master" ;
-
-  $line_out = "yyyymm" ;
-  foreach $region_name (@regions)
-  { $line_out .= ",$region_name" ; }
-  print OUT "$line_out" ;
-
-  foreach $yyyymm (sort {$b cmp $a} keys %months)
-  {
-    $line_out = "\n$yyyymm" ;
-    foreach $region_name (@regions)
-    {
-      $yyyymm_region_code = $yyyymm . ',' . $region_codes {$region_name} ;
-      $line_out .= "," . sprintf ($precision, $data {$yyyymm_region_code}) ;
-    }
-    print OUT "$line_out" ;
-  }
-
-  close OUT ;
-}
-
-sub ReadMasterComscoreDataVisitorsPerProperty
-{
-  my ($file_comscore_master, $file_comscore_updates, $precision, $multiplier, 
$layout_csv) = @_ ;
-
-  print "ReadMasterComscoreDataVisitorsPerProperty\n\n";
-
-  undef %months ;
-  undef %data ;
-  undef @properties ;
-
-  open IN,  '<', "$dir_upd/$file_comscore_master" ;
-
-  $lines = 0 ;
-  $metric       = 'unique_visitors' ;
-  while ($line = <IN>)
-  {
-    chomp $line ;
-    $line =~ s/\r//g ;
-
-    ($yyyymm,@data) = split (',', $line) ;
-    if ($lines++ == 0)
-    { @properties = @data ; next ; }
-
-    $field_ndx = 0 ;
-    foreach (@data)
-    {
-      $property = $properties [$field_ndx] ;
-      $property =~ s/.*Yahoo.*/Yahoo/ ;
-      $data      = $data    [$field_ndx] ;
-      if ($data eq '')
-      { $data = '0' ; }
-
-      # print "Old data $yyyymm,$property = $data\n" ;
-
-      $months {$yyyymm} ++ ;
-      $data {"$yyyymm,$property"} = $data ;
-
-      $field_ndx++ ;
-    }
-  }
-  close IN ;
-
-  my $updates_found = &UpdateMasterFileFromRecentComscoreData 
($file_comscore_master, $file_comscore_updates, 1000, $layout_csv, @properties) 
;
-  return if ! $updates_found ;
-
-  rename "$dir_upd/$file_comscore_master", "$dir_upd/$file_comscore_master.~" ;
-  open OUT,  '>', "$dir_upd/$file_comscore_master" ;
-
-  $line_out = "yyyymm" ;
-  foreach $property (@properties)
-  { $line_out .= ",$property" ; }
-  print OUT "$line_out" ;
-
-  foreach $yyyymm (sort {$b cmp $a} keys %months)
-  {
-    $line_out = "\n$yyyymm" ;
-    foreach $property (@properties)
-    {
-      $yyyymm_property = "$yyyymm,$property" ;
-      $line_out .= "," . sprintf ($precision, $data {$yyyymm_property}) ;
-    }
-    print OUT "$line_out" ;
-  }
-
-  close OUT ;
-}
-
-sub UpdateMasterFileFromRecentComscoreData
-{
-  my ($file_comscore_master, $file_comscore_updates, $multiplier, $layout_csv, 
@white_list) = @_ ;
-
-  print "UpdateMasterFileFromRecentComscoreData\n\n";
-
-  undef %white_list ;
-  undef %not_white_listed ;
-
-  print "White list: ". (join (',', @white_list)) . "\n\n";
-
-  foreach $id (@white_list)
-  { $white_list {$id} = $true ; }
-
-  if (! -e "$dir_upd/$file_comscore_master")
-  { Abort ("File $file_comscore_master not found!") ; }
-
-  $age_all = -M "$dir_upd/$file_comscore_master" ;
-  print "Latest comscore master file is " . sprintf ("%.0f", $age_all) . " 
days old: '$file_comscore_master'\n" ;
-
-  my $cwd = getcwd ;
-  chdir $dir_in ;
-
-  @files = glob($file_comscore_updates) ;
-  $min_age_upd = 999999 ;
-  $file_comscore_updates_latest = '' ;
-  foreach $file (@files)
-  {
-    $age = -M $file ;
-    if ($age < $min_age_upd)
-    {
-      $min_age_upd = $age ;
-      $file_comscore_updates_latest = $file ;
-    }
-  }
-  print "Latest comscore update file is " . sprintf ("%.0f", $min_age_upd) . " 
days old: '$file_comscore_updates_latest'\n" ;
-
-  if ($min_age_upd == 999999)
-  {
-    print "No valid update file found. Nothing to update." ;
-    return ;
-  }
-
-  #if ($age_all > $min_age_upd)
-  #{
-  #  print "File with master data more recent than latest update csv from 
comScore. Nothing to update." ;
-  #  return ;
-  #}
-
-  my $updates_found = $false ;
-
-  open CSV, '<', $file_comscore_updates_latest ;
-  binmode CSV ;
-  while ($line = <CSV>)
-  {
-    chomp $line ;
-    $line =~ s/\r//g ;
-    $line = &GetNumberOnly ($line) ;
-
-    if ($line =~ /Jan-\d\d\d\d.*?Feb-\d\d\d\d/) # e.g. 
'Location,Location,Jan-2010,Feb-2010,Mar-2010,Apr-2010,...'
-    {
-      if ($layout_csv == $layout_csv_properties)
-      { ($dummy1,$dummy2,$dummy3,@months) = split (',', $line) ; } # web 
properties csv file
-      else
-      { ($dummy1,$dummy2,@months) = split (',', $line) ; }         # uv / 
reach csv files
-
-      @months = &mmm_yyyy2yyyy_mm (@months) ;
-    }
-
-    if (($line =~ /^\d+,/) || ($line =~ /,,.*?Total Internet/))
-    {
-      if ($layout_csv == $layout_csv_properties)
-      {
-        ($index,$dummy,$property,@data) = split (',', $line) ;
-        $property =~ s/^\s+// ;
-        $property =~ s/\s+$// ;
-
-        $property =~ s/.*Total Internet.*/Total Internet/i ;
-        $property =~ s/.*Google.*/Google/i ;
-        $property =~ s/.*Microsoft.*/Microsoft/i ;
-        $property =~ s/.*FACEBOOK.*/Facebook/i ;
-        $property =~ s/.*Yahoo.*/Yahoo/i ;
-        $property =~ s/.*Amazon.*/Amazon/i ;
-        $property =~ s/.*Apple.*/Apple/i ;
-        $property =~ s/.*AOL.*/AOL/i ;
-        $property =~ s/.*Wikimedia.*/Wikimedia/i ;
-        $property =~ s/.*Tencent.*/Tencent/i ;
-        $property =~ s/.*Baidu.*/Baidu/i ;
-        $property =~ s/.*CBS.*/CBS/i ;
-
-        if (! $white_list {$property})
-        {
-          $not_white_listed {$property}++ ;
-          next ;
-        }
-
-        $id = $property ;
-      }
-      else
-      {
-        ($index,$region,@data) = split (',', $line) ;
-        $region =~ s/^\s+// ;
-        $region =~ s/\s+$// ;
-
-        if (! $white_list {$region})
-        {
-          $not_white_listed {$region}++ ;
-          next ;
-        }
-
-        $id = $region_codes {$region} ;
-      }
-
-      for ($m = 0 ; $m <= $#months ; $m++)
-      {
-        $yyyymm = $months [$m] ;
-        $months {$yyyymm} ++ ;
-        $yyyymm_id = "$yyyymm,$id" ;
-        $data = $data [$m] * $multiplier ;
-
-        if ($mode eq 'add')
-        {
-          if (! defined $data {$yyyymm_id})
-          {
-            $updates_found = $true ;
-            print "New data found: $yyyymm_id = $data\n" ;
-            $data {$yyyymm_id} = $data ;
-          }
-        }
-        else
-        {
-          $updates_found = $true ;
-          print "Data found: $yyyymm_id = $data\n" ;
-          $data {$yyyymm_id} = $data ;
-        }
-      }
-    }
-  }
-
-  $entities_not_white_listed = join (', ', sort keys %not_white_listed) ;
-  if ($entities_not_white_listed ne '')
-  { print "\nEntities ignored:\n$entities_not_white_listed\n\n" ; }
-
-  if (! $updates_found)
-  { print "No new updates found\n" ; }
-  else
-  { print "\nUpdates found, rewrite master file '$file_comscore_master'\n\n" ; 
}
-
-  return ($updates_found) ;
-}
-
-sub WriteDataAnalytics
-{
-  print "WriteDataAnalytics\n\n";
-
-  open OUT, '>', "$dir_out/analytics_in_comscore.csv" ;
-
-  $metric = 'unique_visitors' ;
-  foreach $yyyymm (sort keys %months)
-  {
-    # store meta data elsewhere
-    # $line = 
"$generated,$source,$server,$script_name,$script_version,$user,$yyyymm,$country_code,$region_code,$property,$project,$normalized,$metric,$data\n"
 ;
-    foreach $region_code (sort values %region_codes)
-    {
-      $country_code = '-' ;
-      $property     = '-' ;
-      $project      = '-' ;
-      $reach        = $reach_region_code     {"$yyyymm,$region_code"} ;
-      $visitors     = $visitors_region_code  {"$yyyymm,$region_code"} ;
-
-      if (! defined $reach)    { $reach    = -1 ; }
-      if (! defined $visitors) { $visitors = -1 ; }
-
-      next if $reach == -1 and $visitors == -1 ;
-
-      $line = 
"$yyyymm,$country_code,$region_code,$property,$project,$reach,$visitors\n" ;
-      print OUT $line ;
-      print     $line ;
-    }
-
-    foreach $property (sort @properties)
-    {
-      $country_code = '-' ;
-      $region_code  = '-' ;
-      $project      = '-' ;
-      $reach        = '-1' ;
-      $visitors     = $visitors_web_property {"$yyyymm,$property"} ;
-
-      next if ! defined $visitors ;
-
-      $line = 
"$yyyymm,$country_code,$region_code,$property,$project,$reach,$visitors\n" ;
-      print OUT $line ;
-      # print     $line ;
-    }
-  }
-}
-
-sub GetNumberOnly
-{
-  my $line = shift ;
-  $line =~ s/("[^\"]+")/($a=$1,$a=~s#,##g,$a)/ge ; # nested regexp: remove 
comma's inside double quotes
-  $line =~ s/"//g ;
-  return $line ;
-}
-
-sub mmm_yyyy2yyyy_mm
-{
-  my @months = @_ ;
-  my ($m) ;
-  # Jan -> 01, etc
-  foreach $month (@months)
-  {
-    my ($mmm,$yyyy) = split ('-', $month) ;
-    for ($m = 0 ; $m <= $#months_short ; $m++)
-    {
-      if ($mmm eq $months_short [$m])
-      {
-        $month = "$yyyy-" . sprintf ("%02d", $m+1) ;
-        last ;
-      }
-    }
-  }
-  return @months ;
-}
-
-sub Abort
-{
-  $msg = shift ;
-
-  print  "\nAbort, reason: $msg\n\n" ;
-  exit ;
-}

Deleted: trunk/wikistats/analytics/AnalyticsPrepLanguageNames.pl
===================================================================
--- trunk/wikistats/analytics/AnalyticsPrepLanguageNames.pl     2011-12-07 
17:36:45 UTC (rev 105444)
+++ trunk/wikistats/analytics/AnalyticsPrepLanguageNames.pl     2011-12-07 
17:39:50 UTC (rev 105445)
@@ -1,162 +0,0 @@
-#!/usr/bin/perl
-
-# This module prepares a csv file with language names for feed into the 
analytics database
-# The main work of collecting these names from different sources is done by
-# 
http://svn.wikimedia.org/viewvc/mediawiki/trunk/wikistats/dumps/WikiReportsLocalizations.pm
-# which is part of wikistats reporting phase and once each month updates local 
master csv files from all sources
-
-# The following code to merge and filter these master csv files is based on 
parts of the code in WikiReportsLocalizations.pm, function Localization
-# To do some day: make a completely independent script out of this code and 
code from WikiReportsLocalizations.pm which covers the whole production cycle
-
-# Sources for language names:
-# - php files
-# - translatewiki
-# - English Wikipedia API (interwikilinks)
-
-# Right now multilingual support for the analytics database is just a nice 
idea, so to speed up data feeds, just keep English and German translations
-
-  use Getopt::Std ;
-
-  $true  = 1 ;
-  $false = 0 ;
-
-  $max_language_name = 50 ; # truncate if longer
-
-  $file_csv_language_names_php   = "LanguageNamesViaPhp.csv" ;
-  $file_csv_language_names_wp    = "LanguageNamesViaWpEnEdited.csv" ;
-  $file_csv_analytics_in         = "analytics_in_language_names.csv" ;
-
-  $languages_force_case_uc = "ast|br|de|en|id|nl|wa" ; # incomplete list, 
based on languages supported by wikistats reporting
-  $languages_filter        = "de|en" ;
-  foreach $language (split '\|', $languages_filter)
-  {  $include_target_language {$language} = $true ; }
-
-  &ParseArguments ;
-  &ReadCsvFiles ;
-  &WriteCsvFile ;
-
-
-#  if ($language eq "id") # swap which file takes precedence
-
-  print "\nReady\n\n" ;
-  exit ;
-
-sub ParseArguments
-{
-  my (@options, $arguments) ;
-
-  getopt ("io", \%options) ;
-
-  foreach $arg (sort keys %options)
-  { $arguments .= " -$arg " . $options {$arg} . "\n" ; }
-  print ("\nArguments\n$arguments\n") ;
-
-  $options {"i"} = "w:/# out bayes/csv_wp" ; # EZ test
-  $options {"o"} = "c:/MySQL/analytics" ;    # EZ test
-
-  die ("Specify input folder for projectcounts files as: -i path") if (! 
defined ($options {"i"})) ;
-  die ("Specify output folder as: -o path'")                       if (! 
defined ($options {"o"})) ;
-
-  ($path_in  = $options {"i"}) =~ s/[\/\\]+$// ; # remove trailing (back)slash 
if any
-  ($path_out = $options {"o"}) =~ s/[\/\\]+$// ; # remove trailing (back)slash 
if any
-
-  die "Input folder '$path_in' does not exist"   if (! -d $path_in) ;
-  die "Output folder '$path_out' does not exist" if (! -d $path_out) ;
-
-  print "Input  folder: $path_in\n" ;
-  print "Output folder: $path_out\n\n" ;
-
-  $file_csv_language_names_php = "$path_in/$file_csv_language_names_php" ;
-  $file_csv_language_names_wp  = "$path_in/$file_csv_language_names_wp" ;
-  $file_csv_analytics_in       = "$path_out/$file_csv_analytics_in" ;
-
-  die "Input file '$file_csv_language_names_php' not found"   if (! -e 
$file_csv_language_names_php) ;
-  die "Input file '$file_csv_language_names_wp' not found"    if (! -e 
$file_csv_language_names_wp) ;
-}
-
-sub ReadCsvFiles
-{
-  #first read definitions from php message files, then overwrite with 
definitions from interwiki links when available
-  # except for target language 'id' (Indonesian) where quality of php file has 
been deemed more reliable
-
-  open CSV_IN, "<", $file_csv_language_names_php ;
-  while ($line = <CSV_IN>)
-  {
-    chomp ($line) ;
-    ($target_language, $code, $name_unicode, $name_html) = split (',', $line) ;
-
-    next if ! $include_target_language {$target_language} ;
-
-    $out_languages {$target_language} {$code} = &FormatName ($target_language, 
$name_unicode) ; # forget about html for analytics database
-  }
-  close CSV_IN ;
-
-  open CSV_IN, "<", $file_csv_language_names_wp ;
-  while ($line = <CSV_IN>)
-  {
-    chomp ($line) ;
-    ($target_language, $code, $name_unicode, $name_html) = split (',', $line) ;
-
-    next if ! $include_target_language {$target_language} ;
-
-    next if $target_language eq 'id' and $out_languages {$target_language} 
{$code} ne '' ;
-
-  # $name_unicode_php = $out_languages {$target_language} {$code} ;           
# test only
-  # $name_unicode_wp  = &FormatName ($target_language, $name_unicode) ;       
# test only
-  # if (($name_unicode_php ne '') && ($name_unicode_php ne $name_unicode_wp)) 
# test only
-  # { print "$name_unicode_php => $name_unicode_wp\n" ; }                     
# test only
-
-    $out_languages {$target_language} {$code} = &FormatName ($target_language, 
$name_unicode) ; # forget about html for analytics database
-  }
-  close CSV_IN ;
-}
-
-sub FormatName
-{
-  my ($target_language, $name_unicode) = @_ ;
-
-  $name_unicode2 = $name_unicode ;
-
-  if ($target_language eq "de")
-  { $name_unicode =~ s/e?\s*\-?sprache//i ; }
-
-  if ($target_language =~ /^(?:$languages_force_case_uc)/)
-  { $name_unicode = ucfirst $name_unicode ; }
-  else
-  { $name_unicode = lc $name_unicode ; }
-
-# Test only
-# if (($target_language eq 'de') && ($name_unicode ne $name_unicode2))
-# { print "$name_unicode2 => $name_unicode\n" ; }
-
-  return ($name_unicode) ;
-}
-
-sub WriteCsvFile
-{
-  open CSV_OUT, ">", $file_csv_analytics_in || die ("File 
'$file_csv_analytics_in' could not be opened") ;
-  binmode CSV_OUT ; # force Unix style linebreak \012 
-
-  foreach $target_language (sort keys %out_languages)
-  {
-    print "\nTarget language '$target_language'\n" ;
-    %translations = %{$out_languages {$target_language}} ;
-
-    foreach $code (sort keys %translations)
-    {
-      $language_name  = $translations{$code} ;
-      $language_name2 = substr ($language_name,0,$max_language_name) ;
-
-      if ($language_name ne $language_name2)
-      { print "Language name truncated to $max_language_name chars: 
'$language_name' -> '$language_name2'\n" ; }
-
-      if ($language_name2 =~ /,/)
-      { $language_name2 = "\"$language_name2\"" ; }
-      # test only
-      print CSV_OUT "$target_language,$code,$language_name2\n" ;
-    }
-  }
-
-  close CSV_OUT ;
-}
-

Deleted: trunk/wikistats/analytics/AnalyticsPrepPageViews.pl
===================================================================
--- trunk/wikistats/analytics/AnalyticsPrepPageViews.pl 2011-12-07 17:36:45 UTC 
(rev 105444)
+++ trunk/wikistats/analytics/AnalyticsPrepPageViews.pl 2011-12-07 17:39:50 UTC 
(rev 105445)
@@ -1,11 +0,0 @@
-#!/usr/bin/perl
-
-# This file is just a place holder (alias)
-# Page views per day/week/month/etc, per project, per language, nrmalized and 
not normalized are collected by 
-# 
http://svn.wikimedia.org/viewvc/mediawiki/trunk/wikistats/dumps/WikiCountsSummarizeProjectCounts.pl
-#
-# Added May 2001: 
-# For analytics database one file is written for all projects and languages 
combined, 
-# with per month, not normalized and normalized page view counts in one row.
-# This file is written to same output folder as other csv files generated by 
WikiCountsSummarizeProjectCounts.pl
-# and needs to be copied to analytics folder later
\ No newline at end of file

Deleted: trunk/wikistats/analytics/AnalyticsPrepWikiCountsOutput.pl
===================================================================
--- trunk/wikistats/analytics/AnalyticsPrepWikiCountsOutput.pl  2011-12-07 
17:36:45 UTC (rev 105444)
+++ trunk/wikistats/analytics/AnalyticsPrepWikiCountsOutput.pl  2011-12-07 
17:39:50 UTC (rev 105445)
@@ -1,334 +0,0 @@
-#!/usr/local/bin/perl
-
-# Copyright (C) 2011 Wikimedia Foundation
-# This program is free software; you can redistribute it and/or
-# modify it under the terms of the GNU General Public License version 2
-# as published by the Free Software Foundation.
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-# See the GNU General Public License for more details, at
-# http://www.fsf.org/licenses/gpl.html
-
-# Author:
-# Erik Zachte, email ezac...@wikimedia.org
-# loosely based on predecessor
-# 
http://svn.wikimedia.org/viewvc/mediawiki/trunk/wikistats/reportcard/ReportCardExtractWikiCountsOutput.pl
-
-# Functionality:
-# tba
-
-# Parameters:
-# tba
-
-# Output:
-# updated csv file for import in MySQL
-
-# http://svn.wikimedia.org/viewvc/mediawiki/trunk/wikistats/analytics/
-
-  use Getopt::Std ;
-
-  $true  = 1 ;
-  $false = 0 ;
-
-  @projects = ('wb','wk','wn','wp','wq','ws','wv','wx','commons','*') ;
-
-  $file_csv_monthly_data         = "StatisticsMonthly.csv" ;
-  $file_csv_user_activity_spread = "StatisticsUserActivitySpread.csv" ;
-  $file_csv_analytics_in         = "analytics_in_wikistats.csv" ;
-
-  &ParseArguments ;
-  &ReadStatisticsMonthly ;
-  &FindLargestWikis ;
-  &WriteMonthlyData ;
-
-  print "\nReady\n\n" ;
-  exit ;
-
-sub ParseArguments
-{
-  print "ParseArguments\n" ;
-  my (@options, $arguments) ;
-
-  getopt ("io", \%options) ;
-
-  foreach $arg (sort keys %options)
-  { $arguments .= " -$arg " . $options {$arg} . "\n" ; }
-  print ("\nArguments\n$arguments\n") ;
-
-  if (! -d '/mnt/') # EZ test
-  {
-    $path_in  = "c:/\@ wikimedia/# out bayes" ;
-    $path_out = "c:/MySQL/analytics" ;
-  }
-  else
-  {
-    die ("Specify input folder for projectcounts files as: -i path") if (! 
defined ($options {"i"})) ;
-    die ("Specify output folder as: -o path'")                       if (! 
defined ($options {"o"})) ;
-
-    $path_in  = $options {"i"} ;
-    $path_out = $options {"o"} ;
-  }
-
-  die "Input folder '$path_in' does not exist"   if (! -d $path_in) ;
-  die "Output folder '$path_out' does not exist" if (! -d $path_out) ;
-
-  print "Input  folder: $path_in\n" ;
-  print "Output folder: $path_out\n\n" ;
-
-  $file_csv_out = "$path_out/analytics_in_wikistats.csv" ;
-}
-
-sub ReadStatisticsMonthly
-{
-  print "ReadStatisticsMonthly\n" ;
-  &ReadStatisticsMonthlyForProject ("wb") ;
-  &ReadStatisticsMonthlyForProject ("wk") ;
-  &ReadStatisticsMonthlyForProject ("wn") ;
-  &ReadStatisticsMonthlyForProject ("wp") ;
-  &ReadStatisticsMonthlyForProject ("wq") ;
-  &ReadStatisticsMonthlyForProject ("ws") ;
-  &ReadStatisticsMonthlyForProject ("wv") ;
-  &ReadStatisticsMonthlyForProject ("wx") ;
-
-# &ReadStatisticsPerBinariesExtensionCommons ;
-}
-
-sub ReadStatisticsMonthlyForProject
-{
-  my $project = shift;
-  $all_projects = "*" ;
-
-  my $file_csv_in_1 = "$path_in/csv_$project/$file_csv_monthly_data" ;
-  my $file_csv_in_2 = "$path_in/csv_$project/$file_csv_user_activity_spread" ;
-
-  if (! -e $file_csv_in_1)
-  { &Abort ("Input file '$file_csv_in_1' not found") ; }
-  if (! -e $file_csv_in_2)
-  { &Abort ("Input file '$file_csv_in_2' not found") ; }
-
-  my $yyyymm ;
-
-  print "Read '$file_csv_in_1'\n" ;
-  open CSV_IN, '<', $file_csv_in_1 ;
-  while ($line = <CSV_IN>)
-  {
-    chomp $line ;
-    ($language,$date,$counts) = split (',', $line, 3) ;
-    @fields = split (',', $counts) ;
-
-    next if ! &AcceptWiki ($project,$language) ;
-
-    ($month,$day,$year) = split ('\/', $date) ;
-    $yyyymm = sprintf ("%04d-%02d", $year, $month) ;
-
-    foreach $field (@fields)
-    {
-      if ($field eq '-')
-      { $field = 0 ; }
-    }
-
-    $data = $fields  [0] . ',' . # contributors all time
-            $fields  [1] . ',' . # new contributors
-            'data2,'           . # place holder for more data, to be inserted 
later
-            $fields  [4] . ',' . # articles
-            $fields  [6] . ',' . # articles new per day
-            $fields  [9] . ',' . # larger than 0.5 kB
-            $fields [10] . ',' . # larger than 2.0 kB
-            $fields  [7] . ',' . # mean edits per article
-            $fields  [8] . ',' . # mean bytes per article
-            $fields [11] . ',' . # edits
-            $fields [12] . ',' . # size in bytes
-            $fields [13] . ',' . # size in words
-            $fields [14] . ',' . # links internal
-            $fields [15] . ',' . # links interwiki
-            $fields [16] . ',' . # links images
-            $fields [17] . ',' . # links external
-            $fields [18] ;       # redirects
-
-    $data1 {"$project,$language,$yyyymm"} = $data ;
-  }
-  close CSV_IN ;
-
-  # now read (very) active editors from newer more accurate file (split data 
for reg users and bots, unlike StatisticsMonthly.csv)
-
-  print "Read '$file_csv_in_2'\n" ;
-  open CSV_IN, '<', $file_csv_in_2 ;
-  while ($line = <CSV_IN>)
-  {
-    chomp $line ;
-    ($language,$date,$reguser_bot,$group,@counts) = split (',', $line) ;
-
-    next if ! &AcceptWiki ($project,$language) ;
-
-    if ($reguser_bot ne "R") { next ; }  # R: reg user, B: bot
-    if ($group       ne "A") { next ; }  # A: articles, T: talk pages, O: 
other namespaces
-
-    ($month,$day,$year) = split ('\/', $date) ;
-    $yyyymm = sprintf ("%04d-%02d", $year, $month) ;
-    $months {$yyyymm} ++ ;
-#    print "YYYYMM $yyyymm\n" ;
-
-    # data have been collected in WikiCountsProcess.pm and been written in 
WikiCountsOutput.pm
-    # count user with over x edits
-    # threshold starting with a 3 are 10xSQRT(10), 100xSQRT(10), 
1000xSQRT(10), etc
-    # @thresholds = 
(1,3,5,10,25,32,50,100,250,316,500,1000,2500,3162,5000,10000,25000,31623,50000,100000,250000,316228,500000,1000000,2500000,3162278,500000,10000000,25000000,31622777,5000000,100000000)
 ;
-    $edits_ge_5   = @counts [2] > 0 ? @counts [2] : 0 ;
-    $edits_ge_25  = @counts [4] > 0 ? @counts [4] : 0 ;
-    $edits_ge_100 = @counts [7] > 0 ? @counts [7] : 0 ;
-    $data2 {"$project,$language,$yyyymm"} = 
"$edits_ge_5,$edits_ge_25,$edits_ge_100" ;
-
-    $total_edits_ge_5   {"$project,$language"} += $edits_ge_5 ;
-    $total_edits_ge_25  {"$project,$language"} += $edits_ge_25 ;
-    $total_edits_ge_100 {"$project,$language"} += $edits_ge_100 ;
-
-    # prep string with right amount of comma's
-    if ($data2_default eq '')
-    {
-      $data2_default = $data2 {"$project,$language,$yyyymm"} ;
-      $data2_default =~ s/[^,]+/0/g ;
-    }
-  }
-  close CSV_IN ;
-}
-
-#sub ReadStatisticsPerBinariesExtensionCommons
-#{
-#  my $file_csv_in = "$path_in/csv_wx/StatisticsPerBinariesExtension.csv" ;
-#  my $mmax = -1 ;
-
-#  if (! -e $file_csv_in)
-#  { &Abort ("Input file '$file_csv_in' not found") ; }
-
-#  print "Read '$file_csv_in'\n" ;
-#  open CSV_IN, '<', $file_csv_in ;
-#  while ($line = <CSV_IN>)
-#  {
-#    chomp $line ;
-#    ($language,$date,$counts) = split (',', $line, 3) ;
-
-#    if ($language ne "commons") { next ; }
-
-#    if ($date eq "00/0000")
-#    {
-#      @fields = split (',', $counts) ;
-#      $field_ndx = 0 ;
-#      foreach $field (@fields)
-#      {
-#        $ext_cnt {-1}{$field_ndx} = $field ;
-#      # print "EXT_CNT $field_ndx : $field\n" ;
-#        $field_ndx ++ ;
-#      }
-#      next ;
-#    }
-
-#    ($month,$year) = split ('\/', $date) ;
-#    my $m = &months_since_2000_01 ($year,$month) ;
-#    next if $m < $m_start ;
-
-#    if ($m > $mmax)
-#    { $mmax = $m ; }
-
-#    @fields = split (',', $counts) ;
-#    $field_ndx = 0 ;
-#    foreach $field (@fields)
-#    {
-#      $ext_cnt {$m}{$field_ndx}  = $field ;
-#      $ext_tot {$m}             += $field ;
-#      $field_ndx ++ ;
-#    }
-#  }
-#  close CSV_IN ;
-
-#  %ext_cnt_mmax = %{$ext_cnt {$mmax}} ;
-#  @ext_cnt_mmax = (sort {$ext_cnt_mmax {$b} <=> $ext_cnt_mmax {$a}} keys 
%ext_cnt_mmax) ;
-
-#  $extcnt = 0 ;
-#  foreach $extndx (@ext_cnt_mmax)
-#  {
-#    # print "$extndx < ${ext_cnt {-1}{$extndx}} > : ${ext_cnt_mmax 
{$extndx}}\n" ;
-#    push @extndxs, $extndx ;
-#    if ($extcnt++ >= 9) { last ; }
-#  }
-#}
-
-sub FindLargestWikis
-{
-  print "FindLargestWikis\n" ;
-  print "Largest projects (most accumulated very active editors):\n";
-  @total_edits_ge_100 = sort {$total_edits_ge_100 {$b} <=> $total_edits_ge_100 
{$a}} keys %total_edits_ge_100 ;
-  $rank = 0 ;
-  foreach $project_language (@total_edits_ge_100)
-  {
-    $largest_projects {$project_language} = $rank++ ;
-    print "$project_language," ;
-    last if $rank > 10 ;
-  }
-  print "\n\n" ;
-
-  foreach $yyyymm (sort keys %months)
-  {
-    next if $yyyymm lt '2011' ;
-    foreach $project_language (keys %largest_projects)
-    {
-      ($project,$language) = split (',', $project_language) ;
-          if ($data2 {"$project,$language,$yyyymm"} eq '')
-      {
-        print "No data yet for large wiki $project_language for $yyyymm-> skip 
month $yyyymm\n" ;
-        $months {$yyyymm} = 0 ;
-      }
-    }
-  }
-}
-
-sub WriteMonthlyData
-{
-  print "WriteMonthlyData\n" ;
-  my $file_csv_out = "$path_out/$file_csv_analytics_in" ;
-  open CSV_OUT, '>', $file_csv_out ;
-  foreach $project_wiki_month (sort keys %data1)
-  {
-    ($project,$wiki,$yyyymm) = split (',', $project_wiki_month) ;
-
-    # recent month misses on eor more large wikis?
-    next if $months {$yyyymm} == 0 ;
-
-    $data1 = $data1 {$project_wiki_month} ;
-    $data2 = $data2 {$project_wiki_month} ;
-    if ($data2 eq '')
-    {
-      print "Editor data missing for $project_wiki_month\n" ;
-      $data2 = $data2_default ;
-    }
-    $data1 =~ s/data2/$data2/ ; # insert rather than append to have all editor 
fields close together
-    print CSV_OUT "$project_wiki_month,$data1\n" ;
-  }
-  $total_edits_ge_5   {"$project,*,$yyyymm"} += $edits_ge_5 ;
-  $total_edits_ge_25  {"$project,*,$yyyymm"} += $edits_ge_25 ;
-  $total_edits_ge_100 {"$project,*,$yyyymm"} += $edits_ge_100 ;
-  close CSV_OUT ;
-}
-
-sub AcceptWiki
-{
-  my ($project,$language) = @_ ;
-
-  return $false if $language eq 'commons' and $project ne 'wx' ; # commons 
also in wikipedia csv files (bug, hard to cleanup, just skip)
-  return $false if $language eq 'sr'      and $project eq 'wn' ; # ignore 
insane bot spam on
-  return $false if $language =~ 
/mania|team|comcom|closed|chair|langcom|office|searchcom|sep11|nostalgia|stats|test/i
 ;
-
-  return $false if $language =~ /^(?:dk|tlh|ru_sib)$/ ; # dk=dumps exist(ed?) 
but site not, tlh=Klignon, ru-sib=Siberian
-  return $false if $project eq 'wk' and ($language eq "als" or $language eq 
"tlh") ;
-
-  return $true ;
-}
-
-sub Abort
-{
-  my $msg = shift ;
-  print "$msg\nExecution aborted." ;
-  # to do: log also to file
-  exit ;
-}
-
-

Added: trunk/wikistats/analytics/README
===================================================================
--- trunk/wikistats/analytics/README                            (rev 0)
+++ trunk/wikistats/analytics/README    2011-12-07 17:39:50 UTC (rev 105445)
@@ -0,0 +1 @@
+The files in this folder have been moved to 
trunk/extensions/MetricReporting/import.


Property changes on: trunk/wikistats/analytics/README
___________________________________________________________________
Added: svn:eol-style
   + native

Deleted: trunk/wikistats/analytics/_readme.txt
===================================================================
--- trunk/wikistats/analytics/_readme.txt       2011-12-07 17:36:45 UTC (rev 
105444)
+++ trunk/wikistats/analytics/_readme.txt       2011-12-07 17:39:50 UTC (rev 
105445)
@@ -1,21 +0,0 @@
-This folder contain bash and perl files to create and fill database 
'analytics', a.o. for monthly dashboard.
-
-== analytics_new.sh ==
-Defines the database and tables and loads data from existing csv files.
-It executes SQL from analytics_create_and_load_from_csv.txt
-
-== analytics_upd.sh ==
-Prepares new csv files (delegated to analytics_generate_csv_files.sh),
-and empties/reloads all tables for which csv files are in this folder.   
-It executes SQL from analytics_refresh_from_csv.txt
-
-== CSV files ==
-CSV files and where they are generated:
-
-analytics_in_binaries.csv          <- AnalyticsPrepBinariesData.pl
-analytics_in_comscore.csv          <- AnalyticsPrepComscoreData.pl             
   
-analytics_in_comscore_regions.csv  <- manual
-analytics_in_language_names.csv   
-analytics_in_offline.csv           <- manual
-analytics_in_page_views.csv        <- 
/home/ezachte/wikistats/pageviews_monthly.sh (file copy after that)
-analytics_in_wikistats.csv         <- AnalyticsPrepWikiCountsOutput.pl

Deleted: trunk/wikistats/analytics/analytics_create_and_load_from_csv.txt
===================================================================
--- trunk/wikistats/analytics/analytics_create_and_load_from_csv.txt    
2011-12-07 17:36:45 UTC (rev 105444)
+++ trunk/wikistats/analytics/analytics_create_and_load_from_csv.txt    
2011-12-07 17:39:50 UTC (rev 105445)
@@ -1,181 +0,0 @@
--- make sure to delete output files *test*.csv first if any exist (MySQL on 
purpose forbids overwrite)
-
--- tables implemented:
---   comscore
---   comscore_regions
---   wikistats 
---   page_views     
---   language names 
---   binaries    
-
--- more tables planned (O= optional, not needed for report card stage)
---   project_names  
--- O edits          per project_code, per language, per month, per 
normalization type (Y/N), editor type (manual, anonymous, bot), namespace group 
(articles, talk pages, other)  
--- O editors        per project_code, per language, per month, per 
normalization type (Y/N), editor type (manual, anonymous, bot), namespace group 
(articles, talk pages, other)  
-
-
--- open issues: 
--- only store basic data in database and calculate all aggregates on the fly 
or do some aggregation before hand ? (e.g. count for mobile / non-mobile / ==> 
total of both ? <==)
--- for binaries, store one extension type per row? (future proof, more work to 
query), or a selected few as columns? (one row only needed per month)
-
--- Create database and two tables from scratch 
-DROP DATABASE IF EXISTS `analytics` ;
-
-CREATE DATABASE `analytics` ;
-
-USE `analytics` ;
-
-CREATE TABLE `comscore` (
-  `date`           date NOT NULL, 
-  `country_code`   varchar (3), 
-  `region_code`    varchar (3), 
-  `web_property`   varchar (20), 
-  `project_code`   varchar (10), 
-  `reach`          decimal (4,1) DEFAULT NULL,
-  `visitors`       decimal (15)  DEFAULT NULL, 
-  PRIMARY KEY (date,country_code,region_code,project_code,web_property),
-  KEY (`country_code`)  
-) ;
-
-CREATE TABLE `comscore_regions` (
-  `region_code`      varchar (2), 
-  `report_language`  varchar (10),
-  `region_name`      varchar (18),
-  PRIMARY KEY (region_code,report_language)
-) ;  
-
-CREATE TABLE `wikistats` (
-  `date`                     date NOT NULL, 
-  `project_code`             varchar (10),
-  `language_code`            varchar (15),
-  `editors_all_time`         int (10)  DEFAULT NULL, 
-  `editors_new`              int (7)   DEFAULT NULL, 
-  `editors_ge_5`             int (7)   DEFAULT NULL, 
-  `editors_ge_25`            int (7)   DEFAULT NULL, 
-  `editors_ge_100`           int (7)   DEFAULT NULL, 
-  `articles`                 int (12)  DEFAULT NULL, 
-  `articles_new_per_day`     int (9)   DEFAULT NULL, 
-  `articles_over_bytes_500`  int (12)  DEFAULT NULL, 
-  `articles_over_bytes_2000` int (12)  DEFAULT NULL, 
-  `edits_per_article`        decimal (9,1) DEFAULT NULL, 
-  `bytes_per_article`        decimal (9,1) DEFAULT NULL, 
-  `edits`                    int (12)  DEFAULT NULL, 
-  `size_in_bytes`            int (15)  DEFAULT NULL, 
-  `size_in_words`            int (15)  DEFAULT NULL, 
-  `links_internal`           int (15)  DEFAULT NULL, 
-  `links_interwiki`          int (15)  DEFAULT NULL, 
-  `links_image`              int (15)  DEFAULT NULL, 
-  `links_external`           int (15)  DEFAULT NULL, 
-  `redirects`                int (15)  DEFAULT NULL, 
-  PRIMARY KEY (date,project_code,language_code)
-) ;
-
-CREATE TABLE `page_views` (
-  `date`                        date NOT NULL, 
-  `project_code`                char (2),
-  `language_code`               char (15),
-  `views_non_mobile_raw`        bigint (15), 
-  `views_mobile_raw`            bigint (15), 
-  `views_non_mobile_normalized` bigint (15), 
-  `views_mobile_normalized`     bigint (15), 
-  `views_raw`                   bigint (15), 
-  `views_normalized`            bigint (15), 
-  PRIMARY KEY (date,project_code,language_code),
-  KEY (date,language_code,project_code)
-) ;
-
-CREATE TABLE `language_names` (
-  `report_language`             varchar (15),   
-  `language_code`               varchar (15),
-  `language_name`               varchar (50),
-  PRIMARY KEY (report_language,language_code)
-) ;
-
-CREATE TABLE `binaries` (
-  `date`                        date NOT NULL, 
-  `project_code`                char (2),
-  `language_code`               char (15),
-  `extension`                   varchar (10),
-  `binaries`                    bigint (15), 
-  PRIMARY KEY (date,project_code,language_code,extension)
-) ;
-
-CREATE TABLE `offline` (
-  `date`                        date NOT NULL,
-  `source`                      varchar (255),
-  `readers`                     bigint (12),
-  PRIMARY KEY (date,source)
-) ;
-
--- SHOW TABLES ;
--- DESCRIBE comscore ;
--- DESCRIBE comscore_regions ;
--- DESCRIBE wikistats ;
--- DESCRIBE page_views ;
--- DESCRIBE language_names ;
--- DESCRIBE binaries ;
--- DESCRIBE offline ;
-
--- Database Manipulation
--- Obviously in real world this is a separate script
-
-LOAD DATA LOCAL INFILE 'analytics_in_comscore.csv' 
-     INTO TABLE comscore 
-     FIELDS TERMINATED BY ',' 
-     OPTIONALLY ENCLOSED BY '"' 
-     (@date,country_code,region_code,web_property,project_code,reach,visitors)
-     SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;
-
-LOAD DATA LOCAL INFILE 'analytics_in_comscore_regions.csv' 
-     INTO TABLE comscore_regions 
-     FIELDS TERMINATED BY ',' 
-     OPTIONALLY ENCLOSED BY '"' 
-     (report_language,region_code,region_name) ;
-
-LOAD DATA LOCAL INFILE 'analytics_in_wikistats.csv' 
-     INTO TABLE wikistats 
-     FIELDS TERMINATED BY ',' 
-     OPTIONALLY ENCLOSED BY '"' 
-     
(project_code,language_code,@date,editors_all_time,editors_new,editors_ge_5,editors_ge_25,editors_ge_100,articles,articles_new_per_day,articles_over_bytes_500,articles_over_bytes_2000,edits_per_article,bytes_per_article,edits,size_in_bytes,size_in_words,links_internal,links_interwiki,links_image,links_external,redirects)
 
-     SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;
-
-LOAD DATA LOCAL INFILE 'analytics_in_page_views.csv' 
-     INTO TABLE page_views 
-     FIELDS TERMINATED BY ',' 
-     OPTIONALLY ENCLOSED BY '"' 
-     
(project_code,language_code,@date,views_non_mobile_raw,views_mobile_raw,views_non_mobile_normalized,views_mobile_normalized,views_raw,views_normalized)
 
-     SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;
-
-
-LOAD DATA LOCAL INFILE 'analytics_in_language_names.csv' 
-     INTO TABLE language_names 
-     FIELDS TERMINATED BY ',' 
-     OPTIONALLY ENCLOSED BY '"' 
-     (report_language,language_code,language_name) ;
-
-LOAD DATA LOCAL INFILE 'analytics_in_binaries.csv' 
-     INTO TABLE binaries 
-     FIELDS TERMINATED BY ',' 
-     OPTIONALLY ENCLOSED BY '"' 
-     (project_code,language_code,@date,extension,binaries) 
-     SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;
-
-LOAD DATA LOCAL INFILE 'analytics_in_offline.csv' 
-     INTO TABLE offline 
-     FIELDS TERMINATED BY ',' 
-     OPTIONALLY ENCLOSED BY '"' 
-     (@date,readers) 
-     SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;
-  
-  
--- show contents (debugging only) 
--- SELECT * FROM comscore ;
--- SELECT * FROM comscore_regions ;
--- SELECT * FROM wikistats ;
--- SELECT * FROM page_views ;
--- SELECT * FROM language_names ;
--- SELECT * FROM binaries 
--- WHERE project_code = 'commons' ;
-   SELECT * FROM offline ; 
-
-

Deleted: trunk/wikistats/analytics/analytics_generate_csv_files.sh
===================================================================
--- trunk/wikistats/analytics/analytics_generate_csv_files.sh   2011-12-07 
17:36:45 UTC (rev 105444)
+++ trunk/wikistats/analytics/analytics_generate_csv_files.sh   2011-12-07 
17:39:50 UTC (rev 105445)
@@ -1,47 +0,0 @@
-#!/bin/sh
-
-# Prepare several csv files, ready for importing into analytics database
-# All generated files have _in_ in name signalling these contain data ready 
for importing into database
-# One input record corresponds to one database record
-
-ulimit -v 8000000
-
-clear
-cd /a/analytics
-
-
-# AnalyticsPrepBinariesData.pl read counts for binaries which were generated 
by wikistats 
-# and which reside in /a/wikistats/csv_[project 
code]/StatisticsPerBinariesExtension.csv
-# It filters and reorganizes data and produces analytics_in_binaries.csv
-# Output csv contains: project code, language, month, extension name, count
-
-perl AnalyticsPrepBinariesData.pl -i /a/wikistats/ -o /a/analytics/
-
-# AnalyticsPrepComscoreData.pl scans /a/analytics/comscore for newest comScore 
csv files (with data for last 14 months) 
-# parses those csv files, adds/replaces data from these csv files into master 
files (containing full history)
-# and generates input csv file analytics_in_comscore.csv ready for importing 
into database
-#
-# note : these csv files were manually downloaded from 
http://mymetrix.comscore.com/app/report.aspx 
-# and given more descriptive names, script finds newest files based on partial 
name search 
-#
-# -r replace (default is add only)
-# -i input folder, contains manually downloaded csv files from comScore (or 
xls files manually converted to csv) 
-# -m master files with full history
-# -o output csv file, with reach per region, UV's per region and UV's per top 
web property, ready for import into database
-
-perl AnalyticsPrepComscoreData.pl -r -i /a/analytics/comscore -m /a/analytics 
-o /a/analytics
-
-# AnalyticsPrepWikiCountsOutput.pl reads a plethora of fields from several csv 
files from wikistats process
-# It filters and reorganizes data and produces analytics_in_wikistats.csv, 
ready for import into analytics database 
-
-perl AnalyticsPrepWikiCountsOutput.pl -i /a/wikistats/ -o /a/analytics 
-
-# analytics_in_page_views.csv is written daily as part of 
WikiCountsSummarizeProjectCounts.pl 
-# part of (/home/ezachte/pageviews_monthly.sh job) 
-# which processes hourly projectcounts files (per wiki page view totals for 
one hour) from http://dammit.lt/wikistats
-# and generates several files on different aggregation levels
-# only action here is to copy data to this folder to have everything in one 
place
-# note: unlike folder name suggests this file contains stats for all projects
-
-cp /a/wikistats/csv_wp/analytics_in_page_views.csv .
-

Deleted: trunk/wikistats/analytics/analytics_new.sh
===================================================================
--- trunk/wikistats/analytics/analytics_new.sh  2011-12-07 17:36:45 UTC (rev 
105444)
+++ trunk/wikistats/analytics/analytics_new.sh  2011-12-07 17:39:50 UTC (rev 
105445)
@@ -1,5 +0,0 @@
-clear
-cd /a/analytics
-# rm *test*.csv
-# mysql  --user=root --password=changerootwhenpriyankareturns < 
analytics_create_and_load_from_csv.txt > mysql_log.txt
-  mysql  -u analytics -h project2.wikimedia.org -preport < 
analytics_create_and_load_from_csv.txt > mysql_log.txt

Deleted: trunk/wikistats/analytics/analytics_refresh_from_csv.txt
===================================================================
--- trunk/wikistats/analytics/analytics_refresh_from_csv.txt    2011-12-07 
17:36:45 UTC (rev 105444)
+++ trunk/wikistats/analytics/analytics_refresh_from_csv.txt    2011-12-07 
17:39:50 UTC (rev 105445)
@@ -1,55 +0,0 @@
-USE `analytics` ;
-
-TRUNCATE TABLE comscore ;
-LOAD DATA LOCAL INFILE 'analytics_in_comscore.csv' 
-     INTO TABLE comscore 
-     FIELDS TERMINATED BY ',' 
-     OPTIONALLY ENCLOSED BY '"' 
-     (@date,country_code,region_code,web_property,project_code,reach,visitors)
-     SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;
-
-TRUNCATE TABLE comscore_regions   ;
-LOAD DATA LOCAL INFILE 'analytics_in_comscore_regions.csv' 
-     INTO TABLE comscore_regions 
-     FIELDS TERMINATED BY ',' 
-     OPTIONALLY ENCLOSED BY '"' 
-     (report_language,region_code,region_name) ;
-
-TRUNCATE TABLE wikistats ; 
-LOAD DATA LOCAL INFILE 'analytics_in_wikistats.csv' 
-     INTO TABLE wikistats 
-     FIELDS TERMINATED BY ',' 
-     OPTIONALLY ENCLOSED BY '"' 
-     
(project_code,language_code,@date,editors_all_time,editors_new,editors_ge_5,editors_ge_25,editors_ge_100,articles,articles_new_per_day,articles_over_bytes_500,articles_over_bytes_2000,edits_per_article,bytes_per_article,edits,size_in_bytes,size_in_words,links_internal,links_interwiki,links_image,links_external,redirects)
 
-     SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;
-
-TRUNCATE TABLE page_views ;
-LOAD DATA LOCAL INFILE 'analytics_in_page_views.csv' 
-     INTO TABLE page_views 
-     FIELDS TERMINATED BY ',' 
-     OPTIONALLY ENCLOSED BY '"' 
-     
(project_code,language_code,@date,views_non_mobile_raw,views_mobile_raw,views_non_mobile_normalized,views_mobile_normalized,views_raw,views_normalized)
 
-     SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;
-
-TRUNCATE TABLE language_names ;   
-LOAD DATA LOCAL INFILE 'analytics_in_language_names.csv' 
-     INTO TABLE language_names 
-     FIELDS TERMINATED BY ',' 
-     OPTIONALLY ENCLOSED BY '"' 
-     (report_language,language_code,language_name) ;
-
-TRUNCATE TABLE binaries ;
-LOAD DATA LOCAL INFILE 'analytics_in_binaries.csv' 
-     INTO TABLE binaries 
-     FIELDS TERMINATED BY ',' 
-     OPTIONALLY ENCLOSED BY '"' 
-     (project_code,language_code,@date,extension,binaries) 
-     SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;
-
-TRUNCATE TABLE offline ;  
-LOAD DATA LOCAL INFILE 'analytics_in_offline.csv' 
-     INTO TABLE offline 
-     FIELDS TERMINATED BY ',' 
-     OPTIONALLY ENCLOSED BY '"' 
-     (@date, readers) 
-     SET date = last_day (str_to_date(concat (@date,'-01'),'%Y-%m-%d')) ;

Deleted: trunk/wikistats/analytics/analytics_upd.sh
===================================================================
--- trunk/wikistats/analytics/analytics_upd.sh  2011-12-07 17:36:45 UTC (rev 
105444)
+++ trunk/wikistats/analytics/analytics_upd.sh  2011-12-07 17:39:50 UTC (rev 
105445)
@@ -1,6 +0,0 @@
-clear
-cd /a/analytics
-
-./analytics_generate_csv_files.sh
-
-mysql  -u analytics -h project2.wikimedia.org -preport < 
analytics_refresh_from_csv.txt


_______________________________________________
MediaWiki-CVS mailing list
MediaWiki-CVS@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs

[MediaWiki-CVS] SVN: [105445] trunk/wikistats/analytics

Reply via email to