Author: spadkins
Date: Tue Dec 12 11:33:13 2006
New Revision: 8387

Modified:
   p5ee/trunk/App-Repository/lib/App/Repository.pm

Log:
enhance summarize_rows() to do extended summaries

Modified: p5ee/trunk/App-Repository/lib/App/Repository.pm
==============================================================================
--- p5ee/trunk/App-Repository/lib/App/Repository.pm     (original)
+++ p5ee/trunk/App-Repository/lib/App/Repository.pm     Tue Dec 12 11:33:13 2006
@@ -2762,17 +2762,33 @@
     $summarized_rows = $rep->summarize_rows([EMAIL PROTECTED], [EMAIL 
PROTECTED], [EMAIL PROTECTED], \%formulas);
 
     @rows = (
-        { id=>5, "Jim", "Green", 13.5, 320, },
-        { id=>3, "Bob", "Green",  4.2, 230, },
-        { id=>9, "Ken", "Green", 27.4, 170, },
-        { id=>2, "Kim", "Blue",  11.7, 440, },
-        { id=>7, "Jan", "Blue",  55.1,  90, },
-        { id=>1, "Ben", "Blue",  22.6, 195, },
+        { id=>5, name=>"Jim", team=>"Green", rating=>13.5, score=>320, },
+        { id=>3, name=>"Bob", team=>"Green", rating=> 4.2, score=>230, },
+        { id=>9, name=>"Ken", team=>"Green", rating=>27.4, score=>170, },
+        { id=>2, name=>"Kim", team=>"Blue",  rating=>11.7, score=>440, },
+        { id=>7, name=>"Jan", team=>"Blue",  rating=>55.1, score=> 90, },
+        { id=>1, name=>"Ben", team=>"Blue",  rating=>22.6, score=>195, },
     );
     @columns = ( "rating", "score" );  # summarize a subset of the columns
     @summary_keys = ( "team" );
-    %options = ();
+    %options = (
+        ext_summaries => \%summaries,         # extended summaries
+        ext_summary_columns => [ "rating", "score", "team", ],   # optional
+        ext_summary_functions => {  # optional
+            sum      => 1,
+            count    => 1,
+            sum_sq   => 1,
+            distinct => 1,
+            min      => 1,
+            max      => 1,
+            average  => 1,  # requires sum, count
+            median   => 1,  # requires distinct
+            mode     => 1,  # requires min, max
+            stddev   => 1,  # requires sum, sum_sq, count
+        },
+    );
 
+    # returns the "natural" summaries
     $summarized_rows = $rep->summarize_rows([EMAIL PROTECTED], [EMAIL 
PROTECTED], [EMAIL PROTECTED], \%options);
 
 =cut
@@ -2785,6 +2801,24 @@
 
     my $column_def = $self->{table}{$table}{column};
 
+    my ($ext_summaries, $ext_column_summary, $ext_summary_columns, 
$ext_summary_functions);
+    $ext_summaries = $options->{ext_summaries};
+    if ($ext_summaries) {
+        $ext_summary_columns = $options->{ext_summary_columns};
+        $ext_summary_functions = $options->{ext_summary_functions} || {  # do 
all of them
+            count    => 1,
+            distinct => 1,
+            sum      => 1,
+            sum_sq   => 1,
+            min      => 1,
+            max      => 1,
+            average  => 1,  # requires sum, count
+            median   => 1,  # requires distinct
+            mode     => 1,  # requires min, max
+            stddev   => 1,  # requires sum, sum_sq, count
+        };
+    }
+
     my (@summary_rows, $summary_row, %summary_row);
     my ($key, $row, $hash_rows, $hash_row, $i, $rowidx, $colidx, $column, 
$value);
     my $row_type = "ARRAY";
@@ -2866,7 +2900,86 @@
                 push(@summary_key_values, $key);
             }
             foreach $i (@$sum_column_idx) {
-                $summary_row->[$i] += $row->[$i];
+                if (defined $row->[$i]) {
+                    $summary_row->[$i] += $row->[$i];
+                }
+            }
+            if ($ext_summaries) {
+                foreach $i (@$sum_column_idx) {
+                    $column = $columns->[$i];
+                    $value = $row->[$i];
+                    if (defined $value) {
+                        $ext_column_summary = $ext_summaries->{$column};
+                        if (!$ext_column_summary) {
+                            $ext_column_summary = {};
+                            $ext_summaries->{$column} = $ext_column_summary;
+                        }
+                        $ext_column_summary->{count} ++;
+                        $ext_column_summary->{distinct}{$value} ++;
+                        $ext_column_summary->{sum} += $value;
+                        $ext_column_summary->{sum_sq} += $value*$value;
+                        if (!defined $ext_column_summary->{min} || 
$ext_column_summary->{min} > $value) {
+                            $ext_column_summary->{min} = $value;
+                        }
+                        if (!defined $ext_column_summary->{max} || 
$ext_column_summary->{max} > $value) {
+                            $ext_column_summary->{max} = $value;
+                        }
+                    }
+                }
+            }
+        }
+        if ($ext_summaries) {
+            my ($count, $sum, $sum_sq, $num, $median_count, $median);
+            foreach $i (@$sum_column_idx) {
+                $column = $columns->[$i];
+                $ext_column_summary = $ext_summaries->{$column};
+                if ($ext_column_summary && $ext_column_summary->{count}) {
+                    $ext_column_summary->{average} = 
$ext_column_summary->{sum}/$ext_column_summary->{count};
+                    $ext_column_summary->{mode}    = 
($ext_column_summary->{max} + $ext_column_summary->{min})/2;
+                    $count                         = 
$ext_column_summary->{count};
+                    if ($count > 1) {
+                        $sum                       = 
$ext_column_summary->{sum};
+                        $sum_sq                    = 
$ext_column_summary->{sum_sq};
+                        $value                     = ($count * $sum_sq + $sum 
* $sum)/($count * ($count - 1));
+                        if ($value > 0) {
+                            $ext_column_summary->{stddev} = sqrt($value);
+                        }
+                    }
+                    if ($count % 2 == 1) {
+                        $num = 0;
+                        $median_count = ($count - 1)/2 + 1;
+                        foreach $value (sort { $a <=> $b } keys 
%{$ext_column_summary->{distinct}}) {
+                            $num += $ext_column_summary->{distinct}{$value};
+                            if ($num >= $median_count) {
+                                $ext_column_summary->{median} = $value;
+                                last;
+                            }
+                        }
+                    }
+                    else {
+                        $num = 0;
+                        $median_count = $count/2;
+                        $median = undef;
+                        foreach $value (sort { $a <=> $b } keys 
%{$ext_column_summary->{distinct}}) {
+                            $num += $ext_column_summary->{distinct}{$value};
+                            if (!defined $median) {
+                                if ($num >= $median_count + 1) {
+                                    $ext_column_summary->{median} = $value;
+                                    last;
+                                }
+                                elsif ($num == $median_count) {
+                                    $median = $value;
+                                }
+                            }
+                            else {
+                                if ($num >= $median_count + 1) {
+                                    $ext_column_summary->{median} = ($median + 
$value)/2;
+                                    last;
+                                }
+                            }
+                        }
+                    }
+                }
             }
         }
 

Reply via email to