patch: DateTime::TimeZone PP redundant structures

Matt Sisk Fri, 15 Jul 2005 08:58:01 -0700

Hello,

I updated my 'redundant data structure' fix so that it works with the latest
code in the repository and have included it below.


This only works for the PP versions of the zone modules. It was more of a proof
of concept -- there's probably more efficient ways to approach the problem.
(for example, I'm using md5 signatures of serialized data structures as keys --
I could probably simply use the serialized structures themselves as keys; on the
other hand using md5 might be necessary in the XS world). The performance hit
*should* only happen the first time each zone module is loaded (and even then
I've got the current threshold set to 25, so loading fewer zones than that
should be more or less simliar to current behavior).

The idea is to attack smaller data structures first, reusing them whenever
possible. I also attack larger data sets (say arrays of rules) and objects
(which themselves might contain other structures) and reuse those
super-structures as well, even though they might already be reusing smaller
structures.

I've also included a small test script that loads all the time zone modules and
prints some statistics (not a real benchmark, just reference count info for a
full zone load).

Feel free to play with it and perhaps see if any of it can be applied to the new
XS modifications.

Here's the output I get from the tzt.pl test script below:

---

Files/zones loaded: 373
_juxtapose() calls: 2689
         Refs seen: 21843
       Refs shared: 11006
   Refs eliminated: 10837

breakdown by type of ref:
  ARRAY => 10002 (reduced from 18613)
  DateTime::TimeZone::OlsonDB::Rule => 499 (reduced from 1961)
  DateTime => 193 (reduced from 282)
  HASH => 193 (reduced from 282)
  DateTime::TimeZone::OlsonDB::Observance => 117 (reduced from 141)
  DateTime::Locale::en_US => 1 (reduced from 282)
  DateTime::TimeZone::Floating => 1 (reduced from 282)

---

Cheers,
Matt

Index: lib/DateTime/TimeZone.pm
===================================================================
RCS file: 
/cvsroot/perl-date-time/modules/DateTime-TimeZone/lib/DateTime/TimeZone.pm,v
retrieving revision 1.115
diff -d -u -r1.115 TimeZone.pm
--- lib/DateTime/TimeZone.pm    8 Jul 2005 02:57:58 -0000       1.115
+++ lib/DateTime/TimeZone.pm    15 Jul 2005 15:55:29 -0000
@@ -24,6 +24,14 @@
 use constant IS_DST      => 5;
 use constant SHORT_NAME  => 6;
 
+# for hunting down redundant data structures
+use constant JUXTA_THRESHOLD => 25;
+use vars qw(%juxta_data_registry @juxta_data_queue %juxta_type_count
+            $juxta_load_count $juxta_invocation_count $juxta_attempts);
+$juxta_load_count = $juxta_invocation_count = $juxta_attempts = 0;
+use Digest::MD5;
+use Data::Dumper;
+
 sub new
 {
     my $class = shift;
@@ -342,6 +350,44 @@
     return $self;
 }
 
+
+# Class methods for hunting down redundant data structures
+
+# (counts are merely for diagnostics)
+sub _juxta_increment_load { ++$juxta_load_count }
+
+sub _juxtapose {
+  # does not handle recursive structures!
+  my $class = shift;
+  ++$juxta_invocation_count;
+  if ($juxta_load_count < JUXTA_THRESHOLD) {
+    push(@juxta_data_queue, @_);
+    return @_ > 1 ? @_ : $_[0];
+  }
+  if ($juxta_load_count == JUXTA_THRESHOLD) {
+    # start tracking redundant structures only when we've loaded
+    # JUXTA_THRESHOLD timezones (and process the zone objects loaded
+    # thus far).
+    $class->_juxta_increment_load;
+    $class->_juxtapose(@juxta_data_queue);
+    @juxta_data_queue = ();
+  }
+  # we're over the threshold, so crunch our args
+  foreach (@_) {
+    ref or next;
+    ++$juxta_attempts;
+    ++$juxta_type_count{ref($_)};
+    my $key = Digest::MD5::md5_hex(Dumper($_));
+    if ($juxta_data_registry{$key}) {
+      $_ = $juxta_data_registry{$key};
+    }
+    else {
+      $juxta_data_registry{$key} = $_;
+    }
+  }
+  @_ > 1 ? @_ : $_[0];
+}
+
 #
 # Functions
 #
Index: tools/parse_olson
===================================================================
RCS file: /cvsroot/perl-date-time/modules/DateTime-TimeZone/tools/parse_olson,v
retrieving revision 1.41
diff -d -u -r1.41 parse_olson
--- tools/parse_olson   8 Jul 2005 21:11:21 -0000       1.41
+++ tools/parse_olson   15 Jul 2005 15:55:29 -0000
@@ -391,8 +391,14 @@
 
 [EMAIL PROTECTED]::TimeZone::${mod_name}::ISA = ( 'Class::Singleton', 
'DateTime::TimeZone' );
 
-my \$spans =
-$spans;
+# Load counts for redundant data structure threshold
+__PACKAGE__->_juxta_increment_load;
+
+# Crunch redundant data structures
+my \$spans = __PACKAGE__->_juxtapose([
+__PACKAGE__->_juxtapose(
+  [EMAIL PROTECTED]
+)]);
 
 sub _spans { \$spans }
 sub max_span { \$spans->[-1] }
@@ -737,46 +743,95 @@
 
     return '' unless $zone->infinite_rules;
 
-    my $generator = <<'EOF';
-my $last_observance = !LAST_OBSERVANCE;
-sub _last_observance { $last_observance }
+    my $last_observance = ($zone->sorted_changes)[-1]->observance;
 
-my $rules = !RULES;
-sub _rules { $rules }
-EOF
+    my @rules = $zone->infinite_rules;
 
-    my $last_observance = ($zone->sorted_changes)[-1]->observance;
+    # Hack for reusing redundant data structures. We defined 'redundant'
+    # as sharing a common fingerprint -- in this case, an md5 signature
+    # on the stringified serialized structure as provided by
+    # Data::Dumper. We're doing two things here:
+    #
+    #   1) making the generated perl modules self-referential for common
+    #      structures (thereby reducing the file size)
+    #   2) imbuing the generated modules with the dynamic juxtaposition
+    #      calls necessary for sharing data structures across all time
+    #      zones when new data structures are encountered.
+    #
+    # The keys of %jstruct will end up becoming scalar variable names in
+    # the generated modules. These structures have been specifically
+    # targeted because they offer opportunites for reducing redundancy.
+    my %jstruct;
+    $jstruct{last_observance} = $last_observance;
+    $jstruct{juxta_frule} = $last_observance->{first_rule};
+    $jstruct{juxta_lsd}   = $last_observance->{local_start_datetime};
+    $jstruct{juxta_usd}   = $last_observance->{utc_start_datetime};
+    $jstruct{juxta_lor}   = $last_observance->{rules};
+    $jstruct{juxta_ltz}   = $jstruct{juxta_lsd}{tz};
+    $jstruct{juxta_utz}   = $jstruct{juxta_usd}{tz};
+    $jstruct{juxta_ll}    = $jstruct{juxta_lsd}{locale};
+    $jstruct{juxta_ul}    = $jstruct{juxta_usd}{locale};
+    $jstruct{juxta_llc}   = $jstruct{juxta_lsd}{local_c};
+    $jstruct{juxta_ulc}   = $jstruct{juxta_usd}{local_c};
 
-    # hack to trim size of dumped object
-    delete $last_observance->{utc_start_datetime}{locale};
-    delete $last_observance->{local_start_datetime}{locale};
-    delete $last_observance->{utc_start_datetime}{local_c};
-    delete $last_observance->{local_start_datetime}{local_c};
-    delete $last_observance->{rules};
-    delete $last_observance->{first_rule};
+    # order is important -- super structures to the right of sub
+    # structures.
+    my @jstruct_order = qw(juxta_ltz juxta_utz juxta_ll juxta_ul
+                           juxta_llc juxta_ulc juxta_lsd juxta_usd
+                           juxta_lor juxta_frule last_observance);
+    my %jstruct_str;
+    @[EMAIL PROTECTED] =
+    Data::Dumper->Dump( [ @[EMAIL PROTECTED] ], [ @jstruct_order ] );
+    chomp foreach values %jstruct_str;
 
-    # This assumes that there is only one observance from end of
-    # changes til end of time, which should be guaranteed by code in
-    # OlsonDB module.
-    my $offset = $last_observance->total_offset;
+    my $juxta_declarations;
 
-    my @rules = $zone->infinite_rules;
+    # Arrays of rules are treated slightly differently and are double-
+    # juxed, once for each rule and once for the overal set of rules.
+    my %jstruct_rstr;
+    $jstruct_rstr{juxta_lor}  = join(',', Dumper(@{$jstruct{juxta_lor}}));
+    $jstruct_rstr{rules} = join(',', Dumper(@rules));
+    foreach (qw(juxta_lor rules)) {
+      $juxta_declarations .= <<__JSTR;
+my \$$_ = __PACKAGE__->_juxtapose([
+  __PACKAGE__->_juxtapose(
+    $jstruct_rstr{$_}
+  )
+]);
+__JSTR
+    }
 
-    # This is cleaner than making the above a double-quoted string
-    $generator =~ s/!RULES/Dumper [EMAIL PROTECTED]/eg;
-    $generator =~ s/!LAST_OBSERVANCE/Dumper $last_observance/eg;
-    $generator =~
-        s/\$VAR1->{'local_start_datetime'}{'tz'}/bless( {
-      'name' => 'floating',
-      'offset' => 0
-    }, 'DateTime::TimeZone::Floating' )/;
-    $generator =~
-        s/\$VAR1->{'utc_start_datetime'}{'tz'}/bless( {
-      'name' => 'floating',
-      'offset' => 0
-    }, 'DateTime::TimeZone::Floating' )/;
-    $generator =~ s/!OFFSET/$offset/g;
-    $generator =~ s/!OFFSET/$offset/g;
+    # regular items get single-juxed
+    foreach (@jstruct_order) {
+      # $juxta_lor has already been declared above as an array of rules.
+      # It is in this array because we wanted the generated module to
+      # refer to the $juxta_lor array reference for compactness.
+      next if $_ eq 'juxta_lor';
+      my $str = $jstruct_str{$_};
+      if ($str =~ /^\s*undef\s*$/) {
+        # with the exception of $last_observance, these juxta variables
+        # can be dropped if they happen to be undef.
+        $juxta_declarations .= "my \$$_ = undef;\n"
+          if $str eq 'last_observance';
+      }
+      else {
+        # Wrap their assignments in calls to _juxtapose() so we catch
+        # new data structures dynamically.
+        $juxta_declarations .= <<__JSTR;
+my \$$_ = __PACKAGE__->_juxtapose(
+$str
+);
+__JSTR
+      }
+    }
+
+    my $generator = <<"EOF";
+
+$juxta_declarations
+sub _last_observance { \$last_observance }
+
+sub _rules { \$rules }
+EOF
 
     return $generator;
 }

#!/usr/bin/perl
use strict;
use warnings;

$ENV{PERL_DATETIME_TIMEZONE_PP} = 'yes';

# use lib ...

my @instances;
my %spans;

use DateTime::TimeZone;

print "path: $INC{'DateTime/TimeZone.pm'}\n";

foreach (DateTime::TimeZone::all_names) {
  print "  load $_\n";
  push(@instances, DateTime::TimeZone->new(name => $_));
}
print "\n";

my $loaded = $DateTime::TimeZone::juxta_load_count - 1;
my $calls  = $DateTime::TimeZone::juxta_invocation_count;
my $seen   = $DateTime::TimeZone::juxta_attempts;
my $shared = scalar keys %DateTime::TimeZone::juxta_data_registry;
my $zapped = $seen - $shared;

print "Files/zones loaded: $loaded\n";
print "_juxtapose() calls: $calls\n";
print "         Refs seen: $seen\n";
print "       Refs shared: $shared\n";
print "   Refs eliminated: $zapped\n";

my %rc;
foreach (values %DateTime::TimeZone::juxta_data_registry) {
  ++$rc{ref $_};
}
print "\nbreakdown by type of ref:\n";
foreach (sort { $rc{$b} <=> $rc{$a} } keys %rc) {
  print "  $_ => $rc{$_} (reduced from 
$DateTime::TimeZone::juxta_type_count{$_})\n";
}

patch: DateTime::TimeZone PP redundant structures

Reply via email to