Re: [Bacula-users] Finding largest file(s) backed up for a specific job

Geert Stappers Tue, 05 Apr 2011 00:03:11 -0700

Op 20110401 om 04:30 schreef John Stoffel:
> 
> John> I've been happily running bacula at home, and usualy it's pretty
> John> predictable about the size of data backed up each night, but
> John> last night I had an incremental run for a specfic client which
> John> used 8Gb of data, when I normally expect around 500mb or so.
> 
> John> Is there an easy mysql query I can use to:
> 
> John> a) find the largest file(s) backed up for a particular jobid?
> 
> Ok, I've googled, read back in the archives about the File.LStat field
> and how the file size is in a strange Base64 (almost) encodiing which
> needs to be hacked on to get out the sizes.
> 
> Looks like I'll just need to:
> 
> 1. write a perl script to read in a jobid, then pull out the files
> from that job.
> 2. For each file, pull out the File.LStat field and decode the info.
> 3. And finally then I can search for the largest file(s).  
> 
> I'll post something when I'm done.  I'd prefer to do it in mysql
> directly, but it looks too funky to parse the string properly without
> writing really crazy (to me!) SQL procedures or cursors.  Not worth
> it.


On march 15th posted Kjetil Torgrim Homme his "bacula-du" script
to this mailinglist. The 'du' in the name is from "disk usage",
same as the UNIX command `du`. Subject line was/is "file listing"

Find attached my modified version plus a diff.


Hope this helps
Geert Stappers
Sysadmin

#! /usr/bin/perl -w

# bacula-du 1.0
# Written by Kjetil Torgrim Homme <kjetil.ho...@redpill-linpro.com>
# Released under GPLv3 or the same terms as Bacula itself

sub usage {
    print <<"_END_";
Usage: $0 [OPTIONS] -j JOBID
Summarize disk usage of directories included in the backup JOBID

Options are:
  -a, --all             write counts for all files, not just directories
  -b, --bytes           use size in octets rather than number of blocks
  -B, --block-size=SIZE report SIZE-byte blocks (default 1Ki)
  -m                    like --block-size=1Mi
  -S, --separate-dirs   do not include size of subdirectories
  -t, --threshold=SIZE  skip output for files or directories with usage
                        below SIZE
  -L, --largest=NUM     only print NUM largest directories/files

SIZE may be (or may be an integer optionally followed by) one of following:
k (1000), Ki (1024), M (1000*1000), Mi (1024*1024), G, Gi, T, Ti, P, Pi.
_END_
   exit(64);
}

use strict;
use DBD::mysql;
use DBI;
use MIME::Base64;
use Getopt::Long qw(:config bundling no_ignore_case);
use Data::Dumper;

my $dbhost = "localhost";
my $db = "bacula";
my $dsn = "DBI:mysql:database=$db;mysql_read_default_group=clientp";
my $dbuser = $db;
my $dbpass = undef;

#######################

my $i = 0;
my %base64 = map { $_ => $i++ } split("", 
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/");

sub decode_bacula_base64 {
    my $acc = 0;
    for (split("", $_[0])) {
        $acc <<= 6;
        $acc += $base64{$_};
    }
    return $acc;
}

sub extract_size_from_lstat {
    return decode_bacula_base64((split(" ", shift))[7]);
}

sub extract_blocks_from_lstat {
    return 512 * decode_bacula_base64((split(" ", shift))[9]);
}

sub convert_units {
    my $num = shift;

    my %units = ("k" => 1000**1, "Ki" => 1024**1, "ki" => 1024**1,
                 "M" => 1000**2, "Mi" => 1024**2,
                 "G" => 1000**3, "Gi" => 1024**3,
                 "T" => 1000**4, "Ti" => 1024**4,
                 "P" => 1000**5, "Pi" => 1024**5);

    if ($num =~ /^(\d*)([kKMGTP]i?)B?$/) {
        $num = ($1 ? $1 : 1) * $units{$2};
    } elsif ($num !~ /^\d+$/) {
        die "Can't parse: $num\n";
    }
    return $num;
}

### main program resumes

my $threshold = 1; # omit 0 octet sized files/directories by default
my $blocksize = 1024;
my ($jobid, $all, $bytes, $separate_dirs, $largest);

GetOptions("jobid|j=i" => \$jobid,
           "threshold|t=s" => \$threshold,
           "separate-dirs|S" => \$separate_dirs,
           "all|a" => \$all,
           "bytes|b" => \$bytes,
           "block-size|B=s" => \$blocksize,
           "largest|L=i" => \$largest,
           "m" => sub { $blocksize = "1Mi" },
    ) || usage();

usage() unless $jobid;

$threshold = convert_units($threshold);
$blocksize = convert_units($blocksize);


my @padding = ("", "A==", "==", "=");

sub extract_size_from_lstat_foo {
    my ($b64) = (split(" ", shift))[7];

    my $acc = 0;
    for (split("", decode_base64($b64 . $padding[length($b64) % 4]))) {
        $acc <<= 8;
        $acc += ord($_);
    }
    return $acc;
}

my $extract_size = $bytes
    ? \&extract_size_from_lstat
    : \&extract_blocks_from_lstat;

my $dbh;
unless ($dbh = DBI->connect($dsn, $dbuser, $dbpass, {AutoCommit => 0})) {
    print STDERR "Could not connect to database $db on host $dbhost\n";
    exit 2;
}

    print STDERR "DB connect \n";
my $sth = $dbh->prepare("
   SELECT p.Path, fn.Name, LStat
   FROM Path p
     JOIN File f ON f.PathId = p.PathId
     JOIN Filename fn ON f.FilenameId = fn.FilenameId
   WHERE f.JobId = $jobid");
$sth->execute();

    print STDERR "DB prepare \n";
my %du;
my $rowcount = 0;
while (my ($path, $fname, $lstat) = $sth->fetchrow_array) {
    my $size = $extract_size->($lstat);
    print STDERR "Got '$path' size $size\n";
    $du{"$path$fname"} += $size if $all;
    $du{$path} += $size;
    next if $separate_dirs;
    while ($path ne '/') {
        $path =~ s,[^/]+/$,,;
        $du{$path} += $size;
    }
    if ((++$rowcount % 1000) == 0) {
        print STDERR "got $rowcount rows\r";
    }
}
$dbh->disconnect();
print STDERR "done reading database.\n";

if ($largest) {
    my @sizes = sort { $a <=> $b } values %du;
    my $cutoff = $largest < @sizes ? $sizes[-$largest] : 0;
    $threshold = $cutoff unless ($threshold && $threshold > $cutoff);
}

# We add ~ to the filename so that the parent directory is printed
# below the children.  ('~' could be any character which sorts after
# '/')
for my $path (sort { "$a~" cmp "$b~" } keys %du) {
    next if $du{$path} < $threshold;
    printf("%9d %s\n", ($du{$path} + $blocksize - 1) / $blocksize, $path);
}

diff --git a/bacula/scripts/bacula-du b/bacula/scripts/bacula-du
old mode 100644
new mode 100755
index e32b191..0e165ce
--- a/bacula/scripts/bacula-du
+++ b/bacula/scripts/bacula-du
@@ -34,13 +34,9 @@ use Data::Dumper;
 
 my $dbhost = "localhost";
 my $db = "bacula";
-my $dsn = "DBI:Pg:dbname=$db;host=$dbhost";
-my $dbuser = "postgres";
-my $dbpass = "";
-# Suggestion for MySQL:
-# my $dsn = "DBI:mysql:database=mysql;mysql_read_default_group=clientp";
-# my $dbuser = "mysql";
-# my $dbpass = undef;
+my $dsn = "DBI:mysql:database=$db;mysql_read_default_group=clientp";
+my $dbuser = $db;
+my $dbpass = undef;
 
 #######################
 
@@ -126,6 +122,7 @@ unless ($dbh = DBI->connect($dsn, $dbuser, $dbpass, {AutoCommit => 0})) {
     exit 2;
 }
 
+    print STDERR "DB connect \n";
 my $sth = $dbh->prepare("
    SELECT p.Path, fn.Name, LStat
    FROM Path p
@@ -134,11 +131,12 @@ my $sth = $dbh->prepare("
    WHERE f.JobId = $jobid");
 $sth->execute();
 
+    print STDERR "DB prepare \n";
 my %du;
 my $rowcount = 0;
 while (my ($path, $fname, $lstat) = $sth->fetchrow_array) {
     my $size = $extract_size->($lstat);
-    # print STDERR "Got '$path' size $size\n";
+    print STDERR "Got '$path' size $size\n";
     $du{"$path$fname"} += $size if $all;
     $du{$path} += $size;
     next if $separate_dirs;

------------------------------------------------------------------------------
Xperia(TM) PLAY
It's a major breakthrough. An authentic gaming
smartphone on the nation's most reliable network.
And it wants your games.
http://p.sf.net/sfu/verizon-sfdev

_______________________________________________
Bacula-users mailing list
Bacula-users@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/bacula-users

Re: [Bacula-users] Finding largest file(s) backed up for a specific job

Reply via email to