--- Begin Message ---
Package: pristine-tar
Version: 1.24
examples:
Many (> 100) kde 4.8.2 packages including:
https://launchpad.net/ubuntu/precise/+source/marble/4:4.8.2-0ubuntu1/+files/marble_4.8.2.orig.tar.xz
https://launchpad.net/ubuntu/precise/+source/kdesdk/4:4.8.2-0ubuntu1/+files/kdesdk_4.8.2.orig.tar.xz
https://launchpad.net/ubuntu/precise/+source/kde-workspace/4:4.8.2-0ubuntu1/+files/kde-workspace_4.8.2.orig.tar.xz
The same multiple blocks files can be found at
ftp://ftp.kde.org/pub/kde/stable/4.8.4/src/ (examples includes kdesk,
kalzium, oxygen-icons, etc).
The following script is a first shot at parsing the .xz files and
extracting the relevant parameters (compression level, check and
blocks).
It may be used to enhance the xz file recognition but doesn't provide a
solution to rebuild multiple blocks xz files.
#!/usr/bin/perl
# Dotted numbers in comments refer to http://tukaani.org/xz/xz-file-format.txt
use warnings;
use strict;
use constant DEBUG => 1 ;
use Fcntl qw{:seek} ;
use Data::Dumper ;
sub checked_read {
# $buf should be a reference so we can still use 'read' and its magic
# trick
my ($in, $buf, $nb_to_read, $msg) = @_ ;
my $nb_read = read($in, $$buf, $nb_to_read) ;
die $msg if ($nb_read != $nb_to_read) ;
}
# see 1.2 with slight adaptation since we read from a file handle instead of
# using a char buffer
sub read_multibyte_integer {
my ($in, $size_max) = @_ ;
return undef, 0 if ($size_max == 0) ;
if ($size_max > 9) {
$size_max = 9 ;
}
my $buf ;
my $i = 1 ;
checked_read($in, \$buf, 1, "Corrupted xz file\n") ;
$buf = unpack('C', $buf) ;
warn sprintf("buf: %d / %d", $buf, $buf & 0x7f) if (DEBUG >= 9) ;
my $num = $buf & 0x7f ;
while ($buf & 0x80) {
return (undef, 0) if ($i >= $size_max) ;
checked_read($in, \$buf, 1, "Corrupted xz file\n") ;
$buf = unpack('C', $buf) ;
warn sprintf("buf: %d / %d", $buf, $buf & 0x7f) if (DEBUG >= 9) ;
$i++ ;
return (undef, 0) if ($buf == 0x00) ;
$num |= ($buf & 0x7f) << (($i - 1) * 7) ;
}
return ($num, $i) ;
}
sub decode_check_byte {
my ($byte) = @_ ;
my $check_key ;
# We use the xz --check values as values below, the Ids are used as keys.
my %known_check_of =
(
0x00 => 'none',
0x01 => 'crc32',
0x04 => 'crc64',
0x0A => 'sha256',
) ;
$check_key = $known_check_of{$byte} ;
if (!defined($check_key)) {
die sprintf("[%02X] is an unknown xz check\n", $byte) ;
}
return $check_key ;
}
sub decode_stream_flags {
my @flags = @_ ;
die "Unknown flags used in first format byte\n" if ($flags[0] != 0) ;
my $check = decode_check_byte($flags[1]) ;
return [$check] ;
}
sub parse_stream_header {
my ($in) = @_ ;
my ($buf) ;
checked_read($in, \$buf, 6, "Invalid xz file\n") ;
my $magic = unpack('H12', $buf) ;
die "Not an xz file\n" if ($magic ne 'fd377a585a00') ;
checked_read($in, \$buf, 2, "Corrupted xz file\n") ;
my @flags = unpack('CC', $buf) ;
my $flags = decode_stream_flags(@flags) ;
my $check = $flags->[0] ;
checked_read($in, \$buf, 4, "Corrupted xz file\n") ;
my $CRC32 = unpack('V', $buf) ;
return [$check, $CRC32] ;
}
my %lzma2_preset_from_dict_size_of =
(
0x00040000 => ['0'], # 256 KiB
0x00100000 => ['1'], # 1 MiB
0x00200000 => ['2'], # 2 MiB
0x00400000 => ['3', '4'], # 4 MiB
0x00800000 => ['5', '6'], # 8 MiB
0x01000000 => ['7'], # 16 MiB
0x02000000 => ['8'], # 32 MiB
0x04000000 => ['9'], # 64 MiB
) ;
sub decode_lzma2_props {
my ($props) = @_ ;
my $bits = unpack('C', $props) & 0x3f ;
# see 5.3.1
my $dict_size ;
if ($bits == 40) {
$dict_size = 0xFFFFFFFF ;
} else {
$dict_size = 2 | ($bits & 1) ;
$dict_size <<= $bits / 2 + 11 ;
}
my $presets = $lzma2_preset_from_dict_size_of{$dict_size} ;
die "Unkown dict size: $dict_size\n" if (!defined($presets)) ;
return {id => 'lzma2', presets => $presets} ;
}
my %props_decoder_of =
(
0x21 => \&decode_lzma2_props,
) ;
sub decode_filter_props {
my ($id, $props) = @_ ;
my $decoder = $props_decoder_of{$id} ;
die "$id is not a known filter\n" if (!defined($decoder)) ;
return &$decoder($props) ;
}
sub parse_block_header {
my ($in, $check) = @_ ;
my ($buf) ;
checked_read($in, \$buf, 1, "Corrupted xz file\n") ;
my $size = unpack('C', $buf) ;
$size = ($size + 1) * 4 ; # see 3.1.1
checked_read($in, \$buf, 1, "Corrupted xz file\n") ;
my $flags = unpack('C', $buf) ;
my $nb_filters = ($flags & 0x03) + 1 ; # 2 bits used to encode 1-4 values
my $must_be_zero = $flags & 0x3C ;
die "Reversed block flags non-zero\n" if ($must_be_zero != 0) ;
my $compressed_size_present = $flags & 0x40 ;
die "Not implemented\n" if ($compressed_size_present != 0) ;
my $uncompressed_size_present = $flags & 0x80 ;
die "Not implemented\n" if ($uncompressed_size_present != 0) ;
warn "flags: ".Dumper([$nb_filters, $must_be_zero,
$compressed_size_present,
$uncompressed_size_present])."\n"
if (DEBUG >= 2) ;
my $filters = [] ;
my $remaining_filters = $nb_filters ;
while ($remaining_filters)
{
my $filter ;
my ($lg, $id, $prop_size) ;
($id, $lg) = read_multibyte_integer($in, 9) ;
warn "Id: $id, lg: $lg\n" if (DEBUG >= 2) ;
($prop_size, $lg) = read_multibyte_integer($in, 9) ;
warn "Size: $prop_size, lg: $lg\n" if (DEBUG >= 2) ;
my $props ;
checked_read($in, \$props, $prop_size, "Corrupted xz file\n") ;
warn "Props: $props\n" if (DEBUG >= 2) ;
my $decoded_props = decode_filter_props($id, $props) ;
push @$filters, [$id, $decoded_props] ;
$remaining_filters-- ;
}
return [$size, $filters] ;
}
sub parse_indices {
my ($in, $backward_size) = @_ ;
seek($in, - $backward_size, SEEK_CUR) or die "Cannot seek to index - $!\n";
my ($buf) ;
checked_read($in, \$buf, 1, "Corrupted xz file\n") ;
my $indicator = unpack('C', $buf) ;
die "No index indicator ($buf)\n" if ($indicator != 0x00) ;
my ($lg, $nb_records) ;
($nb_records, $lg) = read_multibyte_integer($in, 9) ;
warn "nb records: $nb_records\n" if (DEBUG >= 3) ;
my $total_lg = $lg ;
my $records = [] ;
my $remaining_records = $nb_records ;
while ($remaining_records)
{
my $record ;
my ($unpadded, $uncompressed) ;
($unpadded, $lg) = read_multibyte_integer($in, 9) ;
warn "unpadded: $unpadded ($lg)\n" if (DEBUG >= 4) ;
$total_lg += $lg ;
($uncompressed, $lg) = read_multibyte_integer($in, 9) ;
warn "uncompressed: $uncompressed: ($lg)\n" if (DEBUG >= 4) ;
$total_lg += $lg ;
push @$records, [$unpadded, $uncompressed] ;
$remaining_records-- ;
}
checked_read($in, \$buf, 4, "Corrupted xz file\n") ;
my $CRC32 = unpack('V', $buf) ;
return [$CRC32, $records] ;
}
sub parse_stream_footer {
my ($in) = @_ ;
seek($in, 0, SEEK_END) or die "Cannot seek to footer- $!\n";
my $file_size = tell($in) ;
if ($file_size == -1 || $file_size % 4 != 0) {
die "Corrupted or invalid xz file\n"
}
# CRC32: 4, backward size: 4, stream flags: 2, magic bytes: 2
my $footer_size = 4 + 4 + 2 + 2 ;
seek($in, - $footer_size, SEEK_END) ;
my $buf ;
checked_read($in, \$buf, $footer_size, "Corrupted xz file\n") ;
my ($CRC32, $backward_size, $flag1, $flag2,
$magic) = unpack('VVC2H4', $buf) ;
$backward_size = ($backward_size + 1) * 4 ; # see 2.1.2.2
my $flags = decode_stream_flags($flag1, $flag2) ;
my $check = $flags->[0] ;
die "Not an xz file\n" if ($magic ne '595a') ; # aka 'YZ' see 2.1.2.4
warn "CRC32_f: $CRC32, backward size: $backward_size, check: $check\n"
if (DEBUG >= 2) ;
# seek back to the beginning of the footer
seek($in, - $footer_size, SEEK_END) ;
return [$CRC32, $backward_size, $check] ;
}
sub parse_xz {
my ($fname) = (shift) ;
open (my $in, "<", $fname) || die "$fname: $!";
warn "Parsing $fname" if (DEBUG >= 1) ;
my ($fhdr, $shdr, $bhdr, $sftr, $index);
$fhdr = parse_stream_header($in) ;
my ($check, $CRC32_h) = @$fhdr ;
warn sprintf("check: %s, CRC32_h: %X\n", $check, $CRC32_h) if (DEBUG >= 2) ;
$bhdr = parse_block_header($in, $check) ;
warn "bhdr:".Dumper($bhdr)."\n" if (DEBUG >= 2) ;
$sftr = parse_stream_footer($in, $check) ;
warn "sftr:".Dumper($sftr)."\n" if (DEBUG >= 2) ;
my ($CRC32_f, $backward_size, $check_f) = @$sftr ;
warn sprintf("check: %s, CRC32_f: %X\n", $check, $CRC32_f) if (DEBUG >= 2) ;
$index = parse_indices($in, $backward_size) ;
my ($CRC32_i, $indices) = @$index ;
warn sprintf("CRC32_i: %08X\n", $CRC32_i) if (DEBUG >= 2) ;
warn "indices:".Dumper($indices)."\n" if (DEBUG >= 2) ;
my $filters = $bhdr->[1] ;
warn "filters:".Dumper($filters)."\n" if (DEBUG >= 2) ;
die "Can't decode more than one filter\n" if ($#$filters > 0) ;
my $filter = $filters->[0]->[1] ;
warn "filter:".Dumper($filter)."\n" if (DEBUG >= 2) ;
printf("Id: %s, check: %s, presets: %s\n", $filter->{id}, $check,
join('|', @{$filter->{presets}})) ;
my @input_block_sizes = map {$_->[1]} @$indices ;
printf("Block(s): %s\n", join(", ", @input_block_sizes)) ;
close $in;
}
sub main {
if ($#ARGV != 0) {
die "A single file name is expected\n" ;
}
parse_xz($ARGV[0]) ;
}
main() ;
--- End Message ---