#!/usr/bin/perl
use strict;
use warnings;
use B2B::BGZF::Reader;
use Digest::MD5;
use Digest::Perl::MD5;
my ($fn_compressed, $fn_uncompressed) = @ARGV;
# open filehandle to compressed data
my $fh_c = B2B::BGZF::Reader->new_filehandle( $fn_compressed )
or die "failed to open compressed file";
# open filehandle to uncompressed data
open my $fh_u, '<:raw', $fn_uncompressed
or die "failed to open uncompressed file";
# compare pure-Perl vs XS behavior
for my $class (qw/ Digest::Perl::MD5 Digest::MD5 /) {
seek $fh_c, 0, 0;
seek $fh_u, 0, 0;
my $md5_c = $class->new()->addfile($fh_c)->hexdigest;
my $md5_u = $class->new()->addfile($fh_u)->hexdigest;
print "Compressed v Uncompressed ($class):\n";
print "\t$md5_c\n";
print "\t$md5_u\n";
}
####
package B2B::BGZF::Reader;
use v5.10.1;
use strict;
use warnings;
use Carp;
use Compress::Zlib;
use IO::Uncompress::RawInflate qw/rawinflate $RawInflateError/;
use constant BGZF_MAGIC => pack "H*", '1f8b0804';
use constant HEAD_BYTES => 12; # not including extra fields
use constant FOOT_BYTES => 8;
## no critic
# allow for filehandle tie'ing
sub TIEHANDLE { B2B::BGZF::Reader::new(@_) }
sub READ { B2B::BGZF::Reader::_read(@_) }
sub READLINE { B2B::BGZF::Reader::getline(@_) }
sub SEEK { B2B::BGZF::Reader::_seek(@_) }
sub CLOSE { close $_[0]->{fh} }
sub TELL { return $_[0]->{u_offset} }
sub EOF { return $_[0]->{buffer_len} == -1 }
# accessors
sub usize { return $_[0]->{u_file_size} };
## use critic
sub new_filehandle {
#-------------------------------------------------------------------------
# ARG 0 : BGZF input filename
#-------------------------------------------------------------------------
# RET 0 : Filehandle GLOB (internally an IO::File object tied to
# B2B::BGZF::Reader)
#-------------------------------------------------------------------------
my ($class, $fn_in) = @_;
croak "input filename required" if (! defined $fn_in);
open my $fh, '<', undef;
tie *$fh, $class, $fn_in or croak "failed to tie filehandle";
return $fh;
}
sub new {
#-------------------------------------------------------------------------
# ARG 0 : BGZF input filename
#-------------------------------------------------------------------------
# RET 0 : B2B::BGZF::Reader object
#-------------------------------------------------------------------------
my ($class, $fn_in) = @_;
my $self = bless {}, $class;
# initialize
$self->{fn_in} = $fn_in
or croak "Input name required";
# open compressed file in binary mode
open my $fh, '<:raw', $fn_in or croak "Failed to open input file"; ## no critic
$self->{fh} = $fh;
# these member variables allow for data extraction
$self->{buffer} = ''; # contents of currently uncompressed block
$self->{buffer_len} = 0; # save time on frequent length() calls
$self->{block_offset} = 0; # offset of current block in compressed data
$self->{buffer_offset} = 0; # offset of current pos in uncompressed block
$self->{block_size} = 0; # compressed size of current block
$self->{file_size} = -s $fn_in; # size of compressed file
# these variables are tracked to allow for full seek implementation
$self->{u_offset} = 0; # calculated current uncompressed offset
$self->{u_file_size} = 0; # size of uncompressed file (filled during
# indexing
$self->_generate_index(); # on-the-fly
# load initial block
$self->_load_block();
return $self;
}
sub _load_block {
#-------------------------------------------------------------------------
# ARG 0 : offset of block start in compressed file
#-------------------------------------------------------------------------
# no returns
#-------------------------------------------------------------------------
my ($self, $block_offset) = @_;
# loading a block should always reset buffer offset
$self->{buffer_offset} = 0;
# avoid reload of current block
return if (defined $block_offset
&& $block_offset == $self->{block_offset});
# if no offset given (e.g. sequential reads), move to next block
if (! defined $block_offset) {
$block_offset = $self->{block_offset} + $self->{block_size};
}
$self->{block_offset} = $block_offset;
# deal with EOF
croak "Read past file end (perhaps corrupted/truncated input?)"
if ($self->{block_offset} > $self->{file_size});
if ($self->{block_offset} == $self->{file_size}) {
$self->{buffer} = '';
$self->{buffer_len} = -1;
return;
}
# never assume we're already there
sysseek $self->{fh}, $self->{block_offset}, 0;
# parse block according to GZIP spec, including content inflation
my ($block_size, $uncompressed_size, $content)
= $self->_unpack_block(1);
$self->{block_size} = $block_size;
$self->{buffer_len} = $uncompressed_size;
$self->{buffer} = $content;
return;
}
sub _unpack_block {
#-------------------------------------------------------------------------
# ARG 0 : bool indicating whether to inflate (and return) actual payload
#-------------------------------------------------------------------------
# RET 0 : compressed block size
# RET 1 : uncompressed content size
# RET 2 : content (if ARG 0)
#-------------------------------------------------------------------------
my ($self, $do_unpack) = @_;
my @return_values;
my ($magic, $mod, $flags, $os, $len_extra) = unpack 'A4A4CCv',
_safe_sysread($self->{fh}, HEAD_BYTES);
my $t = sysseek $self->{fh}, 0, 1;
croak "invalid header at $t (corrupt file or not BGZF?)"
if ($magic ne BGZF_MAGIC);
# extract stated block size according to BGZF spec
my $block_size;
my $l = 0;
while ($l < $len_extra) {
my ($field_id, $field_len) = unpack 'A2v',
_safe_sysread($self->{fh}, 4);
if ($field_id eq 'BC') {
croak "invalid BC length" if ($field_len != 2);
croak "multiple BC fields" if (defined $block_size);
$block_size = unpack 'v',
_safe_sysread($self->{fh} => $field_len);
$block_size += 1; # convert to 1-based
}
$l += 4 + $field_len;
}
croak "invalid extra field length" if ($l != $len_extra);
croak "failed to read block size" if (! defined $block_size);
push @return_values, $block_size;
my $payload_len = $block_size - HEAD_BYTES - FOOT_BYTES - $len_extra;
my $content;
if ($do_unpack) {
# decode actual content
my $payload = _safe_sysread($self->{fh}, $payload_len);
rawinflate( \$payload => \$content )
or croak "Error inflating: $RawInflateError\n";
my $crc_given = unpack 'V', _safe_sysread($self->{fh} => 4);
croak "content CRC32 mismatch" if ( $crc_given != crc32($content) );
}
else { sysseek $self->{fh}, $payload_len + 4, 1; }
# unpack and possibly check uncompressed payload size
my $size_given = unpack 'V', _safe_sysread($self->{fh} => 4);
croak "content length mismatch"
if ( defined $content && $size_given != length($content) );
push @return_values, $size_given;
push @return_values, $content if (defined $content);
return @return_values;
}
sub _read {
#-------------------------------------------------------------------------
# ARG 0 : buffer to write to
# ARG 1 : bytes to attempt to read
# ARG 3 : (optional) offset in buffer to start write (default: 0)
#-------------------------------------------------------------------------
# RET 0 : bytes read (0 at EOF, undef on error)
#-------------------------------------------------------------------------
# we try to emulate the built-in 'read' call, so we will
# overwrite $_[1] and return the number of bytes read. To facilitate this,
# make $buf a reference to the buffer passed
my $self = shift;
my $buf = \shift; # must be a reference !!
my $bytes = shift;
my $offset = shift;
# handle cases when $offset is passed in
my $prefix = '';
if (defined $offset && $offset != 0) {
$prefix = substr $$buf, 0, $offset;
$prefix .= "\0" x ( $offset - length($$buf) )
if ( $offset > length($$buf) );
}
$$buf = ''; # reset (only AFTER grabbing any prefix above)
ITER:
while (length($$buf) < $bytes) {
my $l = length($$buf);
my $remaining = $bytes - $l;
# if read is within current block
if ($self->{buffer_offset} + $remaining <= $self->{buffer_len}) {
$$buf .= substr $self->{buffer}, $self->{buffer_offset}, $remaining;
$self->{buffer_offset} += $remaining;
$self->_load_block()
if ($self->{buffer_offset} == $self->{buffer_len});
}
else {
last ITER if ($self->{buffer_len} < 0); #EOF
$$buf .= substr $self->{buffer}, $self->{buffer_offset};
$self->_load_block();
}
}
$$buf = $prefix . $$buf;
my $l = length($$buf);
$self->{u_offset} += $l;
return $l;
}
sub getline {
#-------------------------------------------------------------------------
# No arguments
#-------------------------------------------------------------------------
# RET 0 : string read (undef at EOF)
#-------------------------------------------------------------------------
my ($self) = @_;
my $data;
while (1) {
# return immediately if EOF
return $data if ($self->{buffer_len} < 0);
# search current block for record separator
# start searching from the current buffer offset
pos($self->{buffer}) = $self->{buffer_offset};
if ($self->{buffer} =~ m|$/|g) {
my $pos = pos $self->{buffer};
$data .= substr $self->{buffer}, $self->{buffer_offset},
$pos - $self->{buffer_offset};
$self->{buffer_offset} = $pos;
# if we're at the end of the block, load next
$self->_load_block if ($pos == $self->{buffer_len});
$self->{u_offset} += length($data);
return $data;
}
# otherwise, add rest of block to data and load next block
$data .= substr $self->{buffer}, $self->{buffer_offset};
$self->_load_block;
}
return;
}
sub _generate_index {
#-------------------------------------------------------------------------
# No arguments
#-------------------------------------------------------------------------
# No returns
#-------------------------------------------------------------------------
my ($self) = @_;
my $uncmp_offset = 0;
my $cmp_offset = 0;
my $i = 0;
$self->{u_file_size} = 0;
$self->{idx} = [];
$self->{ridx} = {};
sysseek $self->{fh}, 0, 0;
while ($cmp_offset < $self->{file_size}) {
push @{$self->{idx}}, [$cmp_offset, $uncmp_offset];
$self->{ridx}->{$cmp_offset} = $uncmp_offset;
my ($block_size, $uncompressed_size) = $self->_unpack_block(0);
$cmp_offset += $block_size;
$uncmp_offset += $uncompressed_size;
$self->{u_file_size} += $uncompressed_size;
}
sysseek $self->{fh}, $self->{block_offset}, 0;
return;
}
sub _seek {
#-------------------------------------------------------------------------
# ARG 0 : byte offset to which to seek
# ARG 1 : relativity of offset (0: file start, 1: current, 2: file end)
#-------------------------------------------------------------------------
# no returns
#-------------------------------------------------------------------------
my ($self, $pos, $whence) = @_;
$pos += $self->{u_offset} if ($whence == 1);
$pos = $self->{u_file_size} + $pos if ($whence == 2);
return if ($pos < 0 || $pos >= $self->{u_file_size});
# Do seeded search for nearest block start >= $pos
# (although we don't know the size of each block, we can determine the
# mean length and usually choose a starting value close to the actual -
# benchmarks much faster than binary search)
# TODO: benchmark whether breaking this out and Memoizing speeds things up
my $s = scalar( @{$self->{idx}} );
my $idx = int($pos/($self->{u_file_size}) * $s);
while (1) {
if ($pos < $self->{idx}->[$idx]->[1]) {
--$idx;
next;
}
if ($idx+1 < $s && $pos >= $self->{idx}->[$idx+1]->[1]) {
++$idx;
next;
}
last;
}
my $block_o = $self->{idx}->[$idx]->[0];
my $block_o_u = $self->{idx}->[$idx]->[1];
my $buff_o = $pos - $block_o_u;
$self->_load_block( $block_o );
$self->{buffer_offset} = $buff_o;
$self->{u_offset} = $block_o_u + $buff_o;
return 1;
}
sub _safe_sysread {
# sysread wrapper that checks return count and returns read
# (internally we should never read off end of file - doing so indicates
# either a software bug or a corrupt input file so we croak)
#-------------------------------------------------------------------------
# ARG 0 : bytes to read
#-------------------------------------------------------------------------
# RET 0 : string read
#-------------------------------------------------------------------------
my ($fh, $len) = @_;
my $buf = '';
my $r = sysread $fh, $buf, $len;
croak "returned unexpected byte count" if ($r != $len);
return $buf;
}
1;
####
H4sIBAAAAAAA/wYAQkMCAKsBXZJLbtxADET3PgVPoDsYQXb5ADGcPafFGRPo3zRJI8cP2S1P5CwE
SS2qWPXIX9RtZ6w7Euwtt8EXE3hv2bqiUgEaow3AlEywKluBq90QOg3pnBiVZYOvCldMnFmg8htn
oMpl6VG5G23wxeIOhavLC+1AfxJ1tcFHh1PLnjER6vb09HPnBpUksVWFajkj3A2942/KSx9Q3EtF
lwISL2p1g+94qwyDpumI5j/RQIW0bPTmPQ9/Am+cgE20bfBsCixKU0robuwuXuIOcS37qdX4tFo+
YtSQxuJ+xAp51wnKJR/v7vNKwx8dU0GevacHK+cUgx4Hq+biMDwDZb7Fzxu8OocCXM/QtKmfhcnJ
fpAPZ7VKbXjeGNMP1xdAi/rHvILXN7xMF+9OlW0cqxA17vdIPc9nF890WofirCO280LqtKgBuolj
tE7wVaHj4CPefxsX02kOkkZiz8Ktur1kueMn0jsnxY/RtlLd+z/4Z7QHrrWIwlU38D3STysboGSO
aylhwI5kL7FoTsi3LhblOI9y3P4CBU6J0CwDAAAfiwgEAAAAAAD/BgBCQwIAGwADAAAAAAAAAAAA