Beefy Boxes and Bandwidth Generously Provided by pair Networks
Your skill will accomplish
what the force of many cannot
 
PerlMonks  

Re^2: Digest::MD5 addfile() w/ tied filehandle

by jdv (Sexton)
on Aug 30, 2015 at 20:03 UTC ( [id://1140465]=note: print w/replies, xml ) Need Help??


in reply to Re: Digest::MD5 addfile() w/ tied filehandle
in thread Digest::MD5 addfile() w/ tied filehandle

Here is a MWE:

#!/usr/bin/perl use strict; use warnings; use B2B::BGZF::Reader; use Digest::MD5; use Digest::Perl::MD5; my ($fn_compressed, $fn_uncompressed) = @ARGV; # open filehandle to compressed data my $fh_c = B2B::BGZF::Reader->new_filehandle( $fn_compressed ) or die "failed to open compressed file"; # open filehandle to uncompressed data open my $fh_u, '<:raw', $fn_uncompressed or die "failed to open uncompressed file"; # compare pure-Perl vs XS behavior for my $class (qw/ Digest::Perl::MD5 Digest::MD5 /) { seek $fh_c, 0, 0; seek $fh_u, 0, 0; my $md5_c = $class->new()->addfile($fh_c)->hexdigest; my $md5_u = $class->new()->addfile($fh_u)->hexdigest; print "Compressed v Uncompressed ($class):\n"; print "\t$md5_c\n"; print "\t$md5_u\n"; }

Below is a pared-down version of the package to use with the above. For the test files, I will include following that a Base64-encoded BGZF file (is there a better way to attach binary files?). Once decoded it can be used as the compressed file. Since BGZF is a backward-compatible variant of gzip, you can use gunzip to get the original uncompressed data to use for the example. Hopefully this provides a complete working example.

package B2B::BGZF::Reader; use v5.10.1; use strict; use warnings; use Carp; use Compress::Zlib; use IO::Uncompress::RawInflate qw/rawinflate $RawInflateError/; use constant BGZF_MAGIC => pack "H*", '1f8b0804'; use constant HEAD_BYTES => 12; # not including extra fields use constant FOOT_BYTES => 8; ## no critic # allow for filehandle tie'ing sub TIEHANDLE { B2B::BGZF::Reader::new(@_) } sub READ { B2B::BGZF::Reader::_read(@_) } sub READLINE { B2B::BGZF::Reader::getline(@_) } sub SEEK { B2B::BGZF::Reader::_seek(@_) } sub CLOSE { close $_[0]->{fh} } sub TELL { return $_[0]->{u_offset} } sub EOF { return $_[0]->{buffer_len} == -1 } # accessors sub usize { return $_[0]->{u_file_size} }; ## use critic sub new_filehandle { #----------------------------------------------------------------- +-------- # ARG 0 : BGZF input filename #----------------------------------------------------------------- +-------- # RET 0 : Filehandle GLOB (internally an IO::File object tied to # B2B::BGZF::Reader) #----------------------------------------------------------------- +-------- my ($class, $fn_in) = @_; croak "input filename required" if (! defined $fn_in); open my $fh, '<', undef; tie *$fh, $class, $fn_in or croak "failed to tie filehandle"; return $fh; } sub new { #----------------------------------------------------------------- +-------- # ARG 0 : BGZF input filename #----------------------------------------------------------------- +-------- # RET 0 : B2B::BGZF::Reader object #----------------------------------------------------------------- +-------- my ($class, $fn_in) = @_; my $self = bless {}, $class; # initialize $self->{fn_in} = $fn_in or croak "Input name required"; # open compressed file in binary mode open my $fh, '<:raw', $fn_in or croak "Failed to open input file"; + ## no critic $self->{fh} = $fh; # these member variables allow for data extraction $self->{buffer} = ''; # contents of currently uncompressed +block $self->{buffer_len} = 0; # save time on frequent length() call +s $self->{block_offset} = 0; # offset of current block in compress +ed data $self->{buffer_offset} = 0; # offset of current pos in uncompress +ed block $self->{block_size} = 0; # compressed size of current block $self->{file_size} = -s $fn_in; # size of compressed file # these variables are tracked to allow for full seek implementatio +n $self->{u_offset} = 0; # calculated current uncompressed off +set $self->{u_file_size} = 0; # size of uncompressed file (filled d +uring # indexing $self->_generate_index(); # on-the-fly # load initial block $self->_load_block(); return $self; } sub _load_block { #----------------------------------------------------------------- +-------- # ARG 0 : offset of block start in compressed file #----------------------------------------------------------------- +-------- # no returns #----------------------------------------------------------------- +-------- my ($self, $block_offset) = @_; # loading a block should always reset buffer offset $self->{buffer_offset} = 0; # avoid reload of current block return if (defined $block_offset && $block_offset == $self->{block_offset}); # if no offset given (e.g. sequential reads), move to next block if (! defined $block_offset) { $block_offset = $self->{block_offset} + $self->{block_size}; } $self->{block_offset} = $block_offset; # deal with EOF croak "Read past file end (perhaps corrupted/truncated input?)" if ($self->{block_offset} > $self->{file_size}); if ($self->{block_offset} == $self->{file_size}) { $self->{buffer} = ''; $self->{buffer_len} = -1; return; } # never assume we're already there sysseek $self->{fh}, $self->{block_offset}, 0; # parse block according to GZIP spec, including content inflation my ($block_size, $uncompressed_size, $content) = $self->_unpack_block(1); $self->{block_size} = $block_size; $self->{buffer_len} = $uncompressed_size; $self->{buffer} = $content; return; } sub _unpack_block { #----------------------------------------------------------------- +-------- # ARG 0 : bool indicating whether to inflate (and return) actual p +ayload #----------------------------------------------------------------- +-------- # RET 0 : compressed block size # RET 1 : uncompressed content size # RET 2 : content (if ARG 0) #----------------------------------------------------------------- +-------- my ($self, $do_unpack) = @_; my @return_values; my ($magic, $mod, $flags, $os, $len_extra) = unpack 'A4A4CCv', _safe_sysread($self->{fh}, HEAD_BYTES); my $t = sysseek $self->{fh}, 0, 1; croak "invalid header at $t (corrupt file or not BGZF?)" if ($magic ne BGZF_MAGIC); # extract stated block size according to BGZF spec my $block_size; my $l = 0; while ($l < $len_extra) { my ($field_id, $field_len) = unpack 'A2v', _safe_sysread($self->{fh}, 4); if ($field_id eq 'BC') { croak "invalid BC length" if ($field_len != 2); croak "multiple BC fields" if (defined $block_size); $block_size = unpack 'v', _safe_sysread($self->{fh} => $field_len); $block_size += 1; # convert to 1-based } $l += 4 + $field_len; } croak "invalid extra field length" if ($l != $len_extra); croak "failed to read block size" if (! defined $block_size); push @return_values, $block_size; my $payload_len = $block_size - HEAD_BYTES - FOOT_BYTES - $len_ext +ra; my $content; if ($do_unpack) { # decode actual content my $payload = _safe_sysread($self->{fh}, $payload_len); rawinflate( \$payload => \$content ) or croak "Error inflating: $RawInflateError\n"; my $crc_given = unpack 'V', _safe_sysread($self->{fh} => 4); croak "content CRC32 mismatch" if ( $crc_given != crc32($conte +nt) ); } else { sysseek $self->{fh}, $payload_len + 4, 1; } # unpack and possibly check uncompressed payload size my $size_given = unpack 'V', _safe_sysread($self->{fh} => 4); croak "content length mismatch" if ( defined $content && $size_given != length($content) ); push @return_values, $size_given; push @return_values, $content if (defined $content); return @return_values; } sub _read { #----------------------------------------------------------------- +-------- # ARG 0 : buffer to write to # ARG 1 : bytes to attempt to read # ARG 3 : (optional) offset in buffer to start write (default: 0) #----------------------------------------------------------------- +-------- # RET 0 : bytes read (0 at EOF, undef on error) #----------------------------------------------------------------- +-------- # we try to emulate the built-in 'read' call, so we will # overwrite $_[1] and return the number of bytes read. To facilita +te this, # make $buf a reference to the buffer passed my $self = shift; my $buf = \shift; # must be a reference !! my $bytes = shift; my $offset = shift; # handle cases when $offset is passed in my $prefix = ''; if (defined $offset && $offset != 0) { $prefix = substr $$buf, 0, $offset; $prefix .= "\0" x ( $offset - length($$buf) ) if ( $offset > length($$buf) ); } $$buf = ''; # reset (only AFTER grabbing any prefix above) ITER: while (length($$buf) < $bytes) { my $l = length($$buf); my $remaining = $bytes - $l; # if read is within current block if ($self->{buffer_offset} + $remaining <= $self->{buffer_len} +) { $$buf .= substr $self->{buffer}, $self->{buffer_offset}, $ +remaining; $self->{buffer_offset} += $remaining; $self->_load_block() if ($self->{buffer_offset} == $self->{buffer_len}); } else { last ITER if ($self->{buffer_len} < 0); #EOF $$buf .= substr $self->{buffer}, $self->{buffer_offset}; $self->_load_block(); } } $$buf = $prefix . $$buf; my $l = length($$buf); $self->{u_offset} += $l; return $l; } sub getline { #----------------------------------------------------------------- +-------- # No arguments #----------------------------------------------------------------- +-------- # RET 0 : string read (undef at EOF) #----------------------------------------------------------------- +-------- my ($self) = @_; my $data; while (1) { # return immediately if EOF return $data if ($self->{buffer_len} < 0); # search current block for record separator # start searching from the current buffer offset pos($self->{buffer}) = $self->{buffer_offset}; if ($self->{buffer} =~ m|$/|g) { my $pos = pos $self->{buffer}; $data .= substr $self->{buffer}, $self->{buffer_offset}, $pos - $self->{buffer_offset}; $self->{buffer_offset} = $pos; # if we're at the end of the block, load next $self->_load_block if ($pos == $self->{buffer_len}); $self->{u_offset} += length($data); return $data; } # otherwise, add rest of block to data and load next block $data .= substr $self->{buffer}, $self->{buffer_offset}; $self->_load_block; } return; } sub _generate_index { #----------------------------------------------------------------- +-------- # No arguments #----------------------------------------------------------------- +-------- # No returns #----------------------------------------------------------------- +-------- my ($self) = @_; my $uncmp_offset = 0; my $cmp_offset = 0; my $i = 0; $self->{u_file_size} = 0; $self->{idx} = []; $self->{ridx} = {}; sysseek $self->{fh}, 0, 0; while ($cmp_offset < $self->{file_size}) { push @{$self->{idx}}, [$cmp_offset, $uncmp_offset]; $self->{ridx}->{$cmp_offset} = $uncmp_offset; my ($block_size, $uncompressed_size) = $self->_unpack_block(0) +; $cmp_offset += $block_size; $uncmp_offset += $uncompressed_size; $self->{u_file_size} += $uncompressed_size; } sysseek $self->{fh}, $self->{block_offset}, 0; return; } sub _seek { #----------------------------------------------------------------- +-------- # ARG 0 : byte offset to which to seek # ARG 1 : relativity of offset (0: file start, 1: current, 2: file + end) #----------------------------------------------------------------- +-------- # no returns #----------------------------------------------------------------- +-------- my ($self, $pos, $whence) = @_; $pos += $self->{u_offset} if ($whence == 1); $pos = $self->{u_file_size} + $pos if ($whence == 2); return if ($pos < 0 || $pos >= $self->{u_file_size}); # Do seeded search for nearest block start >= $pos # (although we don't know the size of each block, we can determine + the # mean length and usually choose a starting value close to the act +ual - # benchmarks much faster than binary search) # TODO: benchmark whether breaking this out and Memoizing speeds t +hings up my $s = scalar( @{$self->{idx}} ); my $idx = int($pos/($self->{u_file_size}) * $s); while (1) { if ($pos < $self->{idx}->[$idx]->[1]) { --$idx; next; } if ($idx+1 < $s && $pos >= $self->{idx}->[$idx+1]->[1]) { ++$idx; next; } last; } my $block_o = $self->{idx}->[$idx]->[0]; my $block_o_u = $self->{idx}->[$idx]->[1]; my $buff_o = $pos - $block_o_u; $self->_load_block( $block_o ); $self->{buffer_offset} = $buff_o; $self->{u_offset} = $block_o_u + $buff_o; return 1; } sub _safe_sysread { # sysread wrapper that checks return count and returns read # (internally we should never read off end of file - doing so indi +cates # either a software bug or a corrupt input file so we croak) #----------------------------------------------------------------- +-------- # ARG 0 : bytes to read #----------------------------------------------------------------- +-------- # RET 0 : string read #----------------------------------------------------------------- +-------- my ($fh, $len) = @_; my $buf = ''; my $r = sysread $fh, $buf, $len; croak "returned unexpected byte count" if ($r != $len); return $buf; } 1;

And the Base64-encoded BGZF:

H4sIBAAAAAAA/wYAQkMCAKsBXZJLbtxADET3PgVPoDsYQXb5ADGcPafFGRPo3zRJI8cP2S +1P5CwE SS2qWPXIX9RtZ6w7Euwtt8EXE3hv2bqiUgEaow3AlEywKluBq90QOg3pnBiVZYOvCldMnF +mg8htn oMpl6VG5G23wxeIOhavLC+1AfxJ1tcFHh1PLnjER6vb09HPnBpUksVWFajkj3A2942/KSx +9Q3EtF lwISL2p1g+94qwyDpumI5j/RQIW0bPTmPQ9/Am+cgE20bfBsCixKU0robuwuXuIOcS37qd +X4tFo+ YtSQxuJ+xAp51wnKJR/v7vNKwx8dU0GevacHK+cUgx4Hq+biMDwDZb7Fzxu8OocCXM/QtK +mfhcnJ fpAPZ7VKbXjeGNMP1xdAi/rHvILXN7xMF+9OlW0cqxA17vdIPc9nF890WofirCO280LqtK +gBuolj tE7wVaHj4CPefxsX02kOkkZiz8Ktur1kueMn0jsnxY/RtlLd+z/4Z7QHrrWIwlU38D3STy +sboGSO aylhwI5kL7FoTsi3LhblOI9y3P4CBU6J0CwDAAAfiwgEAAAAAAD/BgBCQwIAGwADAAAAAA +AAAAAA

Log In?
Username:
Password:

What's my password?
Create A New User
Domain Nodelet?
Node Status?
node history
Node Type: note [id://1140465]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this?Last hourOther CB clients
Other Users?
Others lurking in the Monastery: (3)
As of 2024-04-25 16:29 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    No recent polls found