Description: |
Recently, in the Perl beginners list, someone had a bit of a quandary. They were reading a 600 MB file and needed to find a search term, grab from the file 200 bytes of data both before and after this term and then search for another term within that 'chunk' of data.
I thought this was such a fun problem that I went ahead and wrote the program for this person (yeah, I know, I gave him a fish). This is deliberately overcommented in case the person did not know a lot of Perl. The basic idea is to search the file and return 400 byte 'chunks' in an array. |
use strict;
use warnings;
use Data::Dumper;
# this is how far forward or back you need to read
my $width = 200;
# this is your target string. You can make it a regex if you prefer
my $target = 'search';
# file to search
my $file = 'test.txt';
my $fsize = -s $file;
# when you're done, this should contain the data you're looking for
my @chunks;
open FILE, "< $file" or die "Cannot open $file for reading: $!";
while (<FILE>)
{
if ( /$target/g )
{
my $file_position = tell FILE;
# backwards from end of string
my $word_position = $file_position - (length( $_ ) - pos( $_ )
+);
# to beginning of word. It's separate so you can
# pull it out if necessary.
$word_position -= length $target;
push @chunks, get_chunk( \*FILE, $word_position, $file_positio
+n, $width, $fsize );
}
}
print Dumper \@chunks;
close FILE;
sub get_chunk
{
my ( $fh, $word_position, $file_position, $width, $fsize ) = @_;
# don't try to read before beginning of file
my $start = $word_position >= $width
? $word_position - $width
: 0;
# don't try to read after end of file
my $end = $word_position + $width <= $fsize
? $word_position + $width
: $fsize;
# position to start of where we want to read
seek $fh, $start, 0;
my $chunk;
# shouldn't fail unless I got my boundaries wrong
read ( $fh, $chunk, $end-$start ) or die "Problem reading file: $!
+";
# put us back to where we were
seek $fh, $file_position, 0;
return $chunk;
}
|