in reply to Creating very long strings from text files (DNA sequences)
Not sure if this is useful or not, but it does load this 165MB FASTA file in 1.156 seconds and 165MB of ram using 5.8.3. The iteration performance isn't too bad either.
package FASTA::Faster; use strict; use warnings; use Carp; my %raw; my %seq; our $DEBUG = 0; sub TIEHASH { $DEBUG and carp "TIEHASH: @_"; my( $class, $file, @options ) = @_; my $self = bless \$file, $class; open my $in, '< :raw', $file or croak "$file : $!"; sysread( $in, $raw{ $self }, -s $file ) or die "$!"; close $in; $raw{ $self } .= "\n>"; ## Update: Make sure we capture the las +t record. $seq{ $self }{ $1 } = \substr( $raw{ $self }, $-[ 2 ], $+[ 2 ] - $ +-[ 2 ] ) while $raw{ $self } =~ m[>(\S+)\s[^\n]*?\n(.*?)\n(?=>)]sg; return $self; } use constant { SELF => 0, KEY => 1, }; sub FETCH { $DEBUG and carp "FETCH: @_"; my $value = ${ $seq{ $_[ SELF ] }{ $_[ KEY ] } }; $value =~ tr[\n][]d; $value; } sub EXISTS { $DEBUG and carp "EXISTS: @_"; exists $seq{ $_[ SELF ] }{ $_[ KEY ] }; } sub FIRSTKEY { $DEBUG and carp "FIRSTKEY: @_"; keys %{ $seq{ $_[ SELF ] } }; each %{ $seq{ $_[ SELF ] } }; } sub NEXTKEY { $DEBUG and carp "NEXTKEY: @_"; each %{ $seq{ $_[ SELF ] } }; } sub SCALAR { $DEBUG and carp "SCALAR: @_"; croak 'Not implemented'; } sub STORE { $DEBUG and carp "STORE: @_"; croak 'Not implemented'; } sub DELETE { $DEBUG and carp "DELETE: @_"; croak 'Not implemented'; } return 1 if caller; package main; use Benchmark::Timer; my $T = new Benchmark::Timer; local $\=$/; my %sequence; $T->start( 'load' ); my $seqRef = tie %sequence, 'FASTA::Faster', 'na_clones.dros.RELEASE2. +5'; $T->stop( 'load' ); $T->start( 'keys' ); map $_, keys %sequence; $T->stop( 'keys' ); print scalar keys %sequence; $T->start( 'values' ); map $_, values %sequence; $T->stop( 'values' ); print scalar values %sequence; $T->report; printf 'Check memory'; <STDIN>; my( $key, $value ); print "$key =>\n$value\n" while ( $key, $value ) = each %sequence; __END__ P:\test\FASTA>perl faster.pm 940 940 1 trial of load (1.165s total) 1 trial of keys (12.100ms total) 1 trial of values (12.311ms total)
|
|---|