# ACGT String compression and decompression # # We compress to fixed length string, 2 bits per symbol. # # In order to recover the length of the string the ls bits of the last byte contain the # length of the string mod 4 -- this adds a complete zero byte if necessary. my %compress ; # string of up to 4 characters to compressed byte my @expand ; # compressed byte to string (except last byte) my @expand_l ; # compressed last byte to string BEGIN { my @ACGT = qw(A C G T) ; my $s = '' ; my $v = 0 ; foreach my $a (@ACGT) { $s = $a ; $compress{$s."\n"} = $compress{$s} = chr($v+1) ; foreach my $b (@ACGT) { $s = $a.$b ; $compress{$s."\n"} = $compress{$s} = chr($v+2) ; foreach my $c (@ACGT) { $s = $a.$b.$c ; $compress{$s."\n"} = $compress{$s} = chr($v+3) ; foreach my $d (@ACGT) { $s = $a.$b.$c.$d ; $compress{$s} = chr($v) ; $expand[$v++] = $s ; } ; } ; } ; } ; @expand_l = map substr($expand[$_], 0, ($_ & 3)), (0..255) ; $compress{"\n"} = "\0" ; } ; # Compress ACGT string (all upper case) with or without trailing "\n" # # $compressed = acgt_compress(ACGT_STRING) sub acgt_compress { my $c = '' ; $c .= $compress{$_} for unpack('(a4)*', $_[0]) ; return (length($_[0]) & 3) || (substr($_[0], -1) eq "\n") ? $c : $c . "\0" ; } ; # Decompress compressed ACGT # # $acgt_string = acgt_expand(COMPRESSED) sub acgt_expand { my $s = '' ; $s .= $expand[$_] for unpack('C*', substr($_[0], 0, -1)) ; return $s . $expand_l[ord(substr($_[0], -1))] ; } ;