#!/usr/bin/perl use constant ONE_BYTE => 1; use constant FOUR_BYTE => 4; use constant BITS_PER_BYTE => 8; use constant BASES_PER_FOUR_BYTE => 16; use strict; use warnings; my $file = $ARGV[0] or die "Usage: $0 "; open(my $fh, '<', $file) or die "Unable to open '$file' for reading: $!"; my $header = parse_header($fh); my $count = $header->{CNT}; my %toc; populate_toc($fh, $count, \%toc); for my $name (keys %toc) { my $offset = $toc{$name}; my $dna = fetch_record($fh, $offset); #print length($dna), "\n"; print "$dna\n"; } sub parse_header { my ($fh) = @_; # Read header my $raw = ''; sysread($fh, $raw, FOUR_BYTE * 4); # Parse header my ($sig, $ver, $cnt, $reserved) = unpack('l4', $raw); # TODO: validate (signature, reverse byte order, version) return {SIG => $sig, VER => $ver, CNT => $cnt, RSV => $reserved}; } sub populate_toc { my ($fh, $count, $toc) = @_; my ($raw, $size, $name) = ('', '', ''); for (1 .. $count) { # Read size of record name sysread($fh, $raw, ONE_BYTE); $size = unpack('C', $raw); # Read name of reacord sysread($fh, $name, $size); # Read and store offset sysread($fh, $raw, FOUR_BYTE); $toc->{$name} = unpack('l', $raw); } } sub fetch_record { my ($fh, $offset) = @_; my ($raw, $dna, $size, $cnt) = ('', '', '', ''); my (@start, @len, %nblock, %mblock); # Seek to the record location sysseek($fh, $offset, 0); # Establish the conversion table my %conv = ('00' => 'T', '01' => 'C', '10' => 'A', '11' => 'G'); # Fetch the DNA size sysread($fh, $raw, FOUR_BYTE); $size = unpack('l', $raw); # Handle the n block and m blocks for my $block (\%nblock, \%mblock) { # Fetch the n block count sysread($fh, $raw, FOUR_BYTE); $cnt = unpack('l', $raw); if ($cnt) { sysread($fh, $raw, FOUR_BYTE * $cnt); @start = unpack("l$cnt", $raw); sysread($fh, $raw, FOUR_BYTE * $cnt); @len = unpack("l$cnt", $raw); @{$block}{@start} = @len; } } # throw away reserved field sysread($fh, $raw, FOUR_BYTE); # Fetch DNA - TODO: read in configurable size chunks my $bytes = ((int($size / BASES_PER_FOUR_BYTE)) + 1) * FOUR_BYTE; sysread($fh, $raw, $bytes); $dna = join '', map $conv{$_}, unpack('(A2)*', unpack("B" . $bytes * BITS_PER_BYTE , $raw)); # Fix N blocks substr($dna, $_, $nblock{$_}, 'N' x $nblock{$_}) for keys %nblock; # Fix M blocks substr($dna, $_, $mblock{$_}, lc(substr($dna, $_, $mblock{$_}))) for keys %mblock; return substr($dna, 0, $size); } __END__ See http://genome.ucsc.edu/FAQ/FAQformat#format7 See also human genome as .2bit ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/bigZips/hg18.2bit TODO 1. Documentation, error handling, and tests 2. Profiling and benchmarking 3. Handle big/little endian 4. Dynamically determine if record count justifies SQLite versus in-memory hash 5. Allow the reading of DNA sequence in configurable size chunks 6. Add an API to treat the sequence like an iterator