#!/usr/bin/perl
use constant            ONE_BYTE => 1;
use constant           FOUR_BYTE => 4;
use constant       BITS_PER_BYTE => 8;
use constant BASES_PER_FOUR_BYTE => 16;
use strict;
use warnings;

my $file = $ARGV[0] or die "Usage: $0 <input>";
open(my $fh, '<', $file) or die "Unable to open '$file' for reading: $!";

my $header = parse_header($fh);
my $count  = $header->{CNT};

my %toc;
populate_toc($fh, $count, \%toc);

for my $name (keys %toc) {
    my $offset = $toc{$name};
    my $dna = fetch_record($fh, $offset);
    #print length($dna), "\n";
    print "$dna\n";
}

sub parse_header {
    my ($fh) = @_;

    # Read header
    my $raw = '';
    sysread($fh, $raw, FOUR_BYTE * 4);

    # Parse header
    my ($sig, $ver, $cnt, $reserved) = unpack('l4', $raw);

    # TODO: validate (signature, reverse byte order, version)
    return {SIG => $sig, VER => $ver, CNT => $cnt, RSV => $reserved};
}

sub populate_toc {
    my ($fh, $count, $toc) = @_;

    my ($raw, $size, $name) = ('', '', '');
    for (1 .. $count) {

        # Read size of record name
        sysread($fh, $raw, ONE_BYTE);
        $size = unpack('C', $raw);

        # Read name of reacord
        sysread($fh, $name, $size);

        # Read and store offset
        sysread($fh, $raw, FOUR_BYTE);
        $toc->{$name} = unpack('l', $raw);
    }
}

sub fetch_record {
    my ($fh, $offset) = @_;

    my ($raw, $dna, $size, $cnt) = ('', '', '', '');
    my (@start, @len, %nblock, %mblock);

    # Seek to the record location
    sysseek($fh, $offset, 0);

    # Establish the conversion table
    my %conv = ('00' => 'T', '01' => 'C', '10' => 'A', '11' => 'G');

    # Fetch the DNA size
    sysread($fh, $raw, FOUR_BYTE);
    $size = unpack('l', $raw);

    # Handle the n block and m blocks
    for my $block (\%nblock, \%mblock) {

        # Fetch the n block count
        sysread($fh, $raw, FOUR_BYTE);
        $cnt = unpack('l', $raw);

        if ($cnt) {
            sysread($fh, $raw, FOUR_BYTE * $cnt);
            @start = unpack("l$cnt", $raw);
            sysread($fh, $raw, FOUR_BYTE * $cnt);
            @len = unpack("l$cnt", $raw);
            @{$block}{@start} = @len;
        }
    }

    # throw away reserved field
    sysread($fh, $raw, FOUR_BYTE);

    # Fetch DNA - TODO:  read in configurable size chunks
    my $bytes = ((int($size / BASES_PER_FOUR_BYTE)) + 1) * FOUR_BYTE;
    sysread($fh, $raw, $bytes);
    $dna = join '', 
           map $conv{$_},
           unpack('(A2)*', unpack("B" . $bytes * BITS_PER_BYTE , $raw));

    # Fix N blocks
    substr($dna, $_, $nblock{$_}, 'N' x $nblock{$_}) for keys %nblock;

    # Fix M blocks
    substr($dna, $_, $mblock{$_}, lc(substr($dna, $_, $mblock{$_}))) for keys %mblock;

    return substr($dna, 0, $size);
}
__END__
See http://genome.ucsc.edu/FAQ/FAQformat#format7
See also human genome as .2bit
ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/bigZips/hg18.2bit

TODO
1.  Documentation, error handling, and tests
2.  Profiling and benchmarking
3.  Handle big/little endian
4.  Dynamically determine if record count justifies SQLite versus in-memory hash
5.  Allow the reading of DNA sequence in configurable size chunks
6.  Add an API to treat the sequence like an iterator