#!/usr/bin/perl

use Getopt::Long;
use warnings;
use strict;

# Retrieves the normalized values in each bin in a window
# around some peak region. The output is designed to be used
# to plot a heatmap in R.

# init default variables
my $bin_size = 50;
my $scaling_factor = 10000000;
my $number_of_bins = 80;
my $fragment_length = 200;
my $bin_count = 0;
my %hash = ();
my %peaks = ();
my %ids = ();

sub main {
	# get the command line input
	my ($infile, $peaks, $outfile) = @ARGV;
	my $options = GetOptions("bin_size=i"     => \$bin_size,
	                         "scal_factor=i"  => \$scaling_factor,
	                         "bin_num=i"     => \$number_of_bins);
	
	$bin_count = $fragment_length / $bin_size;	
	
	# exceptions
	if (!defined $infile or !defined $peaks or !defined $outfile) {
		print STDERR "Usage: perl binForHeatmap.pl <infile> <peaks> <outfile> \n";
		print STDERR "--bin_size <int> --scal_factor <int> --bin_num <int> \n";
		print STDERR "This script produces output that can be plotted in R.\n";
		print STDERR "The input or output filename is missing!\n" and exit;
	}
	
	# bin the reads and output
	print STDERR "Generating genomic bins of size $bin_size...\n";
	binReads($infile, \%hash);
	print STDERR "Outputing $number_of_bins bins around the summit of the sorted peaks...\n";
	getBinsForPeaks($peaks, \%hash, $outfile);
}

sub binReads {
	my ($file, $hashref) = @_;
	
	# open the file and parse
	# expecting a bowtie format input file!
	open(IN, "$file") || die "Cannot open $file: $!\n";
	
	my $total = 0;
	
	while ( <IN> ) {
		chomp;
		my ($id, $strand, $chr, $start, undef) = split("\t", $_);
		$total++;
		if ($strand eq '+') {
			$start -= 75;
			my $bin = $start - ($start % $bin_size);
			#$hashref->{$chr}->{$bin} += 1;
			for (my $i = 0; $i < $bin_count; $i++) {
				$hashref->{$chr}->{$bin+($bin_size*$i)} += 1; # we assume a fragment size of 200 bp,
			}
		}
		else {
			$start += 75;
			my $bin = $start - ($start % $bin_size);
			#$hashref->{$chr}->{$bin} += 1;
			for (my $i = 0; $i < $bin_count; $i++) {
				$hashref->{$chr}->{$bin-($bin_size*$i)} += 1;
			}
		}
		
		$ids{$id} = 1;
		
		# get nearest bin and add 1
		#my $bin = $start - ($start % $bin_size);
		#$hashref->{$chr}->{$bin} += 1;
	}
    
    # count the number of id keys
    #$total = keys %ids;
    #print "$total\n" and exit;
	
	close IN;
	
	# normalize the bins
	my $multiplier = $scaling_factor / $total;
	foreach my $chr (keys %$hashref) {
		foreach my $bin (keys %{$hashref->{$chr}}) {
			$hashref->{$chr}->{$bin} *= $multiplier;
		}
	}
}

sub getBinsForPeaks {
	my ($file, $hashref, $outfile) = @_;
	
	open(IN, "$file") || die "Cannot open $file: $!\n";
	open(OUT, ">$outfile") || die "Cannot open $outfile: $!\n";
	
	my $counter = 1;
	
	while ( <IN> ) {
		chomp;
		# looking for the .xls output of arem/macs!
		next if $_ =~ m/#/;
		next if $_ =~ m/start/;
		my ($chr, $start, $end, $length, $summit, $score) = split("\t", $_);
		next if !defined $end;  # capture that annoying blank line! 
		
		# normalize the score by the length
		$score /= $length;
		$peaks{$counter}{"summit"} = ($start + $summit);
		$peaks{$counter}{"score"}  = $score;
		$peaks{$counter}{"chrom"}  = $chr;
		$counter++;
	}
	
	close IN;
	
	# sort on the score and output bins around the summit
	foreach my $key (sort {$peaks{$a}{"score"} <=> $peaks{$b}{"score"}} keys %peaks) {
		# this sorts backwards, small to large; R plots things backwards, so it plots correctly
		my $summit = $peaks{$key}{"summit"};
		my $chr    = $peaks{$key}{"chrom"};
		my $score  = $peaks{$key}{"score"};
		#print STDERR "$chr\t$summit\t$score\n" and sleep(2);
		
		my $bin = $summit - ($summit % $bin_size);
		my $left_bin = $bin - (($number_of_bins / 2) * $bin_size);
		
		#my %hash = %$hashref; # prevents annoying output to stdout when no bin exists
		
		for (my $i = 0; $i < $number_of_bins; $i++) {
			my $count = (exists $hashref->{$chr}->{($left_bin + ($i * $bin_size))}) ? $hashref->{$chr}->{($left_bin + ($i * $bin_size))} : 0;
			#print STDERR "$count\n" and sleep(2);
			if ($i == ($number_of_bins - 1)) {
				print OUT "$count\n";
			}
			else {
				print OUT "$count\t";
			}
		}
	}
	
	close OUT;
}

# call main
main();