#!/usr/bin/perl use Getopt::Long; use warnings; use strict; # Retrieves the normalized values in each bin in a window # around some peak region. The output is designed to be used # to plot a heatmap in R. # init default variables my $bin_size = 50; my $scaling_factor = 10000000; my $number_of_bins = 80; my $fragment_length = 200; my $bin_count = 0; my %hash = (); my %peaks = (); my %ids = (); sub main { # get the command line input my ($infile, $peaks, $outfile) = @ARGV; my $options = GetOptions("bin_size=i" => \$bin_size, "scal_factor=i" => \$scaling_factor, "bin_num=i" => \$number_of_bins); $bin_count = $fragment_length / $bin_size; # exceptions if (!defined $infile or !defined $peaks or !defined $outfile) { print STDERR "Usage: perl binForHeatmap.pl \n"; print STDERR "--bin_size --scal_factor --bin_num \n"; print STDERR "This script produces output that can be plotted in R.\n"; print STDERR "The input or output filename is missing!\n" and exit; } # bin the reads and output print STDERR "Generating genomic bins of size $bin_size...\n"; binReads($infile, \%hash); print STDERR "Outputing $number_of_bins bins around the summit of the sorted peaks...\n"; getBinsForPeaks($peaks, \%hash, $outfile); } sub binReads { my ($file, $hashref) = @_; # open the file and parse # expecting a bowtie format input file! open(IN, "$file") || die "Cannot open $file: $!\n"; my $total = 0; while ( ) { chomp; my ($id, $strand, $chr, $start, undef) = split("\t", $_); $total++; if ($strand eq '+') { $start -= 75; my $bin = $start - ($start % $bin_size); #$hashref->{$chr}->{$bin} += 1; for (my $i = 0; $i < $bin_count; $i++) { $hashref->{$chr}->{$bin+($bin_size*$i)} += 1; # we assume a fragment size of 200 bp, } } else { $start += 75; my $bin = $start - ($start % $bin_size); #$hashref->{$chr}->{$bin} += 1; for (my $i = 0; $i < $bin_count; $i++) { $hashref->{$chr}->{$bin-($bin_size*$i)} += 1; } } $ids{$id} = 1; # get nearest bin and add 1 #my $bin = $start - ($start % $bin_size); #$hashref->{$chr}->{$bin} += 1; } # count the number of id keys #$total = keys %ids; #print "$total\n" and exit; close IN; # normalize the bins my $multiplier = $scaling_factor / $total; foreach my $chr (keys %$hashref) { foreach my $bin (keys %{$hashref->{$chr}}) { $hashref->{$chr}->{$bin} *= $multiplier; } } } sub getBinsForPeaks { my ($file, $hashref, $outfile) = @_; open(IN, "$file") || die "Cannot open $file: $!\n"; open(OUT, ">$outfile") || die "Cannot open $outfile: $!\n"; my $counter = 1; while ( ) { chomp; # looking for the .xls output of arem/macs! next if $_ =~ m/#/; next if $_ =~ m/start/; my ($chr, $start, $end, $length, $summit, $score) = split("\t", $_); next if !defined $end; # capture that annoying blank line! # normalize the score by the length $score /= $length; $peaks{$counter}{"summit"} = ($start + $summit); $peaks{$counter}{"score"} = $score; $peaks{$counter}{"chrom"} = $chr; $counter++; } close IN; # sort on the score and output bins around the summit foreach my $key (sort {$peaks{$a}{"score"} <=> $peaks{$b}{"score"}} keys %peaks) { # this sorts backwards, small to large; R plots things backwards, so it plots correctly my $summit = $peaks{$key}{"summit"}; my $chr = $peaks{$key}{"chrom"}; my $score = $peaks{$key}{"score"}; #print STDERR "$chr\t$summit\t$score\n" and sleep(2); my $bin = $summit - ($summit % $bin_size); my $left_bin = $bin - (($number_of_bins / 2) * $bin_size); #my %hash = %$hashref; # prevents annoying output to stdout when no bin exists for (my $i = 0; $i < $number_of_bins; $i++) { my $count = (exists $hashref->{$chr}->{($left_bin + ($i * $bin_size))}) ? $hashref->{$chr}->{($left_bin + ($i * $bin_size))} : 0; #print STDERR "$count\n" and sleep(2); if ($i == ($number_of_bins - 1)) { print OUT "$count\n"; } else { print OUT "$count\t"; } } } close OUT; } # call main main();