#!/usr/bin/perl -w use strict; use List::Util qw(sum); use Data::Dumper; use constant { name =>0, start =>1, end =>2, pos1 =>3, pos2 =>4, CGtype =>5, Cov =>6, Methy =>7, Strand =>8}; my @rec=(); my $line; my $no_read=0; my $header_line = ; #skip the headers print q{"CpG" "length" "CG" "CG>0" "CG>5" "CG>10" "CG>0mean" }, q{"CG>0median" "CG>5mean" "CG>5median" "CG>10mean" }, q{"CG>10median"},"\n"; while ($no_read or defined($line=)) { next if $line =~ /^\s*$/; my @cols = split(' ',$line); if (!@rec) #first line of record { $no_read=0; push @rec,[@cols]; } elsif ($rec[0][name] ne $cols[name] or $rec[0][start] != $cols[start] or $rec[0][end] != $cols[end]) { $no_read=1; # keep current line for next record process_record(); # record is finished @rec=(); } else #record continuation { push @rec,[@cols]; } } process_record(); #for the last one sub process_record { my $num_CG = grep{$_->[CGtype] eq 'CG'}@rec; # collect Methy values for CGtype with Cov >0,5,10 # The number of those values is a proxy for count of CGtypes # statisfying those conditions. # map{} to Methy values is used instead of scalar grep{} to # count the number of lines statisfying the condition(s) because # the @Meth_CGgtX arrays can be used easily in the mean/median # calculations. # this code does iterate over @rec 4 times, but it does have a # regularity and simplicity to it that is worth something. # I figure that if @rec is not "that big", this won't matter # much. If it does, then this is an easily identifiable sub # that can be "improved". my @Meth_CGgt0 = map{ ( $_->[CGtype] eq 'CG' and $_->[Cov]>0 ) ? $_->[Methy] :()}@rec; my @Meth_CGgt5 = map{ ( $_->[CGtype] eq 'CG' and $_->[Cov]>5 ) ? $_->[Methy] :()}@rec; my @Meth_CGgt10 = map{ ( $_->[CGtype] eq 'CG' and $_->[Cov]>10 ) ? $_->[Methy] :()}@rec; print "$rec[0][name]:$rec[0][start]-$rec[0][end] "; print ''.@rec." "; print "$num_CG "; # these are the counts of CGgt0, CGgt5, CGgt10 print ''.@Meth_CGgt0." ".@Meth_CGgt5." ".@Meth_CGgt10." "; # ternary operator is used to avoid divide by zero errors! print @Meth_CGgt0 ? (sum(@Meth_CGgt0)/@Meth_CGgt0) : 0, " "; print median(@Meth_CGgt0)," "; print @Meth_CGgt5 ? (sum(@Meth_CGgt5)/@Meth_CGgt5) : 0, " "; print median(@Meth_CGgt5)," "; print @Meth_CGgt10 ? (sum(@Meth_CGgt10)/@Meth_CGgt10) : 0, " "; print median(@Meth_CGgt10),"\n"; } sub median { my @array = @_; return 0 if @array == 0; # for NULL arrays #return $array[0] if @array == 1; # update: not necessary @array = sort {$a<=>$b}@array; if (@array % 2) { return $array[int(@array/2)]; } else { return ($array[@array/2] + $array[@array/2 - 1]) / 2; } } =prints "CpG" "length" "CG" "CG>0" "CG>5" "CG>10" "CG>0mean" "CG>0median" "CG>5mean" "CG>5median" "CG>10mean" "CG>10median" chr1:18598-19673 5 2 2 0 0 0 0 0 0 0 0 chr1:124987-125426 3 2 0 0 0 0 0 0 0 0 0 chr1:317653-318092 3 0 0 0 0 0 0 0 0 0 0 chr1:427014-428027 1 0 0 0 0 0 0 0 0 0 0 chr1:439136-440407 1 0 0 0 0 0 0 0 0 0 0 chr1:523082-523977 1 1 0 0 0 0 0 0 0 0 0 chr1:534601-536512 1 0 0 0 0 0 0 0 0 0 0 chr1:703847-704410 1 0 0 0 0 0 0 0 0 0 0 chr1:752279-753308 1 0 0 0 0 0 0 0 0 0 0 chr2:5000-100000 4 4 4 2 0 7.25 8 9 9 0 0 =cut __DATA__ chr start end pos1 pos2 CGtype Cov. Methy Strand chr1 18598 19673 18676 18676 CHH 0 0 + chr1 18598 19673 18689 18689 CG 2 0 + chr1 18598 19673 18997 18997 CHH 0 0 + chr1 18598 19673 19546 19546 CG 4 0 + chr1 18598 19673 19671 19671 CHG 7 0 + chr1 124987 125426 125001 125001 CHH 1 0 + chr1 124987 125426 125226 125226 CG 0 0 + chr1 124987 125426 125426 125426 CG 0 0 + chr1 317653 318092 317653 317653 CHG 11 0 + chr1 317653 318092 317795 317795 CHG 0 0 + chr1 317653 318092 318090 318090 CHH 3 0 + chr1 427014 428027 427025 427025 CHH 0 0 + chr1 439136 440407 439687 439687 CHH 9 0 + chr1 523082 523977 523167 523167 CG 0 0 + chr1 534601 536512 535789 535789 CHH 1 0 + chr1 703847 704410 703999 703999 CHH 0 0 + chr1 752279 753308 753330 753330 CHH 0 0 + chr2 5000 100000 100000 100000 CG 1 9 + chr2 5000 100000 100000 100000 CG 3 2 + chr2 5000 100000 100000 100000 CG 6 7 + chr2 5000 100000 100000 100000 CG 9 11 +