Thanks a lot "graff" your inputs were quite useful.
I could be able to write the code. (suggestions for the improvements are welcome)
This code take the inputs: 1. raw pileup file and the 2. consensus fasta sequence(here it is named as "input.fasta"). and modifies the consensus fasta sequence based on the nucleotide frequences (if above 5%) at each position (from the read column (column 9).
It also includes deletions if they above 5% in indel line (i.e. lines with * in the column 3)
>input.fasta
GATCCccagagcgttGAwgcttAgacTCCgagc
Find the complete code below:
usage: perl modify.pileup.pl pileup2markedfasta input.pileup
#!/usr/bin/perl
use strict;
use warnings;
&usage if (@ARGV < 1);
my $command = shift(@ARGV);
my %func = (pileup2markedfasta=>\&pileup2fq);
die("Unknown command \"$command\".\n") if (!defined($func{$command}));
&{$func{$command}};
exit(0);
# pileup2markedfasta
sub pileup2fq {
die(qq/
Usage: modify.pileup.pl pileup2marked.fasta [options] <in.cns-pileup
+>
/) if (@ARGV == 0 && -t STDIN);
my %hash = (
1 => 'A',
2 => 'T',
3 => 'G',
4 => 'C',
12 => 'W', 21 => 'W',
13 => 'R', 31 => 'R',
14 => 'M', 41 => 'M',
23 => 'K', 32 => 'K',
24 => 'Y', 42 => 'Y',
34 => 'S', 43 => 'S'
);
my $seq;
my $ConsFileName = "input.fasta" ; # Input single chromosome consensus
+ fasta file
open(INFILE, "$ConsFileName");
my $header ;
while(my $line = <INFILE>) {
if ($line =~ m/^\>/){
$header = $line; # Fasta header is removed; otherwise counted
}
else{
chomp $line; # Remove the "\n"
$seq .= $line;
}
}
close(INFILE);
print "$header";
print "$seq\n";
my @gps;
my $dsize ;
my $rseq = '';
my $report = '';
my @t;
while (<>) {
@t = split;
if ($t[7] < 7){substr($seq, ($t[1]-1), 1, "N");} # minimum number (
+7) of reads
elsif ($t[2] eq '*') { # checking if it is indel in the raw pile
+up
if (($t[3] =~ /\*\/\+/)||($t[3] =~ /^\+/)){
#print "next $t[3] \n";
next; }
elsif ( ($t[3] =~ /\*\/\*/) && ($t[9] =~ /^\-/) && ($t[11]/$t[7] >
+=0.05) ){
my $lenn =(length($t[9])-1);
for (1..$lenn){ push (@gps, '-') }
my $ndash .= '-' x ($lenn);
# print "$t[1]: $lenn, @gps, dash** $ndash\n";
substr($seq, $t[1], $lenn, "$ndash");
}
elsif ( ($t[3] =~ /\*\/\-/) && ($t[11]/$t[7] >=0.05) ){
my $lenn =(length($t[9])-1);
for (1..$lenn){ push (@gps, '-') }
my $ndash .= '-' x ($lenn);
# print "$t[1] dash*- $ndash\n";
substr($seq, $t[1], $lenn, "$ndash");
}
elsif (($t[3] =~ /^\-/) && ($t[10]/$t[7] >=0.05)){
my $lenn =(length($t[8])-1);
for (1..$lenn){ push (@gps, '-') }
my $ndash .= '-' x ($lenn);
# print "$t[1] 'dash-' $ndash\n";
substr($seq, $t[1], $lenn, "$ndash");
}
}
else {
$dsize = scalar(@gps);
if ($dsize >=1){shift @gps;
#print "$dsize: $t[1]skippped\n";
next; }
else {
my $countref = ($t[8] =~ tr/.,//);
if ( $countref / $t[7] > 0.95 ){next;}
elsif ($countref/$t[7] <=0.95){
$t[8] =~ s/[+-][0-9]+[atgcrykmswbdhvnATGCRYKMSWBDHVN]+//g;
+ # removes unwanted indels from read base lines
$t[8] =~ s/[\@\'\&\!\^\$\*\)]+//g; # removes garbage symbo
+ls
$t[8] =~ s/[.,]/$t[2]/g;
my %ratio;
my @letters = qw/a t g c/;
$report = " $t[1] : ";
for my $p ( @letters ) {
#`echo "$t[1]"`;
my $count = eval "lc( '$t[8]' ) =~ tr/$p//";
$ratio{$p} = $count / $t[7];
# print "$p: $ratio{$p}\n";
$report .= sprintf( "%s %5.3f ", uc( $p ), $ratio{$p} );
}
my $countnut = 0;
my @ncode =();
for my $i (0 .. $#letters) {
if ( $ratio{ $letters[$i] } >= 0.05 ) {
push @ncode, $i + 1;
#print "$t[1], @ncode, $letters[$i]\n";
$countnut++;
}
}
my $ccode = join ("", @ncode);
chomp $ccode;
if ( ! exists( $hash{$ccode} )) {
# print "not $t[1], $ccode\n";
substr($seq, ($t[1]-1), 1, "N");
}
else {
# print "yes $t[1], $ccode\n";
substr($seq, ($t[1]-1), 1, "$hash{$ccode}");
}
}
}
# printf( "Data_line %5d : t1= %5s ; report= %s \n", $., $t[1], $r
+eport );
}
} # end of while
print "$header"; # printing back the fasta header
$seq= uc($seq) ;
# print "$seq\n"; ;
&p2q_post_process(\$seq) ;
#print "$header"; # printing back the fasta header
} # end of pileup
##################
sub p2q_post_process {
my ($seq) = @_;
print &p2q_print_str($seq);
}
#
sub p2q_print_str {
my ($s) = @_;
my $l = length($$s);
for (my $i = 0; $i < $l; $i += 60) {
print substr($$s, $i, 60), "\n";
}
}
#
sub usage {
die(qq/
Program: modify.pileup.pl
Usage: modify.pileup.pl <command> [<arguments>]\n
Command: pileup2markedfasta generate maked.fasta from `pileup -c r
+aw pileup'
\n/);
}
input.pileup
2L 1 G A 48 48 26 7 AAaaAAA ACCCD?@
2L 2 A A 48 0 26 7 ..,,... AACC@BC
2L 3 T K 18 18 26 7 .Ggg... ACCCCCC
2L 4 C C 57 0 37 16 .,..,,.,a.,,,,,^F, CBCC
+BBC@3CCBBD=?
2L 5 C C 54 0 37 17 .,..,,.,a.,,,,,,^F, CDC
+CDACB9CCA@B?@D
2L 6 c C 78 0 37 17 .,..,,.,,.,,,,,,, CBCCB
+AC>9CBB@BA?A
2L 7 c C 45 0 37 19 .,..,,.,,.+1A,,,,t,,^F,^Ft
+ CDCCDAC>;CBCBD*?DAA
2L 7 * */* 55 0 37 19 * A 18 1 0
+ 0 0
2L 8 a A 162 0 21 45 ,-4gtct,-4gtct,,....,,,,,
+,.,.,,,,.,,,,,,,,...,.,.,,,,.,,, CCCCA<;=/CCACDBCBCCCCDCCCCBBCCBDC
+CCCCDCDDCCAB
2L 8 * */* 173 0 21 45 * -gtct 43 3
+ 0 0 0
2L 9 g G 87 0 37 20 .,..,,.,,.,,,,,,,,,^F,
+CDCCCAC?BCBBCD?<DBBB
2L 10 a A 87 0 37 20 .,..,,.,,.,,,,,,,,,, C
+BCDABC>BCACBB:AC?4C
2L 11 g G 90 0 37 21 .,..,,.,,.,,,,,,,,,,^F.
+ CDCCCAC@@CDDCC<:DBABC
2L 12 c C 117 0 14 31 ...,,*.,,-2gt.,,.,,.,...
+.......,,.. ;B6CCBBCBCACDCCCB@DCDCC8CCCABCC
2L 12 * */-gt 21 21 14 31 * -gt 30 1
+ 0 0 0
2L 13 g G 90 0 37 21 .,..,,.,,.,,,,,,,,,,.
+CDCCCAC?:CDB5?<>BB>BC
2L 14 t T 45 0 37 6 ..,,,.-1T CCCCCC
2L 14 * -t/-t 56 59 37 6 -t * 1 5
+0 0 0
2L 15 t T 93 0 37 22 .,..,,.,,.,,,G,,,,,,.^F.
+ CDCDDDC<@CBB5B@ABC<ACC
2L 16 G G 178 0 36 50 .$,$,$,$,,,,,.,,,,..,.,,
+...,,,,.,,...,....,...,,,$,,,., BCCCCCCCCCCCCBDBCBCCB>8CCCCBCCBDBC
+8>ACCBB6CC?CCC6C
2L 17 A A 59 0 36 45 ,$,$,,,C,,,c..,.,,C..,,,,
+.,,...,....,..C,,,,,$., CCCCCBCCCBB8CACC>@;CCCCDCCCDAC-5ABCAA2CCCC
+?1A
2L 18 w T 54 54 37 9 tTTTTttTT CCDCCDCCC
2L 18 * A/+A 65 279 37 9 A * 8 1 0
+ 0 0
2L 19 g G 54 0 37 9 ....,,... >>@@CC>BB
2L 20 c C 57 0 37 10 ....,,...^F, CACCCC?CB
+B
2L 21 t T 57 0 37 10 ....,,..., BCDCCCCCB?
2L 22 t T 48 0 34 7 ..,.,,+4caaa,+4caaa C?C
+CA=?
2L 22 * */* 9 0 34 7 * +CAAA 5 2 0
+ 0 0
2L 23 A A 48 0 34 7 .$.$,.,+1g,, DBC?CCA
2L 23 * */* 19 0 34 7 * +G 6 1 0
+ 0 0
2L 24 g G 60 0 14 33 .,...,,.-13GGCGCGCGTGCGC.
+,,A,,A,,A,..........aa.. DCBBBBDBCCCCBCCCDCBDCBCCCACCBABCC
2L 24 * */* 61 0 14 33 * -ggcgcgcgtgcgc
+32 1 0 0 0
2L 25 a A 163 0 15 49 ..$,..,,,t,.T,,,..,,,,T,
+-7attattt,,-7attattt,,,,,,....,,......,.,,.. BBCA>BCCCC:CCCC>ACCCC
+BCCCCBCCCCCCDCCCCCCCCBCBCDCC
2L 25 * */-attattt 27 27 15 49 * -attattt
+ 47 2 0 0 0
2L 26 c C 32 0 0 11 ,,-4tctt,,.-4TCTT...-4TCTT
+.^!.^!, CCC8CCCCBCA
2L 26 * */-tctt 17 17 0 11 * -tctt 8
+3 0 0 0
2L 27 T K 18 18 26 7 .Ggg... ACCCCCC
2L 28 C C 57 0 37 16 .,..,,.,a.,,,,,^F, CBC
+CBBC@3CCBBD=?
2L 29 C C 54 0 37 17 .,..,,.,a.,,,,,,^F, CD
+CCDACB9CCA@B?@D
2L 30 g G 87 0 37 20 .,..,,.,,.,,,,,,,,,^F,
+ CDCCCAC?BCBBCD?<DBBB
2L 31 a A 87 0 37 20 .,..,,.,,.,,,,,,,,,, C
+BCDABC>BCACBB:AC?4C
2L 32 g G 90 0 37 21 .,..,,.,,.,,,,,,,,,,^F.
+ CDCCCAC@@CDDCC<:DBABC
2L 33 c C 117 0 14 31 ...,,*.,,-2gt.,,.,,.,...
+.......,,.. ;B6CCBBCBCACDCCCB@DCDCC8CCCABCC
Output:
>input.fasta
GATCCccagagcgttGAwgcttAgacTCCgagc
>input.fasta
AAKMMCYA----GNTGMTGCTTARWC----AGC
|