#!/usr/bin/perl -w use strict; use POSIX; use Getopt::Std; use Data::Dumper; my $usage = "\nusage: $0 [-flags] (must be all 1 type. ie all fasta NT, all fastq, all fasta fna)\n". "The script is able to detect if fasta or fastq files have been suplied. Depending on the flags, it produces different output.\n". "It will produce a histogram, a vector distribution, and a NT/AA distribution. For fastq files, you must use a flag to specify which version\n". "of fastq the file is formatted with.\n\n". "-l # Y or N to specify Solexa/Illumina < 1.3 Fastq: Defaults N\n". "-s # Y or N to specify Sanger Fastq: Defaults N\n". "-u # Y or N to specify Illumina >= 1.3 Fastq: Defaults N\n"; our($opt_l, $opt_s, $opt_u, $opt_b); # histogram interval getopts('l:s:u:b:') or die $usage; if (!defined($opt_l)) {$opt_l = 'N';} if (!defined($opt_s)) {$opt_s = 'N';} if (!defined($opt_u)) {$opt_u = 'N';} if (!defined($opt_b)) {$opt_u = 'N';} if( ( $#ARGV + 1 ) < 1 ) { die $usage; } my $infile = shift or die $usage; my $outfile = shift or die $usage; open OUT, ">$outfile" or die $usage; open STDERR, ">/dev/null"; # Read in sequences from one or more fasta files my $Id; my $nextline; my $qual; my @phred_means; my @phred1scr; my $numseqs = 0; my @total; my @count; my @min; my @max; my @stddev; my @matrix; my @Q1, my @Q2; for(my $j = 0; $j < 70; $j++) { $total[$j] = 0; $count[$j] = 0; $min[$j] = 100; $max[$j] = 0; } for(my $i = 0; $i < 40; $i++) { for(my $j = 0; $j < 60; $j++) { $matrix[$i][$j] = 0; } } #automatic-1fasta or fastq detection open(FASTA, $infile) or die "Can't open file $infile\n"; while () { ##if Fastq format if(/^@(.+)\s/) { $Id = $1; my $nextline = ; if($nextline =~ /^(\S+)$/) { my $nextline = ; if($nextline =~ /^\+(.*)\s/) { my $nextline = ; if($nextline =~ /^(\S+)$/) { $qual = $1; $numseqs += 1; ##returns an array with quality raw scores @phred1scr = fastq_processing($qual); my $i = 0; ##adds each returned array score to the correct $total[position] foreach my $score (@phred1scr) { $score = round($score); $total[$i] += $score; $count[$i] += 1; if($score > $max[$i]) { $max[$i] = $score; } if($score < $min[$i]) { $min[$i] = $score; } if($score >= 0 and $score <= 40) { $matrix[$score][$i]++; } $i++; } } } } } } close (FASTA); ##calculates the means for my $j (0..69) { last if($count[$j] == 0); my $avg = $total[$j] / $count[$j]; push (@phred_means, $avg); } #print Dumper(\@matrix); ##calculate the stddevs and quartiles #if(@val60) { # my $total2 = 0; # foreach my $num (@val60) { # $total2 += ($phred_means[59]-$num)**2; # } # my $mean2 = $total2 / (scalar @val60); # my $std_dev = sqrt($mean2); # push(@stddev, $std_dev); #} fastq_dist(\@phred_means); print OUT "\n\nScore\t\tCount\n"; #my @col = get_col(\@matrix, 38); #print Dumper(\@col); for my $i (0..$#matrix) { my @col = map { $_->[$i] } \@matrix; print Dumper(\@col); my $score = score_count(@col); # print OUT "$i\t\t$score\n"; } ########################################### ##### Sub Processing Routines follow ###### ########################################### sub round { my($number) = shift; return int($number + .5); } sub score_count { my @arr = shift; my $total = 0; foreach my $score (@arr) { print "@$score "; $total += @$score; } print "$total\n"; return $total; } sub fastq_processing { my $raw_qual = shift; my $i = 1; my @phred1seq; my @bases = split(//, $raw_qual); #different phred score calculations depending on fastq version foreach my $byte (@bases) { if($opt_l eq 'Y') { my $phred = (10 * log(1 + 10 ** ((ord($byte) - 64) / 10.0)) / log(10)); push (@phred1seq, $phred); } if($opt_s eq 'Y') { my $phred = ord($byte) - 33; push (@phred1seq, $phred); } if($opt_u eq 'Y') { my $phred = ord($byte) - 64; push (@phred1seq, $phred); } } return @phred1seq; } sub fastq_dist { my @means = shift; my $i; my $text; if($opt_l eq 'Y') { $text = "Solexa/Illumina < 1.3 scoring"; } elsif ($opt_s eq 'Y') { $text = "Sanger scoring"; } elsif ($opt_u eq 'Y') { $text = "Illumina >= 1.3 scoring"; } print OUT "Base\nPos\tCount\tMin\tMax\tStd Dev\tMean Phred\tHistogram - $text\n"; my $lrgstpos = 0; my $mean_arr = $means[0]; foreach my $mean (@$mean_arr) { ##get's the number of phred values per sequence if($mean > 0) { $lrgstpos += 1; } } #loops through the total number of postions and gets the mean for each for my $i (0 .. ($lrgstpos - 1)) { my $mean = $$mean_arr[$i]; my $pos = $i + 1; print OUT "$pos\t$count[$i]\t"; # printf OUT "%.2f\t%.2f\t%.2f\t%.2f\t", $min[$i], $max[$i], $stddev[$i], $mean; my $histogram = "|" x $mean; print OUT $histogram."\n"; } print OUT "\n"; } #### @SRR006331.1 201WGAAXX:1:1:782:676 length=36 TTTTAGGTCAAAATCAGTTCTATTAGCAACTCCTAA +SRR006331.1 201WGAAXX:1:1:782:676 length=36 IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII @SRR006331.2 201WGAAXX:1:1:821:689 length=36 TCTTAAGTTTTTTGCATATTCATAATTATATAAATC +SRR006331.2 201WGAAXX:1:1:821:689 length=36 IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII @SRR006331.3 201WGAAXX:1:1:845:648 length=36 TTTTATTGCCATTTTTGTCAACAAAATGTATTCCAT +SRR006331.3 201WGAAXX:1:1:845:648 length=36 IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII @SRR006331.4 201WGAAXX:1:1:863:661 length=36 TATGAAGTTACTAAAAAAATAATTATTGATAAGATT +SRR006331.4 201WGAAXX:1:1:863:661 length=36 IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII @SRR006331.5 201WGAAXX:1:1:521:222 length=36 TAATAATAACTAGTTCATCTTTAGCCCTAGTTATAG +SRR006331.5 201WGAAXX:1:1:521:222 length=36 IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII @SRR006331.6 201WGAAXX:1:1:774:423 length=36 TTAACTAAATCCTCTTGATTTATAGAGGCTTTTTTT +SRR006331.6 201WGAAXX:1:1:774:423 length=36 IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII @SRR006331.7 201WGAAXX:1:1:658:111 length=36 TACTTTATCACCATTAAAATAAAGCTGATTTTCTGA +SRR006331.7 201WGAAXX:1:1:658:111 length=36 AIIIIIIIIIIIIIIAIIIIIIIIIIIIIIIIIIII @SRR006331.8 201WGAAXX:1:1:413:670 length=36 TTTATCACATTTATCTAATTCTTCATATTGTGCTTT +SRR006331.8 201WGAAXX:1:1:413:670 length=36 IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII @SRR006369.8 201WGAAXX:1:1:413:670 length=36 TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT +SRR006369.8 201WGAAXX:1:1:413:670 length=36 IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII #### perl fastq_qualquartileBIN.pl -s Y test.txt testing.out