#! usr/bin/perl # Length_dstrbtn_seq_extractor.pl # This PERL script accepts two input files - the first input is a multifasta file containing alternating lines of fasta headers and sequences generated using using the Perl script fix_multifasta0.pl. # This Perl scripts calculates the frequency distribution of sequences of all sizes in this input file. # The second input file is the user chosen multifasta sequence file from which sequences of the same frequency distribution of lengths will be extracted. # The only output is multifasta sequence file. The fasta headers specify source sequence, start coordinate and stop coordinate. # Syntax : perl Length_dstrbtn_seq_extractor.pl

#********************# #PROCESS INPUT FILES #********************# use strict; use warnings; my $start_time = time; my $input1 = shift @ARGV; my $input2 = shift @ARGV; my (@lengths, @source, @distrbtn, @output); open(IN1, '<', $input1) or die "Can't read source file $input1 : $!\n"; while() { chomp; if ($_=~ m/\>/) { #looks for match to the '>' character in the header line # if match to fasta header, does nothing } elsif ($_!~ m/\>/) { #looks for non-match to the '>' character in the sequence line push @lengths, length($_); # if match to fasta sequence, calculates and collects length of sequence } } close IN1; open(IN2, '<', $input2) or die "Can't read source file $input2 : $!\n"; while() { chomp; if ($_=~ m/\>/) { #looks for match to the '>' character in the header line push @source, $_; # if match to fasta header, includes in array } elsif ($_!~ m/\>/) { #looks for non-match to the '>' character in the sequence line push @source, $_; # if match to fasta sequence, includes sequence in array } } close IN2; #********************# # CALCULATE LENGTH DISTRIBUTION FROM INPUT FILE #1 #********************# my @sorted = sort {$a <=> $b}@lengths; my %seen = (); my @uniques = grep { !$seen{$_}++ } @sorted; foreach my $len(@uniques) { push @distrbtn, $len; my $index = 0; my $count = 0; while($index <= $#sorted) { if($len == $sorted[$index]) { $count++; } $index++; } push @distrbtn, $count; } my %dstrbtn_hash = @distrbtn; # hash of predicted sORF length (key) and number of times (value) that size is observed in the multifasta input file #1 # print @distrbtn, "\n"; # works thus far #********************# # EXTRACT SEQUENCES (RANDOMLY) FROM INPUT2, WITH IDENTICAL LENGTH DISTRIBUTION OF INPUT 1 #********************# my $header_count = 1; foreach my $key (keys %dstrbtn_hash) { my $size = $key - 3; # the sORF size is calculated only for coding region with the stop codon length (3 nucleotides - TAA, TGA or TAG) removed to obtain JUST the coding sequence's length my $freq = $dstrbtn_hash{$key}; # the number of times that a certain sORF size is seen in the input file #1, as calculated by the earlier portion of this Perl script my $iteration = 1; while ($iteration <= $freq) { my ($temp_source_seq, $temp_source_seq_len); EXTRACT: # choose a random sequence ONLY if it is as long or longer # than the length of sequence that needs to be extracted out { my $chosen=int(rand($#source)); $chosen++ if(($chosen%2) == 0); # even numbered array index is for fasta header, and therefore not of interest, # instead, we are interested in odd numbered array index number for the fasta sequence # which precedes or follows the header $temp_source_seq = $source[$chosen]; $temp_source_seq_len = length ($temp_source_seq); redo EXTRACT if($temp_source_seq_len < $size); } START: # choose a random start coordinate ONLY if the substring starting at that position # falls within the end coord of the sequence that it is part of { my $random_start_coord = int(rand($temp_source_seq_len)); redo START if(($random_start_coord + $size) > $temp_source_seq_len); my $extracted_seq = substr($temp_source_seq, $random_start_coord, $size); push @output,">".$header_count."extracted_seq", "\n"; push @output, $extracted_seq, "\n"; $header_count++; } $iteration++; } } #********************# # WRITE TO OUTPUT FILE, REPORT TIME TAKEN FOR CALCULATION #********************# my $filename = $input1.$input2."_extracted_seqs.fasta"; open (OUT, '>', $filename) or die "Can't write to file $filename : $!\n"; print OUT @output; my $end_time = time; my $duration = ($end_time - $start_time)/60; print "Thank you for your patience, this Perl script operation has completed, it took $duration minutes, good bye!", " \n"; close OUT; #********************#