use strict; use warnings; my $start_time = time; my $input1 = shift @ARGV; my $input2 = shift @ARGV; my %dstrbtn_hash; open(IN1, '<', $input1) or die "Can't read source file $input1 : $!\n"; while(){ chomp; $dstrbtn_hash{ length($_) }++ unless />/; } close IN1; open(IN2, '<', $input2) or die "Can't read source file $input2 : $!\n"; my @source; while(){ # ignore header my $seq = ; chomp $seq; my $len = length( $seq ); push @source, [ $seq, $len ]; # keep length alongside with header } close IN2; my $filename = $input1.$input2."_extracted_seqs.fasta"; open (OUT, '>', $filename) or die "Can't write to file $filename : $!\n"; my $header_count = 1; while( my ($key, $freq) = each %dstrbtn_hash) { my $size = $key - 3; for my $iteration (1..$freq) { my $temp_source_seq; my @cand; EXTRACT: { @cand = map { $_->[0] } grep { $_->[1] >= $size } @source; # filter sequences long enough die "No long enough sequence found in $input2\n" unless @cand; my $chosen=int(rand(@cand)); $temp_source_seq = $cand[$chosen]; } START: { my $temp_source_seq_len = length ($temp_source_seq); my $random_start_coord = int(rand($temp_source_seq_len-$size)); # substract $size to avoid loop my $extracted_seq = substr($temp_source_seq, $random_start_coord, $size); print OUT ">".$header_count."extracted_seq", "\n"; print OUT $extracted_seq, "\n"; $header_count++; } } } close OUT; my $end_time = time; my $duration = ($end_time - $start_time)/60; print "Thank you for your patience, this Perl script operation has completed, it took $duration minutes, good bye!", " \n";