>ID1 atgagcagctag >ID2 tggacgagctgaca . . >IDn tgactagacggacatac #### >ID1 actggctagaag >ID2 cggaccagggacta . . >IDn ctgaaaaaggagccttt #### use strict; use warnings; use List::Util 'shuffle'; # Idea from http://www.perlmonks.org/?node_id=199901 ########################################################################################## my $start_time =time; my $input = shift @ARGV; my $destination = shift @ARGV; open(IN, '<', $input) or die "Can't read multifasta input genomic DNA file $input : $!\n"; my $window = 1000000; # hard coded for shuffle window to be 1MB i.e 10^6 print "The length of the window shuffled is 1MB (i.e. 10^6 bp), it is hard coded, you may change it in line 27 of this script. Thank you! \n"; my (@IDs, @lengths, @IDsLens, @output); my $concat_seq=""; my $total_length= 0; ########################################################################################## while () { chomp; if ($_ =~ m/\>/) { my $ID1 = my $ID2 = $_; push @IDs, $ID1; push @IDsLens, $ID2; } elsif ($_ !~ m/\>/) { my $len1 = my $len2 = length($_); push @lengths, $len1; $total_length = $total_length + $len1; push @IDsLens, $len2; $concat_seq = $concat_seq.$_; # concatenating the entire genome into one long sequence before breaking it for shuffling } } my %IDLen_hash = @IDsLens; ########################################################################################## my $i = 0; my $cat_shuf_full_seq=""; for (my $i = 1; $i <= $total_length; $i=$i+$window ) { # perform this operation below every 1MB, see line 21 above my $s = substr ($concat_seq, $i - 1, $window); my $rev = reverse $s; my @temp_seq_array; if ($i % 2 == 0) {@temp_seq_array = split //, $rev;} # throw in some reverse sequence alternatively to shuffle it more randomly elsif ($i % 2 != 0) {@temp_seq_array = split //, $s;} my @rand_seq_array = shuffle @temp_seq_array; # using the List::Util module my $rand_shuffled_substr = join ('', @rand_seq_array,); $cat_shuf_full_seq = $cat_shuf_full_seq.$rand_shuffled_substr; # concatenates the shuffled DNA seq to the 3' end of the previous 1MB fragment } my @shuffle_concat_array = split("", $cat_shuf_full_seq); ########################################################################################## foreach my $ID (@IDs) { my $contig_len = $IDLen_hash{$ID}; my @shuffle_concat_splice_seq = splice @shuffle_concat_array, 0, $contig_len; # this will progressively reduce the length of @shuffle_concat_splice_seq # until the final splice operation will be for the length of the final contig in original sequence # which should also be the same as the length of the remaining array assigned to this array variable my $cat_shuf_splice_final_subseq = join('', @shuffle_concat_splice_seq); print $ID, "\n"; push @output, $ID, "\n", $cat_shuf_splice_final_subseq, "\n"; } ########################################################################################## #my @input_name_split = split(".fa.shIDscleaned-up",$input); #my $destination = $input_name_split[0]."_cat_shuf1MB_frag.fasta"; open(OUT, '>', $destination) or die "Can't write to file $destination: $!\n"; print OUT @output; close OUT; my $end_time = time; my $duration = ($end_time - $start_time)/60; my $rounded = sprintf("%.1f", $duration); print "Finished processing input, written output to ", $destination, " in ", $rounded, " minutes \n"; ##########################################################################################