>ID1
atgagcagctag
>ID2
tggacgagctgaca
.
.
>IDn
tgactagacggacatac
####
>ID1
actggctagaag
>ID2
cggaccagggacta
.
.
>IDn
ctgaaaaaggagccttt
####
use strict;
use warnings;
use List::Util 'shuffle'; # Idea from http://www.perlmonks.org/?node_id=199901
##########################################################################################
my $start_time =time;
my $input = shift @ARGV;
my $destination = shift @ARGV;
open(IN, '<', $input) or die "Can't read multifasta input genomic DNA file $input : $!\n";
my $window = 1000000; # hard coded for shuffle window to be 1MB i.e 10^6
print "The length of the window shuffled is 1MB (i.e. 10^6 bp), it is hard coded, you may change it in line 27 of this script. Thank you! \n";
my (@IDs, @lengths, @IDsLens, @output);
my $concat_seq="";
my $total_length= 0;
##########################################################################################
while () {
chomp;
if ($_ =~ m/\>/) {
my $ID1 = my $ID2 = $_;
push @IDs, $ID1;
push @IDsLens, $ID2;
}
elsif ($_ !~ m/\>/) {
my $len1 = my $len2 = length($_);
push @lengths, $len1;
$total_length = $total_length + $len1;
push @IDsLens, $len2;
$concat_seq = $concat_seq.$_; # concatenating the entire genome into one long sequence before breaking it for shuffling
}
}
my %IDLen_hash = @IDsLens;
##########################################################################################
my $i = 0;
my $cat_shuf_full_seq="";
for (my $i = 1; $i <= $total_length; $i=$i+$window ) { # perform this operation below every 1MB, see line 21 above
my $s = substr ($concat_seq, $i - 1, $window);
my $rev = reverse $s;
my @temp_seq_array;
if ($i % 2 == 0) {@temp_seq_array = split //, $rev;} # throw in some reverse sequence alternatively to shuffle it more randomly
elsif ($i % 2 != 0) {@temp_seq_array = split //, $s;}
my @rand_seq_array = shuffle @temp_seq_array; # using the List::Util module
my $rand_shuffled_substr = join ('', @rand_seq_array,);
$cat_shuf_full_seq = $cat_shuf_full_seq.$rand_shuffled_substr; # concatenates the shuffled DNA seq to the 3' end of the previous 1MB fragment
}
my @shuffle_concat_array = split("", $cat_shuf_full_seq);
##########################################################################################
foreach my $ID (@IDs) {
my $contig_len = $IDLen_hash{$ID};
my @shuffle_concat_splice_seq = splice @shuffle_concat_array, 0, $contig_len;
# this will progressively reduce the length of @shuffle_concat_splice_seq
# until the final splice operation will be for the length of the final contig in original sequence
# which should also be the same as the length of the remaining array assigned to this array variable
my $cat_shuf_splice_final_subseq = join('', @shuffle_concat_splice_seq);
print $ID, "\n";
push @output, $ID, "\n", $cat_shuf_splice_final_subseq, "\n";
}
##########################################################################################
#my @input_name_split = split(".fa.shIDscleaned-up",$input);
#my $destination = $input_name_split[0]."_cat_shuf1MB_frag.fasta";
open(OUT, '>', $destination) or die "Can't write to file $destination: $!\n";
print OUT @output;
close OUT;
my $end_time = time;
my $duration = ($end_time - $start_time)/60;
my $rounded = sprintf("%.1f", $duration);
print "Finished processing input, written output to ", $destination, " in ", $rounded, " minutes \n";
##########################################################################################