>ID1 atgagcagctag >ID2 tggacgagctgaca . . >IDn tgactagacggacatac ##

##

>ID1
actggctagaag
>ID2
cggaccagggacta
.
.
>IDn
ctgaaaaaggagccttt

##

##

use strict;
use warnings;
use List::Util 'shuffle'; # Idea from http://www.perlmonks.org/?node_id=199901

##########################################################################################

my $start_time =time;

my $input = shift @ARGV;
my $destination = shift @ARGV;

open(IN, '<', $input) or die "Can't read multifasta input genomic DNA file $input : $!\n";

my $window = 1000000; # hard coded for shuffle window to be 1MB i.e 10^6
print "The length of the window shuffled is 1MB (i.e. 10^6 bp), it is hard coded, you may change it in line 27 of this script. Thank you! \n";

my (@IDs, @lengths, @IDsLens, @output);

my $concat_seq="";
my $total_length= 0;

##########################################################################################

while () {
    chomp;
    if ($_ =~ m/\>/)	{
    my $ID1 = my $ID2 = $_;
    push @IDs, $ID1;
    push @IDsLens, $ID2;
    }
    elsif ($_ !~ m/\>/)	{
    my $len1 = my $len2 = length($_);
    push @lengths, $len1;
    $total_length = $total_length + $len1;
    push @IDsLens, $len2;
    $concat_seq = $concat_seq.$_; # concatenating the entire genome into one long sequence before breaking it for shuffling
    }
}

my %IDLen_hash = @IDsLens;

##########################################################################################

my $i = 0;
my $cat_shuf_full_seq="";

		for (my $i = 1; $i <= $total_length; $i=$i+$window ) { # perform this operation below every 1MB, see line 21 above
		    my $s = substr ($concat_seq, $i - 1, $window);
		    my $rev = reverse $s;
		    my @temp_seq_array;
		    if ($i % 2 == 0)	{@temp_seq_array = split //, $rev;} # throw in some reverse sequence alternatively to shuffle it more randomly
		    elsif ($i % 2 != 0)	{@temp_seq_array = split //, $s;}
		   my @rand_seq_array = shuffle @temp_seq_array; # using the List::Util module
		   my $rand_shuffled_substr = join ('', @rand_seq_array,);
		   $cat_shuf_full_seq = $cat_shuf_full_seq.$rand_shuffled_substr; # concatenates the shuffled DNA seq to the 3' end of the previous 1MB fragment
	}
	
	my @shuffle_concat_array = split("", $cat_shuf_full_seq);
	
##########################################################################################

foreach my $ID (@IDs)	{

my $contig_len = $IDLen_hash{$ID};
my @shuffle_concat_splice_seq = splice @shuffle_concat_array, 0, $contig_len; 
# this will progressively reduce the length of @shuffle_concat_splice_seq
# until the final splice operation will be for the length of the final contig in original sequence
# which should also be the same as the length of the remaining array assigned to this array variable
my $cat_shuf_splice_final_subseq = join('', @shuffle_concat_splice_seq);
print $ID, "\n";
push @output, $ID, "\n", $cat_shuf_splice_final_subseq, "\n";
}

##########################################################################################

#my @input_name_split = split(".fa.shIDscleaned-up",$input);
#my $destination = $input_name_split[0]."_cat_shuf1MB_frag.fasta";
open(OUT, '>', $destination) or die "Can't write to file $destination: $!\n";
print OUT @output;
close OUT;
my $end_time = time;
my $duration = ($end_time - $start_time)/60;
my $rounded = sprintf("%.1f", $duration);
print "Finished processing input, written output to ", $destination, " in ", $rounded, " minutes \n";

##########################################################################################