my @third_fragments = grep !/$rsite3/, $second_fragments[$i]; ##
##

#! /usr/bin/perl -w
# a script to get fragments of a genome based on restriction enzyme
# retrieves the fragments greater than XX bp and less than XX bp
# whole genome in one gzip file (fasta format)!
# calculates the number of fragments you will get per genome for RADseq
# perl genomeFragmentor_doubleDigest.pl genome.gz restriction_site1 restriction_site2 organism
# perl genomeFragmentor_doubleDigest.pl Anolisgenome.gz CCTGCAGG GGATCC Anolis

use strict;

my %genome = fasta_read_gzip_alt($ARGV[0]);
#my %genome = fasta_read_alt($ARGV[0]);
my $rsite1 = $ARGV[1];
my $rsite2 = $ARGV[2];
my $rsite3 = $ARGV[3];

my $totalCount = 0;			# count of all fragments in the genome
my $totalBP = 0;			# the total number of base pairs in the selected fragments

my $organism = $ARGV[4];

my $radtagfile = $organism."_Radseq_fragments_doubleDigest_".$rsite1."_".$rsite2."_".$rsite3.".fasta";
my $sizesfile = $organism."_Radseq_fragments_doubleDigest_".$rsite1."_".$rsite2."_".$rsite3."_sizes.txt";
open (OUT, ">$radtagfile");
open (SIZE, ">$sizesfile");
print SIZE "length	organism\n";

# prepare a summary file
my ($Second, $Minute, $Hour, $Day, $Month, $Year, $WeekDay, $DayOfYear, $IsDST) = localtime(time);
$Month++;
$Year += 1900;
my $date = $Month."_".$Day."_".$Year;
my $summaryfile = $organism."_summary_doubleDigest_".$rsite1."_".$rsite2."_".$rsite3."_".$date.".txt";
open (RESULTS, ">$summaryfile");

my %final_fragments;
my @all_size_fragments;

# start creating the fragments
foreach my $seqname (keys %genome) {
	print "Working on sequence $seqname.\n";
	
	my @first_fragments = split("$rsite1", $genome{$seqname});			# split the fragments up based on the enzyme motif
	
	# add the restriction site motif back onto the fragments
	$first_fragments[0] .= $rsite1;	
	$first_fragments[scalar @first_fragments - 1] = $rsite1.$first_fragments[scalar @first_fragments - 1];	 
	
	for (my $i = 0; $i < scalar @first_fragments; ++$i) {
		if ($i != 0 or $i != (scalar @first_fragments - 1)) {
			$first_fragments[$i] = $rsite1.$first_fragments[$i].$rsite1;
		}
		
		# now split the fragment using $rsite2
		# repair the first and last fragments to include $rsite2
		# these are the only fragments to contain both restriction sites, so keep them in @final_fragments
		my @second_fragments = split($rsite2, $first_fragments[$i]);
		
		$second_fragments[0] .= $rsite2;
		$second_fragments[scalar @second_fragments - 1] = $rsite2.$second_fragments[scalar @second_fragments - 1];
		foreach my $fragment (@second_fragments) {
			push(@all_size_fragments, length($fragment));
			
			}
			
		my @third_fragments = grep !/$rsite3/, $second_fragments[$i];
		
		
		$third_fragments[0] .= $rsite3;
		$third_fragments[scalar @third_fragments - 1] = $rsite3.$third_fragments[scalar @third_fragments - 1];
		foreach my $fragment (@third_fragments) {
			push(@all_size_fragments, length($fragment));
			}
		my $final_fragment1 = $seqname."_".$i."_1";	 
		my $final_fragment2 = $seqname."_".$i."_2";	
		$final_fragments{$final_fragment1} = $third_fragments[0];
		$final_fragments{$final_fragment2} = $third_fragments[scalar @third_fragments - 1];
	}
	
}

# keep a score of how many fragments fall within a particular size range

my $size_100_150 = 0;
my $size_151_200 = 0;	
my $size_201_250 = 0;	
my $size_251_300 = 0;	
my $size_301_350 = 0;	
my $size_351_400 = 0;	
my $size_401_450 = 0;	
my $size_451_500 = 0;	
my $size_501_550 = 0;	
my $size_551_600 = 0;
my $size_small = 0;
my $size_large = 0;	

	
foreach my $fragment (keys %final_fragments) {		
	# add on $rsite1 to both sides of the fragment
	my $fragmentLength = length($final_fragments{$fragment});
	print OUT ">$fragment", "_", "1\n";
	print OUT substr($final_fragments{$fragment}, 0, 96), "\n";
	print OUT ">$fragment", "_", "2rc\n";
	print OUT revcom(substr($final_fragments{$fragment}, $fragmentLength - 96, 96)), "\n";
	$totalBP += $fragmentLength;
	
	if ($fragmentLength >= 100 and $fragmentLength <= 150) {
		++$size_100_150;
	} elsif ($fragmentLength > 150 and $fragmentLength <= 200) {
		++$size_151_200;
	} elsif ($fragmentLength > 200 and $fragmentLength <= 250) {
		++$size_201_250;
	} elsif ($fragmentLength > 250 and $fragmentLength <= 300) {
		++$size_251_300;
	} elsif ($fragmentLength > 300 and $fragmentLength <= 350) {
		++$size_301_350;
	} elsif ($fragmentLength > 350 and $fragmentLength <= 400) {
		++$size_351_400;
	} elsif ($fragmentLength > 400 and $fragmentLength <= 450) {
		++$size_401_450;
	} elsif ($fragmentLength > 450 and $fragmentLength <= 500) {
		++$size_451_500;
	} elsif ($fragmentLength > 500 and $fragmentLength <= 550) {
		++$size_501_550;
	} elsif ($fragmentLength > 550 and $fragmentLength <= 600) {
		++$size_551_600;
	} elsif ($fragmentLength < 100) {
		++$size_small;
	} elsif ($fragmentLength > 600) {
		++$size_large;
	}
}
	
$totalCount = scalar keys %final_fragments;			# count of all fragments in the genome
print RESULTS "The restriction sites used were:\n";
print RESULTS $ARGV[1], "\n";
print RESULTS $ARGV[2], "\n\n";

print RESULTS "There were ", $totalCount, " fragments from the whole genome.\n";

print RESULTS "There were ", $size_100_150, " fragments between 100 and 150 bp.\n";
print RESULTS "There were ", $size_151_200, " fragments between 151 and 200 bp.\n";
print RESULTS "There were ", $size_201_250, " fragments between 201 and 250 bp.\n";
print RESULTS "There were ", $size_251_300, " fragments between 251 and 300 bp.\n";
print RESULTS "There were ", $size_301_350, " fragments between 301 and 350 bp.\n";
print RESULTS "There were ", $size_351_400, " fragments between 351 and 400 bp.\n";
print RESULTS "There were ", $size_401_450, " fragments between 401 and 450 bp.\n";
print RESULTS "There were ", $size_451_500, " fragments between 451 and 500 bp.\n";
print RESULTS "There were ", $size_501_550, " fragments between 501 and 550 bp.\n";
print RESULTS "There were ", $size_551_600, " fragments between 551 and 600 bp.\n";
print RESULTS "There were ", $size_small, " fragments smaller than 100 bp.\n";
print RESULTS "There were ", $size_large, " fragments larger than 600 bp.\n\n";

print RESULTS "There are ", $totalBP, " base pairs in the fragments.\n";

print RESULTS "\nJust some notes of reference:\n";
print RESULTS "HiSeq 2000 gives 375,000,000 reads.\n"; 
print RESULTS "HiSeq 2500 gives 742,000,000 reads.\n"; 

close OUT;
close RESULTS;

foreach my $length (@all_size_fragments) {
	print SIZE $length, "\t", $organism, "\n";
}
exit;

sub fasta_read_gzip_alt {
	
	# reads in a gzip fasta file and pases it into a hash
	# version 1.0
	
	(my $filename) = @_;		# be sure to include the path
	my %fasta;
	open(FASTA, "gunzip -c $filename |") || die "can't open pipe to $filename";
	my $fastaData;
	my $sequence = '';
	my $name = '';
	while() {
		$fastaData = $_;
		$fastaData =~ s/\n//gms;
		if ($fastaData =~ />/) {
			if ($sequence) {				# if there is a sequence, then the sequence belongs to the last name
				$fasta{$name} = $sequence;
			}
			
			# reinitialize everything
			$sequence = '';				# start over!
			$name = $fastaData;
			$name =~ s/>//gms;
		} elsif (eof FASTA) {
			$fasta{$name} = $sequence;
		} else {
			$sequence .= $fastaData;
		}
	}
	close FASTA;
	
	return %fasta;

}

sub revcom {
	(my $sequence) = @_;
	$sequence = reverse($sequence);
	$sequence =~ tr/AGCTRYMKSWHBVDNagctrymkswhbvdn/TCGAYRKMSWDVBHNtcgayrkmswdvbhn/;
	return $sequence;
}

sub fasta_read_alt {
	
	# reads in a fasta file and pases it into a hash
	# version 1.0
	
	(my $filename) = @_;		# be sure to include the path
	my %fasta;
	open(FASTA, $filename);
	my $fastaData;
	my $sequence = '';
	my $name = '';
	while() {
		$fastaData = $_;
		$fastaData =~ s/\n//gms;
		if ($fastaData =~ />/) {
			if ($sequence) {				# if there is a sequence, then the sequence belongs to the last name
				$fasta{$name} = $sequence;
			}
			
			# reinitialize everything
			$sequence = '';				# start over!
			$name = $fastaData;
			$name =~ s/>//gms;
		} elsif (eof FASTA) {
			$fasta{$name} = $sequence;
		} else {
			$sequence .= $fastaData;
		}
	}
	close FASTA;
	return %fasta;

}