my @third_fragments = grep -v ($rsite3), $second_fragments[$i] my @final_fragment1 = $seqname."_".$i."_1"; my @final_fragment2 = $seqname."_".$i."_2"; $final_fragments{$final_fragment1} = $third_fragments[0]; $final_fragments{$final_fragment2} = $third_fragments[scalar @third_fragments - 1]; ##

##

syntax error at /Users/smithcabinets26/Desktop/RAD/Digester/Improving/MyTripleDigester-3.pl line 68, near "my "
Global symbol "@final_fragment1" requires explicit package name at /Users/smithcabinets26/Desktop/RAD/Digester/Improving/MyTripleDigester-3.pl line 68.
Global symbol "$final_fragment1" requires explicit package name at /Users/smithcabinets26/Desktop/RAD/Digester/Improving/MyTripleDigester-3.pl line 70.
Global symbol "$final_fragment2" requires explicit package name at /Users/smithcabinets26/Desktop/RAD/Digester/Improving/MyTripleDigester-3.pl line 71.
Execution of /Users/smithcabinets26/Desktop/RAD/Digester/Improving/MyTripleDigester-3.pl aborted due to compilation errors.
##

##

#! /usr/bin/perl -w
# a script to get fragments of a genome based on restriction enzyme
# retrieves the fragments greater than XX bp and less than XX bp
# whole genome in one gzip file (fasta format)!
# calculates the number of fragments you will get per genome for RADseq
# perl genomeFragmentor_doubleDigest.pl genome.gz restriction_site1 restriction_site2 organism
# perl genomeFragmentor_doubleDigest.pl Anolisgenome.gz CCTGCAGG GGATCC Anolis

use strict;

my %genome = fasta_read_gzip_alt($ARGV[0]);
#my %genome = fasta_read_alt($ARGV[0]);
my $rsite1 = $ARGV[1];
my $rsite2 = $ARGV[2];
my $rsite3 = $ARGV[3];

my $totalCount = 0;			# count of all fragments in the genome
my $totalBP = 0;			# the total number of base pairs in the selected fragments

my $organism = $ARGV[4];

my $radtagfile = $organism."_Radseq_fragments_doubleDigest_".$rsite1."_".$rsite2."_".$rsite3.".fasta";
my $sizesfile = $organism."_Radseq_fragments_doubleDigest_".$rsite1."_".$rsite2."_".$rsite3."_sizes.txt";
open (OUT, ">$radtagfile");
open (SIZE, ">$sizesfile");
print SIZE "length	organism\n";

# prepare a summary file
my ($Second, $Minute, $Hour, $Day, $Month, $Year, $WeekDay, $DayOfYear, $IsDST) = localtime(time);
$Month++;
$Year += 1900;
my $date = $Month."_".$Day."_".$Year;
my $summaryfile = $organism."_summary_doubleDigest_".$rsite1."_".$rsite2."_".$rsite3."_".$date.".txt";
open (RESULTS, ">$summaryfile");

my %final_fragments;

my @all_size_fragments;

# start creating the fragments
foreach my $seqname (keys %genome) {
	print "Working on sequence $seqname.\n";
	
	my @first_fragments = split("$rsite1", $genome{$seqname});			# split the fragments up based on the enzyme motif
	
	# add the restriction site motif back onto the fragments
	$first_fragments[0] .= $rsite1;	
	$first_fragments[scalar @first_fragments - 1] = $rsite1.$first_fragments[scalar @first_fragments - 1];	 
	
	for (my $i = 0; $i < scalar @first_fragments; ++$i) {
		if ($i != 0 or $i != (scalar @first_fragments - 1)) {
			$first_fragments[$i] = $rsite1.$first_fragments[$i].$rsite1;
		}
		
		# now split the fragment using $rsite2
		# repair the first and last fragments to include $rsite2
		# these are the only fragments to contain both restriction sites, so keep them in @final_fragments
		my @second_fragments = split($rsite2, $first_fragments[$i]);
		
		$second_fragments[0] .= $rsite2;
		$second_fragments[scalar @second_fragments - 1] = $rsite2.$second_fragments[scalar @second_fragments - 1];
		foreach my $fragment (@second_fragments) {
			push(@all_size_fragments, length($fragment));
		}
		my @third_fragments = grep -v ($rsite3), $second_fragments[$i]
	
		my @final_fragment1 = $seqname."_".$i."_1";	
		my @final_fragment2 = $seqname."_".$i."_2";	
		$final_fragments{$final_fragment1} = $third_fragments[0];
		$final_fragments{$final_fragment2} = $third_fragments[scalar @third_fragments - 1];
	}
	
}

# keep a score of how many fragments fall within a particular size range

my $size_651_750 = 0;
my $size_551_650 = 0;
my $size_501_550 = 0;
my $size_401_500 = 0;
my $size_301_400 = 0;
my $size_small = 0;
my $size_large = 0;	
	
foreach my $fragment (keys %final_fragments) {		
	# add on $rsite1 to both sides of the fragment
	my $fragmentLength = length($final_fragments{$fragment});
	print OUT ">$fragment", "_", "1\n";
	print OUT substr($final_fragments{$fragment}, 0, 96), "\n";
	print OUT ">$fragment", "_", "2rc\n";
	print OUT revcom(substr($final_fragments{$fragment}, $fragmentLength - 96, 96)), "\n";
	$totalBP += $fragmentLength;
	
	if ($fragmentLength >= 300 and $fragmentLength <= 400) {
		++$size_301_400;
	} elsif ($fragmentLength > 400 and $fragmentLength <= 500) {
		++$size_401_500;
	} elsif ($fragmentLength > 500 and $fragmentLength <= 550) {
		++$size_501_550;
	} elsif ($fragmentLength > 550 and $fragmentLength <= 650) {
		++$size_551_650;
	} elsif ($fragmentLength > 650 and $fragmentLength <= 750) {
		++$size_651_750;
	} elsif ($fragmentLength < 300) {
		++$size_small;
	} elsif ($fragmentLength > 750) {
		++$size_large;
	}
}
	
$totalCount = scalar keys %final_fragments;			# count of all fragments in the genome
print RESULTS "The restriction sites used were:\n";
print RESULTS $ARGV[1], "\n";
print RESULTS $ARGV[2], "\n\n";
print RESULTS $ARGV[3], "\n\n\n";

print RESULTS "There were ", $totalCount, " fragments from the whole genome.\n\n";

print RESULTS "For MiSeq v3 there are ", $size_651_750, " fragments\n";
print RESULTS "For MiSeq v2 there are ", $size_551_650, " fragments\n";
print RESULTS "For HiSeq 2X150 there are ", $size_401_500, " fragments\n";
print RESULTS "For HiSeq 2X100 there are ", $size_301_400, " fragments\n\n";

print RESULTS "There were ", $size_small, " fragments smaller than 100 bp.\n";
print RESULTS "There were ", $size_large, " fragments larger than 800 bp.\n\n";

print RESULTS "There are ", $totalBP, " base pairs in the fragments.\n";

print RESULTS "\nJust some notes:\n";
print RESULTS "3RAD adapters are ~140bp, select below +/- 50bp\n"; 
print RESULTS "MiSeq v3 2X300 you will want 700bp fragments.\n"; 
print RESULTS "MiSeq v2 2X250 you will want 600bp fragments.\n"; 
print RESULTS "HiSeq Fast 2X150 you will want 450bp fragments.\n"; 
print RESULTS "HiSeq 2X100 you will want 350bp fragments.\n"; 

close OUT;
close RESULTS;

foreach my $length (@all_size_fragments) {
	print SIZE $length, "\t", $organism, "\n";
}
exit;

sub fasta_read_gzip_alt {
	
	# reads in a gzip fasta file and pases it into a hash
	# version 1.0
	
	(my $filename) = @_;		# be sure to include the path
	my %fasta;
	open(FASTA, "gunzip -c $filename |") || die "can't open pipe to $filename";
	my $fastaData;
	my $sequence = '';
	my $name = '';
	while() {
		$fastaData = $_;
		$fastaData =~ s/\n//gms;
		if ($fastaData =~ />/) {
			if ($sequence) {				# if there is a sequence, then the sequence belongs to the last name
				$fasta{$name} = $sequence;
			}
			
			# reinitialize everything
			$sequence = '';				# start over!
			$name = $fastaData;
			$name =~ s/>//gms;
		} elsif (eof FASTA) {
			$fasta{$name} = $sequence;
		} else {
			$sequence .= $fastaData;
		}
	}
	close FASTA;
	
	return %fasta;

}

sub revcom {
	(my $sequence) = @_;
	$sequence = reverse($sequence);
	$sequence =~ tr/AGCTRYMKSWHBVDNagctrymkswhbvdn/TCGAYRKMSWDVBHNtcgayrkmswdvbhn/;
	return $sequence;
}

sub fasta_read_alt {
	
	# reads in a fasta file and pases it into a hash
	# version 1.0
	
	(my $filename) = @_;		# be sure to include the path
	my %fasta;
	open(FASTA, $filename);
	my $fastaData;
	my $sequence = '';
	my $name = '';
	while() {
		$fastaData = $_;
		$fastaData =~ s/\n//gms;
		if ($fastaData =~ />/) {
			if ($sequence) {				# if there is a sequence, then the sequence belongs to the last name
				$fasta{$name} = $sequence;
			}
			
			# reinitialize everything
			$sequence = '';				# start over!
			$name = $fastaData;
			$name =~ s/>//gms;
		} elsif (eof FASTA) {
			$fasta{$name} = $sequence;
		} else {
			$sequence .= $fastaData;
		}
	}
	close FASTA;
	return %fasta;

}

##

##

my @third_fragments = grep !/$rsite3/, $second_fragments[$i];