my @third_fragments = grep -v ($rsite3), $second_fragments[$i]
my @final_fragment1 = $seqname."_".$i."_1";
my @final_fragment2 = $seqname."_".$i."_2";
$final_fragments{$final_fragment1} = $third_fragments[0];
$final_fragments{$final_fragment2} = $third_fragments[scalar @third_fragments - 1];
####
syntax error at /Users/smithcabinets26/Desktop/RAD/Digester/Improving/MyTripleDigester-3.pl line 68, near "my "
Global symbol "@final_fragment1" requires explicit package name at /Users/smithcabinets26/Desktop/RAD/Digester/Improving/MyTripleDigester-3.pl line 68.
Global symbol "$final_fragment1" requires explicit package name at /Users/smithcabinets26/Desktop/RAD/Digester/Improving/MyTripleDigester-3.pl line 70.
Global symbol "$final_fragment2" requires explicit package name at /Users/smithcabinets26/Desktop/RAD/Digester/Improving/MyTripleDigester-3.pl line 71.
Execution of /Users/smithcabinets26/Desktop/RAD/Digester/Improving/MyTripleDigester-3.pl aborted due to compilation errors.
####
#! /usr/bin/perl -w
# a script to get fragments of a genome based on restriction enzyme
# retrieves the fragments greater than XX bp and less than XX bp
# whole genome in one gzip file (fasta format)!
# calculates the number of fragments you will get per genome for RADseq
# perl genomeFragmentor_doubleDigest.pl genome.gz restriction_site1 restriction_site2 organism
# perl genomeFragmentor_doubleDigest.pl Anolisgenome.gz CCTGCAGG GGATCC Anolis
use strict;
my %genome = fasta_read_gzip_alt($ARGV[0]);
#my %genome = fasta_read_alt($ARGV[0]);
my $rsite1 = $ARGV[1];
my $rsite2 = $ARGV[2];
my $rsite3 = $ARGV[3];
my $totalCount = 0; # count of all fragments in the genome
my $totalBP = 0; # the total number of base pairs in the selected fragments
my $organism = $ARGV[4];
my $radtagfile = $organism."_Radseq_fragments_doubleDigest_".$rsite1."_".$rsite2."_".$rsite3.".fasta";
my $sizesfile = $organism."_Radseq_fragments_doubleDigest_".$rsite1."_".$rsite2."_".$rsite3."_sizes.txt";
open (OUT, ">$radtagfile");
open (SIZE, ">$sizesfile");
print SIZE "length organism\n";
# prepare a summary file
my ($Second, $Minute, $Hour, $Day, $Month, $Year, $WeekDay, $DayOfYear, $IsDST) = localtime(time);
$Month++;
$Year += 1900;
my $date = $Month."_".$Day."_".$Year;
my $summaryfile = $organism."_summary_doubleDigest_".$rsite1."_".$rsite2."_".$rsite3."_".$date.".txt";
open (RESULTS, ">$summaryfile");
my %final_fragments;
my @all_size_fragments;
# start creating the fragments
foreach my $seqname (keys %genome) {
print "Working on sequence $seqname.\n";
my @first_fragments = split("$rsite1", $genome{$seqname}); # split the fragments up based on the enzyme motif
# add the restriction site motif back onto the fragments
$first_fragments[0] .= $rsite1;
$first_fragments[scalar @first_fragments - 1] = $rsite1.$first_fragments[scalar @first_fragments - 1];
for (my $i = 0; $i < scalar @first_fragments; ++$i) {
if ($i != 0 or $i != (scalar @first_fragments - 1)) {
$first_fragments[$i] = $rsite1.$first_fragments[$i].$rsite1;
}
# now split the fragment using $rsite2
# repair the first and last fragments to include $rsite2
# these are the only fragments to contain both restriction sites, so keep them in @final_fragments
my @second_fragments = split($rsite2, $first_fragments[$i]);
$second_fragments[0] .= $rsite2;
$second_fragments[scalar @second_fragments - 1] = $rsite2.$second_fragments[scalar @second_fragments - 1];
foreach my $fragment (@second_fragments) {
push(@all_size_fragments, length($fragment));
}
my @third_fragments = grep -v ($rsite3), $second_fragments[$i]
my @final_fragment1 = $seqname."_".$i."_1";
my @final_fragment2 = $seqname."_".$i."_2";
$final_fragments{$final_fragment1} = $third_fragments[0];
$final_fragments{$final_fragment2} = $third_fragments[scalar @third_fragments - 1];
}
}
# keep a score of how many fragments fall within a particular size range
my $size_651_750 = 0;
my $size_551_650 = 0;
my $size_501_550 = 0;
my $size_401_500 = 0;
my $size_301_400 = 0;
my $size_small = 0;
my $size_large = 0;
foreach my $fragment (keys %final_fragments) {
# add on $rsite1 to both sides of the fragment
my $fragmentLength = length($final_fragments{$fragment});
print OUT ">$fragment", "_", "1\n";
print OUT substr($final_fragments{$fragment}, 0, 96), "\n";
print OUT ">$fragment", "_", "2rc\n";
print OUT revcom(substr($final_fragments{$fragment}, $fragmentLength - 96, 96)), "\n";
$totalBP += $fragmentLength;
if ($fragmentLength >= 300 and $fragmentLength <= 400) {
++$size_301_400;
} elsif ($fragmentLength > 400 and $fragmentLength <= 500) {
++$size_401_500;
} elsif ($fragmentLength > 500 and $fragmentLength <= 550) {
++$size_501_550;
} elsif ($fragmentLength > 550 and $fragmentLength <= 650) {
++$size_551_650;
} elsif ($fragmentLength > 650 and $fragmentLength <= 750) {
++$size_651_750;
} elsif ($fragmentLength < 300) {
++$size_small;
} elsif ($fragmentLength > 750) {
++$size_large;
}
}
$totalCount = scalar keys %final_fragments; # count of all fragments in the genome
print RESULTS "The restriction sites used were:\n";
print RESULTS $ARGV[1], "\n";
print RESULTS $ARGV[2], "\n\n";
print RESULTS $ARGV[3], "\n\n\n";
print RESULTS "There were ", $totalCount, " fragments from the whole genome.\n\n";
print RESULTS "For MiSeq v3 there are ", $size_651_750, " fragments\n";
print RESULTS "For MiSeq v2 there are ", $size_551_650, " fragments\n";
print RESULTS "For HiSeq 2X150 there are ", $size_401_500, " fragments\n";
print RESULTS "For HiSeq 2X100 there are ", $size_301_400, " fragments\n\n";
print RESULTS "There were ", $size_small, " fragments smaller than 100 bp.\n";
print RESULTS "There were ", $size_large, " fragments larger than 800 bp.\n\n";
print RESULTS "There are ", $totalBP, " base pairs in the fragments.\n";
print RESULTS "\nJust some notes:\n";
print RESULTS "3RAD adapters are ~140bp, select below +/- 50bp\n";
print RESULTS "MiSeq v3 2X300 you will want 700bp fragments.\n";
print RESULTS "MiSeq v2 2X250 you will want 600bp fragments.\n";
print RESULTS "HiSeq Fast 2X150 you will want 450bp fragments.\n";
print RESULTS "HiSeq 2X100 you will want 350bp fragments.\n";
close OUT;
close RESULTS;
foreach my $length (@all_size_fragments) {
print SIZE $length, "\t", $organism, "\n";
}
exit;
sub fasta_read_gzip_alt {
# reads in a gzip fasta file and pases it into a hash
# version 1.0
(my $filename) = @_; # be sure to include the path
my %fasta;
open(FASTA, "gunzip -c $filename |") || die "can't open pipe to $filename";
my $fastaData;
my $sequence = '';
my $name = '';
while() {
$fastaData = $_;
$fastaData =~ s/\n//gms;
if ($fastaData =~ />/) {
if ($sequence) { # if there is a sequence, then the sequence belongs to the last name
$fasta{$name} = $sequence;
}
# reinitialize everything
$sequence = ''; # start over!
$name = $fastaData;
$name =~ s/>//gms;
} elsif (eof FASTA) {
$fasta{$name} = $sequence;
} else {
$sequence .= $fastaData;
}
}
close FASTA;
return %fasta;
}
sub revcom {
(my $sequence) = @_;
$sequence = reverse($sequence);
$sequence =~ tr/AGCTRYMKSWHBVDNagctrymkswhbvdn/TCGAYRKMSWDVBHNtcgayrkmswdvbhn/;
return $sequence;
}
sub fasta_read_alt {
# reads in a fasta file and pases it into a hash
# version 1.0
(my $filename) = @_; # be sure to include the path
my %fasta;
open(FASTA, $filename);
my $fastaData;
my $sequence = '';
my $name = '';
while() {
$fastaData = $_;
$fastaData =~ s/\n//gms;
if ($fastaData =~ />/) {
if ($sequence) { # if there is a sequence, then the sequence belongs to the last name
$fasta{$name} = $sequence;
}
# reinitialize everything
$sequence = ''; # start over!
$name = $fastaData;
$name =~ s/>//gms;
} elsif (eof FASTA) {
$fasta{$name} = $sequence;
} else {
$sequence .= $fastaData;
}
}
close FASTA;
return %fasta;
}
####
my @third_fragments = grep !/$rsite3/, $second_fragments[$i];