my @third_fragments = grep !/$rsite3/, $second_fragments[$i]; #### #! /usr/bin/perl -w # a script to get fragments of a genome based on restriction enzyme # retrieves the fragments greater than XX bp and less than XX bp # whole genome in one gzip file (fasta format)! # calculates the number of fragments you will get per genome for RADseq # perl genomeFragmentor_doubleDigest.pl genome.gz restriction_site1 restriction_site2 organism # perl genomeFragmentor_doubleDigest.pl Anolisgenome.gz CCTGCAGG GGATCC Anolis use strict; my %genome = fasta_read_gzip_alt($ARGV[0]); #my %genome = fasta_read_alt($ARGV[0]); my $rsite1 = $ARGV[1]; my $rsite2 = $ARGV[2]; my $rsite3 = $ARGV[3]; my $totalCount = 0; # count of all fragments in the genome my $totalBP = 0; # the total number of base pairs in the selected fragments my $organism = $ARGV[4]; my $radtagfile = $organism."_Radseq_fragments_doubleDigest_".$rsite1."_".$rsite2."_".$rsite3.".fasta"; my $sizesfile = $organism."_Radseq_fragments_doubleDigest_".$rsite1."_".$rsite2."_".$rsite3."_sizes.txt"; open (OUT, ">$radtagfile"); open (SIZE, ">$sizesfile"); print SIZE "length organism\n"; # prepare a summary file my ($Second, $Minute, $Hour, $Day, $Month, $Year, $WeekDay, $DayOfYear, $IsDST) = localtime(time); $Month++; $Year += 1900; my $date = $Month."_".$Day."_".$Year; my $summaryfile = $organism."_summary_doubleDigest_".$rsite1."_".$rsite2."_".$rsite3."_".$date.".txt"; open (RESULTS, ">$summaryfile"); my %final_fragments; my @all_size_fragments; # start creating the fragments foreach my $seqname (keys %genome) { print "Working on sequence $seqname.\n"; my @first_fragments = split("$rsite1", $genome{$seqname}); # split the fragments up based on the enzyme motif # add the restriction site motif back onto the fragments $first_fragments[0] .= $rsite1; $first_fragments[scalar @first_fragments - 1] = $rsite1.$first_fragments[scalar @first_fragments - 1]; for (my $i = 0; $i < scalar @first_fragments; ++$i) { if ($i != 0 or $i != (scalar @first_fragments - 1)) { $first_fragments[$i] = $rsite1.$first_fragments[$i].$rsite1; } # now split the fragment using $rsite2 # repair the first and last fragments to include $rsite2 # these are the only fragments to contain both restriction sites, so keep them in @final_fragments my @second_fragments = split($rsite2, $first_fragments[$i]); $second_fragments[0] .= $rsite2; $second_fragments[scalar @second_fragments - 1] = $rsite2.$second_fragments[scalar @second_fragments - 1]; foreach my $fragment (@second_fragments) { push(@all_size_fragments, length($fragment)); } my @third_fragments = grep !/$rsite3/, $second_fragments[$i]; $third_fragments[0] .= $rsite3; $third_fragments[scalar @third_fragments - 1] = $rsite3.$third_fragments[scalar @third_fragments - 1]; foreach my $fragment (@third_fragments) { push(@all_size_fragments, length($fragment)); } my $final_fragment1 = $seqname."_".$i."_1"; my $final_fragment2 = $seqname."_".$i."_2"; $final_fragments{$final_fragment1} = $third_fragments[0]; $final_fragments{$final_fragment2} = $third_fragments[scalar @third_fragments - 1]; } } # keep a score of how many fragments fall within a particular size range my $size_100_150 = 0; my $size_151_200 = 0; my $size_201_250 = 0; my $size_251_300 = 0; my $size_301_350 = 0; my $size_351_400 = 0; my $size_401_450 = 0; my $size_451_500 = 0; my $size_501_550 = 0; my $size_551_600 = 0; my $size_small = 0; my $size_large = 0; foreach my $fragment (keys %final_fragments) { # add on $rsite1 to both sides of the fragment my $fragmentLength = length($final_fragments{$fragment}); print OUT ">$fragment", "_", "1\n"; print OUT substr($final_fragments{$fragment}, 0, 96), "\n"; print OUT ">$fragment", "_", "2rc\n"; print OUT revcom(substr($final_fragments{$fragment}, $fragmentLength - 96, 96)), "\n"; $totalBP += $fragmentLength; if ($fragmentLength >= 100 and $fragmentLength <= 150) { ++$size_100_150; } elsif ($fragmentLength > 150 and $fragmentLength <= 200) { ++$size_151_200; } elsif ($fragmentLength > 200 and $fragmentLength <= 250) { ++$size_201_250; } elsif ($fragmentLength > 250 and $fragmentLength <= 300) { ++$size_251_300; } elsif ($fragmentLength > 300 and $fragmentLength <= 350) { ++$size_301_350; } elsif ($fragmentLength > 350 and $fragmentLength <= 400) { ++$size_351_400; } elsif ($fragmentLength > 400 and $fragmentLength <= 450) { ++$size_401_450; } elsif ($fragmentLength > 450 and $fragmentLength <= 500) { ++$size_451_500; } elsif ($fragmentLength > 500 and $fragmentLength <= 550) { ++$size_501_550; } elsif ($fragmentLength > 550 and $fragmentLength <= 600) { ++$size_551_600; } elsif ($fragmentLength < 100) { ++$size_small; } elsif ($fragmentLength > 600) { ++$size_large; } } $totalCount = scalar keys %final_fragments; # count of all fragments in the genome print RESULTS "The restriction sites used were:\n"; print RESULTS $ARGV[1], "\n"; print RESULTS $ARGV[2], "\n\n"; print RESULTS "There were ", $totalCount, " fragments from the whole genome.\n"; print RESULTS "There were ", $size_100_150, " fragments between 100 and 150 bp.\n"; print RESULTS "There were ", $size_151_200, " fragments between 151 and 200 bp.\n"; print RESULTS "There were ", $size_201_250, " fragments between 201 and 250 bp.\n"; print RESULTS "There were ", $size_251_300, " fragments between 251 and 300 bp.\n"; print RESULTS "There were ", $size_301_350, " fragments between 301 and 350 bp.\n"; print RESULTS "There were ", $size_351_400, " fragments between 351 and 400 bp.\n"; print RESULTS "There were ", $size_401_450, " fragments between 401 and 450 bp.\n"; print RESULTS "There were ", $size_451_500, " fragments between 451 and 500 bp.\n"; print RESULTS "There were ", $size_501_550, " fragments between 501 and 550 bp.\n"; print RESULTS "There were ", $size_551_600, " fragments between 551 and 600 bp.\n"; print RESULTS "There were ", $size_small, " fragments smaller than 100 bp.\n"; print RESULTS "There were ", $size_large, " fragments larger than 600 bp.\n\n"; print RESULTS "There are ", $totalBP, " base pairs in the fragments.\n"; print RESULTS "\nJust some notes of reference:\n"; print RESULTS "HiSeq 2000 gives 375,000,000 reads.\n"; print RESULTS "HiSeq 2500 gives 742,000,000 reads.\n"; close OUT; close RESULTS; foreach my $length (@all_size_fragments) { print SIZE $length, "\t", $organism, "\n"; } exit; sub fasta_read_gzip_alt { # reads in a gzip fasta file and pases it into a hash # version 1.0 (my $filename) = @_; # be sure to include the path my %fasta; open(FASTA, "gunzip -c $filename |") || die "can't open pipe to $filename"; my $fastaData; my $sequence = ''; my $name = ''; while() { $fastaData = $_; $fastaData =~ s/\n//gms; if ($fastaData =~ />/) { if ($sequence) { # if there is a sequence, then the sequence belongs to the last name $fasta{$name} = $sequence; } # reinitialize everything $sequence = ''; # start over! $name = $fastaData; $name =~ s/>//gms; } elsif (eof FASTA) { $fasta{$name} = $sequence; } else { $sequence .= $fastaData; } } close FASTA; return %fasta; } sub revcom { (my $sequence) = @_; $sequence = reverse($sequence); $sequence =~ tr/AGCTRYMKSWHBVDNagctrymkswhbvdn/TCGAYRKMSWDVBHNtcgayrkmswdvbhn/; return $sequence; } sub fasta_read_alt { # reads in a fasta file and pases it into a hash # version 1.0 (my $filename) = @_; # be sure to include the path my %fasta; open(FASTA, $filename); my $fastaData; my $sequence = ''; my $name = ''; while() { $fastaData = $_; $fastaData =~ s/\n//gms; if ($fastaData =~ />/) { if ($sequence) { # if there is a sequence, then the sequence belongs to the last name $fasta{$name} = $sequence; } # reinitialize everything $sequence = ''; # start over! $name = $fastaData; $name =~ s/>//gms; } elsif (eof FASTA) { $fasta{$name} = $sequence; } else { $sequence .= $fastaData; } } close FASTA; return %fasta; }