use strict; use warnings; my ($fastaFile, $rsite1, $rsite2, $rsite3, $organism) = @ARGV; my %genome = fastaReadData(); my $radtagfile = join '_', $organism, "Radseq_fragments_doubleDigest", $rsite1, $rsite2, "${rsite3}fasta"; my $sizesfile = join '_', $organism, "Radseq_fragments_doubleDigest", $rsite1, $rsite2, $rsite3, "sizes.txt"; open my $fOut, '>', $radtagfile or die "Can't create $radtagfile: $!\n"; open my $fSize, '>', $sizesfile or die "Can't create $sizesfile: $!\n"; print $fSize "length\torganism\n"; # prepare a summary file my ($Second, $Minute, $Hour, $Day, $Month, $Year) = localtime(time); $Month++; $Year += 1900; my $date = join '_', $Month, $Day, $Year; my $summaryfile = join '_', $organism, "summary_doubleDigest", $rsite1, $rsite2, $rsite3, "${date}txt"; open my $fRes, '>', $summaryfile or die "Can't create $summaryfile: $!\n"; my %final_fragments; my @all_size_fragments; # start creating the fragments foreach my $seqname (keys %genome) { print "Working on sequence $seqname.\n"; # split the fragments up based on the enzyme motif my @first_fragments = split $rsite1, $genome{$seqname}; # add the restriction site motif back onto the fragments $first_fragments[0] .= $rsite1; $first_fragments[-1] = "$rsite1$first_fragments[-1]"; for my $i (0 .. $#first_fragments) { if ($i or $i != $#first_fragments) { $first_fragments[$i] = "$rsite1$first_fragments[$i]$rsite1"; } # now split the fragment using $rsite2 # repair the first and last fragments to include $rsite2 # these are the only fragments to contain both restriction sites, so # keep them in @final_fragments my @second_fragments = split $rsite2, $first_fragments[$i]; $second_fragments[0] .= $rsite2; $second_fragments[-1] = "$rsite2$second_fragments[-1]"; push @all_size_fragments, length $_ for @second_fragments; my @third_fragments = grep !/$rsite3/, @second_fragments; $third_fragments[0] .= $rsite3; $third_fragments[-1] = "$rsite3$third_fragments[-1]"; push @all_size_fragments, length $_ for @third_fragments; $final_fragments{join '_', $seqname, $i, 1} = $third_fragments[0]; $final_fragments{join '_', $seqname, $i, 2} = $third_fragments[-1]; } } # keep a score of how many fragments fall within a particular size range my %counts; my @bins = (('smaller than 100') x 2, 'between 100 and 150'); push @bins, map {'between ' . ($_ * 50 + 1) . ' and ' . ($_ + 1) * 50} 3 .. 12; push @bins, 'larger than 600'; $counts{$_} = 0 for @bins; my $totalBP = 0; # the total number of base pairs in the selected fragments foreach my $fragment (keys %final_fragments) { # add on $rsite1 to both sides of the fragment my $fragmentLength = length($final_fragments{$fragment}); print $fOut ">${fragment}_1\n"; print $fOut substr($final_fragments{$fragment}, 0, 96), "\n"; print $fOut ">${fragment}_2rc\n"; print $fOut revcom( substr($final_fragments{$fragment}, $fragmentLength - 96, 96) ), "\n"; $totalBP += $fragmentLength; my $binnedSize = $fragmentLength / 50; $binnedSize = 2 if $binnedSize < 2 && $fragmentLength >= 100; if ($binnedSize >= $#bins) { ++$counts{$bins[-1]}; } else { ++$counts{$bins[$binnedSize]}; } } my $totalCount = keys %final_fragments; print $fRes "The restriction sites used were:\n"; print $fRes "$rsite1\n"; print $fRes "$rsite2\n\n"; print $fRes "There were ", $totalCount, " fragments from the whole genome.\n"; print $fRes "There were $counts{$_} fragments $_ bp.\n" for @bins[1 .. $#bins]; print $fRes "There are ", $totalBP, " base pairs in the fragments.\n"; print $fRes "\nJust some notes of reference:\n"; print $fRes "HiSeq 2000 gives 375,000,000 reads.\n"; print $fRes "HiSeq 2500 gives 742,000,000 reads.\n"; close $fOut; close $fRes; print $fSize "$_\t$organism\n" for @all_size_fragments; sub fastaReadData { return parseFile(*DATA); } sub fasta_read_gzip_alt { my ($filename) = @_; # be sure to include the path my %fasta; open my $fIn, "gunzip -c $filename |" or die "can't open pipe to $filename: $!"; return parseFile($fIn); } sub fasta_read_alt { my ($filename) = @_; # be sure to include the path open my $fIn, '<', $filename or die "can't open $filename: $!\n"; return parseFile($fIn); } sub parseFile { my ($fIn) = @_; my $fastaData; my $sequence = ''; my $name = ''; my %fasta; while (<$fIn>) { $fastaData = $_; $fastaData =~ s/\n//gms; if ($fastaData =~ />/) { if ($sequence) { # if there is a sequence, then the sequence belongs to the last # name $fasta{$name} = $sequence; } # reinitialize everything $sequence = ''; # start over! $name = $fastaData; $name =~ s/>//gms; } elsif (eof $fIn) { $fasta{$name} = $sequence; } else { $sequence .= $fastaData; } } return %fasta; } sub revcom { (my $sequence) = @_; $sequence = reverse($sequence); $sequence =~ tr/AGCTRYMKSWHBVDNagctrymkswhbvdn/TCGAYRKMSWDVBHNtcgayrkmswdvbhn/; return $sequence; } __DATA__ >24.6jsd1.Tut TTGGAGAGTTTGATCCTGGCTCAGGATGAACGCTGGCGGCGTGCCTAATA CATGCAAGTCGAGCGAATGGATTAAGAGCTTGCTCTTATGAAGTTAGCGG CGGACGGGTGAGTAACACGTGGGTAACCTGCCCATAAGACTGGGATAACT CCGGGAAACCGGGGCTAATACCGGATAACATTTTGAACCGCATGGTTCGA AATTGAAAGGCGGCTTCGGTCGTCACTTATGGATGGACCCGCGTCGCATT AGCTAGTTGGTGAGGTAACGGCTCACCAAGGCAACGATGCGTAGCCGACC TGAGAGGGTGATCGGCCACACTGGGACTGAGACACGGCCCAGACTCCTAC GGGAGGCAGCAGTAGGGAATCTTCCGCAATGGACGAAAGTCTGACGGAGC AACGCCGCGTGAGTGATGAAGGCTTTCGGGTCGTAAAACTCTGTTGTTAG GGAAGAACAAGTGCTAGTTGAATAAGCTGGCACCTTGACGGTACCTAACC AGAAAGCCACGGCTAACTACGTGCCAGCAGCCGCGGTAATACGTAGGTGG CAAGCGTTATCCGGAATTATTGGGCGTAAAGAACGCGCAGGTGGTTTCTT AAGTCTGATGTGAAAGCCCACGGCTCAACCGTGGAGGGTCATTGGAAACT GGGAGACTTGAGTGCAGAAGAGGAAAGTGGAATTCCATGTGTAGCGGTGA AATGCGTAGAGATATGGAGGAACACCAGTGGCCCAGGCGACTTTCTGGTC TGTAACTGACACTGAGGCGCGAAAGCGTGGGGAGCAAACAGGATTAGATA CCCTGGTAGTCCACGCCGTAAACGATGAGTGCTAAGTGTTAGAGGGTTTC CGCCCTTTAGTGCTGAAGTTAAAGCATTAAGCACTCCGCGTGTGGAGTAC GGCCGCAAGGCTGAAACTCAAAGGAATTGACGGGGGCCCGCACAAGCGGT GGAGCATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCAGGTCTTG ACATCCTCTGACAACCCTAGAGATAGGGCTTCTCCTTCGGGAGCAGAGTG ACAGGTGGTGCATGGTTGTCGTCAGCTCGTGTCGTGAGATGTTGGGTTAA GTCCCGCAACGAGCGCAACCCTTGATCTTAGTTGCCATCATTWAGTTGGG CACTCTAAGGTGACTGCCGGTGACAAACCGGAGGAAGGTGGGGATGACGT CAAATCATCATGCCCCTTATGACCTGGGCTACACACGTGCTACAATGGAC GGTACAAAGAGCTGCAAGACCGCGAGGTGGAGCTAATCTCATAAAACCGT TCTCAGTTCGGATTGTAGGCTGCAACTCGCCTACATGAAGCTGGAATCGC TAGTAATCGCGGATCAGCATGCCGCGGTGAATACGTTCCCGGGCCTTGTA CACACCGCCCGTCACACCACGAGAGTTTGTAACACCCGAAGTCGGTGGGG TAACCTTTTTGGAGCCAGCCGCCTAAGGTGGGACAGATGATTGGGGTGAA GTCGTAACAAGGTAGCCGTATCGGAAGGTGCGGCTGGATCACCTCCTTTC T #### The restriction sites used were: TTGG CTAG There were 18 fragments from the whole genome. There were 14 fragments smaller than 100 bp. There were 3 fragments between 100 and 150 bp. There were 0 fragments between 151 and 200 bp. There were 1 fragments between 201 and 250 bp. There were 0 fragments between 251 and 300 bp. There were 0 fragments between 301 and 350 bp. There were 0 fragments between 351 and 400 bp. There were 0 fragments between 401 and 450 bp. There were 0 fragments between 451 and 500 bp. There were 0 fragments between 501 and 550 bp. There were 0 fragments between 551 and 600 bp. There were 0 fragments between 601 and 650 bp. There were 0 fragments larger than 600 bp. There are 1121 base pairs in the fragments. Just some notes of reference: HiSeq 2000 gives 375,000,000 reads. HiSeq 2500 gives 742,000,000 reads.