use strict; use warnings; open my $inFH, q{<}, \ < >2L type=chromosome_arm; loc=2L:1..23011544; ID=2L; dbxref=REFSEQ:NT_033779,GB:AE014134; MD5=bfdfb99d39fa5174dae1e2ecd8a231cd; length=23011544; release=r5.54; species=Dmel; CGACAATGCACGACAGAGGAAGCAGAACAGATATTTAGATTGCCTCTCAT TTTCTCTCCCATATTATAGGGAGAAATATGATCGCGTATGCGAGAGTAGT GCCAACATATTGTGCTCTTTGATTTTTTGGCAACCCAAAATGGTGGCGGA TGAACGAGATGATAATATATTCAAGTTGCCGCTAATCAGAAATAAATTCA TTGCAACGTTAAATACAGCACAATATATGATCGCGTATGCGAGAGTAGTG CCAACATATTGTGCTAATGAGTGCCTCTCGTTCTCTGTCTTATATTACCG CAAACCCAAAAAGACAATACACGACAGAGAGAGAGAGCAGCGGAGATATT TAGATTGCCTATTAAATATGATCGCGTATGCGAGAGTAGTGCCAACATAT TGTGCTCTCTATATAATGACTGCCTCTCATTCTGTCTTATTTTACCGCAA ACCCAAATCGACAATGCACGACAGAGGAAGCAGAACAGATATTTAGATTG CCTCTCATTTTCTCTCCCATATTATAGGGAGAAATATGATCGCGTATGCG AGAGTAGTGCCAACATATTGTGCTCTTTGATTTTTTGGCAACCCAAAATG GTGGCGGATGAACGAGATGATAATATATTCAAGTTGCCGCTAATCAGAAA TAAATTCATTGCAACGTTAAATACAGCACAATATATGATCGCGTATGCGA GAGTAGTGCCAACATATTGTGCTAATGAGTGCCTCTCGTTCTCTGTCTTA TATTACCGCAAACCCAAAAAGACAATACACGACAGAGAGAGAGAGCAGCG GAGATATTTAGATTGCCTATTAAATATGATCGCGTATGCGAGAGTAGTGC CAACATATTGTGCTCTCTATATAATGACTGCCTCTCATTCTGTCTTATTT TACCGCAAACCCAAATCGACAATGCACGACAGAGGAAGCAGAACAGATAT EOF do { my $discard = <$inFH> }; my $seq = join q{}, map { chomp; $_ } <$inFH>; close $inFH or die $!; my $ct = 0; my @kmersGG = do { my %seen; grep { not $seen{ $_ } ++ } grep { q{GG} eq substr $_, 21 } map { substr $seq, $_, 23 } 0 .. length( $seq ) - 23; }; printf qq{>crispr_%d\n%s\n}, ++ $ct, $_ for @kmersGG > 1000 ? @kmersGG[ 0 .. 999 ] : @kmersGG; #### >crispr_1 CATTTTCTCTCCCATATTATAGG >crispr_2 ATTTTCTCTCCCATATTATAGGG >crispr_3 TATTGTGCTCTTTGATTTTTTGG >crispr_4 GATTTTTTGGCAACCCAAAATGG >crispr_5 TTTTTGGCAACCCAAAATGGTGG >crispr_6 TTGGCAACCCAAAATGGTGGCGG >crispr_7 ACGACAGAGAGAGAGAGCAGCGG >crispr_8 AAATCGACAATGCACGACAGAGG