Using up my quota of guesses for the day...
This will get all valid substrings of longer valid strings.
#!/usr/bin/perl
use strict; # https://perlmonks.org/?node_id=11137405
use warnings;
my $random = join '', map qw(A C G T)[rand 4], 1 .. 50;
for my $seq ( qw( AGCAGC AATGCAATCGCAGCAGCA AGCTACCCAGCTAGGGAGCTA
AAA_x_AAA_x_BBB_x_AAA_x_AAA_x_BBB
), $random )
{
my %found;
$seq =~ /([A-Z]{3,}) .* \1 (?{ $found{$1}++ }) (*FAIL)/x;
my %counts = map { $_, scalar(() = $seq =~ /$_/g) } keys %found;
use Data::Dump 'dd'; dd "for sequence $seq", \%counts;
}
Outputs:
("for sequence AGCAGC", { AGC => 2 })
(
"for sequence AATGCAATCGCAGCAGCA",
{ AAT => 2, AGC => 2, CAG => 2, GCA => 4 },
)
(
"for sequence AGCTACCCAGCTAGGGAGCTA",
{ AGC => 3, AGCT => 3, AGCTA => 3, CTA => 3, GCT => 3, GCTA => 3 },
)
(
"for sequence AAA_x_AAA_x_BBB_x_AAA_x_AAA_x_BBB",
{ AAA => 4, BBB => 2 },
)
(
"for sequence ATGGACTGCCTGGAAGAATCATCCATCCTGGGGCCCGGATCTTTGTACCC",
{
ATC => 4,
ATCC => 2,
CAT => 2,
CATC => 2,
CCC => 2,
CCT => 2,
CCTG => 2,
CCTGG => 2,
CTG => 3,
CTGG => 2,
GAA => 2,
GCC => 2,
GGA => 3,
TCC => 2,
TGG => 3,
TGGA => 2,
},
)