Ok. Here's a better version. While I haven't
benchmarked it, my feeling are that it's a hog, but I
bullet proofed several areas. It's less than a hog
than my earlier post. I'm posting the new version
for an easier compare.
#!/usr/bin/perl
# string, min_len of pattern, min_num of patterns
use strict;
use warnings;
my $string = "bookhelloworldhellohellohihellohiworldhihelloworldhihe
+llobookpenbookpenworld";
get_pattern($string, 2, 2);
exit;
sub get_pattern {
my ($string, $min_len, $min_num) = @_;
my $str_len = length($string);
my $srch_max = int($str_len/2);
my %patterns;
# First we find all patterns that are up to 1/2 the length of the stri
+ng
print "length : $str_len\n";
my %tmp_hash;
foreach my $len ($min_len..$srch_max) {
my $eol = $str_len - $len;
foreach my $ind1 (0..$eol) {
my $pat = substr($string, $ind1, $len);
unless ( defined($tmp_hash{$pat}) ) {
$tmp_hash{$pat} = 0;
$tmp_hash{$pat}++ while ($string =~ /\Q$pat\E/g);
$patterns{$pat} = $tmp_hash{$pat} if ($tmp_hash{$pat} >= $min_
+num);
}
}
}
undef %tmp_hash;
print "Patterns: ", scalar (keys %patterns), "\n";
# We then go through the patterns by order and remove those
# that are invalidated by better patterns
# Longer strings that occur more often are considered better
my $mod_str = $string;
foreach my $key (sort { $patterns{$b} * (length($b)-1) <=>
$patterns{$a} * (length($a)-1)
or length($b) <=> length($a) }
keys %patterns) {
my $tstr = $mod_str;
# We null out any area with pattern and count
$patterns{$key} = ($tstr =~ s/\Q$key\E/\000/g);
if ($patterns{$key} >= $min_num) {
# If it hits threshold we keep
$mod_str = $tstr;
}
else {
# If not we toss pattern
delete $patterns{$key};
}
}
print "Valid : ", scalar (keys %patterns), "\n";
# We finally print results
foreach my $key
(sort { $patterns{$b} * (length($b)-1) <=>
$patterns{$a} * (length($a)-1)
or length($b) <=> length($a)
or $a cmp $b } keys %patterns) {
(my $pat = $key) =~ s/\n/\\n/g;
printf "%3d: (%s)\n", $patterns{$key}, $pat;
}
}