#!/usr/bin/env -S perl -w
##!/usr/bin/env -S perl -wd

use v5.30.0;
use strict;
use List::AllUtils qw( reduce );

my ($slurpee, $length, $sum);
{
    local $/;
    ($slurpee) = <DATA>;
}
$length = length $slurpee;

my @regexes = (
    [ qr/[A-Z]/,                                   "uppercase characters",   0 ],
    [ qr/[a-z]/,                                   "lowercase characters",   0 ],
    [ qr/\d/,                                      "digits",                 0 ],
    [ qr/\s/,                                      "whitespace characters",  0 ],
#
#   Note: $ must be \$, and - must be first to avoid range interpretation.
#
    [ qr/[-~`!@#\$%^&*()_+={}\[\]|\\:;"'<>,.?\/]/, "punctuation characters", 0 ],
);

#for my $c (split //, $slurpee) { print $c; }

for my $case (@regexes) {
    say "seeding // with: $case->[0]";
    "Aa5: " =~ $case->[0];       # seed the // iteration
    say "matched: '$&'" if $&;
    for (split //, $slurpee) {
        // and $case->[2]++;
    }
}    
for my $case (@regexes) { printf("%4d %s\n", $case->[2], $case->[1]); }

$sum = reduce { $a + $b } (map $_->[2], @regexes);
printf(" sum and length: %3d and %3d\n", $sum, $length);

say "\nNow extract the string between HTML tags with //...";
my $str = "Before tag<i>between tags</i>after tag";
say "\n$str";
$str =~ s{ (?: (?<= \w) (?= <) | (?<= >) (?= \w) ) }{ }xg;    # insert whitespace
say $str;
my @tokens = split / /, $str;
say "Tokens...\n";
for (@tokens) { say };

my $between;
for (@tokens) {
    if (/<\w>/../<\/\w>/) {
        $between .= "$_ " unless // and $&;
    }
}
chop $between if $between;
say "'$between'";

$str = "\n'Before tag<i>between tags</i>after tag'";
say $str;
say "Parse it again with...";
my $regex = qr/ (<\w+>) (.*) (<\/\w+>) /x;
say $regex;
$str =~ $regex;
say "\$1: '$1'";
say "\$2: '$2'";
say "\$3: '$3'";

exit(0);
__END__
Last night I dreamt I went to Manderley again. This will come as a surprise to
Daphne since she did not write these lines.  Here is a line containing stuff
   ,?- ! : that should/must be deleted/// ; : ! before using it as a one-time-pad.
A one-time-pad should contain only characters, no  punctuation, no parentheticals like (this is bogus) or [(this is bogus, too)], or {also this}; no contractions, such as
I'll or it's or digits such as 0, 123, -75 or 8 P.M., and no numbers, such as $1,234.69.  If
you want to use numbers in your message, spell them out; one-hundred dollars and sixty-nine cents, or theeepm.  These non-alpha characters in the one-time-pad will be discarded, but they must be entered eactly as represented in the book used as the pad.  Let the encoding program decide what to use and what to skip.

Some of the text is from "Rebecca", an out-of copyright but not out-of-print fictional
work that can be freely downloaded as an eBook from Project Gutenberg. I use it as the
raw source for one-time pads in a cryptologic research study; i.e., extract potential
pad bits from somewhere in the text, randomly chosen with seek from EOF. Munge the
characters, encrypt the message and delete the characters used for the pad. Since both
encoder and decoder use the same seek expression, both pads are guaranteed to be
identical, and since the characters used to create the pad are deleted, never to be seen
again, the pad is guaranteed to be used exactly once. Does not scale for large
organizations but works flawlessly for a small group of conspirators.