CUFP
thundergnat
<p>My daughter has been showing some interest in learning Perl recently and I've been encouraging it. One thing she has been wrestling with especially is regexes. Remembering the syntax, keeping straight what is in a particular capture group, it's all been a struggle.</p>
<p>I wrote a little perl/Tk utility that lets her test out regexes and shows her what the matches will be in real time with feedback for syntax errors. It has pop-up help balloons and extensive drop down menus of various regex assertions classes and syntactic elements. It uses perl to parse the regex, so it will automatically adjust its error reporting depending on what features are present in the perl it is running on. It ended up working out pretty well, so I polished it up a bit and submitted it here in case anyone else might find it useful.</p>
<p>It has some limitations; it doesn't handle more than 9 capture groups in any one regex, (if you need more than that, write your own damn program :-) ), doesn't handle named Unicode character assertions, ( \N{whatever} ), can't really handle the x modifier flag effectively, (the regex entry is limited to one line), and doesn't treat the input as a stream. (It is just one big string with embedded newlines.) Within those limitations it works pretty well though.</p>
<p>Tested on Windows and Linux. Given some sane defaults but they are fairly easy to override should you so desire.</p>
<p>Give it a try. Comments and suggestions welcome.</p>
<p><b>Update:</b> fixed some typos.</p>
<p>Included suggestions from [zentara] and [Anonymous Monk]. And did a little more tweaking and polishing.</p>
<p><b>Update 2:</b> I have an updated and more featureful version of this hosted on Github now. See [https://github.com/thundergnat/VRegExp|Visual Regex Explorer]</p>
<readmore>
<c>
#/usr/bin/perl
use warnings;
use strict;
use charnames ':full';
use 5.10.0;
use Tk;
use Tk::TextUndo;
use Tk::ROText;
use Tk::Entry;
use Tk::Pane;
use Tk::Balloon;
use Tk::FontDialog;
use Tk::Dialog;
use YAML qw(DumpFile LoadFile);
my $settings_file = $0 . '.settings';
sub DEBUG () { return 0 }
my $OS;
given ($^O) {
when (/Win32/) { $OS = 'Win32' };
default { $OS = 'Linux' };
};
my %settings = (
regex => '\b((\w)\w*\2)\b',
text => 'Put some text in this window to match against.',
saved => [ '\b((\w)\w*\2)\b', '\w+' ],
font => { # 0: entry, text boxes, 1: menus
Win32 => [ '{Courier New} 10', '{Courier New} 8' ],
Linux => [ '{Monospace} 10', '{Monospace} 8' ], # Not Win32
},
geometry => '650x480',
);
if ( -e $settings_file ) {
%settings = LoadFile($settings_file);
}
my %bg = ( # error label background
err => 'orange',
ok => 'default',
active => 'yellow',
inactive => 'gray',
highlight => 'yellow',
);
my %flag = ( # regex flags
case => '',
multiple => '',
single => '',
global => 1
);
my $update;
my $error_text;
my $matches = 'Matches: ';
my $cap_disp = '';
my @show = ( undef, 1 ); # Array of capture display flags
# Show first capture by default
my %w; # Hash to hold tk widgets;
$w{mw} = MainWindow->new;
$w{fd} = $w{mw}->FontDialog(
-nicefont => 0,
-title => 'Select Font',
-applycmd => \&apply_font,
-familylabel => 'Font Family',
-fixedfontsbutton => 1,
-nicefontsbutton => 1,
-initfont => $settings{font}{$OS}[0],
);
$w{reg_img} = $w{mw}->Photo(
-format => 'gif',
-data =>
'R0lGODlhBwAEAIAAAAAAAP///yH5BAEAAAEALAAAAAAHAAQAAAIIhA+BGWoNWSgAOw=='
);
$w{help} = $w{mw}->Balloon( -initwait => 1000 );
$w{paned_window} = $w{mw}->Panedwindow( -orient => 'vertical' )->pack(
-side => 'top',
-expand => 'yes',
-fill => 'both',
-pady => 2,
-padx => '2m',
);
$w{regex_frame} = $w{paned_window}->Frame;
$w{result_frame} = $w{paned_window}->Frame;
$w{top_frame} = $w{regex_frame}->Frame->pack(
-anchor => 'nw',
-fill => 'x',
-expand => 1
);
$w{top_frame}->Label( -text => 'Regex string: ' )->grid(
-row => 1,
-column => 0
);
$w{top_frame}->Label( -text => 'Modifiers:' )->grid(
-row => 1,
-column => 1
);
$w{ck_bt_case} = $w{top_frame}->Checkbutton(
-text => 'i',
-onvalue => 'i',
-offvalue => '',
-variable => \$flag{case}
)->grid(
-row => 1,
-column => 2
);
$w{help}->attach( $w{ck_bt_case}, -balloonmsg => 'Case insensitive' );
$w{ck_bt_single} = $w{top_frame}->Checkbutton(
-text => 's',
-onvalue => 's',
-offvalue => '',
-variable => \$flag{multiple}
)->grid(
-row => 1,
-column => 3
);
$w{help}
->attach( $w{ck_bt_single}, -balloonmsg => 'Single string ( . matches \n )' );
$w{ck_bt_multiple} = $w{top_frame}->Checkbutton(
-text => 'm',
-onvalue => 'm',
-offvalue => '',
-variable => \$flag{single}
)->grid(
-row => 1,
-column => 4
);
$w{help}->attach( $w{ck_bt_multiple},
-balloonmsg => 'Multiple lines ( ^ and $ apply to each line )' );
$w{ck_bt_global} = $w{top_frame}->Checkbutton(
-text => 'g',
-onvalue => 1,
-offvalue => 0,
-variable => \$flag{global}
)->grid(
-row => 1,
-column => 5
);
$w{help}->attach( $w{ck_bt_global}, -balloonmsg => 'Global search' );
$w{entry_frame} = $w{regex_frame}->Frame( -height => 1 )->pack(
-anchor => 'nw',
-fill => 'x',
-expand => 1
);
$w{mlabel} =
$w{entry_frame}->Label( -text => 'm/', -font => $settings{font}{$OS}[0], )
->pack( -side => 'left', -anchor => 'n' );
$w{reg_recall} = $w{entry_frame}->Button(
-activebackground => $bg{active},
-command => sub { reg_saved( $w{reg_entry}, $settings{saved} ); },
-image => $w{reg_img},
-height => 10,
)->pack( -side => 'left', -anchor => 'n', );
$w{reg_entry} = $w{entry_frame}->Entry(
-font => $settings{font}{$OS}[0],
-background => 'white',
-text => $settings{regex},
)->pack(
-side => 'left',
-anchor => 'n',
-fill => 'x',
-expand => 1
);
$w{labelm} =
$w{entry_frame}->Label( -text => '/', -font => $settings{font}{$OS}[0], )
->pack( -side => 'right', -anchor => 'n' );
$w{reg_error} = $w{regex_frame}->Label( -textvariable => \$error_text, )->pack(
-anchor => 'nw',
-fill => 'x',
-expand => 1
);
$bg{ok} = $w{reg_error}->cget( -background );
$w{regex_frame}->Label( -text => 'Text to match against.' )->pack();
$w{reg_text} = $w{regex_frame}->Scrolled(
'TextUndo',
-exportselection => 'true',
-scrollbars => 'e',
-background => 'white',
-font => $settings{font}{$OS}[0],
)->pack(
-anchor => 'nw',
-fill => 'both',
-expand => 1
);
$w{reg_text}->tagConfigure( 'highlight', -background => $bg{highlight} );
$w{reg_text}->tagLower('highlight');
$w{output_frame} = $w{result_frame}->Frame()->pack(
-anchor => 'nw',
-fill => 'x',
-expand => 1
);
$w{output_frame}->Label( -textvariable => \$matches )->pack( -side => 'left' );
$w{output_frame}->Label( -textvariable => \$cap_disp )->pack( -side => 'left' );
for ( 1 .. 9 ) {
$w{"cap$_"} = $w{output_frame}->Checkbutton(
-text => "\$$_",
-onvalue => 1,
-offvalue => 0,
-variable => \$show[$_],
);
}
$w{rst_text} = $w{result_frame}->Scrolled(
'ROText',
-scrollbars => 'e',
-background => 'white',
-font => $settings{font}{$OS}[0],
)->pack(
-side => 'top',
-fill => 'both',
-expand => 1,
-anchor => 'n',
);
$w{paned_window}->add( $w{regex_frame}, $w{result_frame} );
$w{mw}->geometry( $settings{geometry} );
match_height();
$w{paned_window}->sashPlace( 0, 2, 240 );
$w{mw}->repeat( 500, \&update );
$w{reg_text}->Contents( $settings{text} );
$w{menu} = $w{mw}->Menu( -type => 'menubar' );
$w{mw}->configure( -menu => $w{menu} );
buildmenu();
$w{sure} = $w{mw}->Dialog(
-text => 'Are you sure?',
-bitmap => 'warning',
-title => 'Really?',
-default_button => 'Cancel',
-buttons => [qw/Cancel Yes/]
);
$w{mw}->update;
$w{mw}->bind(
'<Configure>' => sub { # Detect geometry changes
$settings{geometry} = $w{mw}->geometry;
save_settings();
}
);
MainLoop;
sub update { # Check term and matches periodically.
return if $update;
$update = 1;
# Some errors are runtime, not compile time, so trap STDERR
open( STDERR, '>', ( $OS eq 'Win32' ) ? 'NULL' : '/dev/null' ) unless DEBUG;
my $term = $w{reg_entry}->get;
my $flags = $flag{case} . $flag{multiple} . $flag{single};
if ( my $whoopsie = invalid($term) ) { # Check the regex.
whine($whoopsie); # Uh oh, There's a compile time regex error.
$w{reg_text}->tagRemove( 'highlight', '1.0', 'end' );
$update = 0;
return; # Notify, remove any highlighting and bail.
}
$error_text = 'Ok'; # Yay. No errors.
$w{reg_error}->configure( -background => $bg{ok} );
my $text = $w{reg_text}->Contents;
my ( @results, $i, $cap_count );
my @caps = $text =~ /(?$flags)$term/; # Get a count of captures.
my $l = defined $1;
show_caps( my $caps = scalar @caps, $l );
my @match_index;
if ( $caps > 1 ) { # More than 1 capture in regex.
my @allcaps;
if ( $flag{global} ) {
@allcaps = $text =~ /(?$flags)$term/g; # global regex.
while ( $text =~ /(?$flags)$term/g ) {
push @match_index,
[ $-[0], ( $+[0] - $-[0] ) ]; # get match indicies
}
}
else {
@allcaps = $text =~ /(?$flags)$term/; # single
push @match_index,
[ $-[0], ( $+[0] - $-[0] ) ]; # get match indicies
}
$cap_count = 'Matches: ' . scalar @allcaps / $caps;
for ( 0 .. $#allcaps ) {
my $index = $_ % $caps;
$i++ unless $index;
next unless $show[ $index + 1 ]; # Only save desired captures
push @results,
( $i . '($' . ( 1 + $index ) . "):\t" . $allcaps[$_] );
}
}
elsif ( $flag{global} ) { # global regex.
given (1) {
when ( $show[1] and $l ) { # has captures
@results =
map { ++$i . "(\$1):\t" . $_ } $text =~ /(?$flags)$term/g;
while ( $text =~ /(?$flags)$term/g ) {
push @match_index,
[ $-[0], ( $+[0] - $-[0] ) ]; # get match indicies
}
};
when ( !$show[1] and $l ) { # no show
@results = map { '' } $text =~ /(?$flags)$term/g;
while ( $text =~ /(?$flags)$term/gs ) {
push @match_index,
[ $-[0], ( $+[0] - $-[0] ) ]; # get match indicies
}
};
default { # no captures
@results =
map { ++$i . ":\t" . $_ } $text =~ /(?$flags)$term/g;
while ( $text =~ /(?$flags)$term/g ) {
push @match_index,
[ $-[0], ( $+[0] - $-[0] ) ]; # get match indicies
}
};
}
}
else {
@results = $text =~ /(?$flags)$term/; # single term no captures
push @match_index, [ $-[0], ( $+[0] - $-[0] ) ]; # get match indicies
}
my $results = join "\n", @results;
$matches = $cap_count ? $cap_count : 'Matches: ' . scalar @results;
if ( $text eq "\n" or $term eq '' ) {
$matches = 'Matches: 0';
$w{rst_text}->Contents("");
$update = 0;
return; # Don't update if no term or text.
} # Don't update if nothing has changed.
if ($w{rst_text}->Contents eq $results . "\n"){
$update = 0;
return;
}
$w{rst_text}->Contents($results);
$w{reg_text}->tagRemove( 'highlight', '1.0', 'end' );
# remove highlighting from text.
$w{mw}->Busy;
my ( $lineidx, $matchacc ) = ( 1, 0 );
for my $match (@match_index)
{ # highlight the match indicies previously captured.
while (1) {
my $linelen =
length( $w{reg_text}->get( "$lineidx.0", "$lineidx.end" ) ) + 1;
last if ( ( $matchacc + $linelen ) > $match->[0] );
$matchacc += $linelen;
$lineidx++;
}
my $offset = $match->[0] - $matchacc;
$w{reg_text}->tagAdd( 'highlight', "$lineidx.$offset",
"$lineidx.$offset +" . ( $match->[1] ) . 'c' );
}
$w{mw}->Unbusy;
$update = 0;
}
sub invalid { # Check to see if a regex is valid.
my $term = shift; # Don't bother trying to parse it,
eval { '' =~ m/$term/; }; # let perl do it for us.
return $@;
}
sub whine {
my $error = shift;
$error =~ s/ at .+?$//; # Massage error text a bit.
$error =~ s/[\cM\cJ]//g;
$error =~ s/marked by <-- HERE in //;
$error_text = $error; # And display it.
$w{reg_error}->configure( -background => $bg{err} );
$w{rst_text}->Contents('');
$matches = 'Matches: 0';
$w{reg_text}->tagRemove( 'highlight', '1.0', 'end' );
}
sub Tk::Error { # Trap runtime errors.
my ( $w, $error, @msgs ) = @_;
whine($error)
if $error =~ /Unicode property/; # report unicode property errors
say $error if DEBUG;
return;
}
sub show_caps { # show or hide capture checkboxes
my ( $show, $cap1 ) = @_;
if ($cap1) {
$cap_disp = ' -- Display captures: ';
for ( 1 .. $show ) { $w{"cap$_"}->pack( -side => 'left' ); }
for ( $show + 1 .. 9 ) {
$w{"cap$_"}->packForget;
$show[$_] = 0;
}
}
else {
$cap_disp = '';
$w{"cap$_"}->packForget for 1 .. 9;
}
}
sub apply_font {
my $font = shift;
if ( defined $font ) {
$settings{font}{$OS}[0] = $w{mw}->GetDescriptiveFontName($font);
save_settings();
for ( $w{reg_entry}, $w{reg_text}, $w{rst_text}, $w{mlabel},
$w{labelm} )
{
$_->RefontTree( -font => $font );
}
}
match_height();
}
sub match_height {
$w{mw}->update;
$w{reg_recall}->configure( -height => $w{reg_entry}->height - 6 );
}
sub buildmenu { # build menus
$w{menu}->Cascade(
-label => 'Metachars & Assertions',
-tearoff => 1,
-menuitems => [
map { item($_) } (
[ '\\', ' Quote the next metacharacter' ],
[ '^', ' Match the beginning of a line' ],
[ '.', ' Match any character (except newline)' ],
[ '$', ' Match the end of a line' ],
[ '|', ' Alternation' ],
[ '( )', ' Grouping' ],
[ '[ ]', ' Character class' ],
['sep'],
[ '\b', ' Match a word boundary' ],
[ '\B', ' Match except at a word boundary' ],
[ '\A', ' Match only at beginning of string' ],
[ '\Z', ' Match only at end, or before newline at the end' ],
[ '\z', ' Match only at end of string' ],
[ '\G', ' Match only at pos()' ]
)
]
);
$w{menu}->Cascade(
-label => 'Quantifiers',
-tearoff => 1,
-menuitems => [
map { item($_) } (
[ '*', ' Match 0 or more times' ],
[ '+', ' Match 1 or more times' ],
[ '?', ' Match 1 or 0 times' ],
[ '{n}', ' Match exactly n times' ],
[ '{n,}', ' Match at least n times' ],
[ '{n,m}', ' Match at least n but not more than m times' ],
[ '*?', ' Match 0 or more times, not greedily' ],
[ '+?', ' Match 1 or more times, not greedily' ],
[ '??', ' Match 0 or 1 time, not greedily' ],
[ '{n}?', ' Match exactly n times, not greedily' ],
[ '{n,}?', ' Match at least n times, not greedily' ],
[ '{n,m}?', ' Match between n and m times, not greedily' ],
[ '*+', ' Match 0 or more times and give nothing back' ],
[ '++', ' Match 1 or more times and give nothing back' ],
[ '?+', ' Match 0 or 1 time and give nothing back' ],
[ '{n}+', ' Match exactly n times and give nothing back' ],
[ '{n,}+', ' Match at least n times and give nothing back' ],
[ '{n,m}+', ' Match from n to m times and give nothing back' ]
)
]
);
$w{menu}->Cascade(
-label => 'Grouping',
-tearoff => 1,
-menuitems => [
map { item($_) } (
[ '(?#text)', ' A comment' ],
[ '(?pimsx-imsx)', ' Enable / Disable modifier flags' ],
[ '(?:pattern)', ' Non-capturing cluster' ],
[ '(?|pattern)', ' Branch reset' ],
[ '(?=pattern)', ' Zero-width positive look-ahead' ],
[ '(?!pattern)', ' Zero-width negative look-ahead' ],
[ '(?<=pattern)', ' Zero-width positive look-behind' ],
[ '(?<!pattern)', ' Zero-width negative look-behind' ],
[ '(?\'NAME\'pattern)', ' A named capture buffer' ],
[ '(?<NAME>pattern)', ' A named capture buffer' ],
[ '\k\'NAME\'', ' Named backreference' ],
[ '\k<NAME>', ' Named backreference' ]
)
]
);
$w{menu}->Cascade(
-label => 'Escapes',
-tearoff => 1,
-menuitems => [
map { item($_) } (
[ '\t', ' Tab' ],
[ '\n', ' Newline' ],
[ '\r', ' Return' ],
[ '\f', ' Form feed' ],
[ '\a', ' Alarm (bell)' ],
[ '\e', ' Escape (think troff)' ],
[ '\l', ' Lowercase next char (think vi)' ],
[ '\u', ' Uppercase next char (think vi)' ],
[ '\L', ' Lowercase till \E (think vi)' ],
[ '\U', ' Uppercase till \E (think vi)' ],
[ '\E', ' End case modification (think vi)' ],
[ '\Q', ' Quote metacharacters till \E' ],
)
]
);
$w{menu}->Cascade(
-label => 'Classes',
-tearoff => 1,
-menuitems => [
map { item($_) } (
[ '\w', ' Match a word character (alphanumeric or _)' ],
[ '\W', ' Match a non-"word" character' ],
[ '\s', ' Match a whitespace character' ],
[ '\S', ' Match a non-whitespace character' ],
[ '\d', ' Match a digit character' ],
[ '\D', ' Match a non-digit character' ],
[ '\pP', ' Match P, named property (short form).' ],
[ '\p{Prop}', ' Match named property.' ],
[ '\PP', ' Match non-P' ],
[ '\P{Prop}', ' Match not named property.' ],
[ '\X', ' Match eXtended Unicode sequence' ],
[ '\C', ' Match a single C char, even under Unicode.' ],
[ '\1', ' Reference to a capture group' ],
[ '\g1', ' Reference to a specific group,' ],
[
'\g{-1}',
' Negative means a previous buffer, use brackets for safer parsing.'
],
[ '\g{name}', ' Named backreference' ],
[ '\k<name>', ' Named backreference' ],
[
'\K',
' Keep the stuff left of \K, don\'t include in $&'
],
[ '\v', ' Vertical whitespace' ],
[ '\V', ' Not vertical whitespace' ],
[ '\h', ' Horizontal whitespace' ],
[ '\H', ' Not horizontal whitespace' ],
[ '\R', ' Linebreak' ],
[ '\0**', ' Octal char' ],
[ '\x**', ' Hex char' ],
[ '\x{****}', ' Long hex char' ],
[ '\c*', ' Control char' ],
[ '\N{name}', ' Named Unicode character' ]
)
]
);
$w{menu}->Cascade(
-label => 'POSIX',
-tearoff => 1,
-menuitems => [
map { item($_) } (
[ '[[:alpha:]]', ' Unicode alphabetic character' ],
[ '[[:alnum:]]', ' Unicode alphanumeric character' ],
[ '[[:ascii:]]', ' ASCII character' ],
[ '[[:blank:]]', ' \s + vertical tab \cK' ],
[ '[[:cntrl:]]', ' Control character' ],
[ '[[:digit:]]', ' Unicode digit' ],
[ '[[:graph:]]', ' Any Alphanumeric or punctuation' ],
[ '[[:lower:]]', ' Any lowercase character' ],
[ '[[:print:]]', ' Any printable character' ],
[ '[[:punct:]]', ' Any punctuation (special) character.' ],
[ '[[:space:]]', ' Any space character ([[:blank:]])' ],
[ '[[:upper:]]', ' Any uppercase character' ],
[ '[[:word:]]', ' Alphabetic character or underscore' ],
[ '[[:xdigit:]]', ' A hex digit' ]
)
]
);
$w{menu}->Cascade(
-label => 'Named Properties',
-tearoff => 1,
-menuitems => [
map { item($_) } (
[ '', "Too many to list. See perldoc perlunicode." ],
[ '\p{Alpha}', ' Unicode alphabetic character' ],
[ '\p{Alnum}', ' Unicode alphanumeric character' ],
[ '\p{Punct}', ' Punctuation' ],
[ '\p{ASCII}', ' \x00 through \x7f' ],
[ '\p{HexDigit}', ' Any hex digit' ],
[ '\p{L}', ' Letter' ],
[ '\p{Lu}', ' Upper case letter' ],
[ '\p{Ll}', ' Lower case letter' ],
[ '\p{P}', ' Punctuation' ],
[ '\p{S}', ' Symbol' ],
[ '\p{Sm}', ' Math symbol' ],
[ '\p{Latin}', ' Is a Latin character' ],
[ '\p{Greek}', ' Is a Greek character' ],
[ '\p{InBasicLatin}', ' In the Basic Latin code block' ]
)
]
);
$w{menu}->Cascade(
-label => 'Options',
-tearoff => 0,
-menuitems => [
[
Button => 'Choose Font',
-command => sub {
my $font = $w{fd}->Show;
apply_font($font);
}
]
]
);
}
sub item { # build a menu item
my $itemref = shift;
my ($item) = @$itemref;
return undef if $item eq 'sep';
return [
Button => "@$itemref",
-font => $settings{font}{$OS}[1],
-command => [ sub { $w{reg_entry}->insert( 'insert', $_[0] ) }, $item ]
];
}
sub reg_saved {
my ( $entry, $array_ref ) = @_;
my $menu = $entry->Menu( -title => 'Stored Regexes', -tearoff => 0 );
$menu->command(
-label => 'Store Regex',
-command =>
sub { add_reg_saved( $w{reg_entry}->get, $settings{saved} ) },
);
$menu->command(
-label => 'Delete All Stored Regexes',
-command => sub {
my $ans = $w{sure}->Show;
return unless $ans eq 'Yes';
@$array_ref = ();
save_settings();
},
);
$menu->cascade(
-label => 'Remove a Stored Regex',
-tearoff => 0,
-menuitems => [
map ( [
Button => $_,
-font => $settings{font}{$OS}[1],
-command =>
[ sub { del_reg_saved( $_[0], $array_ref ) }, $_ ],
],
@$array_ref )
]
);
$menu->separator;
for my $item (@$array_ref) {
$menu->command(
-label => $item,
-font => $settings{font}{$OS}[1],
-command => [
sub {
load_saved_term( $entry, $_[0] );
add_reg_saved( $_[0], $array_ref );
},
$item
],
);
}
$menu->post( $entry->pointerx, $entry->pointery + 10 );
}
sub add_reg_saved {
my ( $term, $array_ref ) = @_;
@$array_ref = grep { $_ ne $term } @$array_ref;
unshift @$array_ref, $term;
save_settings();
}
sub del_reg_saved {
my ( $term, $array_ref ) = @_;
@$array_ref = grep { $_ ne $term } @$array_ref;
save_settings();
}
sub load_saved_term {
my ( $entry, $term ) = @_;
$entry->delete( '0', 'end' );
$entry->insert( 'end', $term );
}
sub save_settings {
DumpFile( $settings_file, %settings );
}
</c>
</readmore>