package Regexp::NamedCaptures; use warnings; use strict; use 5.007_01 use Text::Balanced qw( extract_bracketed extract_quotelike ); use Carp qw( croak carp ); use Params::Validate qw( validate_pos SCALAR UNDEF CODEREF ); use subs qw( convert _convert_foo_expr _convert_chevron_expr _convert_quote_expr ); use vars qw( $VERSION ); =head1 NAME Regexp::NamedCaptures - Saves capture results to your own variables =head1 VERSION Version 0.01 =cut $VERSION = '0.01'; =head1 SYNOPSIS use Regexp::NamedCaptures; my ( $name, $title, $first, $last ); /(?<\$name>(?<\$title>Mr\.|Ms\.) (?<\$first>\w+) (?<\$last>\w+))/; # is the same as my ( $name, $title, $first, $last ) = /((Mr\.|Ms\.) (\w+) (\w+))/; # use re 'eval' when interpolating use Regexp::NamedCaptures; use re 'eval'; /(?<\$something>$pattern)/ =head1 DESCRIPTION This B module implements named captures. When your regular expression captures something, you can have it automatically copied out to the right location. This is an improvement over normal perl because now you don't have to deal with positional captures. When your expression is complex and there are multiple or nested captures it really helps to not have to track what number you're supposed to find your data in. =head1 NAMED CAPTURE SYNTAX I have borrowed the syntax from .Net. I'm told that each of the following forms are equivalent so I've treated them identically. (?< name >pattern) (?' name 'pattern) C should be a a piece of valid perl code. In a normal, interpolating regular expression, you would write C<(?<\$something>...) if you wanted to have the result copied to the C<$something> variable. That is, perl will interpolate your variables just like it always does. The value of name may be arbitrary perl code. It must be a valid lvalue. C is a normal pattern. The entire expression is rewritten as: (pattern)(?{ name = $^N }) =head1 FUNCTIONS =head2 $rewritten_regexp = convert( $original_regexp ) This function does all the work of converting a regular expression containing named capture expressions into an expression that can be used by perl. You only need this if you're going to be creating regular expressions at runtime. use re 'eval'; $re = Regexp::NamedCapture::convert '(?<$var>...)' $re = qr/$re/ use re 'eval'; $re = Regexp::NamedCapture::convert "(?'\$var'...)"; =cut sub convert { my ( $in ) = validate_pos( @_, { type => SCALAR | UNDEF } ); my $out = ''; my @names; while ( length $in ) { # Seek $in forward until a (?< or (?' is found. Exclude matches for # (?<=, (? because those are already taken or in the # case of (?<>, don't make sense. if ( $in !~ m/ \( (?: (?=\?< [^>=!] ) | (?=\?' ))/x ) { # Nothing was found - copy the rest of $in to $out and # empty $in. $out .= $in; $in = ''; } else { # Copy any leading text directly to the output. $out .= substr $in, 0, $-[0], ''; my $expr; ( $expr, $in ) = extract_bracketed $in, '()'; $out .= ( '(?<' eq substr( $expr, 0, 3 ) ? _convert_chevron_expr $expr : '(?\'' eq substr( $expr, 0, 3 ) ? _convert_quote_expr $expr : croak "Invalid escape sequence in $expr" ); } } return $out; } =head1 C AND SECURITY This module functions by inserting (?{ code }) blocks into your expression. As a security feature, perl does not allow new (?{ ... }) blocks to be compiled once BEGIN-time has passed unless the programmer specifically lifts that restriction by including the C pragma. If you trust all of the expressions that you're interpolating, you can use this safely. If you are accepting regular expressions from sources you might not trust, you should not use C. If you still want to use this module, see if you can push your regular expression compilation earlier. Consider these two examples: use re 'eval'; $rx = qr/(?<\$name>$expr)/; BEGIN { $rx = qr/(?<\$name>$expr)/; } The first one requires the C pragma because the interpolation and compilation occurs at runtime. The second does not because it interpolated and compiled the pattern at BEGIN-time. It suffers the obvious drawback that you must have the value for $expr at BEGIN-time instead of runtime. =head1 AUTHOR Joshua ben Jore, C<< >> =head1 BUGS \Q escapes are completed ignored. If you try to use one to prevent something that looks like a named capture from being parsed as one, it won't work. Please report any bugs or feature requests to C, or through the web interface at L. I will be notified, and then you'll automatically be notified of progress on your bug as I make changes. =head1 ACKNOWLEDGEMENTS =head1 COPYRIGHT & LICENSE Copyright 2005 Joshua ben Jore, all rights reserved. This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut sub _convert_quote_expr { my ( $re ) = validate_pos( @_, { type => SCALAR, regex => qr<\A\(\?\'.+\'.*\)\z>s } ); _convert_foo_expr sub { &extract_quotelike }, $re; } sub _convert_chevron_expr { my ( $re ) = validate_pos( @_, { type => SCALAR, regex => qr<\A\(\?\<.+\>.*\)\z>s } ); _convert_foo_expr sub { extract_bracketed shift, '<>' }, $re; } sub _convert_foo_expr { my ( $extract, $in ) = validate_pos( @_, { type => CODEREF }, { type => SCALAR, regex => qr<^\(\?..+..*\)$>s } ); # Zap the (? and ) parts of (?_..._...) away. substr $in, 0, 2, ''; substr $in, -1, 1, ''; # Split the _NAME_ part from the EXPR part of _NAME_EXPR my ( $name, $expr ) = $extract->( $in ); # Possibly transform the contents of $expr if it contained some # (?<...>...) expressions. $expr = convert $expr; # Zap the delimiters on _NAME_ substr $name, 0, 1, ''; substr $name, -1, 1, ''; # Unescape stuff in $name $name =~ s/\\(.)/$1/gs; # Rewrite the expression so it's a plain capture followed by a # code block. return "(?:($expr)(?{$name=\$^N})|(?{$name=undef}))"; } ##################################################################### ##################################################################### # Overload magic follows use overload( '.' => \ &_concat, '""' => \ &_finalize ); sub import { # Constants are overloaded so their fragments are passed to # _postpone(). overload::constant qr => \ &_postpone; } sub _postpone { # _postpone returns an object. my ( $re ) = @_; if ( not defined $re ) { carp "Use of uninitialized value in concatenation (.) or string"; $re = ''; } return bless \ $re, __PACKAGE__; } sub _concat { # _concat happens anytime something is interpolated. It # re-postpones things until later. my ( $a, $b, $inverted ) = @_; ($a,$b)=($b,$a) if $inverted; for ( $a, $b ) { $_ = $$_ if ref eq __PACKAGE__; if ( not defined ) { carp "Use of uninitialized value in concatenation (.) or string"; $_ = ''; } } my $re = "$a$b"; return bless \ $re, __PACKAGE__; } sub _finalize { # _finalize happens when the regex is due to be compiled. Here, I # just rethrow the regex to the user-accessible function # convert(). Its also supposed to be slightly faster to call # functions this way. return convert ${$_[0]}; } 1; # End of Regexp::NamedCaptures