(.*) # what's between tags, even newline
Are you sure you want that greedy, and with no further restrictions? Maybe it's appropriate in your case, but in the general case it's not good when parsing XML ;-)
Shortly after the release of perl 5.10.0 I wrote an "XML" regex, mostly as an exercise for the cool new regex features. It parses only a small subset of XML, but maybe it's of use to you:
#!/usr/bin/env perl5.10.0 use strict; use warnings; use re 'eval'; use Test::More qw(no_plan); use 5.010; use Data::Dumper; my $xml; my $nested_tags; my $cdata = qr{ (?> [^<>&"]+ # any amount of "normal" text | \&\w+; # named chars | \&\#\d+; # numbered codepoints )}x; $xml = qr{ (?> (??{$nested_tags}) | $cdata)+ }x; #$xml = qr/ $single_xml+/; my $name = qr{ (?>\w+(?: [:-]\w+)*) }x; my $attribute = qr{ (?>$name="$cdata*+") }x; { $nested_tags = qr{ (?<nested_tags> < ($name) # (?{print "after <$^N: \n"}) # (?{print "match: [$&] (\$2:$2) \n"}) (?>\s+$attribute)*\s* (?: /\s*> # either an empty tag end ... | > # or end-of-tag and (?> (?&nested_tags) | $cdata)*+ # arbitrary XML </\s* (??{$2})\s*> # and a closing tag containing # the current name ) ) }x; } like "foo bar baz", qr/^$cdata$/, "cdata"; unlike "<bla>", qr/^$cdata$/, "cdata"; like 'blerk="foo"', qr/^$attribute$/, "simple attribute"; unlike 'blerk=bar', qr/^$attribute$/, "non-quoted attribute"; like '<bla />', qr/^$nested_tags$/, "single, empty XML tag +"; unlike '<bla>', qr/^$nested_tags$/, "single, non-empty XML + tag"; like '<bla></bla>', qr/^$nested_tags$/, "single, closed XML ta +g"; like '<bla><blubb/></bla>', qr/^$nested_tags$/, "nested tags 1"; like '<bla>foo</bla>', qr/^$nested_tags$/, "nested tags 2"; unlike '<bla><blubb></bla>',qr/^$nested_tags$/, "nested tags 3"; like '<bla><blubb></blubb></bla>', qr/^$nested_tags$/, "nested tags 4"; like '<moep><blubb></blubb></moep><foo/><bar></bar>', qr/^$xml+$/, 'multiple nested tags'; unlike '<bla><blubb></foo></bla>', qr/^$xml+$/, "wrongly nested tags"; like '<bla>foo</bla>', qr/^$xml+$/, "nested tags with cdata"; like '<bla foo="bar" />', qr/^$nested_tags$/, 'Tag with attribute'; like '<bla>foo äblubb</bla>', qr/^$nested_tags$/, 'Tags with named entit +ies'; unlike '<bla>foo äblubb</bla>', qr/^$nested_tags$/, 'Tags with malformed n +amed entities'; #print Dumper \@names;
In reply to Re: Regex optimization: Can (?> ) and minimal match help here?
by moritz
in thread Regex optimization: Can (?> ) and minimal match help here?
by Anonymous Monk
| For: | Use: | ||
| & | & | ||
| < | < | ||
| > | > | ||
| [ | [ | ||
| ] | ] |