D:\Temp\file.ext.bak
E:\Code\junk.stuff.ext
####
# not all of these are valid urls
# nor do I want all of them to match
# For instance some invalids I DO want to match
# but handle their invalidity later
my @url=qw(
http://www.foobar.com
http://www.foobar.com/foo
http://www.foobar.com/foo/
http://www.foobar.com/foo.pl
http://www.foobar.com/.extension
http://www.foobar.com?test
http://www.foobar.com/foo?test
http://www.foobar.com/foo/?test
http://www.foobar.com/foo.pl?
http://www.foobar.com/.extension?
http://www.foobar.com/foo/bar/foobar.html
http://www.foobar.com/foo/bar/foo.bar.html
http://perlmonks.com/index.pl?node_id=68135
http://perlmonks.com/index.pl??node_id=68135
http:///file.ext?
http:///.ext?
http:///file.ext
http:///.ext
);
####
foreach (@url) {
if (my @parts=m!^http:// #must begin http://
( #capture the site
[^/?]+ # site has no / or ? in it
) #its mandatory
( #capture the path
/ # starts with a /
(?: # group but dont capture
[^/?]+ # anything but / or ?
/ # followed by a /
)* # zero or more times (opt)
)? #all optional
( #capture the filename
[^./?] # doesnt start with a . or ? or /
[^/?]+? # all chars not / or ? , (ctd.)
# --leave stuff for rest of rex
)? #we dont have to have a filename
( #capture the extension
\. # they start with dots you know
[^.?]* # any letter that arent a . or ?
)? #we dont need an extension really
( #capture a parameter string
\? # it starts with a ?
.* # and has any char following
)? #but its optional too..
$ #and thats the end folks...
!x) { #ignore comments and whitespace in rex
print "$_\t".join(',',@parts)."\n"; # weve matched now print
} else {
print "NOMATCH:$_\n"; #oops, is this ok?
}
} #lets try the next URL and see if we do better....
# :)