D:\Temp\file.ext.bak E:\Code\junk.stuff.ext #### # not all of these are valid urls # nor do I want all of them to match # For instance some invalids I DO want to match # but handle their invalidity later my @url=qw( http://www.foobar.com http://www.foobar.com/foo http://www.foobar.com/foo/ http://www.foobar.com/foo.pl http://www.foobar.com/.extension http://www.foobar.com?test http://www.foobar.com/foo?test http://www.foobar.com/foo/?test http://www.foobar.com/foo.pl? http://www.foobar.com/.extension? http://www.foobar.com/foo/bar/foobar.html http://www.foobar.com/foo/bar/foo.bar.html http://perlmonks.com/index.pl?node_id=68135 http://perlmonks.com/index.pl??node_id=68135 http:///file.ext? http:///.ext? http:///file.ext http:///.ext ); #### foreach (@url) { if (my @parts=m!^http:// #must begin http:// ( #capture the site [^/?]+ # site has no / or ? in it ) #its mandatory ( #capture the path / # starts with a / (?: # group but dont capture [^/?]+ # anything but / or ? / # followed by a / )* # zero or more times (opt) )? #all optional ( #capture the filename [^./?] # doesnt start with a . or ? or / [^/?]+? # all chars not / or ? , (ctd.) # --leave stuff for rest of rex )? #we dont have to have a filename ( #capture the extension \. # they start with dots you know [^.?]* # any letter that arent a . or ? )? #we dont need an extension really ( #capture a parameter string \? # it starts with a ? .* # and has any char following )? #but its optional too.. $ #and thats the end folks... !x) { #ignore comments and whitespace in rex print "$_\t".join(',',@parts)."\n"; # weve matched now print } else { print "NOMATCH:$_\n"; #oops, is this ok? } } #lets try the next URL and see if we do better.... # :)