#!/usr/bin/perl -w # # The challenge - to write a function which given a list of non # negative integers returns a regexp which will match those and only # those numbers. # # Try the test program like this :- # #-- Simple numeric list # ./numeric-list-to-regexp.pl 0..255 # \d|(?:[1-9]|1\d|2[0-4])\d|25[0-5] #-- Sparse numeric list # ./numeric-list-to-regexp.pl 0..11,15,21..33 # \d|1[015]|2[1-9]|3[0-3] #-- Numbers divisble by 3 # ./numeric-list-to-regexp.pl 'map {$_ * 3} 0..33' # [0369]|1[258]|2[147]|3[0369]|4[258]|5[147]|6[0369]|7[258]|8[147]|9[0369] #-- All prime numbers < 100 # ./numeric-list-to-regexp.pl 'use Quantum::Superpositions; grep { $_ % all(2..sqrt($_)+1) != 0 } (1..100)' # [1357]|1[1379]|2[39]|3[17]|4[137]|5[39]|6[17]|7[139]|8[39]|97 use strict; die "Pass some perl in please, eg 1..20 or 1, 4, 5" unless @ARGV; my @list = eval "@ARGV"; my $re = numeric_list_to_regexp( @list ); check_numeric_list_to_regexp($re, \@list); exit; ############################################################ # Converts a list of numbers into a regexp which will # match those numbers and those numbers only. # # It does this by constructing a regexp and then progressively # simplifying it - recursively if necessary. It uses regexp's to # transform the regexp of course! This is almost a general purpose # regexp optimiser. # # We assume that the caller will bound the regexp with ^( and )$ or # \D(?: and )\D or whatever takes their fancy # # Set $DEBUG to 1 if you want to print lots of info and check the # regexp works after each transformation. # # Warning: code contains heavy regexps - lift with care ;-) # Caution: Code may use exponential time and space ;-( ############################################################ sub numeric_list_to_regexp { my (@list) = @_; my $DEBUG = 0; # The basic regexp with |'s on the start and end to make our life # easier # Should uniq here too... my $re = "|" . join("|", sort { $a <=> $b } @list) . "|"; # Transform the regexp in stages, making sure at all time the # regexp is correct if $DEBUG is set check_numeric_list_to_regexp($re, \@list) if $DEBUG; # 1) Concatenate all the single characters a|b|c into [abc]'s $re =~ s{ \| ( \d (?: \| \d )+ ) (?= \| ) } { my ( $string ) = ( $1 ); print "string = '$string'\n" if $DEBUG; "|[" . join("", split m{\|}, $string) . "]" }gex; check_numeric_list_to_regexp($re, \@list) if $DEBUG; # 2) Find all the Xa|Xb|Xc and change to X(?:a|b|c)] $re =~ s{ \| ( (\d+)(\d+) (?: \| \2\d+ )+ ) (?= \| ) } { my ( $string, $prefix ) = ( $1, $2 ); print "prefix = '$prefix', string = '$string'\n" if $DEBUG; "|$prefix\(?:" . join("|", map { substr($_, length $prefix) } split m{\|}, $string) . ")" }gex; check_numeric_list_to_regexp($re, \@list) if $DEBUG; # 3) Find all the aX|bX|cX and change to (a|b|c)X] $re =~ s{ \| ( (\d+?)(.+) (?: \| \d+\3 )+ ) (?= \| ) } { my ( $string, $postfix ) = ( $1, $3 ); print "postfix = '$postfix', string = '$string'\n" if $DEBUG; $string =~ s{ \Q$postfix\E (?= \| | $ ) }{}gx; print "...string = '$string'\n" if $DEBUG; "|(?:$string)$postfix" }gex; check_numeric_list_to_regexp($re, \@list) if $DEBUG; # 4) Change (?:a|b|c) into [abc] $re =~ s{ \(\?\: ( \d (?: \| \d )+ ) \) } { my ( $string ) = ( $1 ); print "string = '$string'\n" if $DEBUG; "[" . join("", split m{\|}, $string) . "]" }gex; check_numeric_list_to_regexp($re, \@list) if $DEBUG; # 5) Optimise [abc] into [a-c] or \d # This doesn't optimise all the cases only the complete continuous # range in the [ ... ] $re =~ s{ \[ ( \d{3,} ) \] } { my ( $string, $start, $end ) = ( $1, substr($1, 0, 1), substr($1, -1, 1) ); print "match ['$string']...range [$start-$end]\n" if $DEBUG; if ($end - $start + 1 == length $string) { $start == 0 && $end == 9 ? '\d' : "[$start-$end]"; } else { "[$string]"; } }gex; check_numeric_list_to_regexp($re, \@list) if $DEBUG; # 6) recurse on any digit sequences left (?:ab|cd|ef) $re =~ s{ \(\?\: ( \d+ (?: \| \d+ )+ ) \) } { my ( $string ) = ( $1 ); print "**** Recursing on '$string'\n" if $DEBUG; "(?:" . numeric_list_to_regexp(split m{\|}, $string) . ")"; }gex; check_numeric_list_to_regexp($re, \@list) if $DEBUG; # 7) fix the | on each end $re =~ s{^\|}{}; $re =~ s{\|$}{}; print "**** Returning '$re'\n" if $DEBUG; return $re; } ############################################################ # Test subroutine to check the regexp performs as advertised # # Call with a regexp and a reference to a list of numbers # it will check that the regexp matches all the list and # doesn't match some others (obviously it can't check them # all can it!) die-ing on any failures. ############################################################ sub check_numeric_list_to_regexp { my ($re, $list) = @_; my %list = map { $_ => 1 } @$list; print "Re: $re\n"; # Put some other test cases in $list{$_} += 0 for (0..999); $list{int(rand()*1000)} += 0 for (0..99); $list{int(rand()*10000)} += 0 for (0..99); $list{int(rand()*100000)} += 0 for (0..99); # print join(", ", map {"$_ => $list{$_}"} keys %list), "\n"; $re =~ s{^\|}{}; # fix | on start and end $re =~ s{\|$}{}; $re = "^(?:$re)\$"; # put in ^(?: ... )$ $re = qr{$re}; # compile the regexp for speed # Check all the keys in list against the regexp - some should pass # and some should fail for my $item (keys %list) { if ($list{$item} xor ($item =~ /$re/)) { die "*** FAILED '$re' for '$item' ShouldMatch: $list{$item}\n"; } else { # print "OK '$re' for '$item'\n"; } } }