package InvList; use strict; use warnings; #use DDS; use constant UNI_INF => 0x10FFFF+1; =pod Inversion Lists This code implements a form of inversion list for handling charclasses (or any numbered and orderable set) The structure of an inversion list is an ordered array, where the even elements indicate the minimum value that is in the charclass, and odd elements indicate the minimum value that is not in the set. Lookup is by scanning the list to find the pair which straddle the value being looked up. The list (64,66,68,70) represents the char class [ABEF], the list (0,64,66,68,70,0x110000) represents the char class [^ABEF]. =cut our $DEBUG=0; # note decode() is bogus code, but its a proof of concept, so its fine. # the code to parse escapes properly is already in the perl core. sub decode { wantarray ? map { ord substr $_,-1 } @_ : ord substr $_[0],-1 } sub make_inv_list { my ($class) = shift; my @list; while ($class=~s/^(?:((\\?.)-(\\?.))|(\\.)|(.))//) { if (defined $1) { print "Range: $1\n" if $DEBUG; my ($l,$r)=decode($2,$3); die "Bad range $1\n" if $l>$r; push @list, [$l,$r+1]; } elsif (defined $4) { print "Escape: $4\n" if $DEBUG; my ($n) = decode($4); push @list, [$n,$n+1]; } elsif (defined $5) { print "Char: $5\n" if $DEBUG; my ($n) = decode($5); push @list, [$n,$n+1]; } else { Carp::confess("Failed to parse inv_list specifier: $class\n"); } } # make sure the elements in the list are ordered correctly @list=sort { $a->[0] <=> $b->[0] || $a->[1] <=> $b->[1] } @list; #Dump(\@list) if $DEBUG; # merge any overlapping elements in the list, they could have written # something like [A-ZBBBBBBB] for instance. # Also flatten the list so its an array of numbers and not an AoAoN my @ret; my ($in,$out)=@{$list[0]}; for my $idx (1..$#list) { my $rec = $list[$idx]; if ($rec->[0]<=$out && $in <= $rec->[1] ) { $in= $rec->[0] if $in > $rec->[0]; $out= $rec->[1] if $out < $rec->[1]; } else { push @ret,$in,$out; ($in,$out)=@$rec; } } push @ret,$in,$out; #Dump(\@ret) if $DEBUG; return bless \@ret; } # "not" a class sub invert { my $l=shift; my @ret=@$l; if (@ret && $ret[0]==0) { shift @ret; } else { unshift @ret,0; } if ($ret[-1]==UNI_INF) { pop @ret; } else { push @ret,UNI_INF; } return bless \@ret; } # "or" two classes together sub union { my ($l,$r)=@_; my @ret; # make sure $l has the leftmost element in it. ( $l, $r ) = ( $r, $l ) if (($l->[0] <=> $r->[0]) || ($l->[1] <=> $r->[1]))>0; my $lp=2; my $rp=0; my ($in,$out)=@{$l}[0,1]; # while both lists still have elements merge them together as needed while ($lp < @$l && $rp < @$r) { warn "w: $in,$out\n" if $DEBUG; if ( $l->[$lp] <= $out && $in <= $l->[$lp+1] ) { $in= $l->[$lp] if $in > $l->[$lp]; $out= $l->[$lp+1] if $out < $l->[$lp+1]; $lp += 2; } elsif ( $r->[$rp] <= $out && $in <= $r->[$rp+1] ) { $in= $r->[$rp] if $in > $r->[$rp]; $out= $r->[$rp+1] if $out < $r->[$rp+1]; $rp += 2; } else { push @ret,$in,$out; if ((($l->[$lp] <=> $r->[$rp]) || ($l->[$lp+1] <=> $r->[$rp+1]))<1) { ($in,$out)=@{$l}[$lp,$lp+1]; $lp += 2; } else { ($in,$out)=@{$r}[$rp,$rp+1]; $rp += 2; } } } # make sure any remaining elements in @$l are processed for ( ; $lp < @$l ; $lp +=2 ) { warn "$lp: l: ($in,$out) ($l->[$lp],$l->[$lp+1])\n" if $DEBUG; if ( $l->[$lp] <= $out && $in <= $l->[$lp+1] ) { $in= $l->[$lp] if $in > $l->[$lp]; $out= $l->[$lp+1] if $out < $l->[$lp+1]; } else { push @ret,$in,$out; ($in,$out)=@{$l}[$lp,$lp+1]; } } # make sure any remaining elements in @$r are processed for ( ; $rp < @$r ; $rp +=2 ) { warn "$rp: r: ($in,$out) ($r->[$rp],$r->[$rp+1])\n" if $DEBUG; if ( $r->[$rp] <= $out && $in <= $r->[$rp+1] ) { $in= $r->[$rp] if $in > $r->[$rp]; $out= $r->[$rp+1] if $out < $r->[$rp+1]; } else { push @ret,$in,$out; ($in,$out)=@{$r}[$rp,$rp+1]; } } push @ret,$in,$out; return bless \@ret; } # "and" two char classes together sub intersection { my ($l,$r)=@_; my @ret; my ($lp,$rp)=(0,0); while ($lp < @$l && $rp < @$r) { if ( $l->[$lp] < $r->[$rp+1] && $r->[$rp] < $l->[$lp+1]) { my $in = ($l->[$lp] < $r->[$rp]) ? $r->[$rp] : $l->[$lp]; my $out = ($l->[$lp+1] < $r->[$rp+1]) ? $l->[$lp+1] : $r->[$rp+1]; push @ret,$in,$out; } if ( $l->[$lp+1] < $r->[$rp+1] ) { $lp += 2; } else { $rp += 2; } } return bless \@ret; } # "subtract" one set from the other sub difference { my ($l,$r)=@_; my $inv = invert($r); return intersection($l,$inv); } # "xor" two sets sub symmetric_difference { my ($l,$r)=@_; return intersection(invert(intersection($l,$r)),union($l,$r)) } # check if a char is in the set sub has { my ($l,$char)=@_; my $v=decode($char); return 0 if $l->[0]>$v || $l->[-1] < $v; for ( my $i=0; $i<@$l; $i+=2 ) { if ( $l->[$i]<=$v && $v<$l->[$i+1] ) { return 1; } } return 0; } my %protect=map { ord($_)=>1 } qw( - \\ [ ] + & | ^ ); sub d2c { map { $_<32||$protect{$_} ? sprintf "\\%o",$_ : $_>127 ? sprintf "\\x{%X}",$_ : chr $_ } @_ } # stringify an invlsit as a charclass specification sub as_string { my $l = shift; my ($i,@c); #warn "@$l\n"; for ( $i = 0 ; $i < @$l ; $i += 2 ) { my ($in,$out) = @{$l}[$i,$i+1]; if ($in == $out-1) { push @c, d2c($in); } else { push @c, join "-",d2c($in, $out-1); } } #warn "@c\n"; return "[".join("",@c)."]"; } # test the code #die make_inv_list('L-NJ-O')->as_string; use Test::More 'no_plan'; while () { next if /^\s*#/ || !/\S/; my ($c1,$c2,$u,$i,$d) = split /\s+/,$_; my $l1 = make_inv_list($c1); my $l2 = make_inv_list($c2); my $ul = union($l1,$l2); my $il = intersection($l1,$l2); my $dl = difference($l1,$l2); for ( ['|',$u,$ul->as_string], ['&',$i,$il->as_string], ['-',$d,$dl->as_string] ) { is($_->[2],$_->[1],"[$c1] $_->[0] [$c2]"); } } my $l = make_inv_list("\2AEIOU"); my $str=""; for ("A".."Z") { $str.=$_ if $l->has($_); } is($str,'AEIOU','has()'); my $li=invert($l); is($li->as_string,'[\\0-\\1\\3-@B-DF-HJ-NP-TV-\\x{10FFFF}]',"invert([\\2AEIOU]) works"); my $lii=invert($li); my ($liii)=invert($lii); is($l->as_string,$lii->as_string,"Double invert works:".$l->as_string); is($li->as_string,$liii->as_string,"Tripple invert works:".$li->as_string); __DATA__ ##C1 C2 | & - A-C C-E [A-E] [C] [A-B] ABEF B-E [A-F] [BE] [AF] A-C G-I [A-CG-I] [] [A-C] A-Z AEIOU [A-Z] [AEIOU] [B-DF-HJ-NP-TV-Z] ACE BDF [A-F] [] [ACE] YVES EVE [ESVY] [EV] [SY] A-F D-I [A-I] [D-F] [A-C] A-C D-G [A-G] [] [A-C] A-D D-G [A-G] [D] [A-C] A-D C-G [A-G] [C-D] [A-B] D-G A-C [A-G] [] [D-G] D-G A-D [A-G] [D] [E-G] C-G A-D [A-G] [C-D] [E-G] ABC B [A-C] [B] [AC] A-D BC [A-D] [B-C] [AD] A-DX-Z ONML [A-DL-OX-Z] [] [A-DX-Z] A-DE-FG-H A-Z [A-Z] [A-H] [] E-J A-C [A-CE-J] [] [E-J] E-J B-D [B-J] [] [E-J] E-J C-E [C-J] [E] [F-J] E-J D-F [D-J] [E-F] [G-J] E-J E-G [E-J] [E-G] [H-J] E-J F-H [E-J] [F-H] [EI-J] E-J G-I [E-J] [G-I] [E-FJ] E-J H-J [E-J] [H-J] [E-G] E-J I-K [E-K] [I-J] [E-H] E-J J-L [E-L] [J] [E-I] E-J K-M [E-M] [] [E-J] E-J L-N [E-JL-N] [] [E-J] A-CU-Z U-WA-F [A-FU-Z] [A-CU-W] [X-Z] B-DT-Y S-UC-H [B-HS-Y] [C-DT-U] [BV-Y] C-ES-X Q-SE-J [C-JQ-X] [ES] [C-DT-X] D-FR-W O-QG-L [D-LO-W] [] [D-FR-W] E-GQ-V M-OI-N [E-GI-OQ-V] [] [E-GQ-V] F-HP-U K-MK-P [F-HK-U] [P] [F-HQ-U] G-IO-T I-KM-R [G-KM-T] [IO-R] [G-HS-T] H-JN-S G-IO-T [G-JN-T] [H-IO-S] [JN] I-KM-R E-GQ-V [E-GI-KM-V] [Q-R] [I-KM-P] J-LL-Q C-ES-X [C-EJ-QS-X] [] [J-Q] K-MK-P A-CU-Z [A-CK-PU-Z] [] [K-P] L-NJ-O B-DT-Y [B-DJ-OT-Y] [] [J-O] M-OI-N D-FR-W [D-FI-OR-W] [] [I-O] N-PH-M F-HP-U [F-U] [HP] [I-O] O-QG-L H-JN-S [G-LN-S] [H-JO-Q] [GK-L] P-RF-K J-LL-Q [F-R] [J-KP-Q] [F-IR] Q-SE-J L-NJ-O [E-OQ-S] [J] [E-IQ-S] R-TD-I N-PH-M [D-PR-T] [H-I] [D-GR-T] S-UC-H P-RF-K [C-KP-U] [F-H] [C-ES-U] T-VB-G R-TD-I [B-IR-V] [D-GT] [B-CU-V] U-WA-F T-VB-G [A-GT-W] [B-FU-V] [AW]