#!/usr/bin/perl use strict; use warnings; my @test_numbers; my $region; while () { # read all region codes # remove the leading zero as it is mandatory # and also an indicator for a region code next unless /^0(\d+)\s+(.*)/g; # get the code my $rc= $1; # and it's city (the city isn't really used. Could be any string) my $city= $2; # store each region code for testing later push(@test_numbers, "x0${rc}x"); # Build up the subtree $region= build_up($region, $rc, $city); } # testcode # use Data::Dumper; # print Dumper $region; # create the regular expression my $re= "(0" . rc2re($region) . ")"; print "$re\n"; # now check all region codes foreach (@test_numbers) { s/$re/-$1-/o; # print $_,"\n"; die "wrong! $_ " unless /^x-/ and /-x$/; } print "success!\n"; exit; sub build_up { my($subtree, $rc, $city)= @_; # if the region code is empty if ( $rc eq '' ) { # we return the city if we don't have a subtree yet return $city unless $subtree; # if we have a subtree, we already have region codes # starting with the same sequence if (ref $subtree) { # So we add a "new digit" (-) for this city $subtree->{'-'}= $city; return $subtree; } # otherwise we had the same region code for a different # city (which is legal) and we simply extend the city's name return "$subtree, $city"; } # If the region code isn't empty we create a new subtee # if we don't already have one if (not defined $subtree) { $subtree={}; } # otherwise we might have a city name elsif (not ref $subtree) { # this is the same case as above but now # we already had a complete region code # starting with the same digits $subtree= { '-' => $subtree }; } # We get the first digit my $f= substr($rc,0,1); # And build it's subtree. $subtree->{$f}= build_up($subtree->{$f}, substr($rc,1), $city); # done return $subtree; } sub rc2re { my($region)= @_; # Collect all end digits my $ep= join '', sort grep { $_ ne '-' and not ref $region->{$_} } keys %$region; # If it's more than 1 make it a character class $ep= "[$ep]" if length $ep>1; # If we had the special digit "-", make the end digits optional $ep= "$ep?" if $region->{'-'}; # Join all regular expressions for all subtrees my $mr= join '|', map { $_ . rc2re($region->{$_}) } sort grep { ref $region->{$_} } keys %$region; # If we had subtrees if (length $mr>0) { # join them with the end digits (if any) $ep.='|' if $ep ne ''; $ep= "(?:$ep$mr)"; } return $ep; } __DATA__ 0201 Essen, Ruhr 0202 Wuppertal 0203 Duisburg 02041 Bottrop 02043 Gladbeck Westf 0208 Muelheim a.d. Ruhr 0208 Oberhausen Rheinl 0212 Solingen 02129 Haan Rheinl 030 Berlin 03301 Oranienburg 03302 Hennigsdorf 03303 Birkenwerder 03304 Velten 033051 Nassenheide 033053 Zehlendorf Kr Oberhav 033054 Liebenwalde 033055 Kremmen 033056 Muehlenbeck Kr Oberhav 03306 Gransee 03307 Zehdenick 040 Hamburg 04101 Pinneberg 04102 Ahrensburg 04103 Wedel Holst 04104 Aumuehle b Hamburg 04105 Seevetal 04106 Quickborn Kr Pinneber 04107 Siek Kr Stormarn 04108 Rosengarten Kr Harbur 04109 Tangstedt Bz Hamburg 04131 Lueneburg 04132 Amelinghausen 04133 Wittorf Kr Lueneburg 04134 Embsen Kr Lueneburg 04135 Kirchgellersen 04136 Scharnebeck 04137 Barendorf 04138 Betzendorf Kr Lünebur 04139 Hohnstorf