package PhoneParse; # Attempts to parse international phone numbers found "in the # wild" where operaters entered the numbers with no attempt at # prior format enforcement. Only handles International Dial # Direct codes, Country Codes, extensions, and the numbers # themselves. No attempt is made to identify "area codes" if # present in the number. use strict; use vars qw( @EXPORT $DEBUG ); use base qw(Exporter); @EXPORT = qw( parse_phone ); use Carp; # Home grown use PhoneNumber; use CountryCodes qw( is_country_code pull_country_code ); sub parse_phone { # Attempt to normalize feral phone numbers Lots of sequential # calls here, where the remnants of the last call are passed # along to the next routine. my $entry = shift; return () if $entry =~ /^\s*$/; # Normalize dividing cues that look like attempts to indicate # alternat phone listings $entry = normalize_alts($entry); # Using those cues, crack into multiple phone numbers my @raw_numbers = split_nums($entry); my @numbers; # storage bin. my @first; # cache for 1st country code and number foreach my $raw_number (@raw_numbers) { # Normalize oddball extension indicators $raw_number = normalize_exts($raw_number); # It helps to strip parens, as opposed to other noise, at # this point. Sequential cohesion, anyone? Bleah. $raw_number = zap_parens($raw_number); # Grab the extensions. my @exts; ($raw_number, @exts) = extract_exts($raw_number); # Trim the fat from the ends $raw_number = zap_border_noise($raw_number); # Yank any International Dial Direct codes, that is, codes # used to dial *out* of various countries. These are of no # interest, although they can serve as an indicator of an # upcoming country code. my $idd; ($raw_number, $idd) = extract_idd($raw_number); # Extensions have been clipped and IDD's plucked. Now look # for stealth dash alternates -- that is to say, a single # dash (normally ubiquitous) that is actually indicating # alternate numbers. This will tank if the lone dash was # meant to indicate an extension. Oh well. $raw_number = normalize_dashslash($raw_number); # Now we can interpolate over alternate suffixes, if any, and # crack each number. my @raws = interpolate($raw_number); foreach my $raw (@raws) { my($num, $cc) = extract_country_code($raw, $first[1]); # Cache first found country code and number @first = ($num, $cc) if !@first && $cc; # Propogate first country code if appropriate. If the root # numbers don't have the same digit count then we do not # propogate the cc. my $native_cc = 1; if (!$cc && $first[1]) { if ($num =~ tr/0-9// == $first[0] =~ tr/0-9//) { $cc = $first[1]; $native_cc = 0; }; } $native_cc = 0 if @raws > 1 && $raw ne $raws[0]; push(@numbers, PhoneNumber->new( num => $num, cc => $cc, ext => \@exts, idd => $idd, )); $numbers[-1]->_native_cc($native_cc); } } # All done @numbers; } sub normalize_dashslash { # Normally dashes cannot tell us much, but if they are towards # the end and the *only* use of a dash on a relatively long # number, it's reasonabe to infer that that the dash is # indicating alternative suffixes for a number. In this case # just replace the dash with something more obvious: a slash. my $raw_number = shift; my $dash_count = $raw_number =~ tr/\-//; return $raw_number unless $dash_count == 1; my $total_dcount = $raw_number =~ tr/0-9//; my($left, $right) = split(/\s*\-\s*/, $raw_number); my($pre, $left) = $left =~ /(.*?)(\d+)$/; my $pre_dcount = $pre =~ tr/0-9//; return $raw_number unless $pre_dcount; my $left_dcount = $left =~ tr/0-9//; my $right_dcount = $right =~ tr/0-9//; my $r_pct = $right_dcount/($pre_dcount + $left_dcount); # If there are lots of digits, proceed. Also proceed on smaller # digit streams if the righthand chunk is "big enough", guessed # and gollied here at around 32%. This avoids simple numbers # such as +dd dddd-dddd and +ddd ddd ddd-ddd if ($total_dcount > 12 || $r_pct <= 0.32) { # We have an "interesting" number if ($right_dcount == $left_dcount || $right_dcount == 1) { # Balanced lengths around a lone dash; probably an # alternate ending. Otherwise an rlength of 1, which is an # alt or an ext but we'll presume alt. return join('/', "$pre$left", $right); } } # No suspicious dashes $raw_number; } sub zap_border_noise { # Zap leading and trailing non-numerics (but not +) my $raw_number = shift; $raw_number =~ s/^[^\+\d]+//; $raw_number =~ s/\D+$//; $raw_number; } sub zap_parens { # Zap parentheses my $raw_number = shift; $raw_number =~ s/[()]//g; $raw_number; } sub extract_idd { # Clip International Direct Dial Codes if they were entered # instead of country codes. Specifically we go after # combinations of leading zeros followed by ones: 00, 011, # 0011, 010, etc. This does not cover all IDDs, but gets many # of them. Oftentimes the country code will remain next in # line. Note that we take special care *not* to clip a mere # leading '1' or '001', the CC for the USA. my $raw_number = shift; my $original_number = $raw_number; # Remove start noise, such as quotes and whitespace $raw_number =~ s/^[^\d\+]+//; # Isolate a '+' if present with dashes, additional pluses, or # any other non-numeric cruft following it. $raw_number =~ s/^\+\D*/\+/; my $idd; if ($raw_number =~ s/^\+(0+1{2,})/\+/) { # Look for a + followed by zeros and at least two 1's, and # replace with a '+'. By far the most common occurrence of # this is '+ 011'. $idd = $1; } elsif ($raw_number =~ s/^\+(0+)1/\+/) { # Check for a + followed by zeros and a single 1. Here we # also check the remaining digit count in order to guess # whether we're dealing with the Country Code of the USA (1) # or an IDD from within somewhere else. my $digit_count = $raw_number =~ tr/[0-9]//; if ($digit_count == 10) { # CC for U.S.A. $idd = $1; $raw_number =~ s/^\+/\+1/; } else { # IDD from within somewhere else? $idd = "${1}1"; } } elsif ($raw_number =~ s/^\+(0+[01]*)/\+/) { # Wrap-up for mandatory '+' sightings: Replace a plus # followed by zero followed by any combination of 1's and 0's # with just a '+'. $idd = $1; } elsif ($raw_number =~ s/^(0+[01]*)/\+/) { # Infer '+' for remaining 01 combinations with no '+', in # particular '00'. $idd = $1; } else { # No idd found return $original_number; } # Booty $raw_number =~ /\d/ ? ($raw_number, $idd) : $original_number; } sub normalize_exts { # attempt to normalize odd ext indicators to a single 'x' my $raw_number = shift; $raw_number =~ s/[\#\*]+\s*(\d+.*)$/x$1/; $raw_number; } sub extract_exts { # Extract extensions. Multiple extensions are assumed to be # indicated with slashes of some sort (hence the earlier # normalizing attempt) my $raw_number = shift; return $raw_number unless $raw_number =~ /\D/; my @exts; if ($raw_number =~ s/[xX]+\D*(\d+.*)$//) { my $ext = $1; $ext =~ s/[^\d,\/\\\|]+//g; @exts = split(/\D+/, $ext); } # clean up non-numeric extension debris $raw_number =~ s/\D+$//; ($raw_number, @exts); } sub normalize_alts { # attempt to normalize delimeters for alternate numbers or # extensions my $entry = shift; return $entry unless $entry =~ /\D/; $entry = lc($entry); $entry =~ s/\s+or\s+/\//g; $entry =~ s/\s*[,;\|]\s*/\//g; $entry; } sub extract_country_code { # Yank or infer country codes if possible my($raw, $cc_known) = @_; my($num, $cc); if ($raw !~ /^\+/) { # No '+', see if the first number group looks like a country # code. If so make sure there are enough digits in the number # to make sense with a country code. if (!$cc_known && ($raw =~ /^(\d+)[\s\-]+\d+/ && is_country_code($1)) || $raw =~ tr/0-9// > 10) { ($num, $cc) = pull_cc_smart($raw); } else { # No country code to pull. Just strip non nums. $num = $raw; $num =~ s/\D+//g; } } else { # There was a leading '+' so we'll have a go at pulling a # country code, even if there is no valid one present. ($num, $cc) = pull_cc_smart($raw); } # Booty ($num, $cc); } sub pull_cc_smart { # Yank country codes by scanning for valid country codes my $raw_number = shift; $raw_number =~ s/\D+//g; my($num, $ccode) = pull_country_code($raw_number); ($num, $ccode); } sub pull_cc_guess { # Attempt to mechanically yank country codes without any # information on what represents a valid cc. my $raw_number = shift; my $pat = qr/^\s*\++[\s\+\-]*(\d+)/; my($ccode) = $raw_number =~ /$pat/; $raw_number =~ s/$pat// if defined $ccode; $raw_number =~ s/\D+//g; ($raw_number, $ccode); } sub split_nums { # Attempt to detect and split multiple numbers. my $raw_number = shift; my @numbers; if ($raw_number =~ tr/0-9// > 18) { if ($raw_number =~ /\+[^\+]+([^\+\s]\s*\+)/) { # Attempt to split on '+' in cases where there # are multiple country codes. @numbers = split("\Q$1\E", $raw_number); map($numbers[$_] = "+$numbers[$_]", 1..$#numbers); } else { # Otherwise go for slashes and length ratios. We # guess/golly chunk lengths of 9 digits or larger. my @chunks; ($numbers[0], @chunks) = split(/\s*[\\\/,]+\s*/, $raw_number); foreach my $chunk (@chunks) { my($ext_guard) = $chunk =~ /^([^x]+)/i; my $chunk_digits = $ext_guard =~ tr/0-9//; if ($chunk_digits >= 9 && $numbers[-1] =~ tr/0-9// >= 9) { push(@numbers, $chunk); } else { $numbers[-1] .= "/$chunk"; } } } # Check for numbers such as +1 555 555 5555 1 444 444 4444 # This is some hard-coded US-centric whack for sure. if (@numbers <= 1) { my @chunks = split(/[\s\+\-]+1[\s\+\-]+/, $raw_number); shift @chunks if $chunks[0] =~ /^\s*$/; if (@chunks >= 2) { @numbers = (); foreach (@chunks) { if (tr/0-9// >= 8) { push(@numbers, "+1 $_"); } else { $numbers[$#numbers] .= " +1 $_"; } } } } } # Booty @numbers ? @numbers : $raw_number; } sub interpolate { # Many times multiple suffixes, rather than whole numbers, are # indicated by slashes, etc. We take these suffixes and join # them to their common prefix. my $raw_num = shift; # Split on our chosen delimeters ( '/' or '\') my($base, @frags) = split(/\s*[\\\/]+\s*/, $raw_num); return $raw_num unless @frags; # Pull the digits from the first alternate in order to capture # the digit count my($nchunk) = $frags[0] =~ /(\d+)/; return $raw_num unless defined $nchunk; # Check to make sure our prefix isn't a shorter stub. If it is, # interpolation makes no sense. my $base_dcount = $base =~ tr/0-9//; my $chunk_dcount = $nchunk =~ tr/0-9//; return $raw_num if $base_dcount <= $chunk_dcount; # Using that length, pull the root number from the string # containing that root plus the *first* alternative. my($prefix) = $base =~ /(.*)\d{$chunk_dcount}$/; # Entirely separate numbers if no prefix. return $raw_num unless defined $prefix; # Apply the prefix to the remaining alternatives. We drop # duplicates. In the real world this probably meant that the # alternative presented was an extension rather than an # alternative. Oh well. my @interpolated; my %seen; foreach ($base, map("$prefix$_", @frags)) { next if $seen{$_}; push(@interpolated, $_); ++$seen{$_}; } # Booty @interpolated; } 1;