use strict; use warnings; use utf8; use Test::More; use open OUT => ':encoding(utf8)'; use open ':std'; my $pos=1; while ( my $t = ) { chomp $t; #printf "orig: %-30s translated: %s\n", $t, translate_yeli2($t); my @data = split/\s\*\*\s/, $t; my $res = translate_eily2($data[0]); # die $data[1]; ok ($res eq $data[1], "*" . $data[0]. "* -> *" . $res . "* / *". $data[1]. "* " . $pos++); } done_testing; sub translate { my $str = shift; $str =~ tr/-/ /; #replace - with a space $str =~ tr/a-zA-Z/ /cs; #replace non letter with a space my @words = split( /\s+/, $str ); foreach my $w (@words) { #insert a space when a upper case is inside a word if ( $w =~ /\p{isLower}\p{isUpper}/ ) { my @all; while ( $w =~ m/\G(\p{isUpper}*\p{isLower}+)/g ) { push @all, $1; } $w = join( " ", @all ); } else { $w = ucfirst( lc($w) ); # we are using side effect of foreach loop } } return join( ' ', @words ); } sub translate_eily { my $name = shift; $name =~ s/ [^a-zA-Z]+ # Non letter chars | # (?<= [a-zA-Z] ) # Something that comes after a letter (?= [A-Z] ) # and comes before an uppercase letter / /xg; # \u is short for ucfirst and \L for lc $name =~ s/(\w+)/\u\L$1/g; $name =~s/\s+$//g; return $name; } sub translate_hippo { my $str = shift; $str =~ tr/a-zA-Z/ /cs; #replace non letter with a space my @words = split( /\s+/, $str ); foreach my $w (@words) { #insert a space when a upper case is inside a word $w =~ s/(\p{isLower})(\p{isUpper})/$1 $2/g or $w = ucfirst( lc($w) ); } return join( ' ', @words ); } sub translate_choroba { my ($str) = @_; $str =~ tr/-/ /; #$str =~ tr/a-zA-Z/ /cs; $str =~ tr/a-zA-Z/ /cs; #replace non letter with a space $str =~ s/(?<=\p{isLower})(?=\p{isUpper})/ /g; $str =~ s/(?:(?<=\s)|(?<=^))(\p{isLower})/\u$1/g; $str =~ s/\s+$//r; } sub translate_eily2 { my $name = shift; $name =~ s/ \P{isLetter}+ # Non letter chars | # or (?<= \p{isLetter} ) # Something that comes after a letter (?= \p{isUpper} ) # and comes before an uppercase letter / /xg; # print "after s: *$name*\n"; # \u is short for ucfirst and \L for lc $name =~ s/(\w+)/\u\L$1/g; $name =~s/\s+$//g; return $name; } __DATA__ Acierno James S., Jr. ** Acierno James S Jr Ackermann-Hirschi L. ** Ackermann Hirschi L Alba-Castro Jose-Luis ** Alba Castro Jose Luis Boulangère Françoise ** Boulangère Françoise AlconadaVerzini M. J. ** Alconada Verzini M J AmorDosSantos S. P ** Amor Dos Santos S P da Costa F. Barreiro Guimaraes ** Da Costa F Barreiro Guimaraes deRenstrom P. A. Bruckman ** De Renstrom P A Bruckman Fauccigiannelli M. ** Fauccigiannelli M FaucciGiannelli M. ** Faucci Giannelli M Yao W-M ** Yao W M Yao W-M. ** Yao W M Yao W. -M ** Yao W M Yao W. -M. ** Yao W M