For completeness, here is the answers above with the use of Test::More
use utf8 is needed because of the accented characters in the text in the bottom part
use strict;
use warnings;
use utf8;
use Test::More;
use open OUT => ':encoding(utf8)';
use open ':std';
my $pos=1;
while ( my $t = <DATA> ) {
chomp $t;
#printf "orig: %-30s translated: %s\n", $t, translate_yeli2($t);
my @data = split/\s\*\*\s/, $t;
my $res = translate_eily2($data[0]);
# die $data[1];
ok ($res eq $data[1], "*" . $data[0]. "* -> *" . $res . "* / *". $d
+ata[1]. "* " . $pos++);
}
done_testing;
sub translate {
my $str = shift;
$str =~ tr/-/ /; #replace - with a space
$str =~ tr/a-zA-Z/ /cs; #replace non letter with a space
my @words = split( /\s+/, $str );
foreach my $w (@words) {
#insert a space when a upper case is inside a word
if ( $w =~ /\p{isLower}\p{isUpper}/ ) {
my @all;
while ( $w =~ m/\G(\p{isUpper}*\p{isLower}+)/g ) {
push @all, $1;
}
$w = join( " ", @all );
}
else {
$w = ucfirst( lc($w) ); # we are using side effect of fore
+ach loop
}
}
return join( ' ', @words );
}
sub translate_eily {
my $name = shift;
$name =~ s/
[^a-zA-Z]+ # Non letter chars
| #
(?<= [a-zA-Z] ) # Something that comes after a let
+ter
(?= [A-Z] ) # and comes before an uppercase le
+tter
/ /xg;
# \u is short for ucfirst and \L for lc
$name =~ s/(\w+)/\u\L$1/g;
$name =~s/\s+$//g;
return $name;
}
sub translate_hippo {
my $str = shift;
$str =~ tr/a-zA-Z/ /cs; #replace non letter with a space
my @words = split( /\s+/, $str );
foreach my $w (@words) {
#insert a space when a upper case is inside a word
$w =~ s/(\p{isLower})(\p{isUpper})/$1 $2/g
or $w =
ucfirst( lc($w) );
}
return join( ' ', @words );
}
sub translate_choroba {
my ($str) = @_;
$str =~ tr/-/ /;
#$str =~ tr/a-zA-Z/ /cs;
$str =~ tr/a-zA-Z/ /cs; #replace non letter with a space
$str =~ s/(?<=\p{isLower})(?=\p{isUpper})/ /g;
$str =~ s/(?:(?<=\s)|(?<=^))(\p{isLower})/\u$1/g;
$str =~ s/\s+$//r;
}
sub translate_eily2 {
my $name = shift;
$name =~ s/
\P{isLetter}+ # Non letter chars
| # or
(?<= \p{isLetter} ) # Something that comes after a
+ letter
(?= \p{isUpper} ) # and comes before an upperc
+ase letter
/ /xg;
# print "after s: *$name*\n";
# \u is short for ucfirst and \L for lc
$name =~ s/(\w+)/\u\L$1/g;
$name =~s/\s+$//g;
return $name;
}
__DATA__
Acierno James S., Jr. ** Acierno James S Jr
Ackermann-Hirschi L. ** Ackermann Hirschi L
Alba-Castro Jose-Luis ** Alba Castro Jose Luis
Boulangère Françoise ** Boulangère Françoise
AlconadaVerzini M. J. ** Alconada Verzini M J
AmorDosSantos S. P ** Amor Dos Santos S P
da Costa F. Barreiro Guimaraes ** Da Costa F Barreiro Guimaraes
deRenstrom P. A. Bruckman ** De Renstrom P A Bruckman
Fauccigiannelli M. ** Fauccigiannelli M
FaucciGiannelli M. ** Faucci Giannelli M
Yao W-M ** Yao W M
Yao W-M. ** Yao W M
Yao W. -M ** Yao W M
Yao W. -M. ** Yao W M
|