ok 1 - stripUtf8Entities # before:blah # after: blah ok 2 - stripUtf8Entities # before:Ü -- # after: Ü -- ok 3 - stripUtf8Entities # before:blah -- ’ -- blah # after: blah -- -- blah ok 4 - stripUtf8Entities # before:Ü -- ’ -- blah # after: Ü -- -- blah ok 5 - stripUtf8EntitiesBetter # before:blah # after: blah ok 6 - stripUtf8EntitiesBetter # before:Ü -- # after: Ü -- not ok 7 - stripUtf8EntitiesBetter # before:blah -- ’ -- blah # after: blah -- -- blah # Failed test 'stripUtf8EntitiesBetter # before:blah -- ’ -- blah # after: blah -- -- blah' # at shopImporter-test.pl line 49. Wide character in print at /home/hartman/idealo_external_dependencies/current/localperl/lib/5.8.8/Test/Builder.pm line 1192. # got: 'blah -- â -- blah' # expected: 'blah -- -- blah' not ok 8 - stripUtf8EntitiesBetter # before:Ü -- ’ -- blah # after: Ü -- -- blah # Failed test 'stripUtf8EntitiesBetter # before:Ü -- ’ -- blah # after: Ü -- -- blah' # at shopImporter-test.pl line 49. Wide character in print at /home/hartman/idealo_external_dependencies/current/localperl/lib/5.8.8/Test/Builder.pm line 1192. # got: 'Ã -- â -- blah' # expected: 'Ã -- -- blah' 1..8 # Looks like you failed 2 tests of 8. #### $ cat utf8-and-html-entities.pl #!/usr/angebote/perlroot/bin/perl use strict; use warnings; # use strict; # use IO::File; # use Text::CSV_XS; # use DBI; # use Time::Local; # use Time::HiRes; # use Compress::Zlib; # use LWP::UserAgent; #use POSIX qw(locale_h); use HTML::Strip; use Test::More qw(no_plan); use Data::Dumper; #setlocale(LC_CTYPE, "de_DE.ISO8859-1"); require "../../perl/agentFunc.pl"; my $stringsBeforeAfter = [ [ 'blah', 'blah' ], [ 'Ü --', 'Ü --'], ["blah -- ’ -- blah", "blah -- -- blah"], ["Ü -- ’ -- blah", "Ü -- -- blah"], ]; foreach my $beforeAfter ( @$stringsBeforeAfter ) { my ( $before, $after ) = @$beforeAfter; my $transformed =HTML2Text( stripUtf8Entities( $before ) ); my $strings = [ [ "before", $before ], [ "after", $after ], [ "transformed", $transformed ] ]; #print "strings: " . Dumper($strings); is($transformed, $after, "stripUtf8Entities"); } foreach my $beforeAfter ( @$stringsBeforeAfter ) { my ( $before, $after ) = @$beforeAfter; my $transformed =HTML2Text( stripUtf8EntitiesBetter( $before ) ); my $strings = [ [ "before", $before ], [ "after", $after ], [ "transformed", $transformed ] ]; #print "strings: " . Dumper($strings); is($transformed, $after, "stripUtf8EntitiesBetter"); } sub HTML2Text { my ($changeText) = @_; my $htmlStripObject = HTML::Strip->new(); $changeText = $htmlStripObject->parse($changeText); return $changeText; } # works, but only for one special character: &rsquo # what happens when I hit another char that doesn't translate well out of utf8? sub stripUtf8Entities { my $string = shift || ""; my $utf8Entities = ["’"]; foreach my $utf8Entity ( @$utf8Entities ) { $string =~ s/$utf8Entity//g; } return $string; } #just a stub -- is there a better, more general way to do this? sub stripUtf8EntitiesBetter { my $string = shift || ""; return $string; }