#!/usr/bin/perl use strict; # mixed string with ISO 8859-1 und UTF-8: my $test_string = "Das Å (auch \"bolle-Å\" genannt, was soviel bedeute +t wie \"Kringel-Å\") ist mit der ". force_utf8("dänischen Rechtschreibreform von 1948 eingeführt worde +n."); print "Source: $test_string\n"; print "UTF : ".force_utf8($test_string)."\n"; print "ISO : ".force_latin($test_string)."\n"; sub force_utf8 { my $string = shift; $string =~ s/([\xc0-\xdf][\x80-\xbf]{1}|[\xe0-\xef][\x80-\xbf]{2}| +[\xf0-\xf7][\x80-\xbf]{3}|[\x80-\xff])/&encode_char_utf8($1)/ge; return $string; } sub force_latin { my $string = shift; $string =~ s/([\xc0-\xdf][\x80-\xbf]{1}|[\xe0-\xef][\x80-\xbf]{2}| +[\xf0-\xf7][\x80-\xbf]{3}|[\x80-\xff])/&decode_char_utf8($1)/ge; return $string; } sub encode_char_utf8 { my $char = shift; if($char =~ /^([\xc0-\xdf][\x80-\xbf]{1}|[\xe0-\xef][\x80-\xbf]{2} +|[\xf0-\xf7][\x80-\xbf]{3})$/) { return $char; } my $value = ord($char); return chr(($value>>6) | 0xc0).chr(0x80 | ($value & 0x3f)); } sub decode_char_utf8 { my $char = shift; if($char =~ /^([\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3} +)$/) { return ''; } elsif($char =~ /^([\xc0-\xdf])([\x80-\xbf])$/) { my $value = ((ord($1) & 0x1f)<<6)+(ord($2) & 0x3f); if($value<256) { return chr($value); } else { return ''; } } else { return $char; } }
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Re: Mixed ISO-8859/UTF-8 conversion
by Juerd (Abbot) on Oct 04, 2007 at 11:04 UTC | |
by olli (Initiate) on Oct 04, 2007 at 11:39 UTC | |
by Juerd (Abbot) on Oct 04, 2007 at 12:38 UTC | |
|
Re: Mixed ISO-8859/UTF-8 conversion
by zby (Vicar) on Oct 04, 2007 at 10:52 UTC | |
by Juerd (Abbot) on Oct 04, 2007 at 11:28 UTC | |
by zby (Vicar) on Oct 04, 2007 at 19:57 UTC |