Does anyone know what the performance penalty of utf8 hash keys -- even if they contain only ASCII chars -- is?
Benchmarking script:
use strict;
use warnings;
use Encode qw( _utf8_on );
use Benchmark::Timer;
use Devel::Peek;
my $timer = Benchmark::Timer->new;
my $latin1 = "foo";
my $utf8 = "foo";
_utf8_on($utf8);
my %latin1_hash = ( $latin1 => 1 );
my %utf8_hash = ( $utf8 => 1 );
warn "===================================\n";
warn "Latin-1 hash key:\n----------------------------------\n";
Dump \%latin1_hash;
warn "===================================\n";
warn "UTF-8 hash key:\n-----------------------------------\n";
Dump \%utf8_hash;
warn "===================================\n";
for my $iter ( 1 .. 10 ) {
my $total = 0;
$timer->start("Latin-1 hash key, Latin-1 probe");
$total += $latin1_hash{$latin1} for 1 .. 1_000_000;
$timer->stop("Latin-1 hash key, Latin-1 probe");
$total = 0;
$timer->start("Latin-1 hash key, UTF-8 probe");
$total += $utf8_hash{$latin1} for 1 .. 1_000_000;
$timer->stop("Latin-1 hash key, UTF-8 probe");
$total = 0;
$timer->start("UTF-8 hash key, Latin-1 probe");
$total += $latin1_hash{$utf8} for 1 .. 1_000_000;
$timer->stop("UTF-8 hash key, Latin-1 probe");
$total = 0;
$timer->start("UTF-8 hash key, UTF-8 probe");
$total += $utf8_hash{foo} for 1 .. 1_000_000;
$timer->stop("UTF-8 hash key, UTF-8 probe");
}
warn scalar $timer->reports . "\n";
Results for vanilla custom-compiled Perl 5.10.0 on Mac OS X:
marvin@smokie:~/perltest $ perl hash_key_bench.pl
===================================
Latin-1 hash key:
----------------------------------
SV = RV(0x8503ac) at 0x8503a0
REFCNT = 1
FLAGS = (TEMP,ROK)
RV = 0x84df80
SV = PVHV(0x809048) at 0x84df80
REFCNT = 2
FLAGS = (PADMY,SHAREKEYS)
ARRAY = 0x200f60 (0:7, 1:1)
hash quality = 100.0%
KEYS = 1
FILL = 1
MAX = 7
RITER = -1
EITER = 0x0
Elt "foo" HASH = 0x238678dd
SV = IV(0x85036c) at 0x850370
REFCNT = 1
FLAGS = (IOK,pIOK)
IV = 1
===================================
UTF-8 hash key:
-----------------------------------
SV = RV(0x8503ac) at 0x8503a0
REFCNT = 1
FLAGS = (TEMP,ROK)
RV = 0x84df40
SV = PVHV(0x80905c) at 0x84df40
REFCNT = 2
FLAGS = (PADMY,SHAREKEYS,HASKFLAGS)
ARRAY = 0x200f80 (0:7, 1:1)
hash quality = 100.0%
KEYS = 1
FILL = 1
MAX = 7
RITER = -1
EITER = 0x0
Elt "foo" [UTF8 "foo"] HASH = 0x238678dd
SV = IV(0x85038c) at 0x850390
REFCNT = 1
FLAGS = (IOK,pIOK)
IV = 1
===================================
Results:
-----------------------------------
10 trials of Latin-1 hash key, Latin-1 probe (2.272s total), 227.159ms
+/trial
10 trials of Latin-1 hash key, UTF-8 probe (2.248s total), 224.823ms/t
+rial
10 trials of UTF-8 hash key, Latin-1 probe (3.100s total), 309.985ms/t
+rial
10 trials of UTF-8 hash key, UTF-8 probe (1.893s total), 189.301ms/tri
+al