#!/usr/bin/perl -wl use strict; use Inline C => Config => LDDLFLAGS => '-shared ../../../popcount.o'; use Inline "C"; use Benchmark "cmpthese"; my $s = pack "C*", map { int rand 256 } 1 .. 2**18; # 256k #print unpack q/%32b*/, $s; #print count_bits($s); cmpthese (-2, { unpack => sub { my $n = unpack q/%32b*/, $s; }, popcnt => sub { my $n = count_bits($s); }, }); __END__ __C__ extern unsigned long popcount(char *, int64_t); // the asm routine unsigned long count_bits (SV* bitstr) { // XS wrapper STRLEN l; char *s = SvPV(bitstr, l); return popcount(s, l); } #### Rate unpack popcnt unpack 1252/s -- -91% popcnt 13581/s 984% -- #### $ cat /proc/cpuinfo | grep name | head -1 model name : AMD Phenom(tm) 9600 Quad-Core Processor #### bits 64 global popcount section .text ; prototype: unsigned long popcount(char*, int64_t); popcount: push rbp mov rbp, rsp mov rcx, rsi shr rcx, 3 mov rsi, rdi xor rax, rax .cnt popcnt rbx, [rsi] add rax, rbx add rsi, 8 loop .cnt mov rsp, rbp pop rbp ret