#/usr/bin/perl use warnings; use strict; use Benchmark qw(cmpthese); use Sort::Naturally; my $decimal = '.'; # decimal point indicator for "natural_sort" my $separator = ','; # thousands separator for "natural_sort" my @array = (); # load in some test data open my $fh, '>', 'test.txt'; # open a file to write to for comparison # write the data out to the file sorted by the different methods print $fh "natural sort\n", natural_sort(@array); print $fh '#' x 80, "\n"; print $fh "Natural::Sort\n", nsort(@array); print $fh '#' x 80, "\n"; print $fh "alphanum\n", sort { alphanum( $a, $b ) } @array; # scale up the data file to a reasonably large size to get a # better idea of how each algorithm scales for ( 0 .. 9 ) { push @array, @array; } # print the size of the data set print scalar @array, " items in array...\n"; my @temp; #temporary array to hold the results of the sorts. cmpthese( -60, { 'alphanum' => sub { @temp = sort { alphanum( $a, $b ) } @array; }, 'Sort::Naturally' => sub { @temp = nsort(@array) }, 'natural_sort' => sub { @temp = natural_sort(@array) }, } ); ############################################################################### # deaccent will force sorting of Latin-1 word characters above \xC0 to be # treated as their base or equivalent character. sub deaccent { my $phrase = shift; return $phrase unless ( $phrase =~ y/\xC0-\xFF// ); #short circuit if no upper chars # translterate what we can (for speed) $phrase =~ tr/ÀÁÂÃÄÅàáâãäåÇçÈÉÊËèéêëÌÍÎÏìíîïÒÓÔÕÖØòóôõöøÑñÙÚÛÜùúûüÝÿý/AAAAAAaaaaaaCcEEEEeeeeIIIIiiiiOOOOOOooooooNnUUUUuuuuYyy/; # and substitute the rest my %trans = qw(Æ AE æ ae Þ TH þ th Ð TH ð th ß ss); $phrase =~ s/([ÆæÞþÐðß])/$trans{$1}/g; return $phrase; } # no-sep will allow the sorting algorithm to ignore (mostly) the presence # of thousands separators in large numbers. It is configured by default # to be comma, but can be changed to whatever is desired. (a likely possibility is .) sub no_sep { my $phrase = shift; $phrase =~ s/\Q$separator\E//g; return $phrase; } # Very fast natural sort routine. If (not) desired, delete the no-sep and deaccent # modifiers to remove those effects. sub natural_sort { my $i; no warnings q/uninitialized/; s/((\Q$decimal\E0*)?)(\d+)/("\x0" x length $2) . (pack 'aNa*', 0, length $3, $3)/eg, $_ .= ' ' . $i++ for ( my @x = map { lc deaccent no_sep $_} @_ ); @_[ map { (split)[-1] } sort @x ]; } ############################################################################### # from http://www.davekoelle.com/alphanum.html sub alphanum { # split strings into chunks my @a = chunkify( $_[0] ); my @b = chunkify( $_[1] ); # while we have chunks to compare. while ( @a && @b ) { my $a_chunk = shift @a; my $b_chunk = shift @b; my $test = ( ( $a_chunk =~ /\d/ ) && ( $b_chunk =~ /\d/ ) ) ? # if both are numeric $a_chunk <=> $b_chunk : # compare as numbers $a_chunk cmp $b_chunk; # else compare as strings # return comparison if not equal. return $test if $test != 0; } # return longer string. return @a <=> @b; } # split on numeric/non-numeric transitions sub chunkify { my @chunks = split m{ # split on (?= # zero width (?<=\D)\d | # digit preceded by a non-digit OR (?<=\d)\D # non-digit preceded by a digit ) }x, $_[0]; return @chunks; } ############################################################################### __DATA__ 1 2.3 7 .7 .07 .009 .007 0.008 0.08 0.0008 1000 100.0 1,100.0 1,100.2 1100.95 1100.0 1100 08057 07011 90210 19105 2 20 20A 20X 20.1 200 2000 10000000000 1,000,000,001.00 1000000001.00 1000000001 1000000001.60 12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678902 12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901 1st 2nd 3rd 33rd 144th apple Ball bald car Card Æon aether niño nina e-mail évian evoke foo fooa fool foo1 foo11 foo2 p4 p5 P6 p10 z1.doc z10.doc z100.doc z101.doc z102.doc z11.doc z12.doc z13.doc z2.doc z20.doc z3.doc z7.doc z8.doc z9.doc