in reply to Re^5: How can I do a numeric sort on a substring? (context matters)
in thread How can I do a numeric sort on a substring?

++ Many thanks for tracking down the problem. Much appreciated.

The results are now more in line with what I would have expected. I see that Perl's string handling function, substr, outstrips the regex solutions: I have been recommending, for a very long time, that string functions be chosen over regexes (where they provide equivalent functionality).

I should probably add some ST routines (e.g. STss, STmcs) to see how they fare; for instance, would GRTpe be faster than STss. I'm currently at $work, so I can't do that now; I'll look into it this evening (i.e. ~8-10hrs hence).

— Ken

Replies are listed 'Best First'.
Re^7: How can I do a numeric sort on a substring? [Benchmark: reworked and extended]
by kcott (Archbishop) on Jun 28, 2021 at 08:21 UTC

    I wrapped all of the routines in @{[...]} to provide the list context; that was what I'd used in the preamble tests.

    I added an STss as I had indicated this morning. I decided that STmcs was going to be pretty much the same as STss, so I skipped that one. I did add an mcse which was mcs with map BLOCK replaced by map EXPR.

    sub st_sort_substr { @{[ map $_->[0], sort { $a->[1] <=> $b->[1] } map [$_, substr $_, 2], @unordered ]}; } sub map_cat_substr_expr { @{[ map "a-$_", sort { $a <=> $b } map substr($_, 2), @unordered ]}; }

    I saw ++swl's post. There wasn't any code there, so I guessed.

    use Sort::Key 'ikeysort'; use Sort::Key::Natural 'natsort'; ... sub sort_key_integer { @{[ ikeysort { substr $_, 2 } @unordered ]}; } sub sort_key_natural { @{[ natsort @unordered ]}; }

    I ran the benchmark several times; there were no major differences between runs. Here's a sample output, in the spoiler; it's getting very wide (18 subroutines now) and this post is "Re^7", so probably best viewed via the "download" link.

    Perl & OS: v5.34.0 on cygwin Unordered data (for preamble tests): a-10 a-01 a-22 a-2 a-0 a-3 a-000 a-1 a-12345 a-1 Preamble tests: grt_pack_expr: a-0 a-000 a-01 a-1 a-1 a-2 a-3 a-10 a-22 a- +12345 grt_pack_expr_q: a-0 a-000 a-01 a-1 a-1 a-2 a-3 a-10 a-22 a- +12345 sort_key_integer: a-0 a-000 a-01 a-1 a-1 a-2 a-3 a-10 a-22 a- +12345 sort_key_natural: a-0 a-000 a-01 a-1 a-1 a-2 a-3 a-10 a-22 a- +12345 st_regex: a-0 a-000 a-01 a-1 a-1 a-2 a-3 a-10 a-22 a- +12345 st_regex_anchored: a-0 a-000 a-01 a-1 a-1 a-2 a-3 a-10 a-22 a- +12345 st_regex_anch_expr_ni: a-0 a-000 a-01 a-1 a-1 a-2 a-3 a-10 a-22 a- +12345 st_regex_anch_ni: a-0 a-000 a-01 a-1 a-1 a-2 a-3 a-10 a-22 a- +12345 st_regex_expr_ni: a-0 a-000 a-01 a-1 a-1 a-2 a-3 a-10 a-22 a- +12345 st_regex_no_index: a-0 a-000 a-01 a-1 a-1 a-2 a-3 a-10 a-22 a- +12345 st_sort_substr: a-0 a-000 a-01 a-1 a-1 a-2 a-3 a-10 a-22 a- +12345 map_cat_substr: a-0 a-000 a-01 a-1 a-1 a-2 a-3 a-10 a-22 a- +12345 map_cat_substr_expr: a-0 a-000 a-01 a-1 a-1 a-2 a-3 a-10 a-22 a- +12345 map_cat_substr_len: a-0 a-000 a-01 a-1 a-1 a-2 a-3 a-10 a-22 a- +12345 sort_pack: a-0 a-000 a-01 a-1 a-1 a-2 a-3 a-10 a-22 a- +12345 sort_regex: a-0 a-000 a-01 a-1 a-1 a-2 a-3 a-10 a-22 a- +12345 sort_regex_anchored: a-0 a-000 a-01 a-1 a-1 a-2 a-3 a-10 a-22 a- +12345 sort_substr: a-0 a-000 a-01 a-1 a-1 a-2 a-3 a-10 a-22 a- +12345 Legend: GRTpe: grt_pack_expr GRTpeq: grt_pack_expr_q SKi: sort_key_integer SKn: sort_key_natural STr: st_regex STra: st_regex_anchored STraen: st_regex_anch_expr_ni STran: st_regex_anch_ni STren: st_regex_expr_ni STrn: st_regex_no_index STss: st_sort_substr mcs: map_cat_substr mcse: map_cat_substr_expr mcsl: map_cat_substr_len sp: sort_pack sr: sort_regex sra: sort_regex_anchored ss: sort_substr Benchmarks: Note: Unordered data extended with 'map "a-$_", shuffle 0..10000' Rate sra sr sp SKn ss STrn STran STr STra STren ST +raen STss GRTpe GRTpeq mcsl mcs mcse SKi sra 12.0/s -- -1% -59% -61% -76% -81% -81% -81% -81% -81% +-81% -84% -89% -89% -90% -90% -90% -94% sr 12.0/s 1% -- -59% -60% -76% -81% -81% -81% -81% -81% +-81% -84% -88% -89% -90% -90% -90% -94% sp 29.4/s 145% 144% -- -3% -42% -53% -53% -53% -53% -53% +-54% -60% -72% -72% -75% -76% -76% -86% SKn 30.4/s 154% 153% 3% -- -40% -51% -52% -52% -52% -52% +-52% -59% -71% -71% -74% -75% -75% -86% ss 50.4/s 321% 319% 71% 66% -- -19% -20% -20% -20% -20% +-20% -32% -52% -52% -57% -58% -58% -77% STrn 62.4/s 421% 419% 112% 105% 24% -- -0% -0% -0% -0% + -1% -16% -40% -41% -47% -48% -48% -71% STran 62.7/s 424% 421% 113% 106% 24% 0% -- -0% -0% -0% + -1% -15% -40% -40% -47% -48% -48% -71% STr 62.8/s 424% 421% 113% 106% 25% 0% 0% -- 0% -0% + -1% -15% -40% -40% -47% -48% -48% -71% STra 62.8/s 424% 421% 113% 106% 25% 0% 0% 0% -- -0% + -1% -15% -40% -40% -47% -48% -48% -71% STren 62.8/s 424% 421% 113% 106% 25% 0% 0% 0% 0% -- + -1% -15% -40% -40% -47% -48% -48% -71% STraen 63.4/s 429% 426% 116% 108% 26% 1% 1% 1% 1% 1% + -- -15% -39% -40% -46% -47% -47% -71% STss 74.2/s 519% 516% 152% 144% 47% 19% 18% 18% 18% 18% + 17% -- -29% -30% -37% -38% -38% -66% GRTpe 105/s 773% 768% 256% 244% 107% 67% 67% 67% 67% 67% + 65% 41% -- -1% -12% -13% -13% -52% GRTpeq 105/s 779% 774% 258% 246% 109% 69% 68% 68% 68% 68% + 66% 42% 1% -- -11% -13% -13% -51% mcsl 118/s 886% 881% 302% 289% 134% 89% 88% 88% 88% 88% + 86% 59% 13% 12% -- -2% -2% -45% mcs 120/s 905% 900% 310% 296% 139% 93% 92% 92% 92% 92% + 90% 62% 15% 14% 2% -- -0% -44% mcse 120/s 906% 901% 310% 296% 139% 93% 92% 92% 92% 92% + 90% 62% 15% 14% 2% 0% -- -44% SKi 217/s 1708% 1699% 637% 612% 330% 247% 245% 245% 245% 245% +242% 192% 107% 106% 83% 80% 80% --

    And here's the code:

    #!/usr/bin/env perl use strict; use warnings; use namespace::autoclean; use Benchmark 'cmpthese'; use List::Util 'shuffle'; use Sort::Key 'ikeysort'; use Sort::Key::Natural 'natsort'; my @unordered = qw{a-10 a-01 a-22 a-2 a-0 a-3 a-000 a-1 a-12345 a-1}; my %expanded_abbrev_for = ( sr => 'sort_regex', STr => 'st_regex', STrn => 'st_regex_no_index', STren => 'st_regex_expr_ni', sra => 'sort_regex_anchored', STra => 'st_regex_anchored', STran => 'st_regex_anch_ni', STraen => 'st_regex_anch_expr_ni', ss => 'sort_substr', mcs => 'map_cat_substr', mcsl => 'map_cat_substr_len', sp => 'sort_pack', GRTpe => 'grt_pack_expr', GRTpeq => 'grt_pack_expr_q', STss => 'st_sort_substr', mcse => 'map_cat_substr_expr', SKi => 'sort_key_integer', SKn => 'sort_key_natural', ); my %coderef_for = ( sr => \&sort_regex, STr => \&st_regex, STrn => \&st_regex_no_index, STren => \&st_regex_expr_ni, sra => \&sort_regex_anchored, STra => \&st_regex_anchored, STran => \&st_regex_anch_ni, STraen => \&st_regex_anch_expr_ni, ss => \&sort_substr, mcs => \&map_cat_substr, mcsl => \&map_cat_substr_len, sp => \&sort_pack, GRTpe => \&grt_pack_expr, GRTpeq => \&grt_pack_expr_q, STss => \&st_sort_substr, mcse => \&map_cat_substr_expr, SKi => \&sort_key_integer, SKn => \&sort_key_natural, ); print "Perl & OS:\n $^V on $^O\n"; print "Unordered data (for preamble tests):\n @unordered\n"; print "Preamble tests:\n"; my $tests_fmt = " %-22s %s\n"; for my $name (sort keys %coderef_for) { printf $tests_fmt, "$expanded_abbrev_for{$name}:", "@{[$coderef_for{$name}->()]}"; } exit if @ARGV && $ARGV[0] eq '--dry_run'; print "Legend:\n"; my $legend_fmt = " %-7s %s\n"; for my $abbrev (sort keys %expanded_abbrev_for) { printf $legend_fmt, "$abbrev:", $expanded_abbrev_for{$abbrev}; } # Extend @unordered for improved benchmarking push @unordered, map "a-$_", shuffle 0..10000; print "Benchmarks:\n"; print " Note: Unordered data extended with 'map \"a-\$_\", shuffle +0..10000'\n"; my $count = 0; cmpthese $count => \%coderef_for; sub sort_regex { @{[ sort { ($a =~ /(\d+)/)[0] <=> ($b =~ /(\d+)/)[0] } @unordered ]}; } sub st_regex { @{[ map { $_->[0] } sort { $a->[1] <=> $b->[1] } map { [$_, (/(\d+)/)[0]] } @unordered ]}; } sub st_regex_no_index { @{[ map { $_->[0] } sort { $a->[1] <=> $b->[1] } map { [$_, /(\d+)/] } @unordered ]}; } sub st_regex_expr_ni { @{[ map $_->[0], sort { $a->[1] <=> $b->[1] } map [$_, /(\d+)/], @unordered ]}; } sub sort_regex_anchored { @{[ sort { ($a =~ /(\d+)$/)[0] <=> ($b =~ /(\d+)$/)[0] } @unordered ]}; } sub st_regex_anchored { @{[ map { $_->[0] } sort { $a->[1] <=> $b->[1] } map { [$_, (/(\d+)$/)[0]] } @unordered ]}; } sub st_regex_anch_ni { @{[ map { $_->[0] } sort { $a->[1] <=> $b->[1] } map { [$_, /(\d+)$/] } @unordered ]}; } sub st_regex_anch_expr_ni { @{[ map $_->[0], sort { $a->[1] <=> $b->[1] } map [$_, /(\d+)$/], @unordered ]}; } sub st_sort_substr { @{[ map $_->[0], sort { $a->[1] <=> $b->[1] } map [$_, substr $_, 2], @unordered ]}; } sub sort_substr { @{[ sort { substr($a, 2) <=> substr($b, 2) } @unordered ]}; } sub map_cat_substr { @{[ map { 'a-' . $_ } sort { $a <=> $b } map { substr $_, 2 } @unordered ]}; } sub map_cat_substr_expr { @{[ map "a-$_", sort { $a <=> $b } map substr($_, 2), @unordered ]}; } sub map_cat_substr_len { @{[ map { 'a-' . $_ } sort { $a <=> $b } map { substr $_, 2, length($_) - 2 } @unordered ]}; } sub sort_pack { @{[ sort { pack(L => substr($a, 2)) cmp pack(L => substr($b, 2)) } @unordered ]}; } sub grt_pack_expr { @{[ map substr($_, 4), sort map pack(L => substr($_, 2)) . $_, @unordered ]}; } sub grt_pack_expr_q { @{[ map substr($_, 8), sort map pack(Q => substr($_, 2)) . $_, @unordered ]}; } sub sort_key_integer { @{[ ikeysort { substr $_, 2 } @unordered ]}; } sub sort_key_natural { @{[ natsort @unordered ]}; }

    — Ken

      Not providing the code is a fairly egregious oversight on my part. Apologies for that.

      Your usage is the same as mine, but my code is in the spoiler for completeness.

      #!/usr/bin/env perl use strict; use warnings; use namespace::autoclean; use Benchmark 'cmpthese'; use List::Util 'shuffle'; use Sort::Key qw /ikeysort/; use Sort::Key::Natural qw /natsort/; my @ordered; my @unordered = qw{a-10 a-01 a-22 a-2 a-0 a-3 a-000 a-1 a-12345 a-1}; my %expanded_abbrev_for = ( sr => 'sort_regex', STr => 'st_regex', STrn => 'st_regex_no_index', STren => 'st_regex_expr_ni', sra => 'sort_regex_anchored', STra => 'st_regex_anchored', STran => 'st_regex_anch_ni', STraen => 'st_regex_anch_expr_ni', ss => 'sort_substr', mcs => 'map_cat_substr', mcsl => 'map_cat_substr_len', sp => 'sort_pack', GRTpe => 'grt_pack_expr', GRTpeq => 'grt_pack_expr_q', SKn => 'sort_key_natural', SKi => 'sort_key_integer', ); my %coderef_for = ( sr => \&sort_regex, STr => \&st_regex, STrn => \&st_regex_no_index, STren => \&st_regex_expr_ni, sra => \&sort_regex_anchored, STra => \&st_regex_anchored, STran => \&st_regex_anch_ni, STraen => \&st_regex_anch_expr_ni, ss => \&sort_substr, mcs => \&map_cat_substr, mcsl => \&map_cat_substr_len, sp => \&sort_pack, GRTpe => \&grt_pack_expr, GRTpeq => \&grt_pack_expr_q, SKn => \&sort_key_natural, SKi => \&sort_key_integer, ); print "Perl & OS:\n $^V on $^O\n"; print "Unordered data (for preamble tests):\n @unordered\n"; print "Preamble tests:\n"; my $tests_fmt = " %-22s %s\n"; for my $name (sort keys %coderef_for) { printf $tests_fmt, "$expanded_abbrev_for{$name}:", "@{[$coderef_for{$name}->()]}"; } exit if @ARGV && $ARGV[0] eq '--dry_run'; print "Legend:\n"; my $legend_fmt = " %-7s %s\n"; for my $abbrev (sort keys %expanded_abbrev_for) { printf $legend_fmt, "$abbrev:", $expanded_abbrev_for{$abbrev}; } # Extend @unordered for improved benchmarking my $max = 10000; push @unordered, map "a-$_", shuffle 0..$max; print "Benchmarks:\n"; print " Note: Unordered data extended with 'map \"a-\$_\", shuffle +0..$max'\n"; my $count = 0; cmpthese $count => \%coderef_for; sub sort_regex { @ordered = sort { ($a =~ /(\d+)/)[0] <=> ($b =~ /(\d+)/)[0] } @unordered; } sub st_regex { @ordered = map { $_->[0] } sort { $a->[1] <=> $b->[1] } map { [$_, (/(\d+)/)[0]] } @unordered; } sub st_regex_no_index { @ordered = map { $_->[0] } sort { $a->[1] <=> $b->[1] } map { [$_, /(\d+)/] } @unordered; } sub st_regex_expr_ni { @ordered = map $_->[0], sort { $a->[1] <=> $b->[1] } map [$_, /(\d+)/], @unordered; } sub sort_regex_anchored { @ordered = sort { ($a =~ /(\d+)$/)[0] <=> ($b =~ /(\d+)$/)[0] } @unordered; } sub st_regex_anchored { @ordered = map { $_->[0] } sort { $a->[1] <=> $b->[1] } map { [$_, (/(\d+)$/)[0]] } @unordered; } sub st_regex_anch_ni { @ordered = map { $_->[0] } sort { $a->[1] <=> $b->[1] } map { [$_, /(\d+)$/] } @unordered; } sub st_regex_anch_expr_ni { @ordered = map $_->[0], sort { $a->[1] <=> $b->[1] } map [$_, /(\d+)$/], @unordered; } sub sort_substr { @ordered = sort { substr($a, 2) <=> substr($b, 2) } @unordered; } sub map_cat_substr { @ordered = map { 'a-' . $_ } sort { $a <=> $b } map { substr $_, 2 } @unordered; } sub map_cat_substr_len { @ordered = map { 'a-' . $_ } sort { $a <=> $b } map { substr $_, 2, length($_) - 2 } @unordered; } sub sort_pack { @ordered = sort { pack(L => substr($a, 2)) cmp pack(L => substr($b, 2)) } @unordered; } sub grt_pack_expr { @ordered = map substr($_, 4), sort map pack(L => substr($_, 2)) . $_, @unordered; } sub grt_pack_expr_q { @ordered = map substr($_, 8), sort map pack(Q => substr($_, 2)) . $_, @unordered; } sub sort_key_natural { @ordered = natsort @unordered; } sub sort_key_integer { @ordered = ikeysort {substr $_, 2} @unordered; }