in reply to Re^4: list of unique strings, also eliminating matching substrings
in thread list of unique strings, also eliminating matching substrings
Try this on that same dataset and let me know how you get on. On my generated test data it takes ~30 minutes for 200,000 strings, but how realistic that dataset is ...
Use thisScript.pl inFile > outFile:
#! perl -slw use strict; use Inline C => Config => BUILD_NOISY => 1; use Inline C => <<'END_C', NAME => '_906020', CLEAN_AFTER_BUILD => 0; #include <string.h> int longCmp( SV *needle, SV *haystack, SV *offset ) { STRLEN ln, lh, o = SvIV( offset ); char *n = SvPV( needle, ln ), *h = SvPV( haystack, lh ); char *nl = n + ln - 1; int diff = lh - ln; int flag = 0, i; h += o; lh -= o; diff -= o; if( diff <= 0 ) return 0; for( i = 0; i < diff; ++i ) { if( ! h[ i + ln - 1 ] ) { i += ln; continue; } if( h[ i ] != *n || h[ i+ ln-1 ] != *nl ) continue; if( strncmp( h+i, n, ln ) ) continue; return i; } return 0; } END_C use Time::HiRes qw[ time ]; sub uniq{ my %x; @x{@_} = (); keys %x } my $start = time; my @uniq = uniq <>; chomp @uniq; @uniq = sort{ length $a <=> length $b } @uniq; my $all = join chr(0), @uniq; my $p = 0; for my $x ( @uniq ) { $p += 1+ length $x; next if longCmp( $x, $all, $p ); print $x; } printf STDERR "Took %.3f\n", time() - $start; __END__
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Re^6: list of unique strings, also eliminating matching substrings
by lindsay_grey (Novice) on May 30, 2011 at 20:07 UTC | |
by BrowserUk (Patriarch) on May 30, 2011 at 20:24 UTC | |
by LanX (Saint) on Jun 02, 2011 at 15:05 UTC | |
by BrowserUk (Patriarch) on Jun 03, 2011 at 05:37 UTC | |
by LanX (Saint) on Jun 03, 2011 at 11:52 UTC | |
| |
by LanX (Saint) on Jun 02, 2011 at 16:02 UTC | |
by BrowserUk (Patriarch) on Jun 02, 2011 at 16:22 UTC |