use 5.014; use strictures; use List::Util "shuffle"; open my $words, "<", "/usr/share/dict/words" or die $!; chomp ( my @words = <$words> ); my $top = @words - 40; @words = shuffle @words; open my $db, ">", "/tmp/PM.db" or die $!; for my $id ( 999_999 .. 999_999_999 ) { use integer; my $end = rand($top); my $range = rand(35) + 5; my $start = $end - $range; $start = 0 if $start < 0; say {$db} join ";", $id, shuffle @words[ $start .. $end ]; last if -s $db > 32_000_000_000; } #### use 5.014; use strictures; use Lucy; my $index = "./lucy.index"; my $schema = Lucy::Plan::Schema->new; my $easyanalyzer = Lucy::Analysis::EasyAnalyzer ->new( language => 'en' ); my $text_type = Lucy::Plan::FullTextType ->new( analyzer => $easyanalyzer, ); my $string_type = Lucy::Plan::StringType->new(); $schema->spec_field( name => 'id', type => $string_type ); $schema->spec_field( name => 'content', type => $text_type ); open my $db, "<", "/tmp/PM.db" or die $!; my $indexer = get_indexer(); my $counter = 1; while (<$db>) { chomp; my ( $id, $text ) = split /;/, $_, 2; $indexer->add_doc({ id => $id, content => $text }); unless ( $counter++ % 100_000 ) { print "committing a batch...\n"; $indexer->commit; $indexer = get_indexer(); } } print "optimizing and committing...\n"; $indexer->optimize; $indexer->commit; sub get_indexer { Lucy::Index::Indexer ->new( schema => $schema, index => $index, create => 1 ); } #### use 5.014; use strictures; use Lucy; use Time::HiRes "gettimeofday", "tv_interval"; use Number::Format "format_number"; my $index = "./lucy.index"; my $searcher = Lucy::Search::IndexSearcher ->new( index => $index ); my $all = $searcher->hits( query => Lucy::Search::MatchAllQuery->new ); print "Searching ", format_number($all->total_hits), " records.\n"; print "Query (q to quit): "; while ( my $q = ) { chomp $q; exit if $q =~ /\Aq(uit)?\z/i; my $t0 = [gettimeofday()]; my $hits = $searcher->hits( query => $q, num_wanted => 3 ); printf "\nMatched %s record%s in %1.2f milliseconds\n", format_number($hits->total_hits), $hits->total_hits == 1 ? "" : "s", 1_000 * tv_interval( $t0, [gettimeofday()] ); while ( my $hit = $hits->next ) { printf "%12d -> %s\n", $hit->{id}, $hit->{content}; } print "\nQuery: "; } #### Searching 126,871,745 records. Query (q to quit): ohai Matched 0 records in 1.33 milliseconds Query: taco Matched 0 records in 0.30 milliseconds Query: dingo Matched 12,498 records in 17.69 milliseconds 79136688 -> incandescency;scratchiness;ungnarred;dingo;desmachymatous;verderer 78453332 -> dingo;verderer;incandescency;ungnarred;coinsurance;scratchiness;desmachymatous 78367042 -> verderer;ungnarred;incandescency;dingo;desmachymatous;scratchiness Query: 78311109 Matched 1 record in 80.07 milliseconds 78311109 -> revealing;sulfocarbimide;Darwinize;reproclamation;intermedial;Cinclidae Query: perl Matched 12,511 records in 34.92 milliseconds 78437383 -> unnoticeableness;radiectomy;brogger;rumorer;oreillet;befan;perle 59450674 -> perle;Avery;autoxidizability;tidewaiter;radiectomy;filthily 59125043 -> oreillet;perle;Avery;autoxidizability;filthily;tidewaiter;radiectomy Query: pollen OR bee Matched 61,997 records in 27.14 milliseconds 127851379 -> sley;Phalaris;pollen;brasque;snuffle;excalate;operculigenous 79011524 -> rave;uliginose;gibel;pollened;uncomprised;salve;topognosia 78853424 -> topognosia;gibel;rave;uncomprised;pollened;uliginose;salve Query: pollen Matched 24,674 records in 1.58 milliseconds 127851379 -> sley;Phalaris;pollen;brasque;snuffle;excalate;operculigenous 79011524 -> rave;uliginose;gibel;pollened;uncomprised;salve;topognosia 78853424 -> topognosia;gibel;rave;uncomprised;pollened;uliginose;salve Query: pollen AND bee Matched 0 records in 21.61 milliseconds