#!/usr/bin/env perl use 5.014; use strictures; use Lucy; use Time::HiRes "gettimeofday", "tv_interval"; my $index = "./lucy.index"; my $schema = Lucy::Plan::Schema->new; my $easyanalyzer = Lucy::Analysis::EasyAnalyzer ->new( language => 'en' ); my $text_type = Lucy::Plan::FullTextType ->new( analyzer => $easyanalyzer, ); my $string_type = Lucy::Plan::StringType ->new(); $schema->spec_field( name => 'id', type => $string_type ); $schema->spec_field( name => 'content', type => $text_type ); my $indexer = Lucy::Index::Indexer ->new( schema => $schema, index => $index, create => 1, truncate => 1, ); while (<DATA>) { my ( $id1, $id2maybe, $text ) = /\A([0-9]+);(?:([0-9]+);)?(.+)/; for my $id ( grep defined, $id1, $id2maybe ) { $indexer->add_doc({ id => $id, content => $text }); } } $indexer->commit; my $searcher = Lucy::Search::IndexSearcher ->new( index => $index ); print "Query (q to quit): "; while ( my $q = <STDIN> ) { chomp $q; exit if $q =~ /\Aq(uit)?\z/i; my $t0 = [gettimeofday()]; my $hits = $searcher->hits( query => $q, ); while ( my $hit = $hits->next ) { printf "%12d -> %s\n", $hit->{id}, $hit->{content}; } printf "\nMatched %s record%s in %1.1f milliseconds\n", $hits->total_hits, $hits->total_hits == 1 ? "" : "s", 1_000 * tv_interval( $t0, [gettimeofday()] ); print "\nQuery: "; } __DATA__ Your 200 lines of test data…
moo@cow[51]~>perl pm-1118102 Query (q to quit): archaea 259697659 -> root;cellular organisms;Archaea;Euryarchaeota;Thermoco +cci;Thermococcales;Thermococcaceae;Pyrococcus;Pyrococcus abyssi;Pyroc +occus abyssi GE5; 272844 -> root;cellular organisms;Archaea;Euryarchaeota;Thermoco +cci;Thermococcales;Thermococcaceae;Pyrococcus;Pyrococcus abyssi;Pyroc +occus abyssi GE5; 289191770 -> root;cellular organisms;Archaea;Euryarchaeota;Methanoc +occi;Methanococcales;Methanocaldococcaceae;Methanocaldococcus;Methano +caldococcus sp. FS406-22; 644281 -> root;cellular organisms;Archaea;Euryarchaeota;Methanoc +occi;Methanococcales;Methanocaldococcaceae;Methanocaldococcus;Methano +caldococcus sp. FS406-22; 490653205 -> root;cellular organisms;Archaea;Euryarchaeota;Halobact +eria;Halobacteriales;Halobacteriaceae;Haloarcula;Haloarcula vallismor +tis; 28442 -> root;cellular organisms;Archaea;Euryarchaeota;Halobact +eria;Halobacteriales;Halobacteriaceae;Haloarcula;Haloarcula vallismor +tis; 493010542 -> root;cellular organisms;Archaea;Euryarchaeota;Halobact +eria;Halobacteriales;Halobacteriaceae;Natronorubrum;Natronorubrum tib +etense; 63128 -> root;cellular organisms;Archaea;Euryarchaeota;Halobact +eria;Halobacteriales;Halobacteriaceae;Natronorubrum;Natronorubrum tib +etense; 500681908 -> root;cellular organisms;Archaea;Euryarchaeota;Methanoc +occi;Methanococcales;Methanococcaceae;Methanococcus;Methanococcus aeo +licus; 42879 -> root;cellular organisms;Archaea;Euryarchaeota;Methanoc +occi;Methanococcales;Methanococcaceae;Methanococcus;Methanococcus aeo +licus; Matched 12 records in 0.4 milliseconds Query: 283552125 283552125 -> root;Viruses;ssRNA viruses;ssRNA negative-strand virus +es;Orthomyxoviridae;Influenzavirus A;Influenza A virus;H5N1 subtype;I +nfluenza A virus (A/chicken/Nigeria/08RS848-4/2006(H5N1)); Matched 1 record in 0.2 milliseconds
Now… what are you getting me for my birthday? :P
Reading: Lucy (lots of reading to do). I expect this will maintain search speed of a few milliseconds with your full data set. It’s designed to handle millions of much larger and more complex documents. Initial indexing will take awhile but you only have to do it once (script does it every time to make example short/simple). Presentation/splitting of the data content is up to you.
In reply to Re: Using indexing for faster lookup in large file
by Your Mother
in thread Using indexing for faster lookup in large file
by anli_
| For: | Use: | ||
| & | & | ||
| < | < | ||
| > | > | ||
| [ | [ | ||
| ] | ] |