#!/usr/bin/env perl
use 5.014;
use strictures;
use Lucy;
use Time::HiRes "gettimeofday", "tv_interval";
my $index = "./lucy.index";
my $schema = Lucy::Plan::Schema->new;
my $easyanalyzer = Lucy::Analysis::EasyAnalyzer
->new( language => 'en' );
my $text_type = Lucy::Plan::FullTextType
->new( analyzer => $easyanalyzer, );
my $string_type = Lucy::Plan::StringType
->new();
$schema->spec_field( name => 'id', type => $string_type );
$schema->spec_field( name => 'content', type => $text_type );
my $indexer = Lucy::Index::Indexer
->new( schema => $schema,
index => $index,
create => 1,
truncate => 1, );
while (<DATA>)
{
my ( $id1, $id2maybe, $text ) = /\A([0-9]+);(?:([0-9]+);)?(.+)/;
for my $id ( grep defined, $id1, $id2maybe )
{
$indexer->add_doc({ id => $id,
content => $text });
}
}
$indexer->commit;
my $searcher = Lucy::Search::IndexSearcher
->new( index => $index );
print "Query (q to quit): ";
while ( my $q = <STDIN> )
{
chomp $q;
exit if $q =~ /\Aq(uit)?\z/i;
my $t0 = [gettimeofday()];
my $hits = $searcher->hits( query => $q, );
while ( my $hit = $hits->next )
{
printf "%12d -> %s\n", $hit->{id}, $hit->{content};
}
printf "\nMatched %s record%s in %1.1f milliseconds\n",
$hits->total_hits,
$hits->total_hits == 1 ? "" : "s",
1_000 * tv_interval( $t0, [gettimeofday()] );
print "\nQuery: ";
}
__DATA__
Your 200 lines of test data…
moo@cow[51]~>perl pm-1118102
Query (q to quit): archaea
259697659 -> root;cellular organisms;Archaea;Euryarchaeota;Thermoco
+cci;Thermococcales;Thermococcaceae;Pyrococcus;Pyrococcus abyssi;Pyroc
+occus abyssi GE5;
272844 -> root;cellular organisms;Archaea;Euryarchaeota;Thermoco
+cci;Thermococcales;Thermococcaceae;Pyrococcus;Pyrococcus abyssi;Pyroc
+occus abyssi GE5;
289191770 -> root;cellular organisms;Archaea;Euryarchaeota;Methanoc
+occi;Methanococcales;Methanocaldococcaceae;Methanocaldococcus;Methano
+caldococcus sp. FS406-22;
644281 -> root;cellular organisms;Archaea;Euryarchaeota;Methanoc
+occi;Methanococcales;Methanocaldococcaceae;Methanocaldococcus;Methano
+caldococcus sp. FS406-22;
490653205 -> root;cellular organisms;Archaea;Euryarchaeota;Halobact
+eria;Halobacteriales;Halobacteriaceae;Haloarcula;Haloarcula vallismor
+tis;
28442 -> root;cellular organisms;Archaea;Euryarchaeota;Halobact
+eria;Halobacteriales;Halobacteriaceae;Haloarcula;Haloarcula vallismor
+tis;
493010542 -> root;cellular organisms;Archaea;Euryarchaeota;Halobact
+eria;Halobacteriales;Halobacteriaceae;Natronorubrum;Natronorubrum tib
+etense;
63128 -> root;cellular organisms;Archaea;Euryarchaeota;Halobact
+eria;Halobacteriales;Halobacteriaceae;Natronorubrum;Natronorubrum tib
+etense;
500681908 -> root;cellular organisms;Archaea;Euryarchaeota;Methanoc
+occi;Methanococcales;Methanococcaceae;Methanococcus;Methanococcus aeo
+licus;
42879 -> root;cellular organisms;Archaea;Euryarchaeota;Methanoc
+occi;Methanococcales;Methanococcaceae;Methanococcus;Methanococcus aeo
+licus;
Matched 12 records in 0.4 milliseconds
Query: 283552125
283552125 -> root;Viruses;ssRNA viruses;ssRNA negative-strand virus
+es;Orthomyxoviridae;Influenzavirus A;Influenza A virus;H5N1 subtype;I
+nfluenza A virus (A/chicken/Nigeria/08RS848-4/2006(H5N1));
Matched 1 record in 0.2 milliseconds
Now… what are you getting me for my birthday? :P
Reading: Lucy (lots of reading to do). I expect this will maintain search speed of a few milliseconds with your full data set. It’s designed to handle millions of much larger and more complex documents. Initial indexing will take awhile but you only have to do it once (script does it every time to make example short/simple). Presentation/splitting of the data content is up to you.
-
Are you posting in the right place? Check out Where do I post X? to know for sure.
-
Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
<code> <a> <b> <big>
<blockquote> <br /> <dd>
<dl> <dt> <em> <font>
<h1> <h2> <h3> <h4>
<h5> <h6> <hr /> <i>
<li> <nbsp> <ol> <p>
<small> <strike> <strong>
<sub> <sup> <table>
<td> <th> <tr> <tt>
<u> <ul>
-
Snippets of code should be wrapped in
<code> tags not
<pre> tags. In fact, <pre>
tags should generally be avoided. If they must
be used, extreme care should be
taken to ensure that their contents do not
have long lines (<70 chars), in order to prevent
horizontal scrolling (and possible janitor
intervention).
-
Want more info? How to link
or How to display code and escape characters
are good places to start.