| Category: | Text |
| Author/Contact Info | Patrick van Venetien / patrick@fictional.net |
| Description: | Perl module for indexing text. It does not read files itself, it just indexes text you feed it. Making it a true indexer for various usage. It includes the indexing engine and a search engine. It uses Search::Dict The search engine supports a simple query language and the code includes a query parser supporting nested queries like (perl and monk (not china)) Installation: put it in (lib/)Search/ usage: use Search::Findex; Gonna try to add it to CPAN (my first public Perl contribution to the community)
* Org post: Aug 23, 2002 License: GNU |
#
# Document Index and search engine
#
# Todo
# deletemarking ipv realtime delete
# resultcache per zoekwoord
# searchresult op date
package Findex;
use strict;
use vars qw($VERSION @ISA @EXPORT @EXPORT_OK);
use Search::Dict;
require Exporter;
$VERSION = '1.00';
@ISA = qw(Exporter AutoLoader);
@EXPORT = qw(
);
#
# Constructor.
#
sub new {
#
# $self is an anonymous hash.
#
my $self = {};
#
# Initialize data for this object.
#
$self->{INDEXDIR} = 'myindex';
$self->{INDEX} = undef;
$self->{INDEXSEPERATOR} = chr(149);
$self->{CHANGEDINDEX} = undef;
$self->{OPTIMIZETIME} = time; # see if index was changed
$self->{SCORE}=undef;
$self->{HITWORDS}=undef;
# Words no use to index
$self->{STOPWORDS}="\w|aan|bij|de|den|der|dit|een|en|haar|hem|hen|
+het|hij|ik|in|je|maar|of|op|over|van|voor";
$self->{SPECIALCHARS}="üéâäàçêëèïîìÄÅÉæÆôöòûùÿÖÜáíóúñÑãõØøœŒÀÃÕ".c
+hr(192).chr(193)."ÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛܧßàßâãäåµçèéêëìíîïðòóôõûü
+";
$self->{LINKCHARS}='\-\'\`';
#
# Let the object know its class.
#
bless($self);
return $self;
}
#
# Destructor.
#
sub DESTROY {
# print "Argh. Life was sweet.\n";
}
#
# Data-access methods.
# You can pass a parameter, in which case
# the attribute gets set to the new value.
# If you skip the parameter, the value is
# returned.
#
sub indexdir {
#
# Extract first parameter--the object ref.
#
my $self = shift;
#
# If there are any arguments left, then set value.
#
if (@_) {
$self->{INDEXDIR} = shift;
}
mkdir $self->{INDEXDIR} if ! -e $self->{INDEXDIR};
return $self->{INDEXDIR};
}
sub getnewfragname {
my $self = shift;
my $indexname = shift;
my $fragnr = "0";
return if !$indexname;
$indexname = $self->{INDEXDIR}."/$indexname";
while(-e $indexname."-$fragnr.ndx"){
$fragnr ++;
}
return $indexname."-$fragnr.ndx";
}
sub updateindex {
my $self = shift;
my ($fragdx,$key) = "";
my $schars=$self->{SPECIALCHARS};
my $seperator=$self->{INDEXSEPERATOR};
$self->{CHANGEDINDEX}=undef;
# print "Updating index\n";
mkdir $self->{INDEXDIR} if ! -d $self->{INDEXDIR};
foreach (keys %{$self->{INDEX}}){
my $sortval = "";
my %sortthis = undef;
foreach(split(/\n/,$self->{INDEX}{$_})){
$sortthis{"$1"}.=$2 if /^([^ \?]+)\s*\?(.*)$/s;
}
open(INDEX,">".$self->getnewfragname($_));
foreach $sortval (sort keys %sortthis){
next if ($sortval=~/^\s*$/ || $sortthis{$sortval} =~/^\s*$
+/);
$sortval." " if $sortval=~/^\d+$/;
print INDEX $sortval."\?".$sortthis{$sortval}."\n";
}
close(INDEX);
$self->{CHANGEDINDEX}{"$_-$fragdx.ndx"}=1;
}
# print "Sorting id's and file\n";
$self->sortindex("idindex-all.ndx");
$self->sortindex("fileindex-all.ndx");
$self->{INDEX}=undef;
return;
}
sub updatedoc {
my $self = shift;
my $string = shift; # Unique identifier
my $date = shift;
my $data = shift;
my $docdata = shift;
$self->removedoc($string);
my $docid = $self->adddoc($string,$date,$data);
$self->indexdoc($docdata,$docid);
return $docid;
}
#
# Index a document - indexdoc ( document, docid/docstring)
#
sub indexdoc {
my $self = shift;
my $document = lc(shift);
my $docid = shift;
my $indexname = "";
my (@words,$word, %sword) = "";
my $schars=$self->{SPECIALCHARS};
my $stopwords = $self->{STOPWORDS};
my $linkchars = $self->{LINKCHARS};
$document=~s/([\w$schars]+)[$linkchars]([\w$schars]+)/$1 $2 $1$2/i
+gs; # create 3 words if linked words
foreach $word (split(/[^\w$schars]+/s,$document)){
# print $word."\n" if $word =~/[$schars]/igs;
# foreach $word (split(/[\s\.\{\}\[\]\(\)\+\,\/\\\#\!\?\"\'\:\;\&\<
+\>\`\~\*\=]+/s,$document)){
next if $sword{$word};
next if $word=~/^($stopwords|[a-z0-9])$/i;
next if ! ($indexname = $self->getindexbasename($word));
$sword{$word}++;
$self->{INDEX}{$indexname}.=$word."\?".$self->{INDEXSEPERATOR}
+.$docid."\n";
}
return $docid;
}
# Sort the given index
sub sortindex {
my $self = shift;
my $indexfile = shift;
my $seperator = $self->{INDEXSEPERATOR};
my ($sortval,@sortindex) = "";
#print "$indexfile\n";
return if $indexfile !~ /\.ndx$/i;
#print "Optimize index $indexfile\n";
open(INDEX,$self->{INDEXDIR}."/$indexfile") || return; # Read inde
+x
@sortindex=<INDEX>;
close(INDEX);
# print "Sorting $indexfile\n";
open(INDEX,">".$self->{INDEXDIR}."/$indexfile"); # Read index
foreach (sort @sortindex){
print INDEX $_;
}
close(INDEX);
}
sub optimizeall {
my $self = shift;
my ($baseindex,@indexes,@indexdata) = "";
$self->{CHANGEDINDEX}=undef;
# join fragmented indexes
opendir(INDEXES,$self->{INDEXDIR});
@indexes=grep(!/\-all/,readdir(INDEXES));
closedir(INDEXES);
foreach(@indexes){
$baseindex = "";
$baseindex=$1 if /^([\d]+)\-/;
next if !$baseindex;
# print "Joining $baseindex with $_\n";
open(INDEX,$self->{INDEXDIR}."/$_") || next;
open(TINDEX,">>".$self->{INDEXDIR}."/$baseindex\-all.ndx");
@indexdata=<INDEX>;
print TINDEX @indexdata;
close(INDEX);
unlink $self->{INDEXDIR}."/$_";
}
$self->optimizeindex;
$self->{OPTIMIZETIME}=time;
}
sub optimizeindex {
my $self = shift;
my $seperator = $self->{INDEXSEPERATOR};
my ($indexfile,$indexfile,$sortval,@sortindex, @indexfiles) = "";
@indexfiles=(keys %{$self->{CHANGEDINDEX}});
$self->{CHANGEDINDEX}=undef;
if(!@indexfiles){
opendir(INDEXDIR,$self->{INDEXDIR});
@indexfiles=readdir(INDEXDIR);
closedir(INDEXDIR);
}
foreach $indexfile (@indexfiles){
next if $indexfile !~ /\.ndx$/;
next if $indexfile!~/^([\d]+|all)\-/;
my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,$atime,$mtime
+,$ctime,$blksize,$blocks) = stat($self->{INDEXDIR}."/$indexfile");
# print "$indexfile $mtime -> ".$self->{OPTIMIZETIME}."\n";
next if $mtime < ($self->{OPTIMIZETIME}-5);
# print "Optimize index $indexfile\n";
open(INDEX,$self->{INDEXDIR}."/$indexfile"); # Read index
@sortindex=<INDEX>;
close(INDEX);
chomp(@sortindex);
my %sortthis = undef;
foreach(@sortindex){
$sortthis{"$1"}.=$2 if /^([^ \?]+)\s*\?(.+)$/s;
}
open(INDEX,">".$self->{INDEXDIR}."/$indexfile");
foreach $sortval (sort keys %sortthis){
next if ($sortval=~/^\s*$/ || $sortthis{$sortval} =~/^\s*$
+/);
my %svalues = undef;
foreach ($sortthis{$sortval}=~ /($seperator[^$seperator]*)
+/gs){
$svalues{$_}=1;
}
$sortthis{$sortval}="";
foreach(keys %svalues){
$sortthis{$sortval}.=$_ if $sortthis{$sortval}!~/$_/ ;
}
print INDEX $sortval."\?".$sortthis{$sortval}."\n" if $sor
+tthis{$sortval};
}
close(INDEX);
}
$self->{OPTIMIZETIME} = time; # see if index was changed
$self->removedocs;
}
#
# remove a document from the index - removedoc ( docid )
#
sub removedoc {
my $self = shift;
my $docid = shift;
open(REMOVEDOC,">>".$self->{INDEXDIR}."/.delete");
print REMOVEDOC $self->getid($docid)."\n";
close(REMOVEDOC);
}
sub removedocs {
my $self = shift;
my $sepchar = $self->{INDEXSEPERATOR};
my ($found,@indexdata) = "";
open(DELETEDOCS,$self->{INDEXDIR}."/.delete");
my @Adeleteids= <DELETEDOCS>;
close(DELETEDOCS);
chomp(@Adeleteids);
if(scalar(@Adeleteids)){
my $deleteids = join('|',@Adeleteids);
$deleteids=~s/\.delete//igs;
# print "Removing $deleteids\n";
opendir(INDEXES,$self->{INDEXDIR});
foreach(readdir INDEXES){
next if !/\.ndx/;
open(INDEX,$self->{INDEXDIR}."/$_");
@indexdata=<INDEX>;
close(INDEX);
$found=0;
foreach(@indexdata){
my $orgline=$_;
if(/$sepchar($deleteids)[$sepchar\n\r]/){
$found=1;
$_=~s/$sepchar($deleteids)([$sepchar\n\r])/$2/g;
} elsif(/^[^\?]+\?($deleteids)\n$/){
$found=1;
$_=~s/^[^\?]+\?($deleteids)\n$//g;
} elsif(/^($deleteids)\s*\?.*$/){
$found=1;
$_=~s/^($deleteids)\s*\?.*$//g;
}
$_="" if /^[\s\n\r]*$/ || /[^\?]+\?[\s\n\r]*$/;
}
if($found){
open(INDEX,">".$self->{INDEXDIR}."/$_");
print INDEX @indexdata;
close(INDEX);
}
}
closedir(INDEXES);
unlink($self->{INDEXDIR}."/.delete");
}
return 1;
}
sub getiddata {
my $self = shift;
my $id = shift;
my $foundid = undef;
open(FILE,$self->{INDEXDIR}."/idindex-all.ndx"); # || print "Error
+ opening ID Index.\n";
my $newpos=look *FILE, $id, 0, 0;
$foundid = <FILE>;
close(FILE);
chomp($foundid);
if($foundid =~s/^$id \?//){
return split(/$self->{INDEXSEPERATOR}/,$foundid);
} else {
return 0;
}
}
# idgenerator ( string,date, data);
sub getid {
my $self = shift;
my $string = shift;
my $id = undef;
my $foundid = undef;
open(FILE,$self->{INDEXDIR}."/fileindex-all.ndx");# || print "Erro
+r opening ID Index.\n";
my $newpos=look *FILE, $string, 0, 0;
$foundid = <FILE>;
close(FILE);
# print "Searched $string : Found : $foundid\n";
if($foundid =~/^$string\?(\d+)$/){
return $1;
}
return;
}
sub adddoc {
my $self = shift;
my $string = shift;
my $date = shift;
my $data = shift;
my $id = undef;
my $id = $self->generateid;
$data=~s/[\r]//igs if $data=~/\n/;
$data=~s/[\n\r]/\\n/igs;
open(FILE,">>".$self->{INDEXDIR}."/idindex-all.ndx");# || print "E
+rror writing new ID.\n";
print FILE "$id \?$string".$self->{INDEXSEPERATOR}.$date.$self->{I
+NDEXSEPERATOR}.$data."\n";
close(FILE);
open(FILE,">>".$self->{INDEXDIR}."/fileindex-all.ndx") || print "E
+rror writing new ID.\n";
print FILE "$string\?$id\n";
close(FILE);
$self->{CHANGEDINDEX}{"idindex-all.ndx"}++;
$self->{CHANGEDINDEX}{"fileindex-all.ndx"}++;
return $id;
}
# generate uniqueid
sub generateid {
my $self = shift;
my $id=0;
open(LASTID,$self->{INDEXDIR}."/lastid") || print "Error generatin
+g id\n";
$id=<LASTID> || 1;
close(LASTID);
open(LASTID,">".$self->{INDEXDIR}."/lastid");
print LASTID ($id+1);
close(LASTID);
# $id=~s/[^\w\d]//igs;
return $id;
}
# determine indexname from word
sub getindexbasename {
my $self = shift;
my $indexname = shift;
my $schars=$self->{SPECIALCHARS};
# if($indexname=~/^([\w$schars])([\w$schars])/){
# $indexname = ord($1).ord($2);
if($indexname=~/^([\w$schars])/){
$indexname = ord($1);
return $indexname;
} else {
return 0;
}
}
# determine indexnames from word returns list
sub getindexnames {
my $self = shift;
my $indexname = shift;
my $schars=$self->{SPECIALCHARS};
my @indexes = "";
$indexname=$self->getindexbasename($indexname);
return 0 if !$indexname;
my $fragnr = "0";
while(-e $self->{INDEXDIR}."/$indexname-$fragnr.ndx"){
push(@indexes,"$indexname-$fragnr.ndx");
$fragnr++;
}
push(@indexes,"$indexname-all.ndx") if -e $self->{INDEXDIR}."/$ind
+exname-all.ndx";
return @indexes;
}
sub search {
my $self = shift;
my $query = shift;
my (@words, $seekword, $parsedseekword, $orgseekword, $foundword)
+= "";
my $wordsquery = $query;
my $indexname = "";
my $seperator = $self->{INDEXSEPERATOR};
my $schars=$self->{SPECIALCHARS};
my (@results, $fragdx,%ids, %cids, %foundwords, $id, %hitwords) =
+undef;
$wordsquery=~s/\s(and|or|not)\s/ /igs;
@words=split(/[\(\)\s]+/,$wordsquery);
foreach $seekword (@words){
%cids = undef;
$orgseekword = $seekword;
$parsedseekword= $seekword;
$parsedseekword=~s/\*.*$/\.\*/;
$seekword=~s/\*.*$//;
foreach $indexname ($self->getindexnames(lc($seekword))){
next if !$indexname;
open(FILE,$self->{INDEXDIR}."/$indexname") || print "error
+ opening Index $indexname for $seekword\n";
my $newpos=look *FILE, $seekword, 0, 0;
$foundword=<FILE>;
while(){
chomp $foundword;
last if $foundword !~/^$seekword/i;
if($foundword =~/^$parsedseekword\s*\?/i){
my $ffoundword=$1 if $foundword=~/^([^\?\s]+)\?/;
foreach ($foundword=~ /[$seperator]([^$seperator\n
+\r\s]+)/gs){
$ids{$_}++;
$foundwords{$_}.=";$orgseekword " if !$cids{$_
+}; # maak gevonden woordenlijst voor dit artikel
$hitwords{$_}.=";$ffoundword";
$cids{$_}++;
}
}
last if eof(FILE);
$foundword=<FILE>;
}
close(FILE);
}
}
$self->{SCORE}=undef;
$self->{HITWORDS}=undef;
undef @results;
for $id (reverse sort {$ids{$a} <=> $ids{$b}} keys %ids) {
next if !$self->ParseQuery($query,$foundwords{$id});
push(@results,$id);
$self->{HITWORDS}{$id}=$hitwords{$id};
$self->{SCORE}{$id}=$ids{$id};
}
return(@results);
}
# "(test and bertje) or (test or (quasi and blah) or blah)" , " Search
+ this text"
sub ParseQuery {
my $self = shift;
my $query = "(@_[0])";
my $search = ";@_[1];";
my $subq = "";
my $result = 0;
# despace
$query=~s/^\s+//g;
$query=~s/\s+$//g;
$query=~s/\(\s*/\(/g;
$query=~s/\s*\)/\)/g;
$query=~s/\s+/ /g;
# Add ()
$query =~ s/(\W)([^\s\)\(]+)\s(and|or|not)\s([^\s\)\(]+)(\W)/$1($2
+ $3 $4)$5/gs;
while($query=~/\(([^\(\)]*)\)/){
$subq=$1;
if($subq =~ /^([^\s]+)\s+(and|or|not)\s+([^\s]+)$/i){
if($2 eq 'not'){
$result="$1 and ".$self->Logics($2,"$3",$search);
} else {
$result=$self->Logics($2,"$1\;$3",$search);
}
} elsif($subq =~ /^([^\s]+)$/i){
$result=$self->Logics("AND","$1",$search);
} else {
$result = $subq;
}
$query=~s/\([^\(\)]*\)/$result/;
$query =~ s/(\W)([^\s\)\(]+)\s(and|or|not)\s([^\s\)\(]+)(\W)/$
+1($2 $3 $4)$5/gs;
}
return $query;
}
# ( "[AND/OR/NOT]" , "test;blah", "test;words;blah")
# return 1 if true, else 0
sub Logics {
my $self = shift;
my $term = uc(shift);
my @words = split(/[\s\;]+/,shift);
my $awords = ";".join(';',split(/[\s\;]+/,shift)).";";
my ($lcount, $twords) ="";
# print "Term: $term - Words: @words - Aword: $awords\n";
my $schars = $self->{SPECIALCHARS};
$awords=~s/[^\w$schars]/\;/igs;
$lcount=0;
if($term eq 'AND'){
foreach(@words){
$twords=$awords;
if($_ eq '1'){
$lcount++;
} elsif($_ eq '0'){
} elsif($twords =~/\;$_\;/igs){
$lcount++;
} else {
#print "\;$_\; ! $awords\n";
}
}
if($lcount eq scalar(@words)){
return 1;
} else {
return 0;
}
} elsif($term eq 'OR'){
$twords=$awords;
foreach(@words){
if($_ eq '1'){
$lcount++;
} elsif($_ eq '0'){
} elsif($twords =~/\;$_\;/igs){
$lcount++;
}
}
if($lcount>0){
return 1;
} else {
return 0;
}
} elsif($term eq 'NOT'){
$twords=$awords;
foreach(@words){
if($_ eq '1'){
$lcount++;
} elsif($_ eq '0'){
} elsif($twords =~/\;$_\;/igs){
$lcount++;
}
}
if($lcount>0){
return 0;
} else {
return 1;
}
}
}
1;
__END__
=head1 NAME
Search::Findex Text indexer and searcher
=head1 SYNOPSIS
use Search::Findex;
=head1 DESCRIPTION
This module creates a searchable-index. It can index any kind of t
+extual data. It's
reasonable fast, can index tenthousands of documents a hour.
Further, it searches very fast too, using Search:Dict, retrieving
+document id's within
milliseconds.
Tests with 4000 documents (avg 4kbyte size) were indexed within 10
+ minutes and multiple searches showed
results within a second, with performance of +/- 14 docs/sec on a
+Cel500 (time incl. writing index).
Multiple queries searches on this 4000 document index between 0.01
+ and 1.2 seconds depending on
the complexity of the query.
# Object creation
use Findex;
my $myindex = new Findex;
# create a directory ./myindex where the indexes will be stored
my $myindex->indexdir("myindex");
# Register a filename/url with date and description
my $docid = $myindex->getid($filename,-M $filename ,"File descript
+ion");
# Index filedata
$myindex->indexdoc($filedata,$docid);
# Register a second filename/url with date and description
my $docid2 = $myindex->getid($filename2,-M $filename2 ,"File descr
+iption2");
# Index the other filedata
$myindex->indexdoc($filedata2,$docid2);
# Write the new index, always after finishing indexing, advisable
+every x-hundred indexed document
$myindex->updateindex;
# Execute AND OR NOT query
my @results = $myindex->search("gates and (microsoft or (bill not
+clinton))");
# Print the score for result 0
print $myindex->{SCORE}{@results[0]};
# Print the words that were found for result 0
print $myindex->{HITWORDS}{@results[0]};
# Print all field data of each ID.
# ($filename,-M $filename ,"File description2") = getiddata
foreach $id (@results){
foreach $field ($myindex->getiddata($id)){
print "$field\n";
}
}
# Removed document from indexes
$myindex->removedoc($docid);
# optimize all indexes (takes a few seconds)
$myindex->optimizeall;
=head1 Methods
$number = $indexobject->getid($identifierstring, $date, $data);
Generate a unique documentID for this document. If identifierstrin
+g already is
registered it returns nothing.
$indexobject->indexdoc( $documentstring, $documentID);
Index a document and store it's results in a temporary memorybuffe
+r under documentid.
$indexobject->updateindex
Write the temporary memorybuffer with indexdata to the index files
$indexobject->optimizeall
Updateindex creates new indexfiles at each update. Optimizeall mer
+ges indexfiles to
single larger files that speed up the searching a bit and optimize
+ filesystem usage.
$indexobject->removedoc( $number)
Removes documentID from all indexes.
@array = $indexobject->search( $stringquery)
Execute a logical query on the index with possible uses of and or
+not. Default the
results are returned sorted at score.
=head1 Variables
- Usage variables
$indexobject->{INDEXDIR}
The directory where the index is or should be located. Needs to be
+ set before indexing
and/or searching.
$indexobject->{SCORE}
A hash in the form $indexobject->{SCORE}{$id} that shows the score
+ for an document id
after a search.
$indexobject->{HITWORDS}
A hash in the form $indexobject->{HITWORDS}{$id} that shows the wo
+rds that search found
for an document id after a search.
- Language dependend variables
$indexobject->{STOPWORDS}
A list of words to be ignored while indexing (default Dutch words)
$indexobject->{SPECIALCHARS}
A list of special characters that beside \w need to be indexed.
$indexobject->{LINKCHARS}
A list of characters that are used to link words. Linked words wil
+l be indexed as
3 seperate words.
Example: "foo-bar" will be indexed as foo, bar and foobar.
- Internal variables
$indexobject->{INDEX}
An internal hash with index data that still needs to be written (d
+one with updateindex)
$indexobject->{INDEXSEPERATOR}
The seperator to use within index data files.
=head1 AUTHOR
Patrick van Venetien - www.zojoh.com / www.fictional.net
=head1 COPYRIGHT
Copyright 2002 Patrick van Venetien. All rights reserved.
=head1 SEE ALSO
perl(1).
=cut
|
|
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Re: Text indexer/search engine
by zentara (Cardinal) on Aug 23, 2002 at 15:36 UTC | |
by Vennis (Pilgrim) on Aug 26, 2002 at 10:10 UTC |