#!/usr/bin/perl -w use strict; use DBI; use File::Find; use Fcntl; use Getopt::Long; use Text::English; use constant TYPE_DEFAULT => "article"; my (%opts, @files, $stop_words, $type); #User input GetOptions( \%opts, "dir=s", "cache=s", "stop=s", "ignore", "type=s", "numbers", "stem"); die usage() unless $opts{dir} && -d $opts{dir}; $opts{'type'} ||= TYPE_DEFAULT; #Get file names and build an array of files. find(sub{push @files, $File::Find::name}, $opts{dir}); $stop_words = load_stopwords($opts{stop}) if $opts{stop}; process_files(\@files, \%opts, $stop_words); sub load_stopwords { my $file = shift; my $words = {}; local *INFO, $_ or die "Can't open stop file: $file\n" unless -e $file; open INFO, $file or die "$!\n"; while() { next if /^#/; $words->{lc $1} = 1 if /(\S+)/; } close INFO; return $words; } sub process_files { #input variables: my($files, $opts, $stop_words) = @_; local( *FILE, $_ ); local $/ = "\n\n"; my $type = $opts{type}; my $dir = $opts{dir}; my %index; #Establish database variables: my($dbh, $sth1, $sth2); local(*FILE); local $/ = "\n\n"; my $file_id = 0; # initializing counter variable #Establish Database Connection $dbh = ("DBI:mysql:host=localhost; database=blah", "blah", "blah", {PrintError => 0, RaiseError=>1}); for ( my $file_id = 0; $file_id < @$files; $file_id++ ) { my $file = $files[$file_id]; my %seen_in_file; next unless -T $file; #print STDERR "Indexing $file\n"; #$index->{"!FILE_NAME:$file_id"} = $file; #Step 1: Create Library of Files: $sth1 = $dbh-> prepare("insert into library values ($file, $dir, $type)"); $sth1-> execute(); open FILE, $file or die "Cannot open file: $file!\n"; while ( #### ) { tr/A-Z/a-z/ if $opts{ignore}; s/<.+?>//gs; # Note this doesn't handle < or > in comments or js while ( /([a-z\d]{2,})\b/gi ) { my $word = $1; next if $stop_words->{lc $word}; next if $word =~ /^\d+$/ && not $opts{number}; ( $word ) = Text::English::stem( $word ) if $opts{stem}; $index->{$word} = ( exists $index->{$word} ? "$index->{$word}:" : "" ) . "$file_id" unless $seen_in_file{$word}++; } #New Flava: Take Contents out of hash Table and into DB foreach my $words (keys(%index)) { $sth2 = $dbh-> prepare('insert into catalog values ($words, $index{$words})'); $sth2 -> execute(); } } } sub usage { my $usage = <## Useless use of a constant in void context at libbuilder.pl line 77. Useless use of a constant in void context at libbuilder.pl line 77. Useless use of a constant in void context at libbuilder.pl line 77. Global symbol "$index" requires explicit package name at libbuilder.pl line 109. Global symbol "$index" requires explicit package name at libbuilder.pl line 109. Global symbol "$index" requires explicit package name at libbuilder.pl line 110. Missing right curly or square bracket at libbuilder.pl line 147, at end of line syntax error at libbuilder.pl line 147, at EOF Execution of libbuilder.pl aborted due to compilation errors.