#!/usr/bin/perl -w use strict; use DBI; use File::Find; use Fcntl; use Getopt::Long; use Text::English; use constant TYPE_DEFAULT =>'article'; my (%opts, @files, $stop_words, $type); #User input GetOptions( \%opts, "dir=s", "stop=s", "ignore", "type=s", "numbers", "stem"); die usage() unless $opts{dir} && -d $opts{dir}; $opts{'type'} ||= TYPE_DEFAULT; #Get file names and build an array of files. find(sub{push @files, $File::Find::name}, $opts{dir}); $stop_words = load_stopwords($opts{stop}) if $opts{stop}; process_files(\@files, \%opts, $stop_words); sub load_stopwords { my $file = shift; my $words = {}; local *INFO, $_ or die "Can't open stop file: $file\n" unless -e $file; open INFO, $file or die "$!\n"; while() { next if /^#/; $words->{lc $1} = 1 if /(\S+)/; } close INFO; return $words; } sub process_files { #input variables: my($files, $opts, $stop_words) = @_; local( *FILE, $_ ); local $/ = "\n\n"; my $type = $opts{type}; my $dir = $opts{dir}; my %index; #Establish database variables: my($dbh, $sth1, $sth2); local(*FILE); local $/ = "\n\n"; my $file_id = 0; # initializing counter variable #Establish Database Connection $dbh = DBI->connect( "DBI:mysql:host=localhost;database=members", "gorillatrades", "kennyber", {PrintError=>0,RaiseError=>1}); for ( my $file_id = 0; $file_id < @$files; $file_id++ ) { my $file = $files[$file_id]; my %seen_in_file; next unless -T $file; #print STDERR "Indexing $file\n"; #$index->{"!FILE_NAME:$file_id"} = $file; #Step 1: Create Library of Files: $sth1 = $dbh-> prepare("insert into library values ($file, $dir, $type)"); $sth1-> execute(); open FILE, $file or die "Cannot open file: $file!\n"; while ( ) { tr/A-Z/a-z/ if $opts{ignore}; s/<.+?>//gs; # Note this doesn't handle < or > in comments or js while ( /([a-z\d]{2,})\b/gi ) { my $word = $1; next if $stop_words->{lc $word}; next if $word =~ /^\d+$/ && not $opts{number}; ( $word ) = Text::English::stem( $word ) if $opts{stem}; if ( ! $seen_in_file{$word} ) { $index{$word} .= ":" if ( exists( $index{$word} )); $index{$word} .= $file_id; $seen_in_file{$word}++; } } #New Flava: Take Contents out of hash Table and into DB foreach my $words (keys(%index)) { $sth2 = $dbh-> prepare('insert into catalog values ($words, $index{$words})'); $sth2 -> execute(); } } } } sub usage { my $usage = <