use strict; use warnings; use utf8; use IO::File; #the file to index my $inFile = "c:\\temp\\texts\\T100001A.txt"; #the file to store the index information my $indexFile = "c:\\temp\\index\\t100001a.index"; my $inFh = new IO::File $inFile, "r"; my $outFh = new IO::File "$indexFile", "w"; my $lineNum = 0; my %index; while(my $line = <$inFh>) { $lineNum++; chomp $line; my @words = split /\s/, $line; foreach my $word (@words) { $word =~ s/,$|\.$|\[|\]|\(|\)|;|:|!//g; $word = lc $word; } @words = grep {!&inStopList($_);} @words; @words = grep {&removeNullEntries($_);} @words; foreach my $word (@words) { if(exists $index{$word}) { push @{$index{$word}}, $lineNum; } else { my @lineNums; push @lineNums, $lineNum; $index{$word} = \@lineNums; } } } print "done indexing\n"; foreach my $key (keys %index) { print $outFh $key; print $outFh "="; print $outFh join(',', @{$index{$key}}); print $outFh "\n"; } sub inStopList { my $word = shift; my @stopList = ("the", "a", "an", "of", "and", "on", "in", "by", "with", "at", "he", "after", "into", "their", "is", "that", "they", "for", "to", "it", "them", "which"); foreach my $stopWord (@stopList) { if($word eq $stopWord) { return $word; } elsif($word =~ /p\.(\d)+/) { return $word; } elsif($word =~ /\-{5,}?/) { return $word; } else { next; } } } sub removeNullEntries { my $word = shift; if($word) { return $word; } else { return undef; } }