#!/usr/bin/perl #This script matches words that are repeated within <num> words and ma +rks them and prints out the original text with repeated words marked. use strict; use warnings; print "Enter the name of the file to check: "; my $inputfile = <>; #Define how many words between matching a repeat. my $numwords = 40; open(TEXTFILE, $inputfile); my @importedtextfile = <TEXTFILE>; close(TEXTFILE); my $counter = 0; my @words; #Split the sentences imported. foreach (@importedtextfile) { my @tempsplit = split(/\s+/, $_); foreach(@tempsplit) { push @words, $_; } } my @filteredlist = @words; my $wordslength = scalar(@words); $counter = 0; foreach(@words) { my $i = 0; my $currentword = $_; $currentword =~ s/[\,\.]//g; #Start counting one word "right" of the one you are trying to matc +h. my $startposition = $counter + 1; #Check <num> words and make sure you don't check empty strings in +array. while ($i < $numwords and $counter + $i < $wordslength) { if ($counter + $i + 1 < $wordslength) { my $matchword = $words[$startposition]; $matchword =~ s/[\,\.]//g; #Match and replace in new array. if ($currentword =~ /\b$matchword\b/i) { $filteredlist[$startposition] = "*".$words[$startposition] +."*"; } } ++$startposition; ++$i; } ++$counter; } my $printedlist = join(" ", @filteredlist); print "$printedlist\n";
This script reads a specified text-file from input and then checks for reoccurring words within specified numbers of words.
As an example. To be or not to be. Would result in "To be or not *to* be." at first pass, and second "To be or not *to* *be*.
The printout though does not preserve the original formatting of the text with line-breaks and such. I'm wondering if that is possible somehow, i guess it's all related to the split or the join.
In reply to Preserve original text formatting. by larsb
| For: | Use: | ||
| & | & | ||
| < | < | ||
| > | > | ||
| [ | [ | ||
| ] | ] |