#!/usr/bin/perl-l.-l../perllib
## PERL CODE TO DETECT PER CENT MATCHES BETWEEN TWO TEXTS(Original & matching):
## Test files:
## Original text: Poet Blake had a milky white cat.
## Matching text: Poet Blake had a white cat.
use warnings;
use strict;
## Input:
my $original_text="Poet Blake had a milky white cat.";
my $new="Poet Blake had a white cat."; # Line 10
## Line 11 ????????
### Code starts here:
my $text=Text::Plagiarized->new; # Line 13
$text->original($original_text);
foreach my $comparison (my @comparison_texts) {
$text->comparison($comparison);
$text->analyze;
print $text->percent, $/; # percent of matching sentences
if ($text->percent > my $some_threshold) { # Line 19
[my $sentence,my $possible_match]
print Dumper($text->matches);
}
}
exit; # Line 24
####
package Text::Plagiarized;
$REVISION = '$Id: Plagiarized.pm,v 1.0 2003/07/13 19:15:57 ovid Exp $';
$VERSION = '0.01';
use 5.006;
use strict;
use warnings;
use String::Approx qw/amatch/;
use Text::Sentence qw/split_sentences/;
sub new {
my $class = shift;
my $self = bless {
original => {},
comparison => {},
matches => [],
threshold => 80,
} => $class;
}
sub original {
my ($self, $text) = @_;
local $_ = $text;
$self->{original} = {
text=> $text,
sentences=> [split_sentences($text)],
};
return $self;
}
sub comparison {
my ($self, $text)=@_;
local $_ = $text;
$self->{comparison} = {
text=> $text,
sentences=> [split_sentences($text)],
};
return $self;
}
my %percentage = map { $_ => 1 } 0 .. 100; # wow. This is a cheap hack
sub threshold {
my $self = shift;
if (@_) {
my $num = shift;
unless (exists $percentage{$num}) {
require Carp;
Carp::croak("threshold must be an integer between 0 and 100, inclusive");
}
$self->{threshold} = 100 - $num;
}
$self->{threshold};
}
sub analyze {
my $self= shift;
my @sentences;
my $threshold= $self->threshold;
foreach my $sentence1 (@{$self->{original}{sentences}}) {
foreach my $sentence2 (@{$self->{comparison}{sentences}}) {
my ($hash1, $hash2) = _hash($sentence1, $sentence2);
if ($hash1 eq $hash2 || amatch($hash1, ["$threshold%"], $hash2)) {
push @sentences => [$sentence1 => $sentence2];
last;
}
}
}
$self->{matches}= \@sentences;
}
sub matches { shift->{matches} }
sub percent {
my $self= shift;
my $precision = shift || 0;
my $matches= @{$self->matches};
my $sentences= @{$self->{original}{sentences}};
sprintf "%.${precision}f" => ($matches/$sentences)*100;
}
# starts to break down if we have more than 26 different words
# use Unicode characters?
# stop words?
# memoize this
sub _hash {
my @string = map lc $_ =>@_;
s/[^[:alnum:][:space:]]//g foreach @string;
s/[[:space:]]+/ /g foreach @string;
my %words;
my $letter = 'a';
s/(\S+)/
unless (exists $words{$1}) {
$words{$1}=$letter;
$letter++;
}
$words{$1}/eg foreach @string;
s/ //g foreach @string;
return @string;
}
1;
####
C:\Users\x>cd desktop
C:\Users\x\Desktop>match.pl
syntax error at C:\Users\x\Desktop\match.pl line 21, near "print"
Execution of C:\Users\x\Desktop\match.pl aborted due to compilation errors.
C:\Users\x\Desktop>