#!/usr/bin/perl-l.-l../perllib
## PERL CODE TO DETECT PER CENT MATCHES BETWEEN TWO TEXTS(Original & matching): 
## Test files:
## Original text: Poet Blake had a milky white cat.  
## Matching text: Poet Blake had a white cat.
use warnings;
use strict;
## Input:
  my $original_text="Poet Blake had a milky white cat.";
  my $new="Poet Blake had a white cat.";    # Line 10 
## Line 11   ????????
### Code starts here:
my $text=Text::Plagiarized->new;    # Line 13 
   $text->original($original_text);
     foreach my $comparison (my @comparison_texts) {
     $text->comparison($comparison);
     $text->analyze;
     print $text->percent, $/; # percent of matching sentences
        if ($text->percent > my $some_threshold) { # Line 19  
    [my $sentence,my $possible_match]
    print Dumper($text->matches);
  }
}
exit;   # Line 24


##</code><code>##

package Text::Plagiarized;
$REVISION = '$Id: Plagiarized.pm,v 1.0 2003/07/13 19:15:57 ovid Exp $';
$VERSION  = '0.01';
use 5.006;
use strict;
use warnings;
use String::Approx qw/amatch/;
use Text::Sentence qw/split_sentences/;
sub new {
    my $class = shift;
    my $self = bless {
        original   => {},
        comparison => {},
        matches    => [],
        threshold  => 80,
    } => $class;
}
sub original {
    my ($self, $text) = @_;
    local $_ = $text;
    $self->{original} = {
        text=> $text,
        sentences=> [split_sentences($text)],
    };
    return $self;
}
sub comparison {
    my ($self, $text)=@_;
    local $_ = $text;
    $self->{comparison} = {
        text=> $text,
        sentences=> [split_sentences($text)],
    };
    return $self;
}
my %percentage = map { $_ => 1 } 0 .. 100; # wow.  This is a cheap hack
sub threshold {
    my $self = shift;
    if (@_) {
        my $num = shift;
        unless (exists $percentage{$num}) {
            require Carp;
            Carp::croak("threshold must be an integer between 0 and 100, inclusive");
        }
        $self->{threshold} = 100 - $num;
    }
    $self->{threshold};
}
sub analyze {
    my $self= shift;
    my @sentences;
    my $threshold= $self->threshold;
    foreach my $sentence1 (@{$self->{original}{sentences}}) {
        foreach my $sentence2 (@{$self->{comparison}{sentences}}) {
            my ($hash1, $hash2) = _hash($sentence1, $sentence2);
            if ($hash1 eq $hash2 || amatch($hash1, ["$threshold%"], $hash2)) {
                push @sentences => [$sentence1 => $sentence2];
                last;
            }
        }
    }
    $self->{matches}= \@sentences;
}
sub matches { shift->{matches} }
sub percent {
    my $self= shift;
    my $precision = shift || 0;
    my $matches= @{$self->matches};
    my $sentences= @{$self->{original}{sentences}};
    sprintf "%.${precision}f" => ($matches/$sentences)*100;
}
# starts to break down if we have more than 26 different words
# use Unicode characters?
# stop words?
# memoize this
sub _hash {
    my @string = map lc $_ =>@_;
    s/[^[:alnum:][:space:]]//g foreach @string;
    s/[[:space:]]+/ /g         foreach @string;
    my %words;
    my $letter = 'a';
    s/(\S+)/
        unless (exists $words{$1}) {
            $words{$1}=$letter;
            $letter++;
        }
        $words{$1}/eg foreach @string;
        s/ //g foreach @string;
    return @string;
}
1;

##</code><code>##

C:\Users\x>cd desktop
C:\Users\x\Desktop>match.pl
syntax error at C:\Users\x\Desktop\match.pl line 21, near "print"
Execution of C:\Users\x\Desktop\match.pl aborted due to compilation errors.

C:\Users\x\Desktop>