Corrupted storable string (binary v2.7) at ../../lib/Storable.pm (autosplit into ../../lib/auto/Storable/thaw.al) line 415, at /usr/share/perl5/IPC/Shareable.pm line 545
####
Munged shared memory segment (size exceeded?) at ./dup_finder2.pl line 45
Munged shared memory segment (size exceeded?) at ./dup_finder2.pl line 45
Object #3223600 should have been retrieved already at ../../lib/Storable.pm (autosplit into ../../lib/auto/Storable/thaw.al) line 415, at /usr/share/perl5/IPC/Shareable.pm line 545
Object #3223600 should have been retrieved already at ../../lib/Storable.pm (autosplit into ../../lib/auto/Storable/thaw.al) line 415, at /usr/share/perl5/IPC/Shareable.pm line 545
Corrupted storable string (binary v2.7) at ../../lib/Storable.pm (autosplit into ../../lib/auto/Storable/thaw.al) line 415, at /usr/share/perl5/IPC/Shareable.pm line 545
Corrupted storable string (binary v2.7) at ../../lib/Storable.pm (autosplit into ../../lib/auto/Storable/thaw.al) line 415, at /usr/share/perl5/IPC/Shareable.pm line 545
####
#!/usr/bin/perl
use warnings;
use strict;
use Text::LevenshteinXS qw(distance);
use Benchmark qw(timediff timestr);
use IPC::Shareable qw(:all);
use Parallel::ForkManager;
my $t0 = Benchmark->new();
my %contacts;
my $minimum_score = 80;
my $max_childs = 5;
my $glue = 1978;
my $db = tie %contacts, 'IPC::Shareable', $glue,
{
create => 1,
exclusive => 0,
destroy => 1
}
or die "cannot tie contacts\n";
# :WARNING:21-10-2010:arfreitas: child process must receive this
$SIG{INT} = sub { die "$$ dying\n" };
# :WARNING:21-10-2010:arfreitas: must get registries AFTER creating the shared memory tie
get_db_data();
my @control = keys(%contacts);
print 'Starting processing data with maximum of ', $max_childs, ' childs', "\n";
my $manager = Parallel::ForkManager->new($max_childs);
$manager->set_max_procs($max_childs);
my $max_tries = 3;
foreach my $id (@control) {
$manager->start() and next;
my $tries = 0;
if ( exists( $contacts{$id} ) ) {
while (1) {
if ( $db->shlock(LOCK_EX) ) {
print "child $$ got a lock\n";
# :BUG:21-10-2010:arfreitas: race condition when deleting keys of the tied hash
my $testing = delete( $contacts{$id} );
my %cache = %contacts;
unless ( $db->shunlock() ) {
print "child $$ could not release the lock\n";
}
else {
print "child $$ released the lock\n";
}
validate_contact( $id, $testing, \%cache );
last;
}
else {
print "child $$ can't lock\n";
sleep 1;
$tries++;
last if ( $tries >= $max_tries );
}
}
}
else {
print "child $$: somebody deleted key $id\n";
}
$manager->finish();
}
$manager->wait_all_children();
my $t1 = Benchmark->new();
my $td = timediff( $t1, $t0 );
print "\nThe code took: ", timestr($td), "\n";
sub validate_contact {
my $id = shift;
my $testing = shift;
my $cache_ref = shift;
my $file = 'tmp/processing-' . $id . '.log';
my $source_name = $testing;
my $counter = 0;
foreach my $contact ( keys( %{$cache_ref} ) ) {
my $dest_name = $cache_ref->{$contact};
my $dest_len = length($dest_name);
$dest_len = 1 unless ( $dest_len > 0 );
my $distance = distance( $source_name, $dest_name );
my $score = 100 - ( ( $distance * 100 ) / $dest_len );
#how much equal is the current row with the original one
if ( $score >= $minimum_score ) {
print "$id looks like $contact\n";
}
$counter++;
}
print "child $$ compared id $id with $counter contacts\n";
}
sub get_db_data {
my $file = 'contacts.txt';
open( my $in, '<:utf8', $file ) or die "cannot read $file: $!\n";
while (<$in>) {
chomp;
my @temp = split( /\|/, $_ );
my $id = splice( @temp, 1, 1 );
$contacts{$id} = $temp[0];
}
close($in);
print "OK\n";
return;
}