comment on

Hi rajaman,

Hello :) Unfortunately, life is getting shorter and have learned to skip threads like this one whenever test data is omitted. The reason is due to lack of time. Sorry. That said, the demonstration that follows is not tested.

#!/usr/bin/perl

use strict;
use warnings;

use Data::Dumper qw(Dumper);
use re::engine::RE2;
use List::MoreUtils qw(uniq);
use Sort::Naturally qw(nsort);
use MCE;

# This program reads an abstract sentence file and produces
# output with the following format ...

if ($#ARGV != 1) {
    print "usage: $0 <inputfile> <outputfile>\n";
}

my $inputfile1 = $ARGV[0];
my $outputfile = $ARGV[1];

unless (-e $inputfile1) {
    die "Can't open $inputfile1: No such file or directory";
}

# Make gather routine for the manager process. It returns a
# closure block for preserving append-order as if processing
# serially.

my %hashunique;

sub make_gather {
    my ($order_id, %tmp) = (1);

    return sub {
        my ($chunk_id, $hashref) = @_;
        $tmp{$chunk_id} = $hashref;

        while (exists $tmp{$order_id}) {
            $hashref = delete $tmp{$order_id};

            for my $k (keys %{ $hashref }) {
                unless (exists $hashunique{$k}) {
                    $hashunique{$k} = $hashref->{$k};
                }
                else {
                    $hashunique{$k} = $hashunique{$k}.'|'.$hashref->{$
+k};
                }
            }

            $order_id++;
        }
    }
}

# The user function for MCE workers. Workers open a file handle to
# a scalar ref due to using MCE option use_slurpio => 1.

sub user_func {
    my ($mce, $slurp_ref, $chunk_id) = @_;
    my %localunique;

    open RF, '<', $slurp_ref;

    # A shared-hash is not necessary. The gist of it all is batching
    # to a local hash. Otherwise, a shared-hash inside a loop involves
    # high IPC overhead.

    local $/ = '';  # blank line, paragraph break
                    # in the event worker receives 2 or more records

    while (<RF>) {
        my @one = split /\n/, $_;
        my ($indexofdashinarray) = grep { $one[$_] =~ /\-\-/ } 0..$#on
+e;

        for my $i (1..$#one) {
            next if $one[$i] =~ /^\-\-$/;

            while ($one[$i] =~ m/(\b)D\*(.*?)\*(.*?)\*D(\b)/g) {
                unless (exists $localunique{"D$2"}) {
                    $localunique{"D$2"} = "$3";
                }
                else {
                    $localunique{"D$2"} = $localunique{"D$2"}.'|'."$3"
+;
                }
            }
        }
    }

    close RF;

    # Each worker must call gather one time when preserving order
    # is desired which is the case for this demonstration.

    MCE->gather($chunk_id, \%localunique);
}

# Am using the core MCE API. Workers read the input file directly and
# sequentially, one worker at a time.

my $mce = MCE->new(
    max_workers => 3,
    input_data  => $inputfile1,
    chunk_size  => 2 * 1024 * 1024, # 2 MiB
    RS          => '', # important, blank line, paragraph break
    gather      => make_gather(),
    user_func   => \&user_func,
    use_slurpio => 1
);

$mce->run();

# Results.

open WF, ">", $outputfile or die "Can't open $outputfile: $!";

foreach my $k (nsort keys %hashunique) {
    $hashunique{$k} = join ("\|", uniq split /\|/ , $hashunique{$k});
    print WF "$k=>$hashunique{$k}\n";
}

close WF;
[download]

Regards, Mario

In reply to Re: Parallel-processing the code by marioroy
in thread Parallel-processing the code by rajaman

Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!

Titles consisting of a single word are discouraged, and in most cases are disallowed outright.

Read Where should I post X? if you're not absolutely sure you're posting in the right place.

Please read these before you post! —

Posts may use any of the Perl Monks Approved HTML tags:

a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, details, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, summary, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr

You may need to use entities for some characters, as follows. (Exception: Within code tags, you can put the characters literally.)

	For:		Use:
	&		`&`
	<		`<`
	>		`>`
	[		`[`
	]		`]`

Link using PerlMonks shortcuts! What shortcuts can I use for linking?

See Writeup Formatting Tips and other pages linked from there for more info.