comment on

Dear monks,

I am working on a script for data migration purpose, it is about mapping accounts and others fields through 2 mapping tables then logging unmapped items for audit/tracing purposes. It takes ~1h30 for ~1 million rows. I am looking for some advice in order to optimize the script, quick wins or blatant code optimization advice, I am not looking for complete code rewriting (even if that would be very cool).

Thank you for any input and advice!

my $begin_time = time();

#Specify files to map
#my @files = ("dataset1.dat", "dataset2.dat");

foreach $files (@ARGV) {

    #Load file into array (start)
    my $begin_time = time();

    print "Processing file: ".$files."\n";

    open my $handle, '<', $files;
    chomp( my @lines = <$handle> );
    #Exclude
    @lines2 = grep( !/(Final|ConvP&L|ConvBsheet)/, @lines );
    @lines = ();
    @lines = @lines2;
    close $handle;

    #Load file into array (end)
    my $end_time = time();
    printf( "Load file into array: "."%.2f\n", $end_time - $begin_time
+ );

    #Mapping process (start)
    my $begin_time = time();

    # Mapping 1
    open(MAPPINGFILE1, "mapping3.txt") or die print "Error: not found 
+in specified path\n";

    # Mapping 2
    open(MAPPINGFILE2, "mapping4.txt") or die print "Error: not found 
+in specified path\n";

    #$count++ while <MAPPINGFILE1>;

    foreach $line (@lines) {

        #Mapping des charges directes
        seek MAPPINGFILE1, 0, 0;
        while (<MAPPINGFILE1>) {

            chomp();
            # Skip blank lines and comments
            next if /^(\s*(#.*)?)?$/;

            # Split source columns
            @source = split /\t/, $line;

            # Split mapping columns
            @mapping = split /\t/, $_;

            # Account is matching with source
            if ($line =~ m/$mapping[0]/) {

                #Account substitution
                if($mapping[2] eq "") {
                    $line =~ s/$mapping[0]/"Compte cible non défini !"
+/;
                } else {
                    $line =~ s/$mapping[0]/$mapping[2]/;
                }

                #Mapping = target Unit, Alloc_ + Unit source
                if($mapping[3] ne "Unit source") {

                    #Unit substitution
                    if($mapping[3] eq "") {
                        $line =~ s/$source[2]/"Unit cible non définie 
+!"\tALLOC_$source[2]/;
                    } else {
                        $line =~ s/$source[2]/$mapping[3]\tALLOC_$sour
+ce[2]/;
                    }
                    push @lines2, $line;
                    last;
                } else {

                    #Mapping : source Unit, Alloc_ + source Unit
                    $line =~ s/$source[2]/$source[2]\tALLOC_$source[2]
+/;
                    push @lines2, $line;
                    last;
                }
            }
            elsif (eof(MAPPINGFILE1)) {

                #Mapping des charges indirectes
                seek MAPPINGFILE2, 0, 0;
                while (<MAPPINGFILE2>) {

                    chomp();
                    # Skip blank lines and comments
                    next if /^(\s*(#.*)?)?$/;

                    # Split mapping columns (tab)
                    @mapping = split /\t/, $_;

                    # Mapping is matching
                    if ($line =~ m/$mapping[0]/) {

                        $line =~ s/$mapping[0]/$mapping[4]/;
                        @source = split /\t/, $line;
                        $line =~ s/$source[2]/$mapping[2]\tALLOC_$sour
+ce[2]/;
                        push @lines2, $line;
                        last;
                    } elsif (eof(MAPPINGFILE2)) {
                        push @rejects, "Lignes non mappées (Account): 
+"."\t".$line;
                    }
                }
            }
        }
    }
    close MAPPINGFILE1;
    close MAPPINGFILE2;

    #Suppress double quote
    for (@lines2) { s/"//g }
    for (@rejects) { s/"//g }

    #Replace empty by Missing
    for (@lines2) { s/\t(?=\t)/\t#MI/g }

    #Generic mapping
    for (@lines2) { s/A2022Local/ACTUAL;FY22;Working_Central;Input;Loc
+al_YTD/g }
    for (@lines2) { s/A2022AjConsoLocal/ACTUAL;FY22;Working_Central;Ad
+j_Conso;Local_YTD/g }
    for (@lines2) { s/A2022InEur/ACTUAL;FY22;Working_Central;Input;Eur
+_ACT2022_Rate_YTD/g }
    for (@lines2) { s/A2022AjConso/ACTUAL;FY22;Working_Central;Adj_Con
+so;Eur_ACT2022_Rate_YTD/g }
    for (@lines2) { s/A2022TxB22/ACTUAL;FY22;Working_Central;Input;Eur
+_BUD2022_Rate_YTD/g }
    for (@lines2) { s/A2022TxB22AjConso/ACTUAL;FY22;Working_Central;Ad
+j_Conso;Eur_BUD2022_Rate_YTD/g }

    for (@lines2) { s/A2021Local/ACTUAL;FY21;Working_Central;Input;Loc
+al_YTD/g }
    for (@lines2) { s/A2021AjConsoLocal/ACTUAL;FY21;Working_Central;Ad
+j_Conso;Local_YTD/g }
    for (@lines2) { s/A2021InEur/ACTUAL;FY21;Working_Central;Input;Eur
+_ACT2021_Rate_YTD/g }
    for (@lines2) { s/A2021AjConso/ACTUAL;FY21;Working_Central;Adj_Con
+so;Eur_ACT2021_Rate_YTD/g }
    for (@lines2) { s/A2021TxB22/ACTUAL;FY21;Working_Central;Input;Eur
+_BUD2021_Rate_YTD/g }
    for (@lines2) { s/A2021TxB22AjConso/ACTUAL;FY21;Working_Central;Ad
+j_Conso;Eur_BUD2021_Rate_YTD/g }

    for (@lines2) { s/A2020Local/ACTUAL;FY20;Working_Central;Input;Loc
+al_YTD/g }
    for (@lines2) { s/A2020AjConsoLocal/ACTUAL;FY20;Working_Central;Ad
+j_Conso;Local_YTD/g }
    for (@lines2) { s/A2020InEur/ACTUAL;FY20;Working_Central;Input;Eur
+_ACT2020_Rate_YTD/g }
    for (@lines2) { s/A2020AjConso/ACTUAL;FY20;Working_Central;Adj_Con
+so;Eur_ACT2020_Rate_YTD/g }
    for (@lines2) { s/A2020TxB22/ACTUAL;FY20;Working_Central;Input;Eur
+_BUD2020_Rate_YTD/g }
    for (@lines2) { s/A2020TxB22AjConso/ACTUAL;FY20;Working_Central;Ad
+j_Conso;Eur_BUD2020_Rate_YTD/g }

    for (@lines2) { s/B2022Local/BUDGET;FY22;Working_Central;Input;Loc
+al_YTD/g }
    for (@lines2) { s/B2022AjConsoLoc/BUDGET;FY22;Working_Central;Adj_
+Conso;Local_YTD/g }
    for (@lines2) { s/B2022AjTBLocal/BUDGET;FY22;Working_Central;Adj_C
+onso;Local_YTD/g }
    for (@lines2) { s/B2022InEur/BUDGET;FY22;Working_Central;Input;Eur
+_BUD2022_Rate_YTD/g }
    for (@lines2) { s/B2022AjConso/BUDGET;FY22;Working_Central;Adj_Con
+so;Eur_BUD2022_Rate_YTD/g }
    for (@lines2) { s/B2022AjTB/BUDGET;FY22;Working_Central;Adj_Conso;
+Eur_BUD2022_Rate_YTD/g }

    #Add header
    unshift @lines2, "Scenario;Year;Audit;Version;Vision;Entity;Unit;U
+nit_Alloc;Account;Jan;Feb;Mar;Apr;May;Jun;Jul;Aug;Sep;Oct;Nov;Dec";

    #Set common delimiter
    for (@lines2) { s/(\t|;)/\t/g }
    for (@rejects) { s/(\t|;)/\t/g }

    #Mapping process (end)
    my $end_time = time();
    printf( "Mapping process: "."%.2f\n", $end_time - $begin_time );

    #Output (start)
    my $begin_time = time();

    open my $handle2, ">", $files."_output.txt";
    print $handle2 join( "\n", @lines2 );
    close $handle2;

    open my $handle3, ">", $files."_output.err";
    print $handle3 join( "\n", @rejects );
    close $handle3;

    #Output (end)
    my $end_time = time();
    printf( "Output: "."%.2f\n", $end_time - $begin_time );
}
[download]

In reply to Optimization tips by sroux

Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!

Titles consisting of a single word are discouraged, and in most cases are disallowed outright.

Read Where should I post X? if you're not absolutely sure you're posting in the right place.

Please read these before you post! —

Posts may use any of the Perl Monks Approved HTML tags:

a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, details, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, summary, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr

You may need to use entities for some characters, as follows. (Exception: Within code tags, you can put the characters literally.)

	For:		Use:
	&		`&`
	<		`<`
	>		`>`
	[		`[`
	]		`]`

Link using PerlMonks shortcuts! What shortcuts can I use for linking?

See Writeup Formatting Tips and other pages linked from there for more info.