July 22, 2015. The example was updated to work with MCE in trunk.
My 1st and 2nd attempts got me warmed up and thought faster is possible. The following demo is my 3rd attempt and writes 1 million cells combined in less than 6 seconds from start to finish and 57 seconds for 10 million cells. Running serially takes 15 and 141 seconds for 1 and 10 million cells respectively. Processors have turbo boost for some time. Thus, serial code is likely to run at a faster GHz.
for ( 1 .. 111_111 ) { ... } # 3 * 3, 1 million for ( 1 .. 1_111_111 ) { ... } # 3 * 3, 10 million
Writing text data will slow this down a little due to obtaining the next unique id from the shared strTable object. The internal str_table is shared between worksheets in Excel::Writer::XLSX. Thus, synchronization is necessary as well.
Note: This requires MCE from trunk r957 or later which includes MCE::Shared as MCE 1.700 is not yet released. The logic consumes only the memory necessary. There is never duplicate data from running multiple workers.
#!/usr/bin/env perl use strict; use warnings; # --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- package StrTable; sub new { my ($class, $self) = ( shift, { table => {}, unique => 0 } ); bless $self, $class; } sub table { $_[0]->{table } } sub unique { $_[0]->{unique} } sub value { if (exists $_[0]->{table}->{ $_[1] }) { $_[0]->{table}->{ $_[1] }; } else { $_[0]->{table}->{ $_[1] } = $_[0]->{unique}++; } } # --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- package main; use Archive::Zip (); use File::Copy qw(move); use File::Find (); use File::Temp (); $File::Temp::KEEP_ALL = 1; use Excel::Writer::XLSX; use MCE::Signal qw($tmp_dir); use MCE::Loop 1.699; use MCE::Shared; my $nodeList = [ [ 'AMS' , 'a' ], [ 'APJ' , 'ap' ], [ 'EMEA', 'e' ] ]; my $strTable = mce_share( new StrTable ); my ($center, $format); { # Override _get_shared_string_index to synchronize str_table update +s no warnings 'redefine'; sub Excel::Writer::XLSX::Worksheet::_get_shared_string_index { my ($self, $str) = (shift, shift); if ( not exists ${ $self->{_str_cache} }->{$str} ) { ${ $self->{_str_cache} }->{$str} = $strTable->value($str); } else { ${ $self->{_str_cache} }->{$str}; } } } sub init_wb { my ($wn, $file) = (shift, shift); # Increment $wn by 1 since worksheet xml files begin at 1 $wn++; mkdir "$tmp_dir/$wn"; my $wb = Excel::Writer::XLSX->new($file || "$tmp_dir/$wn/tmp.xlsx") +; $wb->set_tempdir("$tmp_dir/$wn"); # Set workbook properties $wb->set_properties( title => 'Node List', author => 'L_WC demo', comments => 'Node List', ); # Define/add formats to the workbook $center = $wb->add_format(align => 'center'); $format = $wb->add_format(align => 'center', bg_color => 44); # Add worksheets, specify formats for columns/rows for (0 .. @{ $nodeList } - 1) { $wb->add_worksheet($nodeList->[$_][0]); $wb->sheets($_)->set_column(0, 4, 15, $center); } return $wb; } sub close_wb { my $wb = shift; MCE->sync(); # Wait for others to complete, important $wb->{_str_table } = $strTable->table(); # Replace str_table $wb->{_str_total } = 0+$strTable->unique(); # Update str_total $wb->{_str_unique} = 0+$strTable->unique(); # Update str_unique $wb->close(); # Close workbook } sub merge_wb_data { my $wb_file = shift; my ($zip, @pths, @xlsx_files) = (Archive::Zip->new()); local ($@, $!, $^E, $?); # Other files, e.g. table data likely need the same and not done # for this demonstration. Just worksheet files are merged. # I received help by reading _store_workbook inside # Excel::Writer::XLSX::Workbook.pm. # Find worksheet files 2,3,... for my $_num (1 .. @{ $nodeList }) { my $wanted = sub { push @pths, $1 if $File::Find::name =~ /(.*)\/sheet$_num\.xml +/; }; File::Find::find({ wanted => $wanted, untaint => 1, untaint_pattern => qr|^(.+)$ +| }, "$tmp_dir/$_num"); } # Move worksheet files 2,3,... to where worksheet 1 data resides for (0 .. @pths - 1) { unlink $pths[$_]."/../../../tmp.xlsx"; if ($_ > 0) { my $_num = $_ + 1; unlink $pths[0]."/sheet$_num.xml"; move $pths[$_]."/sheet$_num.xml", $pths[0]."/sheet$_num.xml"; } } # Re-zip xlsx files my $wanted = sub { push @xlsx_files, $File::Find::name if -f }; my $temp_dir = $pths[0]."/../../"; my $short_name; File::Find::find({ wanted => $wanted, untaint => 1, untaint_pattern => qr|^(.+)$| }, $temp_dir); for my $file_name (@xlsx_files) { $short_name = $file_name; $short_name =~ s{^\Q$temp_dir\E/?}{}; $zip->addFile($file_name, $short_name); } open my $fh, '>', $wb_file or die "Error opening xlsx file: $!\n"; binmode $fh; $zip->writeToFileHandle($fh); close $fh; } # --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- MCE::Loop::init( max_workers => scalar(@{ $nodeList }), chunk_size => 1, posix_exit => 1, use_threads => 0, ); mce_loop { my ($region, $sql) = ($_->[0], $_->[1]); my ($wb, $ws); # Acquire data from the DB. Each worker must obtain a handle. # The DB logic is similar to running serially. Just the where # clause is likely unique for each region. # Fill worksheet rows/cells if ($region eq 'AMS') { $wb = init_wb(0); $ws = $wb->sheets(0); $ws->write(0, 2, 'foo', $format); for ( 1 .. 111_111 ) { $ws->write(0 + $_, 0, 1000 + $_); $ws->write(1 + $_, 2, 2000 + $_); $ws->write(2 + $_, 4, 3000 + $_); } print "AMS ---- DONE.\n"; } elsif ($region eq 'APJ') { $wb = init_wb(1); $ws = $wb->sheets(1); $ws->write(0, 2, 'bar', $format); for ( 1 .. 111_111 ) { $ws->write(0 + $_, 0, 4000 + $_); $ws->write(1 + $_, 2, 5000 + $_); $ws->write(2 + $_, 4, 6000 + $_); } print "APJ ---- DONE.\n"; } elsif ($region eq 'EMEA') { $wb = init_wb(2); $ws = $wb->sheets(2); $ws->write(0, 2, 'baz', $format); for ( 1 .. 111_111 ) { $ws->write(0 + $_, 0, 7000 + $_); $ws->write(1 + $_, 2, 8000 + $_); $ws->write(2 + $_, 4, 9000 + $_); } print "EMEA ---- DONE.\n"; } close_wb($wb) if $wb; } $nodeList; # Shutdown MCE MCE::Loop::finish(); # Merge data into one workbook merge_wb_data('Node_List.xlsx'); print "Node List is Done.\n";
Kind regards, Mario
|
---|