kevsurf has asked for the wisdom of the Perl Monks concerning the following question:

Hello,

I need help with the code below. It all works and the data is streamed into the database, but it takes a long time. I'm dealing with 60,000+ records that are being pulled from a continuous growing text file (that stops after the hour of data is collected). I just want to see if anyone has suggestions on improving my code.

cdlPg.pm
package cdlPg; use lib '/export/home/motodev/bin/'; use DBI; #use DBD::mysql; use DBD::Pg; use Benchmark; use IO::File; use CDMAConfig; sub new { my $class = $_[0]; my $objref = { _mkt => $_[1], _srvr => $_[2], _db => $_[3], _st => $_[4], _et => $_[5], _ed => $_[6], _file => $_[7], _tbl => $_[8], _release=> $_[9], _fstopn => $_[10], }; bless $objref, $class; return $objref; }#end new sub print_me { my ($self) = @_; print "mkt: $self->{_mkt}\n"; print "srvr: $self->{_srvr}\n"; print "db: $self->{_db}\n"; print "st: $self->{_st}\n"; print "et: $self->{_et}\n"; print "ed: $self->{_ed}\n"; print "file: $self->{_file}\n"; print "table: $self->{_tbl}\n"; print "release: $self->{_release}\n"; print "dbh: $self->{_dbh}\n"; my $dbh = $self->{_dbh}; #my $sth = $self->{_sth}; %ENV = %CDMAConfig::MOTOENV; #open(FHD, ">$self->{_file}"); open(WRITEME, "| /export/home/motorola/local/postgresql7.2.3/bin/psql +-d $self->{_db} -U kevin -c 'COPY $self->{_tbl} FROM stdin' ") or die + "Couln't fork: $! \n"; #$dbh->trace(2); #$sql = "INSERT INTO $self->{_tbl} VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?, +?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,? +,?,?,?,?,?,?)"; $sql = "SELECT nextval('" . $self->{_tbl} . "_id_seq')"; $sth = $dbh->prepare($sql); $loop_time=new Benchmark; while( @kolines = readany( @{$self->{_fstopn}} ) ) { $number++ foreach my $theline (@kolines) { if($ver == 16) { @line = (split(/\|/, $theline))[0,4,5,6,9,10,11,14,18, +19,21,22,23,25,26,27,28,29,30,31,32,33,34,35,36,37,254,257,258,259,26 +4,265,266,271,272,273,299,300,301,306,307,308,313,314,315,397,398,399 +,400,403,404,406,407,408,409]; } else { @line = (split(/\|/, $theline))[0,4,5,6,9,10,11,14,18,19,2 +1,22,23,25,26,27,28,29,30,31,32,33,34,35,36,37,211,213,214,215,218,21 +9,220,223,224,225,243,244,245,248,249,250,253,254,255,317,318,319,320 +,322,323,325,326,327,328]; } if($line[0] eq "") { #print "8 is: $line[8]"; $line[0] = '0001-01-01'; } if($line[8] eq "") { #print "8 is: $line[8]"; $line[8] = '00:00:00'; } if($line[25] eq "") { #print "25 is: $line[25]"; $line[25] = '00:00:00'; } #print join(", ", @line); $sth->execute; @rv = $sth->fetchrow_array; print WRITEME join("\t", @line,$rv[0]) . "\n"; #$dbh->commit(); #($number % 5) == 0 ? $dbh->commit() : next; }#end foreach }#end while close(WRITEME); $end_time=new Benchmark; open(KOOUT, ">>/export/home/motodev/kobench.txt"); print KOOUT "$self->{_tbl} the code took: " . timestr(timediff($end_t +ime, $loop_time),"all") . "\n\n"; close(KOOUT); $sth->finish(); $dbh->disconnect(); } sub readany { my $cnt = 0; my @res; for my $fh ( @_ ) { next if eof($fh); my $line = <$fh>; $cnt++ if defined($line); push @res, $line; } return if ! $cnt; return @res; }#end readany sub getData { my ($self) = @_; my @files = @{$self->{_fstopn}}; foreach my $file (@files) { local *FILE; open(FILE, "<$file") or die "can't open file $!\n"; push(@filehandles, *FILE); } $self->{_fstopn} = \@filehandles; &print_me; } # Establish the connection and create the table for this data - return + database connection with prepared statement sub connectDB { my ($self) = @_; $dbh = DBI->connect("DBI:Pg:dbname=$self->{_db};host=se-srvr1;port +=5432", 'kevin', 'passwordhere', { RaiseError => 1, AutoCommit => 0 } +); $dbh->do("CREATE TABLE $self->{_tbl} ( date date DEFAULT '0001-01-01', cdl_seq_num int4 DEFAULT 0, call_ref_num int4 DEFAULT 0, cbsc int4 DEFAULT 0, cpp int4, mid char(10), esn int8 DEFAULT 0, dialed_digits char(32), access_time time DEFAULT '00:00:00', access_pn_offset int4, access_channel int4, access_bts int4 DEFAULT '0', access_sector int4, service_option int4, negotiated_so int4, last_mm_setup_event int4, cic_span int4, cic_slot int4, xcdr int4, init_rf_conn_bts int4, init_rf_conn_sector int4, init_rf_conn_mcc int4, init_rf_conn_element int4, init_rf_conn_channel int4, cfc int4, release_time time DEFAULT '00:00:00', last_maho_cand_count int4, last_maho_act1_bts int4, last_maho_act1_sector int4, last_maho_act1_str int2, last_maho_act2_bts int4, last_maho_act2_sector int4, last_maho_act2_str int2, last_maho_act3_bts int4, last_maho_act3_sector int4, last_maho_act3_str int2, last_maho_cand1_bts int4, last_maho_cand1_sector int4, last_maho_cand1_str int2, last_maho_cand2_bts int4, last_maho_cand2_sector int4, last_maho_cand2_str int2, last_maho_cand3_bts int4, last_maho_cand3_sector int4, last_maho_cand3_str int2, last_sho_bts int4, last_sho_sector int4, last_sho_mcc int4, last_sho_element int4, fwd_quality int4, last_fwd_incr int4, rvs_quality int4, last_rvs_incr int4, rvs_erase_count int4, rf_fade_count int4, id serial)") || print "Error is: $dbh->errstr\n"; $dbh->commit(); $self->{_dbh} = $dbh; return $self->{_dbh}; } 1;
Here's the code that calls the module: oocdlbPg.pl
use lib '/export/home/motodev/bin/'; use Getopt::Std; use Date::Manip; use CDMAConfig; use cdlPg; getopt('ZDHCO'); # Get command line parameters # -Z - which market to produce reports for - separate each market by a + comma - default is all # -D - Date # -H - Hour # -C - Crontab run - if set to 1, means that the script is being calle +d from the crontab # and certain variables should be adjusted accordingly. # -O - OMC to run # # -R - Force collection ??? - NOT CURRENLTY SUPPORTED # # Variables produced are $opt_T, $opt_Z, $opt_D, $opt_H and $opt_C if($opt_C == "1") { #Script being called from cron.....set date and hour to cu +rrent time. $hour = (localtime)[2]; $hour = sprintf ("%02d", $hour); #find previous hour $day = &UnixDate("today","%y%m%d"); $dispday = &UnixDate("today","%Y%m%d"); $date = $day . $hour; } else { $opt_H ? ( $hour = $opt_H ) : ( die "Must supply Hour\n" ); $opt_D ? ( $day = $opt_D ) : ( die "Must supply Date\n" ); $date = $day . $hour; $dispday = "20" . $day . $hour; } # Error checking for parameter variables $opt_Z ? ( $mkt = $opt_Z ) : ( die "Must supply Market\n" ); #$opt_O ? ( $omc = $opt_O ) : ( die "Must supply OMC\n" ); $rel = $CDMAConfig::MAR_CDL_FORMAT{$mkt}; $omcaddr = $CDMAConfig::OMC_ADDRESSES{$mkt}{$omc}; $begin = $hour . "0000"; $end = $hour . "5959"; $file = $CDMAConfig::REPORT_PATH . $mkt . "/pdfs/$dispday$hour.$mkt.$r +el.$omc.Pg.pdfs"; #$file = "/export/home/motorola/cdl_pdf/" . $mkt . "/$dispday$hour.$mk +t.r$rel.omc$omc.pdfs"; $table = "cdl_" . $dispday . $hour; #print "$mkt, $omcaddr, $mkt, $begin, $end, $day, $file, $table, $rel" +; %mktomcs = %{$CDMAConfig::OMC_ADDRESSES{$mkt}}; @mktkys = @{[ keys %mktomcs ]}; foreach $mktky (@mktkys) { my $file = "/export/home/motorola/cdl_pdf/" . $mkt . "/$dispday$ho +ur.$mkt.r$rel.omc$mktky.pdfs"; push @fstopn, $file; } #print join("\n", @fstopn); #@fstopn = ("/home/kevin/tmp/file1","/home/kevin/tmp/file2"); $kocdl = cdlPg->new($mkt, $omcaddr, $mkt, $begin, $end, $day, $file, $ +table, $rel, \@fstopn); $dbcon = $kocdl->connectDB; $kocdl->getData($dbcon); __END__
I know the code looks rough, but hopefully someone can help.

Any help is greatly appreciated!!!

Kevin

update (broquaint): added <readmore> tags and some formatting

Replies are listed 'Best First'.
•Re: Stream data into Postgresql
by merlyn (Sage) on Nov 08, 2002 at 05:14 UTC
    Postgresql is "transaction" oriented. If you don't use "BEGIN" and "COMMIT", then every single statement acts like you had wrapped it in those. And COMMITs are expensive.

    If you can, batch your insertions into a group of 10 or 100, and commit once for those. It'll go much, much faster.

    -- Randal L. Schwartz, Perl hacker
    Be sure to read my standard disclaimer if this is a reply.

Re: Stream data into Postgresql
by diotalevi (Canon) on Nov 08, 2002 at 13:18 UTC

    I already answered most of your questions in the chatterbox but I'm following up for completeness and the archives. Prior to going farther with this you ought to read both of these links COPY and Populating a database. All it comes down to is COPY is faster than INSERT, indexes slow down copy/insert/update and doing things in batches is better than doing them one at a time. In practical terms that means you're almost right with this approach, it's just your execution that needs help.

    First off you need to finish off your COPY command with print WRITEME "\\.\n". You aren't doing such a large load that it probably isn't going to matter if you break this into smaller chunks - 60K records is nothing (my experience anyway). If you were doing an actual large data load then you'd probably break every 10K or so, finish the current COPY, be sure the transaction was complete and start a new transaction/COPY. If you had indexes on the table (which you don't) then it might also be worth it to look at droping them while loading the data and re-instating them afterwards. But then, your instance doesn't merit going to such lengths.

    __SIG__ use B; printf "You are here %08x\n", unpack "L!", unpack "P4", pack "L!", B::svref_2object(sub{})->OUTSIDE;
Re: Stream data into Postgresql
by graff (Chancellor) on Nov 08, 2002 at 04:45 UTC
    Well, just a general comment: it's typical for DBI to operate very slowly on large numbers of sql transactions; any RDBMS worth having will provide its own compiled utility for loading data into a table from a file or other stream, and this utility will usually run many times faster than DBI, doing the same large task.

    So think about setting up a perl script to use an external, RDB-specific utility to load the actual data into the table. (Other stuff, like creating the table, or massaging the data as it goes in, should be quick enough in perl.)