http://qs1969.pair.com?node_id=11120825

Anonymous Monk has asked for the wisdom of the Perl Monks concerning the following question:

Dear Monks, I am trying to populate a table with a rather large (~60GB) file. I would like to ask if there is a way to populate the table, even if, say 1% of the lines are erroneous, without the script dying.
My code so far is:
#!/usr/bin/perl use strict; use warnings; use DBI; my $numArgs = $#ARGV + 1; die "Usage: ./upload_script.pl <CSV_FILE>" if ($numArgs != 1); my $infile = $ARGV[0]; my $logfile = $infile.'.log'; my $driver = "Pg"; my $database = "testdb"; my $dsn = "DBI:$driver:dbname = $database;port = 123456"; my $userid = "xxx"; my $password = "yyy"; my $dbh = DBI->connect($dsn, $userid, $password, { RaiseError => 1 }) or die $DBI::errstr; open LOG, ">$logfile"; print LOG "Opened database successfully\n"; my $count=0; open (DB_IN, "<$infile" ) || die ("File not found: '$infile'\n"); readline DB_IN; while(<DB_IN>) { my $input_line=$_; chomp $input_line; my @array_of_input = split(/\t/, $input_line); $count++; my $pid_field = $array_of_input[0]; my $quantityid_field = $array_of_input[1]; my $labid_field = $array_of_input[2]; my $system_field = $array_of_input[3]; my $component_field = $array_of_input[4]; my $numericvalue_field = $array_of_input[6]; my $ref_lower_field = $array_of_input[7]; my $ref_upper_field = $array_of_input[8]; my $shown_value_field = $array_of_input[9]; my $date_field = $array_of_input[10]; my $time_field = $array_of_input[11]; my $database_field = $array_of_input[12]; my $componentdb_field = $array_of_input[13]; my $componenttranslationlookup_field = $array_of_input[14]; my $testtype_field = $array_of_input[15]; my $unitclean_field = $array_of_input[16]; my $shownclean_field = $array_of_input[17]; my $reflowerclean_field = $array_of_input[18]; my $refupperclean_field = $array_of_input[19]; my $intervaltype_field = $array_of_input[20]; my $flag_field = $array_of_input[21]; my $abo_field = $array_of_input[22]; my $rhesus_field = $array_of_input[23]; my $dob_field = $array_of_input[24]; my $sex_field = $array_of_input[25]; my $insert_stmt = " INSERT INTO biochemical VALUES (".$count.", '".$pid_field."', '".$quantityid_field."', '".$labid_field."', '".$system_field."', '".$component_field."', '".$numericvalue_field."', '".$ref_lower_field."', '".$ref_upper_field."', '".$shown_value_field."', '".$date_field."', '".$time_field."', '".$database_field."', '".$componentdb_field."', '".$componenttranslationlookup_fiel +d."', '".$testtype_field."', '".$unitclean_field."', '".$shownclean_field."', '".$reflowerclean_field."', '".$refupperclean_field."', '".$intervaltype_field."', '".$flag_field."', '".$abo_field."', '".$rhesus_field."', '".$dob_field."', '".$sex_field."')"; my $rv = $dbh->do($insert_stmt); if(!$rv) { print "Problem inserting: $input_line\n"; } } close DB_IN; close LOG;
This code does work, but it dies as soon as an erroneous line is found. Ideally, I would like this line to just be thrown into my LOG and then the script continues to execute. Is there a way? In that way, I could leave the script run unattended and then check the lines that were not able to be inserted.