comment on

hello, i'm dealing with a csv file that has some 'bad' lines. how do i handle multiple missing values? multiple commas. my code works for single missing values.

this multiple missing values happens rarely

. i'd like a solution that is still fast at parsing. i'm using Parse::CSV

  my $csv = Parse::CSV->new(
  file       => $xFile[$k],
  sep_char   => ',',
  names      => 1,
  empty_is_undef => 1,
  auto_diag => 1,
  binary    => 0,
  header =>'auto' );
[download]

11004516,0,0,9,9,3,12477,,,4,,0,,,3,38a947a1,b66b7850,6a14f9b9 11006995,1,,-1,,,,,,,,,,,,fbc55dae,9a89b36c,58e67aaf,f600ec0b,

the error is "Argument "" isn't numeric in numeric eq (==)"

i want so substitute '0' for blanks or missing values i've tried code with various comparisons like

if(length($str_check)==0){return ('0');}else{return($str_check);}
[download]

if($str_check = undef){do stuff};{return ('0');}else{return($str_check
+);}
[download]

any help appreciated.

thanks

Here is my full code . Yes, I admit I am new to serious Perl scripts and this code is not optimal.

use 5.12.0;
use warnings;
use strict;
use Carp;
use sigtrap 'handler' => \&myhand, 'INT';
use Cwd;
use Benchmark;
use File::Basename;
use Acme::Comment type => 'C++', own_line => 1;
use English '-no_match_vars';
#####################################################################
use Parse::CSV;
use Text::CSV_XS;
#####################################################################
system('clear');
my $dbg_1=0;
my $dbg_2=1;
my $start=time;
my $t0 = new Benchmark;
print "\n Current Date and Time -> " . localtime() . "\n";
my $Base='/Users/Documents/matlab/projects/kaggle/criteo';
my $s_DIR=$Base.'/input/tmp';
my $p_DIR=$Base.'/output/data/pass';
my $f_DIR=$Base.'/output/data/fail';
my @xFile = grep {-f $_}glob( "$s_DIR/x*");
#
if($dbg_1){
  foreach my $f (@xFile) {
    my $filesize = -s $f;
    printf "%-25s size is %15d \n", ($f, $filesize);
  };
};
#
#initialize vars
my $k=0;
my $noLines=9e3;
my $count=0;
my $result=0;
my $temp=0;
my $value=0;
my $name="";
my $n=@xFile;

for ($k = 0; $k <= $n; $k++){
  my $indexF=0;
  my $indexP=0;
  (my $suffix,my $path,$name)=fileparse($xFile[$k], "\.[^.]*" );
  print 'processing '.$name."\n";
  my $f_Pass=$p_DIR."/pass_table_".$name.'.txt';
  my $f_Fail=$f_DIR."/fail_table_".$name.'.txt';
  open(DATA,">".$f_Pass) || die "Can't open output file";
  open(DATA2,">".$f_Fail) || die "Can't open output file";
  if($dbg_1){
    print "xF=> ".$k."\n";
    print "xFile[xF]=> ".$xFile[$k]."\n";
    print "name=> ".$name." \n";
    print "path=> ".$path." \n";
    print "suffix=> ".$suffix." \n";
    print "f_Pass=> ".$f_Pass."\n";
    print "f_Fail=> ".$f_Fail."\n";
  };
  my $csv = Parse::CSV->new(
  file       => $xFile[$k],
  sep_char   => ',',
  names      => 1,
  empty_is_undef => 1,
  blank_is_undef => 1,
  auto_diag => 1,
  binary    => 1,
  header =>'auto',
  callbacks      => { after_parse => sub { $_ ||= 0 for @{$_[1] }
},}
    );

  my @hash = $csv->names; #returns hash
  my @vals = values @hash; #hash to array
#  for ($count = 0; $count <= $noLines; $count++) {
#    $value = $csv->fetch;
while ( $value = $csv->fetch ){
    if($value->{$vals[1]}==1){
      for $k (2 .. $#vals) {
        $temp=$value->{$vals[$k]};
        $result=check_blank($temp);
        process_table($k,$result);
      };
      printf DATA "\n";
      $indexP=$indexP+1;
    }else{
      for $k (2 .. $#vals) {
        $temp=$value->{$vals[$k]};
        $result=check_blank($temp);
        process_table2($k,$result);
      };
      printf DATA2 "\n";
      $indexF=$indexF+1;
    };
  };
  print " totalP $indexP totalF ".($indexF-0)." total ".($indexP+$inde
+xF)." \n";
  printf "%% totalP/(totalF+totalP)= %.2f %% \n",($indexP/($indexP+$in
+dexF)*100);
  close(DATA) || die "Couldn't close output file properly";
  close(DATA2) || die "Couldn't close output file properly";

};

######################## sub #########################################
+###
sub check_blank{
  my $str_check= $_[0];
  if((length($str_check)==0)) {
    return ('0');
  }else{
    return($str_check);
  };
exit 1;
};
sub process_table{
  my $kk= $_[0];
  my $result= $_[1];
  if($kk==1){
    #do something here
  }else{
    printf DATA $result." ";
  };
  return;
  exit 1;
};
sub process_table2{
  my $kk= $_[0];
  my $result= $_[1];
  if($kk==1){
    #do something here
  }else{
    printf DATA2 $result." ";
  };
  return;
  exit 1;
};
########################system #######################################
+#
my $t1 = new Benchmark;
my $td = timediff($t1, $t0);
print "Code took:",timestr($td),"\n";
printf "++Finished program in ->\t %5.2f seconds\n",time-$start;
print "\n";
######################################################################
+####
sub myhand {
  print "\n caught $SIG{INT}", @_;
  close(DATA) || die "Couldn't close output file properly";
  print "\nHey Stop that SIG hurts!";
  print "\nCleaning up now...";
  exit 1;
};
sub pad (){
  my ( $num, $len ) = @_;
  return '0' x ( $len - length $num ) . $num;
  exit 1;
};

######################################################################
+#####
exit 1;
[download]

In reply to csv parsing with multiple missing values/multiple commas by f77coder

Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!

Titles consisting of a single word are discouraged, and in most cases are disallowed outright.

Read Where should I post X? if you're not absolutely sure you're posting in the right place.

Please read these before you post! —

Posts may use any of the Perl Monks Approved HTML tags:

a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, details, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, summary, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr

You may need to use entities for some characters, as follows. (Exception: Within code tags, you can put the characters literally.)

	For:		Use:
	&		`&`
	<		`<`
	>		`>`
	[		`[`
	]		`]`

Link using PerlMonks shortcuts! What shortcuts can I use for linking?

See Writeup Formatting Tips and other pages linked from there for more info.