I think graff summarized your problem quite well. "Fixing" CSV with regular expressions is fragile. It is possible for some very well-defined situation but likely to break in another.
What (usually) works better is to use a well-defined parser like Text::CSV_XS and remember what a sane state is: either scan each line knowing how many fields should be in it or dynamically remember what the first sane line told you. If a line gives you a parsing error or the parsed line does not contain enough fields, back up and join the next line. You can take the beforementioned parser-xs.pl as a framework to start from.
In your case, that could end up with something like (untested) this:
use strict;
use warnings;
use Text::CSV_XS;
my $csv = Text::CSV_XS->new ({ binary => 1,
blank_is_undef => 1,
eol => $/,
});
my $csa = Text::CSV_XS->new ({ binary => 1,
allow_loose_quotes => 1,
blank_is_undef => 1,
escape_char => undef,
});
my $file = @ARGV ? shift : "test.csv";
open my $fh, "<", $file or die "$file: $!\n";
my %err_eol = map { $_ => 1 } 2010, 2027, 2031, 2032;
print STDERR "Reading $file with Text::CSV_XS $Text::CSV_XS::VERSION\n
+";
my $nf = 0;
my @pv;
while (1) {
my $row = $csv->getline ($fh);
if ($row) {
if (@pv) {
# previous line ended with embedded newline.
$pv[-1] .= "\n" . shift @$row;
unshift @$row, @pv;
@pv = ();
}
$nf ||= @$row;
if (@$row < $nf) {
@pv = @$row;
redo;
}
}
else { # Parsing failed
# Could be end of file
$csv->eof and last;
# Diagnose and show what was wrong
my @diag = $csv->error_diag;
print STDERR "$file line $./$diag[2] - $diag[0] - $diag[1]\n";
my $ep = $diag[2] - 1; # diag[2] is 1-based
my $ein = $csv->error_input; # The line scanned so far
my $err = $ein . " ";
substr $err, $ep + 1, 0, "*"; # Bad character marked between
+ **
substr $err, $ep, 0, "*";
($err = substr $err, $ep - 5, 12) =~ s/ +$//;
print STDERR " |$err|\n";
REPARSE: { # Now retry with allowed options
if ($csa->parse ($ein)) {
print STDERR "Accepted in allow mode ...\n";
$row = [ $csa->fields ];
}
else { # Still fails
my @diag = $csa->error_diag;
if (exists $err_eol{$diag[0]}) { # \r or \n inside fie
+ld
print STDERR " Extending line with next chunk\n";
$ein .= scalar <$fh>;
goto REPARSE;
}
print STDERR " Also could not parse it in allow mode\
+n";
print STDERR " $./$diag[2] - $diag[0] - $diag[1]\n";
print STDERR " Line skipped\n";
next;
}
}
}
# Data was fine, print data properly quoted
$csv->print (*STDOUT, $row);
}
Enjoy, Have FUN! H.Merijn
|