use strict; use utf8; use File::Basename; my @files = glob($ARGV[0]); my $outdir = $ARGV[1]; my $debug = $ARGV[2]; die "No output directory given\n" unless -d $outdir; $outdir =~ s/\\/\//g; # backslash to forward $outdir =~ s/([^\/])$/$1\//; # add final slash if missing foreach my $file(@files){ my $outfile = $outdir . '/' . basename($file); open(CSV, '<', $file)||die "Cannot open $file for read:$!\n"; binmode CSV; open(OUT, '>', $outfile)||die "Cannot open $outfile for write:$!\n"; while (my $line = ){ $line =~ s/\x0D\x0A/\n/g; # binary, so we're still stuck with \r\n dos endings possibly - why are we using binary? if($line =~ /[^[:ascii:]]/){ print "Before: $line\n" if $debug; # translations from octal sequence to ascii char $line =~ s/\302\267/./g; # odd utf 'floating' point to ascii . $line =~ s/\342\200\230/'/g; # left single curly quote to ascii ' $line =~ s/\342\200\231/'/g; # right single curly quote to ascii ' $line =~ s/\342\200\223/-/g; # em-dash to ascii - $line =~ s/\303\257/i/g; # double-dot i to ascii i $line =~ s/\302\243/GBP/g; # pound sign to GBP $line =~ s/\342\200\246/.../g; # elipsis to ascii ... $line =~ s/\302\256/(a)/g; # @ to (a) $line =~ s/\303\250/e/g; # grave e to e $line =~ s/\303\251/e/g; # acute e to e $line =~ s/\342\211\244/\>\=/g; # utf >= to ascii >= $line =~ s/\342\211\245/\<\=/g; # utf <= to ascii <= $line =~ s/\303\264/o/g; # circumflex o (?!?) to ascii o $line =~ s/\302\240/\s/g; # nbsp to sp $line =~ s/\302\263/\^3/g; # superscript 3 to ^3 $line =~ s/\302\262/\^2/g; # superscript 2 to ^2 $line =~ s/\302\260/ degrees/g; # degrees symbol to word ' degrees' $line =~ s/\342\200\235/""/g; # right double curly quote to ascii " (escaped for csv) $line =~ s/\342\200\234/""/g; # left double curly quote to ascii " (escaped for csv) $line =~ s/\302\275/1\/2/g; # utf 1/2 to ascii plain 1/2 if($line =~ /[^[:ascii:]]/){ $line =~ s/([^[:ascii:]])/'[' . (ord $1) . '\/' . (sprintf("0x%X", (ord $1))) . '\/' . (sprintf("%o", (ord $1))) . ']'/ge; print "Unhandled sequence: $line\n"; } print "After: $line\n" if $debug; } print OUT "$line"; } }