in reply to substitute space to 0
A hex dump of the download of the OPed example data seems to show that these are tab-separated CSV records in which you need to expand null fields to a single '0' character. If that's the case, the best advice is to save yourself a headache | many headaches and to use the flexible and reliable Text::CSV module (which will automatically install its cool, fast big brother Text::CSV_XS if it possibly can).
However, if a "pure" regex solution is needed, here's my approach. (A split/fixup/join approach might be easier to understand, but I'm too tired right now to take that on. :) Tested under Perl versions 5.8.9 and 5.10.1.4, but some alternate versions of the critical substitution regex need version 5.10+ for the \K operator.
use warnings; use strict; # use 5.010; # needs \K regex extension for some regexes use Test::More 'no_plan'; use Test::NoWarnings; use Data::Dump qw(pp); note "\n === all test cases are with and without terminating newlines +===\n\n"; my @tests = ( 'simple cases - no tab separator', [ qq{\n}, qq{\n} ], [ qq{}, qq{} ], [ qq{1\n}, qq{1\n} ], [ qq{1}, qq{1} ], 'simple cases - tab separator(s), no null fields', [ qq{1\t23\n}, qq{1\t23\n} ], [ qq{1\t23}, qq{1\t23} + ], [ qq{1\t23\t456\n}, qq{1\t23\t456\n} ], [ qq{1\t23\t456}, qq{1\t23\t +456} ], 'simple cases - only tab(s) (null fields)', [ qq{\t\n}, qq{0\t0\n} ], [ qq{\t}, qq{0\t0} ], [ qq{\t\t\n}, qq{0\t0\t0\n} ], [ qq{\t\t}, qq{0\t0\t0} ], [ qq{\t\t\t\n}, qq{0\t0\t0\t0\n} ], [ qq{\t\t\t}, qq{0\t0\t0\t0} ], 'tabs and digits intermixed (leading/trailing null fields)', [ qq{1\t\n}, qq{1\t0\n} ], [ qq{1\t}, qq{1\t0} ], [ qq{\t1\n}, qq{0\t1\n} ], [ qq{\t1}, qq{0\t1} ], [ qq{1\t\t\n}, qq{1\t0\t0\n} ], [ qq{1\t\t}, qq{1\t0\t0} ], [ qq{\t1\t\n}, qq{0\t1\t0\n} ], [ qq{\t1\t}, qq{0\t1\t0} ], [ qq{\t\t1\n}, qq{0\t0\t1\n} ], [ qq{\t\t1}, qq{0\t0\t1} ], [ qq{1\t\t\t\n}, qq{1\t0\t0\t0\n} ], [ qq{1\t\t\t}, qq{1\t0\t0\t0} ] +, [ qq{\t1\t\t\n}, qq{0\t1\t0\t0\n} ], [ qq{\t1\t\t}, qq{0\t1\t0\t0} ] +, [ qq{\t\t1\t\n}, qq{0\t0\t1\t0\n} ], [ qq{\t\t1\t}, qq{0\t0\t1\t0} ] +, [ qq{\t\t\t1\n}, qq{0\t0\t0\t1\n} ], [ qq{\t\t\t1}, qq{0\t0\t0\t1} ] +, 'tabs and digits intermixed (no leading/trailing null fields)', [ qq{12\t34\n}, qq{12\t34\n} ], [ qq{12\t34}, qq{12\t34} ], [ qq{12\t\t34\n}, qq{12\t0\t34\n} ], [ qq{12\t\t34}, qq{12\t0\t34} ], [ qq{12\t\t\t34\n}, qq{12\t0\t0\t34\n} ], [ qq{12\t\t\t34}, qq{12\t0\t0\t34} ], 'test cases posted with pm 1220510 (with/without newlines)', [ qq{1\t2\t\t\t2\t\t5\t\t\t\t4\n}, qq{1\t2\t0\t0\t2\t0\t5\t0\t0\t0\t4\n}, ], [ qq{1\t2\t\t\t2\t\t5\t\t\t\t4}, qq{1\t2\t0\t0\t2\t0\t5\t0\t0\t0\t4}, ], [ qq{4\t4\t4\t\t\t4\t\t\t\t3\t\n}, qq{4\t4\t4\t0\t0\t4\t0\t0\t0\t3\t0\n}, ], [ qq{4\t4\t4\t\t\t4\t\t\t\t3\t}, qq{4\t4\t4\t0\t0\t4\t0\t0\t0\t3\t0}, ], [ qq{\t\t4\t4\t\t\t1\t\t\t\t\n}, qq{0\t0\t4\t4\t0\t0\t1\t0\t0\t0\t0\n}, ], [ qq{\t\t4\t4\t\t\t1\t\t\t\t}, qq{0\t0\t4\t4\t0\t0\t1\t0\t0\t0\t0}, ], [ qq{\t1\t5\t6\t\t4\t\t\t\t\t\n}, qq{0\t1\t5\t6\t0\t4\t0\t0\t0\t0\t0\n}, ], [ qq{\t1\t5\t6\t\t4\t\t\t\t\t}, qq{0\t1\t5\t6\t0\t4\t0\t0\t0\t0\t0}, ], ); VECTOR: for my $ar_vector (@tests) { if (not ref $ar_vector) { note $ar_vector; next VECTOR; } my ($string, $expected) = @$ar_vector; (my $got = $string) =~ # the next two regexes work, but need perl version 5.10+ for \K # s{ (?: \A | \t) \K (?= \t) | (?<= \t) (?= (?: \t | \Z)) } # works # s{ (?! \A \Z) (?: \A | \t) \K (?= (?: \t | \Z)) } # works # the next regex works with both pre-5.10 and 5.10+ perl versions s{ (?! \A \Z) (?: \A | (?<= \t)) (?= (?: \t | \Z)) } # works {0}xmsg; ok $got eq $expected, # pp($string, $got, $expected) # for debug pp($string, $got) ; } # end for VECTOR done_testing; exit;
c:\@Work\Perl\monks\yueli711>perl normalize_tsv_1.pl # # === all test cases are with and without terminating newlines === # # simple cases - no tab separator ok 1 - ("\n", "\n") ok 2 - ("", "") ok 3 - ( # 1 # , # 1 # , # ) ok 4 - (1, 1) # simple cases - tab separator(s), no null fields ok 5 - ("1\t23\n", "1\t23\n") ok 6 - ("1\t23", "1\t23") ok 7 - ("1\t23\t456\n", "1\t23\t456\n") ok 8 - ("1\t23\t456", "1\t23\t456") # simple cases - only tab(s) (null fields) ok 9 - ("\t\n", "0\t0\n") ok 10 - ("\t", "0\t0") ok 11 - ("\t\t\n", "0\t0\t0\n") ok 12 - ("\t\t", "0\t0\t0") ok 13 - ("\t\t\t\n", "0\t0\t0\t0\n") ok 14 - ("\t\t\t", "0\t0\t0\t0") # tabs and digits intermixed (leading/trailing null fields) ok 15 - ("1\t\n", "1\t0\n") ok 16 - ("1\t", "1\t0") ok 17 - ("\t1\n", "0\t1\n") ok 18 - ("\t1", "0\t1") ok 19 - ("1\t\t\n", "1\t0\t0\n") ok 20 - ("1\t\t", "1\t0\t0") ok 21 - ("\t1\t\n", "0\t1\t0\n") ok 22 - ("\t1\t", "0\t1\t0") ok 23 - ("\t\t1\n", "0\t0\t1\n") ok 24 - ("\t\t1", "0\t0\t1") ok 25 - ("1\t\t\t\n", "1\t0\t0\t0\n") ok 26 - ("1\t\t\t", "1\t0\t0\t0") ok 27 - ("\t1\t\t\n", "0\t1\t0\t0\n") ok 28 - ("\t1\t\t", "0\t1\t0\t0") ok 29 - ("\t\t1\t\n", "0\t0\t1\t0\n") ok 30 - ("\t\t1\t", "0\t0\t1\t0") ok 31 - ("\t\t\t1\n", "0\t0\t0\t1\n") ok 32 - ("\t\t\t1", "0\t0\t0\t1") # tabs and digits intermixed (no leading/trailing null fields) ok 33 - ("12\t34\n", "12\t34\n") ok 34 - ("12\t34", "12\t34") ok 35 - ("12\t\t34\n", "12\t0\t34\n") ok 36 - ("12\t\t34", "12\t0\t34") ok 37 - ("12\t\t\t34\n", "12\t0\t0\t34\n") ok 38 - ("12\t\t\t34", "12\t0\t0\t34") # test cases posted with pm 1220510 (with/without newlines) ok 39 - ( # "1\t2\t\t\t2\t\t5\t\t\t\t4\n", # "1\t2\t0\t0\t2\t0\t5\t0\t0\t0\t4\n", # ) ok 40 - ( # "1\t2\t\t\t2\t\t5\t\t\t\t4", # "1\t2\t0\t0\t2\t0\t5\t0\t0\t0\t4", # ) ok 41 - ( # "4\t4\t4\t\t\t4\t\t\t\t3\t\n", # "4\t4\t4\t0\t0\t4\t0\t0\t0\t3\t0\n", # ) ok 42 - ( # "4\t4\t4\t\t\t4\t\t\t\t3\t", # "4\t4\t4\t0\t0\t4\t0\t0\t0\t3\t0", # ) ok 43 - ( # "\t\t4\t4\t\t\t1\t\t\t\t\n", # "0\t0\t4\t4\t0\t0\t1\t0\t0\t0\t0\n", # ) ok 44 - ("\t\t4\t4\t\t\t1\t\t\t\t", "0\t0\t4\t4\t0\t0\t1\t0\t0\t0\t0") ok 45 - ( # "\t1\t5\t6\t\t4\t\t\t\t\t\n", # "0\t1\t5\t6\t0\t4\t0\t0\t0\t0\t0\n", # ) ok 46 - ("\t1\t5\t6\t\t4\t\t\t\t\t", "0\t1\t5\t6\t0\t4\t0\t0\t0\t0\t0" +) 1..46 ok 47 - no warnings 1..47
Give a man a fish: <%-{-{-{-<
|
|---|