Re: substitute space to 0

A hex dump of the download of the OPed example data seems to show that these are tab-separated CSV records in which you need to expand null fields to a single '0' character. If that's the case, the best advice is to save yourself ~~a headache~~ | many headaches and to use the flexible and reliable Text::CSV module (which will automatically install its cool, fast big brother Text::CSV_XS if it possibly can).

However, if a "pure" regex solution is needed, here's my approach. (A split/fixup/join approach might be easier to understand, but I'm too tired right now to take that on. :) Tested under Perl versions 5.8.9 and 5.10.1.4, but some alternate versions of the critical substitution regex need version 5.10+ for the \K operator.

use warnings;
use strict;

# use 5.010;  # needs \K regex extension for some regexes

use Test::More 'no_plan';
use Test::NoWarnings;

use Data::Dump qw(pp);

note "\n === all test cases are with and without terminating newlines 
+===\n\n";

my @tests = (
  'simple cases - no tab separator',
  [ qq{\n},  qq{\n}  ], [ qq{},  qq{}  ],
  [ qq{1\n}, qq{1\n} ], [ qq{1}, qq{1} ],

  'simple cases - tab separator(s), no null fields',
  [ qq{1\t23\n},      qq{1\t23\n}      ], [ qq{1\t23},      qq{1\t23} 
+     ],
  [ qq{1\t23\t456\n}, qq{1\t23\t456\n} ], [ qq{1\t23\t456}, qq{1\t23\t
+456} ],

  'simple cases - only tab(s) (null fields)',
  [ qq{\t\n},     qq{0\t0\n}       ], [ qq{\t},     qq{0\t0}       ],
  [ qq{\t\t\n},   qq{0\t0\t0\n}    ], [ qq{\t\t},   qq{0\t0\t0}    ],
  [ qq{\t\t\t\n}, qq{0\t0\t0\t0\n} ], [ qq{\t\t\t}, qq{0\t0\t0\t0} ],

  'tabs and digits intermixed (leading/trailing null fields)',
  [ qq{1\t\n},   qq{1\t0\n}    ], [ qq{1\t},   qq{1\t0}    ],
  [ qq{\t1\n},   qq{0\t1\n}    ], [ qq{\t1},   qq{0\t1}    ],

  [ qq{1\t\t\n}, qq{1\t0\t0\n} ], [ qq{1\t\t}, qq{1\t0\t0} ],
  [ qq{\t1\t\n}, qq{0\t1\t0\n} ], [ qq{\t1\t}, qq{0\t1\t0} ],
  [ qq{\t\t1\n}, qq{0\t0\t1\n} ], [ qq{\t\t1}, qq{0\t0\t1} ],

  [ qq{1\t\t\t\n}, qq{1\t0\t0\t0\n} ], [ qq{1\t\t\t}, qq{1\t0\t0\t0} ]
+,
  [ qq{\t1\t\t\n}, qq{0\t1\t0\t0\n} ], [ qq{\t1\t\t}, qq{0\t1\t0\t0} ]
+,
  [ qq{\t\t1\t\n}, qq{0\t0\t1\t0\n} ], [ qq{\t\t1\t}, qq{0\t0\t1\t0} ]
+,
  [ qq{\t\t\t1\n}, qq{0\t0\t0\t1\n} ], [ qq{\t\t\t1}, qq{0\t0\t0\t1} ]
+,

  'tabs and digits intermixed (no leading/trailing null fields)',
  [ qq{12\t34\n},     qq{12\t34\n}       ],
  [ qq{12\t34},       qq{12\t34}         ],
  [ qq{12\t\t34\n},   qq{12\t0\t34\n}    ],
  [ qq{12\t\t34},     qq{12\t0\t34}      ],
  [ qq{12\t\t\t34\n}, qq{12\t0\t0\t34\n} ],
  [ qq{12\t\t\t34},   qq{12\t0\t0\t34}   ],

  'test cases posted with pm 1220510 (with/without newlines)',
  [ qq{1\t2\t\t\t2\t\t5\t\t\t\t4\n},
    qq{1\t2\t0\t0\t2\t0\t5\t0\t0\t0\t4\n},
    ],
  [ qq{1\t2\t\t\t2\t\t5\t\t\t\t4},
    qq{1\t2\t0\t0\t2\t0\t5\t0\t0\t0\t4},
    ],
  [ qq{4\t4\t4\t\t\t4\t\t\t\t3\t\n},
    qq{4\t4\t4\t0\t0\t4\t0\t0\t0\t3\t0\n},
    ],
  [ qq{4\t4\t4\t\t\t4\t\t\t\t3\t},
    qq{4\t4\t4\t0\t0\t4\t0\t0\t0\t3\t0},
    ],
  [ qq{\t\t4\t4\t\t\t1\t\t\t\t\n},
    qq{0\t0\t4\t4\t0\t0\t1\t0\t0\t0\t0\n},
    ],
  [ qq{\t\t4\t4\t\t\t1\t\t\t\t},
    qq{0\t0\t4\t4\t0\t0\t1\t0\t0\t0\t0},
    ],
  [ qq{\t1\t5\t6\t\t4\t\t\t\t\t\n},
    qq{0\t1\t5\t6\t0\t4\t0\t0\t0\t0\t0\n},
    ],
  [ qq{\t1\t5\t6\t\t4\t\t\t\t\t},
    qq{0\t1\t5\t6\t0\t4\t0\t0\t0\t0\t0},
    ],

  );

VECTOR:
for my $ar_vector (@tests) {

  if (not ref $ar_vector) {
    note $ar_vector;
    next VECTOR;
    }

  my ($string, $expected) = @$ar_vector;

  (my $got = $string) =~
  # the next two regexes work, but need perl version 5.10+ for \K
  # s{ (?: \A | \t) \K (?= \t) | (?<= \t) (?= (?: \t | \Z)) }  # works
  # s{ (?! \A \Z) (?: \A | \t) \K (?= (?: \t | \Z)) }  # works
  # the next regex works with both pre-5.10 and 5.10+ perl versions
    s{ (?! \A \Z) (?: \A | (?<= \t)) (?= (?: \t | \Z)) }  # works
     {0}xmsg;

  ok $got eq $expected,
   # pp($string, $got, $expected)  # for debug
     pp($string, $got)
     ;

  }  # end for VECTOR

done_testing;

exit;
[download]

Output:

c:\@Work\Perl\monks\yueli711>perl normalize_tsv_1.pl
#
#  === all test cases are with and without terminating newlines ===
#
# simple cases - no tab separator
ok 1 - ("\n", "\n")
ok 2 - ("", "")
ok 3 - (
#   1
# ,
#   1
# ,
# )
ok 4 - (1, 1)
# simple cases - tab separator(s), no null fields
ok 5 - ("1\t23\n", "1\t23\n")
ok 6 - ("1\t23", "1\t23")
ok 7 - ("1\t23\t456\n", "1\t23\t456\n")
ok 8 - ("1\t23\t456", "1\t23\t456")
# simple cases - only tab(s) (null fields)
ok 9 - ("\t\n", "0\t0\n")
ok 10 - ("\t", "0\t0")
ok 11 - ("\t\t\n", "0\t0\t0\n")
ok 12 - ("\t\t", "0\t0\t0")
ok 13 - ("\t\t\t\n", "0\t0\t0\t0\n")
ok 14 - ("\t\t\t", "0\t0\t0\t0")
# tabs and digits intermixed (leading/trailing null fields)
ok 15 - ("1\t\n", "1\t0\n")
ok 16 - ("1\t", "1\t0")
ok 17 - ("\t1\n", "0\t1\n")
ok 18 - ("\t1", "0\t1")
ok 19 - ("1\t\t\n", "1\t0\t0\n")
ok 20 - ("1\t\t", "1\t0\t0")
ok 21 - ("\t1\t\n", "0\t1\t0\n")
ok 22 - ("\t1\t", "0\t1\t0")
ok 23 - ("\t\t1\n", "0\t0\t1\n")
ok 24 - ("\t\t1", "0\t0\t1")
ok 25 - ("1\t\t\t\n", "1\t0\t0\t0\n")
ok 26 - ("1\t\t\t", "1\t0\t0\t0")
ok 27 - ("\t1\t\t\n", "0\t1\t0\t0\n")
ok 28 - ("\t1\t\t", "0\t1\t0\t0")
ok 29 - ("\t\t1\t\n", "0\t0\t1\t0\n")
ok 30 - ("\t\t1\t", "0\t0\t1\t0")
ok 31 - ("\t\t\t1\n", "0\t0\t0\t1\n")
ok 32 - ("\t\t\t1", "0\t0\t0\t1")
# tabs and digits intermixed (no leading/trailing null fields)
ok 33 - ("12\t34\n", "12\t34\n")
ok 34 - ("12\t34", "12\t34")
ok 35 - ("12\t\t34\n", "12\t0\t34\n")
ok 36 - ("12\t\t34", "12\t0\t34")
ok 37 - ("12\t\t\t34\n", "12\t0\t0\t34\n")
ok 38 - ("12\t\t\t34", "12\t0\t0\t34")
# test cases posted with pm 1220510 (with/without newlines)
ok 39 - (
#   "1\t2\t\t\t2\t\t5\t\t\t\t4\n",
#   "1\t2\t0\t0\t2\t0\t5\t0\t0\t0\t4\n",
# )
ok 40 - (
#   "1\t2\t\t\t2\t\t5\t\t\t\t4",
#   "1\t2\t0\t0\t2\t0\t5\t0\t0\t0\t4",
# )
ok 41 - (
#   "4\t4\t4\t\t\t4\t\t\t\t3\t\n",
#   "4\t4\t4\t0\t0\t4\t0\t0\t0\t3\t0\n",
# )
ok 42 - (
#   "4\t4\t4\t\t\t4\t\t\t\t3\t",
#   "4\t4\t4\t0\t0\t4\t0\t0\t0\t3\t0",
# )
ok 43 - (
#   "\t\t4\t4\t\t\t1\t\t\t\t\n",
#   "0\t0\t4\t4\t0\t0\t1\t0\t0\t0\t0\n",
# )
ok 44 - ("\t\t4\t4\t\t\t1\t\t\t\t", "0\t0\t4\t4\t0\t0\t1\t0\t0\t0\t0")
ok 45 - (
#   "\t1\t5\t6\t\t4\t\t\t\t\t\n",
#   "0\t1\t5\t6\t0\t4\t0\t0\t0\t0\t0\n",
# )
ok 46 - ("\t1\t5\t6\t\t4\t\t\t\t\t", "0\t1\t5\t6\t0\t4\t0\t0\t0\t0\t0"
+)
1..46
ok 47 - no warnings
1..47
[download]

Give a man a fish: <%-{-{-{-<

Comment on Re: substitute space to 0 Select or Download Code