Re: Removing extra spaces

I know, this question was asked more than 7 years ago, but I would
like to post a sub that I wrote that does exactly what you want:

sub CollapseWhitespace{@_ or return'';my$T=shift;defined$T
or return'';my$L=length($T);$L or return'';my$c;my$N=0;my$P
=0;my$U=1;for(my$i=0;$i<$L;$i++){$c=vec($T,$i,8);if($c<33){
$U=0;if($N++==1){vec($T,$P++,8)=32;}}else{$N=0;$U or vec($T
,$P,8)=$c;$P++;}}return$U?$T:substr($T,0,$P);}
[download]

^^ This looks a bit obfuscated, so here is a nicer expanded version:


##############################################################
#
# This function removes single instances of whitespace and
# converts multiple adjacent whitespace characters to a single
# space. In this function, "whitespace" is defined as a character
# whose ASCII value is less than 33. (This includes many special
# characters such as new line characters, nul, bel, etc.)
#
#   Usage: STRING = CollapseWhitespace(STRING)
#
#   Example:
#     CollapseWhitespace("\n\t abc 123   xxx\n") --> " abc123 xxx"
#
sub CollapseWhitespace
{
  @_ or return '';
  my $T = shift;
  defined $T or return '';
  my $L = length($T);
  $L or return ''; 

  my $c;
  my $N = 0;    # consecutive whitespace counter
  my $P = 0;    # target pointer to overwrite original str $T
  my $U = 1;    # string length will be left unchanged
  for (my $i = 0; $i < $L; $i++)
  {
    $c = vec($T, $i, 8);
    if ($c < 33)
    {
      $U = 0;
      if ($N++ == 1) { vec($T, $P++, 8) = 32; }
    }
    else
    {
      $N = 0;
      $U or vec($T, $P, 8) = $c;
      $P++;
    }
  }
  return $U ? $T : substr($T, 0, $P);
}
[download]

Comment on Re: Removing extra spaces Select or Download Code

Replies are listed 'Best First'.
Re^2: Removing extra spaces by AnomalousMonk (Archbishop) on Aug 25, 2019 at 09:32 UTC
A more concise alternative is: c:\@Work\Perl\monks>perl -wMstrict -le "use warnings; use strict; ;; use Test::More 'no_plan'; use Test::NoWarnings; ;; use Data::Dump qw(pp); ;; note qq{perl version: $]}; ;; my @TESTS = ( [ undef , qq{} ], [ qq{} , qq{} ], [ qq{ } , qq{} ], [ qq{\n} , qq{} ], [ qq{\n\t} , qq{ } ], [ qq{\n\t\x00} , qq{ } ], [ qq{\n\t \x00} , qq{ } ], [ qq{\n\t abc 123 xxx\n} , qq{ abc123 xxx} ], [ qq{\nabc 123\a\b\fxxx\n\t }, qq{abc123 xxx } ], [ qq{abc 123\n\r xxx} , qq{abc123 xxx} ], ); ;; note 'special case'; is CollapseWhitespace(), '', 'no arguments'; ;; note 'general cases'; VECTOR: for my $ar_vector (@TESTS) { if (not ref $ar_vector) { note $ar_vector; next VECTOR; } ;; my ($str, $expected) = @$ar_vector; ;; is CollapseWhitespace($str), $expected, pp($str) . ' -> ' . pp($expected) ; } ;; done_testing; ;; exit; ;; sub CollapseWhitespace { my $s = shift; return '' unless defined $s; $s =~ s{ [\x00-\x20]+ }{ $+[0] - $-[0] == 1 ? '' : ' ' }xmsge; return $s; } " # perl version: 5.008009 # special case ok 1 - no arguments # general cases ok 2 - undef -> "" ok 3 - "" -> "" ok 4 - " " -> "" ok 5 - "\n" -> "" ok 6 - "\n\t" -> " " ok 7 - "\n\t\0" -> " " ok 8 - "\n\t \0" -> " " ok 9 - "\n\t abc 123 xxx\n" -> " abc123 xxx" ok 10 - "\nabc 123\a\b\fxxx\n\t " -> "abc123 xxx " ok 11 - "abc 123\n\r xxx" -> "abc123 xxx" 1..11 ok 12 - no warnings 1..12 [download] If you have Perl version 5.14+, a slightly conciserer variation is: `sub CollapseWhitespace { my $s = shift; return defined $s ? $s =~ s{ [\x00-\x20]+ }{ $+[0] - $-[0] == 1 ? '' : ' ' }xmsger : '' ; }` [download] See the `s///` `/r` modifier in perlop. I leave it to you to Benchmark whether the `s///e` version is actually faster than the `for`-loop version. Give a man a fish: `<%-{-{-{-<`	[reply] [d/l] [select]

Replies are listed 'Best First'.

Re^2: Removing extra spaces
by AnomalousMonk (Archbishop) on Aug 25, 2019 at 09:32 UTC

A more concise alternative is:

c:\@Work\Perl\monks>perl -wMstrict -le
"use warnings;
 use strict;
 ;;
 use Test::More 'no_plan';
 use Test::NoWarnings;
 ;;
 use Data::Dump qw(pp);
 ;;
 note qq{perl version: $]};
 ;;
 my @TESTS = (
   [ undef                      , qq{}            ],
   [ qq{}                       , qq{}            ],
   [ qq{ }                      , qq{}            ],
   [ qq{\n}                     , qq{}            ],
   [ qq{\n\t}                   , qq{ }           ],
   [ qq{\n\t\x00}               , qq{ }           ],
   [ qq{\n\t \x00}              , qq{ }           ],
   [ qq{\n\t abc 123   xxx\n}   , qq{ abc123 xxx} ],
   [ qq{\nabc 123\a\b\fxxx\n\t }, qq{abc123 xxx } ],
   [ qq{abc 123\n\r xxx}        , qq{abc123 xxx}  ],
   );
 ;;
 note 'special case';
 is CollapseWhitespace(), '', 'no arguments';
 ;;
 note 'general cases';
 VECTOR:
 for my $ar_vector (@TESTS) {
   if (not ref $ar_vector) {
     note $ar_vector;
     next VECTOR;
     }
   ;;
   my ($str, $expected) = @$ar_vector;
   ;;
   is CollapseWhitespace($str), $expected,
      pp($str) . ' -> ' . pp($expected)
      ;
   }
 ;;
 done_testing;
 ;;
 exit;
 ;;
 sub CollapseWhitespace {
   my $s = shift;
   return '' unless defined $s;
   $s =~ s{ [\x00-\x20]+ }{ $+[0] - $-[0] == 1 ? '' : ' ' }xmsge;
   return $s;
   }
"
# perl version: 5.008009
# special case
ok 1 - no arguments
# general cases
ok 2 - undef -> ""
ok 3 - "" -> ""
ok 4 - " " -> ""
ok 5 - "\n" -> ""
ok 6 - "\n\t" -> " "
ok 7 - "\n\t\0" -> " "
ok 8 - "\n\t \0" -> " "
ok 9 - "\n\t abc 123   xxx\n" -> " abc123 xxx"
ok 10 - "\nabc 123\a\b\fxxx\n\t " -> "abc123 xxx "
ok 11 - "abc 123\n\r xxx" -> "abc123 xxx"
1..11
ok 12 - no warnings
1..12
[download]

sub CollapseWhitespace {
  my $s = shift;
  return defined $s
    ? $s =~ s{ [\x00-\x20]+ }{ $+[0] - $-[0] == 1 ? '' : ' ' }xmsger
    : ''
    ;
  }
[download]

s///

/r

perlop

Benchmark

s///e

for

Give a man a fish: <%-{-{-{-<

[reply]
[d/l]
[select]