in reply to Removing extra spaces

I know, this question was asked more than 7 years ago, but I would
like to post a sub that I wrote that does exactly what you want:

sub CollapseWhitespace{@_ or return'';my$T=shift;defined$T or return'';my$L=length($T);$L or return'';my$c;my$N=0;my$P =0;my$U=1;for(my$i=0;$i<$L;$i++){$c=vec($T,$i,8);if($c<33){ $U=0;if($N++==1){vec($T,$P++,8)=32;}}else{$N=0;$U or vec($T ,$P,8)=$c;$P++;}}return$U?$T:substr($T,0,$P);}

^^ This looks a bit obfuscated, so here is a nicer expanded version:

############################################################## # # This function removes single instances of whitespace and # converts multiple adjacent whitespace characters to a single # space. In this function, "whitespace" is defined as a character # whose ASCII value is less than 33. (This includes many special # characters such as new line characters, nul, bel, etc.) # # Usage: STRING = CollapseWhitespace(STRING) # # Example: # CollapseWhitespace("\n\t abc 123 xxx\n") --> " abc123 xxx" # sub CollapseWhitespace { @_ or return ''; my $T = shift; defined $T or return ''; my $L = length($T); $L or return ''; my $c; my $N = 0; # consecutive whitespace counter my $P = 0; # target pointer to overwrite original str $T my $U = 1; # string length will be left unchanged for (my $i = 0; $i < $L; $i++) { $c = vec($T, $i, 8); if ($c < 33) { $U = 0; if ($N++ == 1) { vec($T, $P++, 8) = 32; } } else { $N = 0; $U or vec($T, $P, 8) = $c; $P++; } } return $U ? $T : substr($T, 0, $P); }

Replies are listed 'Best First'.
Re^2: Removing extra spaces
by AnomalousMonk (Archbishop) on Aug 25, 2019 at 09:32 UTC

    A more concise alternative is:

    c:\@Work\Perl\monks>perl -wMstrict -le "use warnings; use strict; ;; use Test::More 'no_plan'; use Test::NoWarnings; ;; use Data::Dump qw(pp); ;; note qq{perl version: $]}; ;; my @TESTS = ( [ undef , qq{} ], [ qq{} , qq{} ], [ qq{ } , qq{} ], [ qq{\n} , qq{} ], [ qq{\n\t} , qq{ } ], [ qq{\n\t\x00} , qq{ } ], [ qq{\n\t \x00} , qq{ } ], [ qq{\n\t abc 123 xxx\n} , qq{ abc123 xxx} ], [ qq{\nabc 123\a\b\fxxx\n\t }, qq{abc123 xxx } ], [ qq{abc 123\n\r xxx} , qq{abc123 xxx} ], ); ;; note 'special case'; is CollapseWhitespace(), '', 'no arguments'; ;; note 'general cases'; VECTOR: for my $ar_vector (@TESTS) { if (not ref $ar_vector) { note $ar_vector; next VECTOR; } ;; my ($str, $expected) = @$ar_vector; ;; is CollapseWhitespace($str), $expected, pp($str) . ' -> ' . pp($expected) ; } ;; done_testing; ;; exit; ;; sub CollapseWhitespace { my $s = shift; return '' unless defined $s; $s =~ s{ [\x00-\x20]+ }{ $+[0] - $-[0] == 1 ? '' : ' ' }xmsge; return $s; } " # perl version: 5.008009 # special case ok 1 - no arguments # general cases ok 2 - undef -> "" ok 3 - "" -> "" ok 4 - " " -> "" ok 5 - "\n" -> "" ok 6 - "\n\t" -> " " ok 7 - "\n\t\0" -> " " ok 8 - "\n\t \0" -> " " ok 9 - "\n\t abc 123 xxx\n" -> " abc123 xxx" ok 10 - "\nabc 123\a\b\fxxx\n\t " -> "abc123 xxx " ok 11 - "abc 123\n\r xxx" -> "abc123 xxx" 1..11 ok 12 - no warnings 1..12
    If you have Perl version 5.14+, a slightly conciserer variation is:
    sub CollapseWhitespace { my $s = shift; return defined $s ? $s =~ s{ [\x00-\x20]+ }{ $+[0] - $-[0] == 1 ? '' : ' ' }xmsger : '' ; }
    See the  s///  /r modifier in perlop. I leave it to you to Benchmark whether the  s///e version is actually faster than the for-loop version.


    Give a man a fish:  <%-{-{-{-<