#! perl -slw use strict; my @lines = ; my $mask = chr(32) x length $lines[ 0 ]; $mask |= $_ for @lines; my $templ = ''; $templ .= 'a' . length( $1 ) . 'x' . length( $2 ) . ' ' while $mask =~ m[([^ ]+)( +|$)]g; print $templ; ## Count the false fields and determin the left/right alignments: my( @left, @right, @blanks, $nFields ); for ( @lines ) { my @fields = unpack $templ, $_; $nFields = $#fields; for my $field ( 0 .. $#fields ) { $right[ $field ] = 1 if substr( $fields[ $field ], 0, 1 ) eq ' '; $left[ $field ] = 1 if substr( $fields[ $field ], -1 ) eq ' '; $blanks[ $field ]++ unless $fields[ $field ] =~ m[\S]; } } my $reField = qr[a\d+x\d+\s]; ## Simplifies template adjustment regex. ## adjusted template fields backwards ## to ensure that we don't screw up the indexing. for my $field ( reverse 0 .. $nFields ) { ## Skip unless a false field next unless defined $blanks[ $field ] and $blanks[ $field ] > ( @lines / 2 ); my $keep; ## Number of template fields to keep ## If the preceding field is left aligned ## and the following is not right aligned if( $field and $left[ $field -1 ] and $field < $nFields and not $right[ $field + 1 ] ) { warn "Amalgamating field $field with the prevous field\n"; $keep = $field - 1; } ## If the preceding field is right aligned ## and the following is not left aligned elsif( $field < $nFields and $right[ $field + 1 ] and $field and not $left[ $field - 1 ] ) { warn "Amalgamating field $field with the next field\n"; $keep = $field; } ## If preceding is left and following is right aligned ## bellyache and do nothing. else { warn "Field $field is probably a false field, but it is not possible\n" . "to determine which adjacent field to amalgamate it with?"; next; ## Update. } ## Amalgamate the appropriate template fields $templ =~ s[ ( ${reField}{$keep} ) a (\d+) x (\d+) \s a (\d+) x (\d+) \s ][ $1 . 'a' . ($2 + $3 + $4) . 'x' . $5 . ' ' ]xe or warn 'No match'; ## Belache if the regex fails. } print $templ; { local $\; ## Split the records and output delimited by '|' print join '|', unpack $templ, $_ for @lines; } __DATA__ The First One Here Is Longer. Collie SN 262287630 77312 93871 MVP abc de gh A Second (PART) here First In 20 MT 169287655 506666 61066 RTD abc fgh 3rd Person "Something" X&Y No SH 564287705 45423 52443 RTE abc gh The Fourth Person 20 MLP 4000 360505504 3530 72201 VRE abc gh The Fifth Name OR Something Twin 200 SH 469505179 3530 72201 VRE abc fgh The Sixth Person OR Item MLP 260505174 3,530 72,201 VRE abc fgh 70 The Seventh Record MLP 764205122 3530 72201 VRE abc gh The Eighth Person MLP MLP 160545154 3530 7220 VRE abc gh