#!/usr/bin/perl -w use 5.018; use strict; #1149716 =head I would like to extract a piece of data from one field that has multiple fields in it. The original field is a long description that usually contains a #F123456, #123456, #123-F123456, #123-123456, or #12AB-123456 in it. This data floats around from left to right and there should be whitespace before the #. Also, the end of the data is either whitespace, or the end of the field. =cut my @data = ("TRAY HINGED PLSTC 20 CAV #F32473", "BOX HSC,35-3/4X17-1/4 X 50-1/2 SIMULATOR TALL BOX", "PAD, FOAM, 24 X 24 X 1/4 #16193 112 SHEETS PER ROLL, ORDER IN FULL ROLLS", "PKG LIST,ASST ARM,RAD,300 #F37784", "PAD, TOP CAP RE17-30048 #F30121 CORRUGATED ASSEMBLY, 22-7/8 X 21-1/8 X 4-3/4", "foo bar #379460 best F11", "F1234 SIMULATION", ); for my $data (@data) { # say "\t|$data|\n\n"; chomp $data; if ( $data =~ /\n/ ) { $data =~ s/\n//g; } if ( $data =~ /(^.* #[A-Z]*\d+.*$)/m ) { say "\n\$data matches regex\n"; $data =~ s/ +/ /g; # clean up excess spaces say "$data \n"; } else { say "\n\t The data, $data, does NOT MATCH\n"; } } #### C:perl -MYAPE::Regex::Explain -e " print YAPE::Regex::Explain->new(qr/(^.* #[A-Z]*\d+.*$)/)->explain();" The regular expression: (?-imsx:(^.* #[A-Z]*\d+.*$)) matches as follows: NODE EXPLANATION ---------------------------------------------------------------------- (?-imsx: group, but do not capture (case-sensitive) (with ^ and $ matching normally) (with . not matching \n) (matching whitespace and # normally): ---------------------------------------------------------------------- ( group and capture to \1: # NB: I did NOT need the parens as there's no use of the capture # My bad, but harmless except for shoving bits & bytes around # when they didn't need to be disturbed. ---------------------------------------------------------------------- ^ the beginning of the string ---------------------------------------------------------------------- .* any character except \n (0 or more times (matching the most amount possible)) ---------------------------------------------------------------------- # ' #' ---------------------------------------------------------------------- [A-Z]* any character of: 'A' to 'Z' (0 or more times (matching the most amount possible)) ---------------------------------------------------------------------- \d+ digits (0-9) (1 or more times (matching the most amount possible)) ---------------------------------------------------------------------- .* any character except \n (0 or more times (matching the most amount possible)) ---------------------------------------------------------------------- $ before an optional \n, and the end of the string ---------------------------------------------------------------------- ) end of \1 ---------------------------------------------------------------------- ) end of grouping ----------------------------------------------------------------------

And the output is thus:

C:1149716.pl $data matches regex TRAY HINGED PLSTC 20 CAV #F32473 The data, BOX HSC,35-3/4X17-1/4 X 50-1/2 SIMULATOR TALL BOX, does NOT MATCH $data matches regex PAD, FOAM, 24 X 24 X 1/4 #16193 112 SHEETS PER ROLL, ORDER IN FULL ROLLS $data matches regex PKG LIST,ASST ARM,RAD,300 #F37784 $data matches regex PAD, TOP CAP RE17-30048 #F30121 CORRUGATED ASSEMBLY, 22-7/8 X 21-1/8 X 4-3/4 $data matches regex foo bar #379460 best F11 The data, F1234 SIMULATION, does NOT MATCH ##
## $data matches regex TRAY HINGED PLSTC 20 CAV #F32473 The data, BOX HSC,35-3/4X17-1/4 X 50-1/2 SIMULATOR TALL BOX, does NOT MATCH $data matches regex PAD, FOAM, 24 X 24 X 1/4 #16193 112 SHEETS PER ROLL, ORDER IN FULL ROLLS $data matches regex PKG LIST,ASST ARM,RAD,300 #F37784 $data matches regex PAD, TOP CAP RE17-30048 #F30121 CORRUGATED ASSEMBLY, 22-7/8 X 21-1/8 X 4-3/4 $data matches regex foo bar #379460 best F11 The data, F1234 SIMULATION, does NOT MATCH