# small snippet to explore dealing with complex data structures and duplicates/uniques and consolidation my %pets; my @info; #() below only signify that multiple elements possible in 3rd and 4th elements of $info[i] $info[0]="Mary,Owens,cat,white"; $info[1]="Bill,Thompson,(cat,dog),(white,black)"; $info[2]="Bill,Thompson,(hamster,cat),(black,brown)"; $info[3]="Bill,Smith,(goldfish,dog,turtle),(yellow,spotted,green)"; #how to organize this data and loop thru to populate %pets from @info and extract output as below # Loop through each line in the input array, split out the data elements, and store them in the hash foreach my $info_line (@info) { # Peel off First and Last name. # WARNING: We are dangerously assuming they will not be encapsulated in parentheses. my ($firstName, $lastName, $afterName) = split /\,/, $info_line, 3; # I've seem some fancy-schmancy regex work which could do the (x,y,z) thing in one pass. # I've never learned the technique, and I'm too lazy to look it up. my ($petTypes, $afterPets) = &parseNextElement($afterName); my ($petColors, $afterColors) = &parseNextElement($afterPets); # Now split up the pet types if ($petTypes =~ /\(([^\)]+)\)/) { my $innerPetTypes = $1; $petTypes = $innerPetTypes; } my @petTypeList = split /\,/, $petTypes; # Now split up the pet colors if ($petColors =~ /\(([^\)]+)\)/) { my $innerPetColors = $1; $petColors = $innerPetColors; } my @petColorList = split /\,/, $petColors; # Hash trick #14: Using a hash to quietly remove duplicate entries # WARNING: This is case-sensitive. # Force the keys to upper (or lower) case to render it case-insensitive # HINT: Instead of storing $TRUE, store the first mixed-case name as the value foreach my $petType (@petTypeList) { $pets{$firstName}{$lastName}{$PET_TYPES}{$petType} = $TRUE; } foreach my $petColor (@petColorList) { $pets{$firstName}{$lastName}{$PET_COLORS}{$petColor} = $TRUE; } } # Okay, the %pets hash is loaded. Report on it any way you like. # NOTE: To make the code look cleaner at the bottom, we will be storing all data # pre-encapsulated in quotation marks. foreach my $firstName (keys %pets) { my $encapsulatedFirstName = "\"$firstName\""; # Tricky(ish): The Last name is a sub-hash key. # NOTE: This is often called a Hash of Hashes (HoH). foreach my $lastName (keys %{$pets{$firstName}}) { my $encapsulatedLastName = "\"$lastName\""; # One more layer deep in our Hash of Hashes: PET_TYPES is a subkey to the Last Name # And then another layer deep: Each pet type is a subkey to the PET_TYPES hash # NOTE: I am doing this the hard way to slightly de-Perlize it a bit as a mercy for any Hash noob trying to decipher this # This *could* have been done with a single, complex-looking join statement my @petTypes = (); foreach my $petType (keys %{$pets{$firstName}{$lastName}{$PET_TYPES}}) { push @petTypes, "\"$petType\""; } # PET_COLORS is a peer to PET_TYPES under Last Name, so handle the same way my @petColors = (); foreach my $petColor (keys %{$pets{$firstName}{$lastName}{$PET_COLORS}}) { push @petColors, "\"$petColor\""; } # And now we've normalized the data. Pre-join the pet data. my $finalPetTypes = join ',', @petTypes; my $finalPetColors = join ',', @petColors; # Drum roll, please: my $output_line = join ":", ($encapsulatedFirstName, $encapsulatedLastName, $finalPetTypes, $finalPetColors); print "$output_line\n"; } } #### sub parseNextElement { my ($input_line, @extra_stuff) = @_; if (!defined $input_line) { $input_line = ''; } if (!defined $extra_stuff[0]) { @extra_stuff = (); } # Initialize return elements my $next_element = ''; my $remaining_line = ''; # Check if first element is parentheses-encapsulated if ($input_line =~ /^\s*\(/) { # First element looks like it has parenthetically-encapsulated sub-elements ($next_element, $remaining_line) = split /\)/, $input_line, 2; # This chops off the closing parenthesis from that first element, so put it back $next_element .= ')'; # It also leaves the comma on the remaining line, so remove it # WARNING: Making lots of assumptions here about proper formatting $remaining_line =~ s/^\s*\,//; } else { # First element is a solitary value ($next_element, $remaining_line) = split /\,/, $input_line, 2; } return ($next_element, $remaining_line); }