use strict; my %goodwords = ("mhm" => 1, "right" => 1, "well" => 1, "yeah" => 1, "sure" => 1, "good" => 1, "ah" => 1, "okay" => 1, "yep" => 1, "hm" => 1, "definitely" => 1, "alright" => 1, "'m'm" => 1, "oh" => 1, "my" => 1, "god" => 1, "wow" => 1, "uhuh" => 1, "exactly" => 1, "yup" => 1, "mkay" => 1, "i see" => 1, "ooh" => 1, "cool" => 1, "uh" => 1, "fine" => 1, "true" => 1, "hm'm" => 1, "hmm" => 1, "yes" => 1, "absolutely" => 1, "great" => 1, "um" => 1, "so" => 1, "mm" => 1, "weird" => 1, "ye-" => 1, "i mean" => 1, "i know" => 1, "i think so" => 1, "huh" => 1, "yay" => 1, "maybe" => 1, "eh" => 1, "obviously" => 1, "correct" => 1, "awesome" => 1, "really" => 1, "interesting" => 1,); my(%speaker_record); # store the info in this hash in an array ref my $gender = 0; # array number for gender my $matched_words = 1; # array number for matched words count my $unmatched_words = 2; # array number for unmatched words count while(){ if(/(S[\w\-]+)<\/strong>:.*Gender:\s+(Male|Female)/i){ $speaker_record{$1}->[$gender]=$2; $speaker_record{$1}->[$matched_words]=0; $speaker_record{$1}->[$unmatched_words]=0; } else{ # hopefully, a chunk contains just the stuff attributed to one speaker (split on ) my @chunks = split //, $_; foreach my $chunk(@chunks){ if($chunk =~ /(\w+?):/){ # who is the speaker of this chunk? my $speaker = $1; # get rid of stuff we don't want to count $chunk =~ s/<.*?>//g; # html tags and content $chunk =~ s/\[|\]//g; # '[' and ']' $chunk =~ s/(\w+?)://g; # the speaker my @words = split /\s+/, $chunk; # break the chunk up into words foreach my $word(@words){ #non-blank 'word' and valid speaker if($word !~ /^\s*$/ and exists $speaker_record{$speaker}){ # a matched goodword if(exists $goodwords{$word}){ $speaker_record{$speaker}->[$matched_words] ++; } # an unmatched word else{ $speaker_record{$speaker}->[$unmatched_words] ++; } } } } } } } foreach(keys %speaker_record){ print "Speaker: $_, Gender: $speaker_record{$_}->[$gender], "; print "Matched words: $speaker_record{$_}->[$matched_words], "; print "Unmatched words: $speaker_record{$_}->[$unmatched_words]\n"; } __DATA__ S1: Native-Speaker Status: Native speaker, American English; Academic Role: Senior Undergraduate; Gender: Male; Age: 17-23; Restriction: None
S2: Native-Speaker Status: Native speaker, American English; Academic Role: Researcher; Gender: Male; Age: 31-50; Restriction: Cite
S3: Native-Speaker Status: Native speaker, American English; Academic Role: Junior Undergraduate; Gender: Female; Age: 17-23; Restriction: None
S4: Native-Speaker Status: Native speaker, American English; Academic Role: Senior Undergraduate; Gender: Female; Age: 17-23; Restriction: None
S5: Native-Speaker Status: Native speaker, American English; Academic Role: Junior Undergraduate; Gender: Female; Age: 17-23; Restriction: None
SS: Native-Speaker Status: Native speaker, American English; Academic Role: Unknown; Gender: Male; Age: Unknown; Restriction: None

S1: it was presented to them by Chuck D and Public Enemy. [S2: mhm ] and the rest of th- Public Enemy and you know and and Chuck D's f- publicly gets up and says you know they were with us from the beginning and, [S2: mhm ] all that now wheth- whether or not you know that he was reading a TelePrompTer, [S2: mhm ] or or not i i think is uh

S2: or if he was trying to make nice because of the fact that Public Enemy hasn't sold records lately, [S1: right ] and he doesn't wanna look like some kinda old sourpuss