use strict; use warnings; use HTML::TreeBuilder; my @goodWordsList = ( "mhm", "right", "well", "yeah", "sure", "good", "ah", "okay", "yep", "hm", "definitely", "alright", "'m'm", "oh", "my", "god", "wow", "uhuh", "exactly", "yup", "mkay", "i see", "ooh", "cool", "uh", "fine", "true", "hm'm", "hmm", "yes", "absolutely", "great", "um", "so", "mm", "weird", "ye-", "i mean", "i know", "i think so", "huh", "yay", "maybe", "eh", "obviously", "correct", "awesome", "really", "interesting", ); my %goodwords; @goodwords{@goodWordsList} = (1) x @goodWordsList; my $root = HTML::TreeBuilder->new (); $root->parse_file (*DATA); my %speakers; # Parse out speaker attributes for ($root->look_down ('_tag', 'strong')) { my $info = $_->right (); my $name = $_->as_text (); $speakers{$name}{info} = $info; for my $param (split /\s*(?:;\s*|$)/, $info) { my ($key, $value) = $param =~ /^:?\s*([^:]*):\s*(.*)/; $speakers{$name}{$key} = $value; } } my %stats; # Do the analysis for ($root->look_down ('_tag', 'p')) { my $line = $_->as_text ();; my ($name) = $line =~ /(\w+):/; # Preform analysis on paragraph here }