#!/usr/bin/perl -w # # Create clusters of meeting data in a semi-unsupervised fashion # # parameters: # 1) input directory # 2) output data #################################################################### use strict; my $indir = shift @ARGV or die "No indir supplied"; my $outfile = shift @ARGV or die "No outfile supplied"; die "$indir not a directory!" unless -d $indir; open(OUTPUT, ">$outfile") || die "Could not open $outfile"; die "Could not chdir $indir" unless chdir($indir); opendir(INDIR, ".") || die "directory open of $indir failed"; # I prefer the chdir instead of using the slash for reading the filenames my @files = grep(!/^..?\z/, readdir(INDIR)); #get all the files in the directory # I'm not sure if readdir will always give you . and .. first # See the readdir function in the docs. That's my prefered method from the camel closedir(INDIR); #just being neat foreach my $infile (@files) { #process files until none are left next if -d $infile; open(INPUT, $infile) || die "error on file opening $infile"; my $within_spurt=0; my $previous_interupt = 0; my @current_spurt; ; # skip first line which isn't data while () { #inner loop to create initail clustering my (undef, undef, $current_word, $word_in_spurt, $spurt_length, undef, undef, undef, undef, $primary_speaker, undef, $interupting_speakers) = split(' '); # see if number of speakers increases and first_speaker == 0, # this means the spurt is interupting, not the primary speaker if ($primary_speaker == 0 && $interupting_speakers > $previous_interupt && $word_in_spurt == 1) { push @current_spurt, " $current_word ";# start marker $within_spurt = 1; if($word_in_spurt == $spurt_length) {#only word in spurt push @current_spurt, "<\/s>\n"; $within_spurt = 0; $interupting_speakers = 0; # the interupt is over } } # if we are in a spurt and it is not the last word of that spurt elsif($within_spurt && $word_in_spurt != $spurt_length) { push @current_spurt, "$current_word "; #add current word } # if this is the last word of a spurt elsif($word_in_spurt == $spurt_length && $within_spurt == 1) { push @current_spurt, "$current_word <\/s>\n"; #end marker $within_spurt = 0; $interupting_speakers = 0; # the interupt is over } #make sure that at the end of spurts this flag is reduced if($word_in_spurt == $spurt_length) { $interupting_speakers--; # should not be less than 0 if($interupting_speakers < 0) { $interupting_speakers = 0; } } $previous_interupt = $interupting_speakers; } close(INPUT); if( grep { /\bso\b/ } @current_spurt) {; print STDOUT @current_spurt; print OUTPUT @current_spurt; } }