#!/usr/bin/perl -w
#
# Create clusters of meeting data in a semi-unsupervised fashion
#
# parameters:
# 1) input directory
# 2) output data
####################################################################
use strict;
my $indir = shift @ARGV or die "No indir supplied";
my $outfile = shift @ARGV or die "No outfile supplied";
die "$indir not a directory!" unless -d $indir;
open(OUTPUT, ">$outfile") || die "Could not open $outfile";
die "Could not chdir $indir" unless chdir($indir);
opendir(INDIR, ".") || die "directory open of $indir failed";
# I prefer the chdir instead of using the slash for reading the filenames
my @files = grep(!/^..?\z/, readdir(INDIR)); #get all the files in the directory
# I'm not sure if readdir will always give you . and .. first
# See the readdir function in the docs. That's my prefered method from the camel
closedir(INDIR); #just being neat
foreach my $infile (@files) { #process files until none are left
next if -d $infile;
open(INPUT, $infile) || die "error on file opening $infile";
my $within_spurt=0;
my $previous_interupt = 0;
my @current_spurt;
; # skip first line which isn't data
while () { #inner loop to create initail clustering
my (undef, undef,
$current_word,
$word_in_spurt,
$spurt_length,
undef, undef, undef, undef,
$primary_speaker,
undef,
$interupting_speakers) = split(' ');
# see if number of speakers increases and first_speaker == 0,
# this means the spurt is interupting, not the primary speaker
if ($primary_speaker == 0 &&
$interupting_speakers > $previous_interupt &&
$word_in_spurt == 1) {
push @current_spurt, " $current_word ";# start marker
$within_spurt = 1;
if($word_in_spurt == $spurt_length) {#only word in spurt
push @current_spurt, "<\/s>\n";
$within_spurt = 0;
$interupting_speakers = 0; # the interupt is over
}
}
# if we are in a spurt and it is not the last word of that spurt
elsif($within_spurt && $word_in_spurt != $spurt_length) {
push @current_spurt, "$current_word "; #add current word
}
# if this is the last word of a spurt
elsif($word_in_spurt == $spurt_length && $within_spurt == 1) {
push @current_spurt, "$current_word <\/s>\n"; #end marker
$within_spurt = 0;
$interupting_speakers = 0; # the interupt is over
}
#make sure that at the end of spurts this flag is reduced
if($word_in_spurt == $spurt_length) {
$interupting_speakers--;
# should not be less than 0
if($interupting_speakers < 0) { $interupting_speakers = 0; }
}
$previous_interupt = $interupting_speakers;
}
close(INPUT);
if( grep { /\bso\b/ } @current_spurt) {;
print STDOUT @current_spurt;
print OUTPUT @current_spurt;
}
}