#!/g/rcs/sw/bin/perl -w
#
# Create clusters of meeting data in a semi-unsupervised fashion
#
# parameters:
# 1) input directory
# 2) output data
####################################################################
use strict;
my $indir = $ARGV[0];
my $outfile = $ARGV[1];
opendir(INDIR, $indir) || die "directory open failed";
open(OUTPUT, ">$outfile");
my @files = readdir(INDIR); #get all the files in the directory
shift @files; shift @files; #shift off '.' and '..'
foreach my $file (@files) { #process files until none are left
my $slash = '/';
my $infile = $indir.$slash.$file;
open(INPUT, $infile) || die "error on file open";
my $within_spurt=0;
my $previous_interupt = 0;
my $first = 1;
LINE: while () { #inner loop to create initail clustering
if( $first ) {
$first = 0; # skip first line which isn't data
next LINE;
}
my @line = split(' ');
my $current_word = $line[2];
my $word_in_spurt = $line[3];
my $spurt_length = $line[4];
my $primary_speaker = $line[9];
my $interupting_speakers = $line[11];
my @current_spurt;
#@debug = ($line[0], $line[1], $line[3], $line[4], $line[5]);
# see if number of speakers increases and first_speaker == 0,
# this means the spurt is interupting, not the primary speaker
if($primary_speaker == 0 &&
$interupting_speakers > $previous_interupt &&
$word_in_spurt == 1) {
push @current_spurt, " $current_word ";# start marker
$within_spurt = 1;
if($word_in_spurt == $spurt_length) {#only word in spurt
push @current_spurt, "<\/s>\n";
$within_spurt = 0;
$interupting_speakers = 0; # the interupt is over
}
}
# if we are in a spurt and it is not the last word of that spurt
elsif($within_spurt && $word_in_spurt != $spurt_length) {
push @current_spurt, "$current_word "; #add current word
}
# if this is the last word of a spurt
elsif($word_in_spurt == $spurt_length && $within_spurt == 1) {
push @current_spurt, "$current_word <\/s>\n"; #end marker
$within_spurt = 0;
$interupting_speakers = 0; # the interupt is over
}
#make sure that at the end of spurts this flag is reduced
if($word_in_spurt == $spurt_length) {
$interupting_speakers--;
# should not be less than 0
if($interupting_speakers < 0) { $interupting_speakers = 0; }
}
$previous_interupt = $interupting_speakers;
my $yeah = 0;
my $string = join('',@current_spurt);
if( $string =~ /\bso\b/ ) {
$yeah = 1;
}
print STDOUT $string if $yeah;
print $string;
print OUTPUT @current_spurt;
undef @current_spurt;
undef $string;
}
close(INPUT);
last;
}