Abanye abahlanu balimale kanzima .
Abanye abanazo izinkomo noma izimbuzi .
Abaziphathi ngendlela ekhombisa ukufuna ukudlala .
Abazithamundayo bathi uzimisele ngokuliphika icala .
Abazi ukuthi bakhuluma ngani .
abazonikwa amalungelo okudoba ezinga elincane .
Ahluleke ngisho ukukuphulula umhlane njengasemihleni .
Ahluleke ukuzibamba, azidedele inkaba .
Ahluleke ukuzibamba uMaMthembu .
####
icala
ukhalo
inkaba
inkaba
isisu
isisu
isibeletho
umhlane
iqolo
izinqe
umdidi
umphambili
amasende
inkomo
ubhontshisi
ingalo
ukuthi bakhuluma
####
Abanye abahlanu balimale kanzima .
Abanye abanazo izinkomo noma izimbuzi .
Abaziphathi ngendlela ekhombisa ukufuna ukudlala .
Abazithamundayo bathi uzimisele ngokuliphika icala .
Abazi ukuthi bakhuluma ngani .
abazonikwa amalungelo isisu \t ezinga elincane .
Ahluleke ngisho ukukuphulula umhlane njengasemihleni .
Ahluleke ukuzibamba, azidedele inkaba \t .
Ahluleke ukuzibamba uMaMthembu .
####
#!/usr/bin/env perl
use 5.016;
use warnings;
use autodie;
my $corpusname = 'GFSEBcorpus.zul_selected-sentences_original';
my %words2ids;
{
open my $fh, '<', "$corpusname.example.tagset.txt";
while (<$fh>) {
chomp;
my ($text, $token) = split /\t/;
$words2ids{fc $text} = $token;
}
}
my $alt = join '|', sort {
length($b) <=> length($a)
} map fc, keys %words2ids;
my $re = qr{(?i:($alt))};
my %found;
{
open my $in_fh, '<', "$corpusname.txt";
open my $out_fh, '>', "$corpusname.possible-annotation_example.txt";
while (<$in_fh>) {
s/$re/++$found{fc $1}, "$1 $words2ids{fc $1}"/eg;
print $out_fh $_;
}
}
delete @words2ids{keys %found};
{
open my $fh, '>', "$corpusname.tags-not-found_example.txt";
for (sort keys %words2ids) {
say $fh "$_\t$words2ids{$_}";
}
}