Abanye abahlanu balimale kanzima . Abanye abanazo izinkomo noma izimbuzi . Abaziphathi ngendlela ekhombisa ukufuna ukudlala . Abazithamundayo bathi uzimisele ngokuliphika icala . Abazi ukuthi bakhuluma ngani . abazonikwa amalungelo okudoba ezinga elincane . Ahluleke ngisho ukukuphulula umhlane njengasemihleni . Ahluleke ukuzibamba, azidedele inkaba . Ahluleke ukuzibamba uMaMthembu . #### icala ukhalo inkaba inkaba isisu isisu isibeletho umhlane iqolo izinqe umdidi umphambili amasende inkomo ubhontshisi ingalo ukuthi bakhuluma #### Abanye abahlanu balimale kanzima . Abanye abanazo izinkomo noma izimbuzi . Abaziphathi ngendlela ekhombisa ukufuna ukudlala . Abazithamundayo bathi uzimisele ngokuliphika icala . Abazi ukuthi bakhuluma ngani . abazonikwa amalungelo isisu \t ezinga elincane . Ahluleke ngisho ukukuphulula umhlane njengasemihleni . Ahluleke ukuzibamba, azidedele inkaba \t . Ahluleke ukuzibamba uMaMthembu . #### #!/usr/bin/env perl use 5.016; use warnings; use autodie; my $corpusname = 'GFSEBcorpus.zul_selected-sentences_original'; my %words2ids; { open my $fh, '<', "$corpusname.example.tagset.txt"; while (<$fh>) { chomp; my ($text, $token) = split /\t/; $words2ids{fc $text} = $token; } } my $alt = join '|', sort { length($b) <=> length($a) } map fc, keys %words2ids; my $re = qr{(?i:($alt))}; my %found; { open my $in_fh, '<', "$corpusname.txt"; open my $out_fh, '>', "$corpusname.possible-annotation_example.txt"; while (<$in_fh>) { s/$re/++$found{fc $1}, "$1 $words2ids{fc $1}"/eg; print $out_fh $_; } } delete @words2ids{keys %found}; { open my $fh, '>', "$corpusname.tags-not-found_example.txt"; for (sort keys %words2ids) { say $fh "$_\t$words2ids{$_}"; } }