$ cat pm_11148202.tagset.txt udebe ulimi izinyo izinyo lomhlathi ingemuva lomqala umphimbo #### $ cat pm_11148202.txt Lokho udebe kukwenze isilomo. Ukuzihlola izinyo kungahlenga izinyo lomhlathi yakho. Amakhala agxiza amafinyila. Ulimi amafutha ulimi wonke ULIMI amabheringi. Sebenzisa amafutha ulimi. Zama ukugwema ukudla okuncinca udebe. #### #!/usr/bin/env perl use 5.016; use warnings; use autodie; my $corpusname = 'pm_11148202'; my %words2ids; { open my $fh, '<', "$corpusname.tagset.txt"; while (<$fh>) { chomp; my ($text, $token) = split /\t/; $words2ids{fc $text} = $token; } } my $alt = join '|', sort { length($b) <=> length($a) } map fc, keys %words2ids; my $re = qr{(?i:($alt))}; my %found; { open my $in_fh, '<', "$corpusname.txt"; open my $out_fh, '>', "$corpusname.possible-annotation.txt"; while (<$in_fh>) { s/$re/++$found{fc $1}, "$1 $words2ids{fc $1}"/eg; print $out_fh $_; } } delete @words2ids{keys %found}; { open my $fh, '>', "$corpusname.tags-not-found.txt"; for (sort keys %words2ids) { say $fh "$_\t$words2ids{$_}"; } } #### $ cat pm_11148202.possible-annotation.txt Lokho udebe kukwenze isilomo. Ukuzihlola izinyo kungahlenga izinyo lomhlathi yakho. Amakhala agxiza amafinyila. Ulimi amafutha ulimi wonke ULIMI amabheringi. Sebenzisa amafutha ulimi . Zama ukugwema ukudla okuncinca udebe . #### $ cat pm_11148202.tags-not-found.txt ingemuva lomqala umphimbo