use strict; use warnings; my $corpusname = "GoldStandardCorpus.Original.MG.2022-11-10"; my %words2ids; open my $lemmas, "<", $corpusname.".tagset.txt" or die $!; while (my $line = <$lemmas>) { chomp($line); my ($word, $id) = split "\t", $line; $words2ids{ lc($word) } = $id; } my %freq; open my $output, ">", $corpusname.".possible-annotation.txt" or die $!; open my $corpus, "<", $corpusname.".txt" or die $!; while (my $line = <$corpus>) { chomp($line); my @tokens = split ' ', $line; foreach my $token (@tokens) { my $lct = lc $token; if (my $id = $words2ids{ $lct }) { $freq{$lct}++; $token .= " $id"; } } say { $output } "@tokens"; } open my $notfound, ">", $corpusname.".tags-not-found.txt" or die $!; foreach my $word (sort keys(%words2ids)) { next if exists $freq{$word}; say { $notfound } "$word\t$words2ids{$word}"; }