TIMTOWTDI
#!/usr/bin/perl use strict; # https://perlmonks.org/?node_id=11148202 use warnings; use List::AllUtils qw( rev_nsort_by ); my $corpusfile = '/tmp/d.11148202.corpus'; # FIXME filename my $wordfile = '/tmp/d.11148202.words'; # FIXME filename my %words2ids; { local @ARGV = $wordfile; while( <> ) { my ($key, $value) = split /[\t\n]/; $words2ids{lc $key} .= " $value"; } } my $pat = do { local $" = '|'; qr/(@{[ map quotemeta, rev_nsort_by { length } keys %words2ids ]})/i}; my %found; { local @ARGV = $corpusfile; print s/\b$pat\K/ $found{lc $1}++; $words2ids{lc $1} /ger while <>; } delete @words2ids{ keys %found }; # not found local $, = "\n"; print '',"---------------- Not Found:", sort(keys %words2ids), '';
Outputs:
Lokho udebe <ZUL-SIL-0016-n> kukwenze isilomo. Ukuzihlola izinyo <ZUL-SIL-0018-n> <ZUL-SIL-0018-n-other> kungahlenga +izinyo lomhlathi <ZUL-SIL-0019-n> yakho. Amakhala agxiza amafinyila. Ulimi <ZUL-SIL-0017-n> amafutha ulimi <ZUL-SIL-0017-n> wonke ULIMI <ZU +L-SIL-0017-n> amabheringi. Sebenzisa amafutha ulimi <ZUL-SIL-0017-n>. Zama ukugwema ukudla okuncinca udebe <ZUL-SIL-0016-n>. ---------------- Not Found: ingemuva lomqala umphimbo
In reply to Re^3: Finding multiword units in a corpus
by tybalt89
in thread Finding multiword units in a corpus
by veg_running
| For: | Use: | ||
| & | & | ||
| < | < | ||
| > | > | ||
| [ | [ | ||
| ] | ] |