####
1 大 a
2 墙 n
3 内外 f
5 -- w
7 北京市 ns
10 监狱 n
12 纪实 n
13 ( w
14 三 m
15 ) w
17 田 nr
18 珍颖 nr
####
#!/usr/bin/perl -w
use encoding 'utf-8';
die "Usage : $0 source_file(XML)\n" unless (@ARGV>=1);
open (FILEIN, "$ARGV[0]") || die "Unable to open source file $ARGV[0] : $!\n";
open (FILEOUT, ">:utf8", "$ARGV[0]_tok_cat.lst") || die "Unable to open destination file $ARGV[0]_tok_cat.lst : $!";
$pos = 0;
while (.+\) {
$cat = $item;
$item =~ s/POS=\"[a-z]+\">(.+)\<\/[cw]>/$1/;
$cat =~ s/POS=\"([a-z]+)\">.+\<\/[cw]>/$1/;
print FILEOUT "$pos\t$item\t$cat\n";
$pos+=length($item);
}
}
}
}
close FILEIN;
close FILEOUT;