#!/usr/bin/perl -w use encoding 'utf-8'; die "Usage : $0 source_file(XML)\n" unless (@ARGV>=1); open (FILEIN, "$ARGV[0]") || die "Unable to open source file $ARGV[0] : $!\n"; open (FILEOUT, ">:utf8", "$ARGV[0]_tok_cat.lst") || die "Unable to open destination file $ARGV[0]_tok_cat.lst : $!"; $pos = 0; while () { if (m/^\.+\(.+)\<\/[cw]>/$1/; $cat =~ s/POS=\"([a-z]+)\">.+\<\/[cw]>/$1/; print FILEOUT "$pos\t$item\t$cat\n"; $pos+=length($item); } } } } close FILEIN; close FILEOUT;