in reply to unglue words joined together by juncture rules
This needs adapting to handle the multi-character morphems in your Sanscrit example and a post process stage to convert morphed words to their lexicon spellings:
#! perl -slw use strict; use Data::Dump qw[ pp ]; sub deGlue (&\@\%@) { use re 'eval'; my $codeRef = shift; my $callback = sub { my $s = $_; my @words = map{ defined $-[ $_ ] && defined $+[ $_ ] ? substr( $s, $-[ $_ ], $+[ $_ ] - $-[ $_ ] ) : () } 1 .. $#-; $codeRef->( @words ); }; my @lex = @{ shift() }; my $morphRef = shift; for ( @lex ) { my( $pre, $last ) = m[(.*)(.)]; my $morph = $morphRef->{ $last } or next; $_ = "$pre(?:$last|$morph->[ 0 ](?=$morph->[1]))" } my $re = qr[ ^ (?:( ${ \ join( ')|(', @lex ) } ))+ $ (??{ $callback->() }) (?!) ]x; m[$re] for @_; return; } my %morphs = ( t => [ 'd' , 'd' ], ); my @lex = qw[cowboy cow boy cat do dog ]; my $input = 'cowboycaddog'; deGlue{ print join '-', @_ } @lex, %morphs, $input;
Produces:
c:\test>675520 cowboy-cad-dog cow-boy-cad-dog
A longer sample input with the related morphems and lexicon clearly identified (I don't know Sanscrit :), would allow better testing.
|
|---|