Re: Challenge: Predictive Texting

A quick (in the sense of not much time spent on it) first cut version before I head off on holiday. :)

use strict;
use warnings;

use constant LEARN => 1; # Set true to learn by doing

open WORDS, '<', '2of12.txt';
my @words = <WORDS>;
close WORDS;

chomp @words;

@words = map {[$_, asSMS ($_)]} @words;

my $lookup = bless {lookup => {}, nextWordNIndex => 0};

$lookup->addWord ($_) for @words;
$lookup->training ();
$lookup->{scan} = $lookup->{lookup};

my %dict = map {($_->[0] => $_->[1])} @words;
my $strokes = 0;
my $characters = 0;
my $txtWords = 0;

while (<DATA>) {
    chomp;
    
    my @dataWords = split;
    
    for my $word (@dataWords) {
        $word =~ tr/a-z//cd;
        next if ! exists $dict{$word};

        ++$txtWords;
        my @digits = @{asSMS ($word)};
        my @pressed;
        
        $characters += @digits;

        while (1) {
            my $digit = (shift @digits) || '-';
            my $result = $lookup->addDigit ($digit);

            ++$strokes;
            push @pressed, $digit;
            
            if ($result eq $word) {
                $lookup->addDigit ('+');

                ++$characters;
                ++$strokes;
                push @pressed, '+';
                last;
            }
        }
        
        print "@pressed ($word)\n";
    }
}

print "$strokes strokes to send a $characters character message contai
+ning $txtWords words.\n";


sub addWord {
    my ($self, $word) = @_;
    my @digits = @{$word->[1]};
    my $text = $word->[0];
    my $scan = $self->{lookup};

    for my $digit (@digits) {
        if (defined $scan->{$digit}) {
            push @{$scan->{$digit}{words}}, [$text, 0];
        } else {
            $scan->{$digit} = {words => [[$text, 0]]};
        }
        
        $scan = $scan->{$digit};
    }
}


sub addDigit {
    my ($self, $digit) = @_;

    if ($digit eq '+') {
        $self->train ($self->{lastGuess}[0], 1) if LEARN;
        $self->doneWord ();
        return $self->{lastGuess}[0];
        
    } elsif ($digit eq '-') {
        if (! exists $self->{scan}{wordsN}) {
            @{$self->{scan}{wordsN}} =
                grep {$self->{wordlen} == length $_->[0]} @{$self->{sc
+an}{words}};
        }
        
        die "Unknown word @{$self->{digits}}"
            if $self->{nextWordNIndex} == @{$self->{scan}{wordsN}};
        
        # Skip words that have been tried already
        my $wordsN = $self->{scan}{wordsN};
        
        ++$self->{nextWordNIndex}
            while exists $self->{tries}{$wordsN->[$self->{nextWordNInd
+ex}][0]};
        $self->{lastGuess} = $wordsN->[$self->{nextWordNIndex}]

    } else {
        die "Unknown word @{$self->{digits}} $digit"
            if ! exists $self->{scan}{$digit};
            
        push @{$self->{digits}}, $digit;
        $self->{scan} = $self->{scan}{$digit};
        ++$self->{wordlen};

        my $index = 0;
        my $words = $self->{scan}{words};

        # Skip words that have been tried already
        ++$index while exists $self->{tries}{$words->[$index][0]};
        $self->{lastGuess} = $words->[$index]
    }
    
    $self->{tries}{$self->{lastGuess}[0]} = 1;
    return $self->{lastGuess}[0];
}


sub train {
    my ($self, $word, $weight) = @_;
    my @digits = @{asSMS ($word)};
    my $wordlen = length $word;

    $weight ||= 1;
    $weight +=  $wordlen / 10.0;
    
    # Sort the word lists by frequency
    my $scan = $self->{lookup};
    for my $digit (@digits) {
        $scan = $scan->{$digit};
        my @match = grep {$_->[0] eq $word} @{$scan->{words}};

        last if ! @match;

        $_->[1] += $weight for @match;
        @{$scan->{words}} = sort {$b->[1] <=> $a->[1]} @{$scan->{words
+}};
    }
    
    # Sort the wordN lists by frequency
    $scan = $self->{lookup};
    $scan = $scan->{$_} for @digits;

    if (! exists $scan->{wordsN}) {
        @{$scan->{wordsN}} =
            grep {$wordlen == length $_->[0]} @{$scan->{words}};
    }
    
    my @match = grep {$_->[0] eq $word} @{$scan->{wordsN}};

    $_->[1] += $weight for @match;
    @{$scan->{wordsN}} = sort {$b->[1] <=> $a->[1]} @{$scan->{wordsN}}
+;
}


sub doneWord {
    my $self = shift;
    
    $self->{scan} = $self->{lookup};
    $self->{digits} = ();
    $self->{wordlen} = 0;
    $self->{nextWordNIndex} = 0;
    $self->{tries} = ();
}


sub asSMS {
    my $word = shift;
    my @SMSWord;
    
    $word =~ tr /a-z//dc;
    
    for (split //, $word) {
        if (/[abc]/) {
            push @SMSWord,'2';
        } elsif (/[def]/) {
            push @SMSWord,'3';
        } elsif (/[ghi]/) {
            push @SMSWord,'4';
        } elsif (/[jkl]/) {
            push @SMSWord,'5';
        } elsif (/[mno]/) {
            push @SMSWord,'6';
        } elsif (/[pqrs]/) {
            push @SMSWord,'7';
        } elsif (/[tuv]/) {
            push @SMSWord,'8';
        } elsif (/[wxyz]/) {
            push @SMSWord,'9';
        }
    }

    return \@SMSWord;
}


sub training {
    my $self = shift;
    my %freqs = (
        i => 1,
        an => 1,
        no => 1,
        so => 1,
        by => 1,
        at => 1,
        has => 1,
        far => 1,
        see => 1,
        key => 1,
        who => 1,
        but => 1,
        are => 1,
        each => 1,
        used => 1,
        with => 1,
        here => 1,
        only => 1,
        same => 1,
        been => 1,
        pure => 1,
        below => 1,
        might => 1,
        could => 1,
        value => 1,
        enter => 1,
        solve => 1,
        rules => 1,
        minus => 1,
        space => 1,
        pearl => 1,
        number => 1,
        winner => 1,
        fewest => 1,
        reduce => 1,
        secret => 1,
        amount => 1,
        chosen => 1,
        except => 1,
        resume => 1,
        return => 1,
        texting => 1,
        presses => 1,
        thought => 1,
        correct => 1,
        contain => 1,
        support => 1,
        fastest => 1,
        problem => 1,
        between => 1,
        present => 1,
        category => 1,
        thousand => 1,
        returned => 1,
        provided => 1,
        expected => 1,
        suggested => 1,
        incorrect => 1,
        lowercase => 1,
        technique => 1,
        challenge => 1,
        interface => 1,
        following => 1,
        algorithm => 1,
        submitted => 1,
        solutions => 1,
        implement => 1,
        suggestion => 1,
        dictionary => 1,
        authorized => 1,
        completion => 1,
        considered => 1,
        keystrokes => 1,
        predictive => 1,
        programming => 1,
        interesting => 1,
        punctuation => 1,
        information => 1,
        application => 1,
        if => 2,
        one => 2,
        and => 2,
        for => 2,
        that => 2,
        time => 2,
        text => 2,
        plus => 2,
        best => 2,
        digit => 2,
        being => 2,
        means => 2,
        digits => 2,
        entries => 2,
        instead => 2,
        spelled => 2,
        finished => 2,
        must => 3,
        words => 3,
        passed => 3,
        of => 4,
        it => 4,
        in => 4,
        word => 4,
        to => 5,
        all => 5,
        is => 7,
        will => 7,
        a => 8,
        be => 9,
        the => 12,
    );
    
    if (! LEARN) {
        # Cheat by knowing the word frequencey up front
        $self->train ($_, $freqs{$_}) for keys %freqs;
    }
}

__DATA__
all
predictive texting is a technique to reduce the number of key presses 
+used to
enter text with i thought it might be an interesting challenge to see 
+who
could implement the best algorithm here are the rules

    the problem text to solve is secret
    it will contain between and thousand words
    all words will be present in only authorized dictionary
    all words will be lowercase
    no punctuation except a space
    solutions must be pure pearl
    all entries must be submitted by 

one winner for each category below will be chosen

    fewest keystrokes
    fastest completion time

all entries must support the same application programming interface to
+ be
considered digits will be passed in one at a time the expected return 
+value will
be the best suggestion for the amount of information provided so far i
+f instead
of a digit a plus is passed in it means that the word is finished bein
+g spelled
and the correct word has been returned if instead of a digit a minus i
+s passed
in it means that the word is finished being spelled but the suggested 
+word is
incorrect digits will resume following a plus
[download]

Using an early version of the OP's text as the message text I get the following result:

844 strokes to send a 901 character message containing 170 words.
[download]

Note that the 2of12.txt file is expected to reside in the current directory.

Updated to count '+' as strokes and part of message text - they effectively turn into spaces

Updated to take advantage of "knowing" the word frequency in the test text up front. Strokes reduce to 540! This version prints the strokes used for each word.

Set LEARN true to have the code learn the word frequency as the text is supplied. In this case the stroke count increases to 810 (there's a slight algorithm improvement on the first version too;) ).

DWIM is Perl's answer to Gödel

Comment on Re: Challenge: Predictive Texting Select or Download Code