args are --input-corpus 2.short.shelley.txt --ngram-length 2 --output-state 2.short.state { "counts" => { "achieve|treading" => 1, "acquired|new" => 1, ...big snip of similar... "words|of" => 1, "works|in" => 1, }, "cum-twisted-dist" => { achieve => ["treading", 1], acquired => ["new", 1], already => ["marked", 1], and => [ "the", 0.142857142857143, "performed", 0.285714285714286, "that", 0.428571428571429, "even", 0.571428571428571, "show", 0.714285714285714, "their", 0.857142857142857, "almost", 1, ], As => ["he", 1], as => ["if", 1], ascend => ["into", 1], be => ["transmuted", 1], blood => ["circulates", 1], ...big snip... with => ["a", 0.5, "its", 1], words => ["of", 1], works => ["in", 1], }, "dist" => { "achieve|treading" => 0.0087719298245614, "acquired|new" => 0.0087719298245614, ...big snip... "words|of" => 0.0087719298245614, "works|in" => 0.0087719298245614, }, "N" => 2, } ./2.analyse_text.pl : done. #### args are --input-corpus 2.short.shelley.txt --ngram-length 8 --output-state 2.short.state { "counts" => { "already|marked|I|will|pioneer|a|new|way" => 1, "crucible|have|indeed|performed|miracles|They|penetrate|into" => 1, "Frankenstein|more|far|more|will|I|achieve|treading" => 1, "heavens|they|have|discovered|how|the|blood|circulates" => 1, "mock|the|invisible|world|with|its|own|shadows" => 1, "of|nature|and|show|how|she|works|in" => 1, "one|purpose|So|much|has|been|done|exclaimed" => 1, "only|made|to|dabble|in|dirt|and|their" => 1, "promised|impossibilities|and|performed|nothing|The|modern|masters" => 1, "promise|very|little|they|know|that|metals|cannot" => 1, "sounded|and|soon|my|mind|was|filled|with" => 1, "Such|were|the|professor|s|words|rather|let" => 1, "they|can|command|the|thunders|of|heaven|mimic" => 1, "the|air|we|breathe|They|have|acquired|new" => 1, "The|ancient|teachers|of|this|science|said|he" => 1, "the|elixir|of|life|is|a|chimera|but" => 1, "the|fate|enounced|to|destroy|me|As|he" => 1, "touched|which|formed|the|mechanism|of|my|being" => 1, "unfold|to|the|world|the|deepest|mysteries|of" => 1, "went|on|I|felt|as|if|my|soul" => 1, "were|grappling|with|a|palpable|enemy|one|by" => 1, }, "cum-twisted-dist" => { "already|marked|I|will|pioneer|a|new" => ["way", 1], "crucible|have|indeed|performed|miracles|They|penetrate" => ["into", 1], "Frankenstein|more|far|more|will|I|achieve" => ["treading", 1], "heavens|they|have|discovered|how|the|blood" => ["circulates", 1], "mock|the|invisible|world|with|its|own" => ["shadows", 1], "of|nature|and|show|how|she|works" => ["in", 1], "one|purpose|So|much|has|been|done" => ["exclaimed", 1], "only|made|to|dabble|in|dirt|and" => ["their", 1], "promised|impossibilities|and|performed|nothing|The|modern" => ["masters", 1], "promise|very|little|they|know|that|metals" => ["cannot", 1], "sounded|and|soon|my|mind|was|filled" => ["with", 1], "Such|were|the|professor|s|words|rather" => ["let", 1], "they|can|command|the|thunders|of|heaven" => ["mimic", 1], "the|air|we|breathe|They|have|acquired" => ["new", 1], "The|ancient|teachers|of|this|science|said" => ["he", 1], "the|elixir|of|life|is|a|chimera" => ["but", 1], "the|fate|enounced|to|destroy|me|As" => ["he", 1], "touched|which|formed|the|mechanism|of|my" => ["being", 1], "unfold|to|the|world|the|deepest|mysteries" => ["of", 1], "went|on|I|felt|as|if|my" => ["soul", 1], "were|grappling|with|a|palpable|enemy|one" => ["by", 1], }, "dist" => { "already|marked|I|will|pioneer|a|new|way" => 0.0476190476190476, "crucible|have|indeed|performed|miracles|They|penetrate|into" => 0.0476190476190476, "Frankenstein|more|far|more|will|I|achieve|treading" => 0.0476190476190476, "heavens|they|have|discovered|how|the|blood|circulates" => 0.0476190476190476, "mock|the|invisible|world|with|its|own|shadows" => 0.0476190476190476, "of|nature|and|show|how|she|works|in" => 0.0476190476190476, "one|purpose|So|much|has|been|done|exclaimed" => 0.0476190476190476, "only|made|to|dabble|in|dirt|and|their" => 0.0476190476190476, "promised|impossibilities|and|performed|nothing|The|modern|masters" => 0.0476190476190476, "promise|very|little|they|know|that|metals|cannot" => 0.0476190476190476, "sounded|and|soon|my|mind|was|filled|with" => 0.0476190476190476, "Such|were|the|professor|s|words|rather|let" => 0.0476190476190476, "they|can|command|the|thunders|of|heaven|mimic" => 0.0476190476190476, "the|air|we|breathe|They|have|acquired|new" => 0.0476190476190476, "The|ancient|teachers|of|this|science|said|he" => 0.0476190476190476, "the|elixir|of|life|is|a|chimera|but" => 0.0476190476190476, "the|fate|enounced|to|destroy|me|As|he" => 0.0476190476190476, "touched|which|formed|the|mechanism|of|my|being" => 0.0476190476190476, "unfold|to|the|world|the|deepest|mysteries|of" => 0.0476190476190476, "went|on|I|felt|as|if|my|soul" => 0.0476190476190476, "were|grappling|with|a|palpable|enemy|one|by" => 0.0476190476190476, }, "N" => 8, } ./2.analyse_text.pl : done. #### #!/usr/bin/env perl # FILE: analyse_text.pl # by bliako use 5.011; use warnings; use Getopt::Long; use Data::Dump qw/dump/; use lib '.'; use Markov::Ndimensional; my @args = @ARGV; say "args are @args"; my $input_corpus_filename = undef; my $input_state_filename = undef; my $output_state_filename = undef; my $output_stats_filename = undef; my $separator = '\s'; my $internal_separator = '|'; my $ngram_length = -1; if ( !Getopt::Long::GetOptions( 'input-corpus=s' => \$input_corpus_filename, 'input-state=s' => \$input_state_filename, 'output-state=s' => \$output_state_filename, 'output-stats=s' => \$output_stats_filename, 'ngram-length=i' => \$ngram_length, 'separator=s' => \$separator, 'help|h' => sub { print STDERR usage($0); exit(0) } ) ) { print STDERR usage($0) . "\n\nSomething wrong with command-line parameters...\n"; exit(1); } if ( $ngram_length <= 0 ) { print STDERR "$0 : ngram-length must be a positive integer.\n"; exit(1); } my %params = (); if ( defined($output_state_filename) ) { $params{'need'} = { 'all' => 1 } } else { $params{'avoid'} = { 'counts' => 1 } } my $state = undef; if ( defined($input_state_filename) ) { $state = load_state($input_state_filename); if ( !defined($state) ) { print STDERR "$0 : call to " . 'load_state()' . " has failed.\n"; exit(1); } $params{'counts'} = $state->{'counts'}; } if ( defined($input_corpus_filename) ) { $state = learn( { %params, 'ngram-length' => $ngram_length, 'separator' => $separator, 'internal-separator' => $internal_separator, 'remove-these-characters' => '[^a-zA-Z]', 'input-filename' => $input_corpus_filename, } ); if ( !defined($state) ) { print STDERR "$0 : call to " . 'learn()' . " has failed.\n"; exit(1); } } if ( !defined($state) ) { print STDERR "$0 : --input-state and/or --input-fasta must be specified.\n"; exit(1); } if ( defined($output_state_filename) ) { if ( !save_state( $state, $output_state_filename ) ) { print STDERR "$0 : call to " . 'save_state()' . " has failed.\n"; exit(1); } } if ( defined($output_stats_filename) ) { print Data::Dump::dump($state); } else { print Data::Dump::dump($state); } print "\n$0 : done.\n"; exit(0); sub usage { return "Usage : $0 \n"; } 1; #### “The ancient teachers of this science,” said he, “promised impossibilities and performed nothing. The modern masters promise very little; they know that metals cannot be transmuted and that the elixir of life is a chimera but these philosophers, whose hands seem only made to dabble in dirt, and their eyes to pore over the microscope or crucible, have indeed performed miracles. They penetrate into the recesses of nature and show how she works in her hiding-places. They ascend into the heavens; they have discovered how the blood circulates, and the nature of the air we breathe. They have acquired new and almost unlimited powers; they can command the thunders of heaven, mimic the earthquake, and even mock the invisible world with its own shadows.” Such were the professor’s words—rather let me say such the words of the fate—enounced to destroy me. As he went on I felt as if my soul were grappling with a palpable enemy; one by one the various keys were touched which formed the mechanism of my being; chord after chord was sounded, and soon my mind was filled with one thought, one conception, one purpose. So much has been done, exclaimed the soul of Frankenstein—more, far more, will I achieve; treading in the steps already marked, I will pioneer a new way, explore unknown powers, and unfold to the world the deepest mysteries of creation. #### 8-word ngrams of '“The ancient teachers of this science,” said he, “promised impossibilities and performed nothing. The modern masters promise very little; they know that metals cannot be transmuted and that the elixir of life is a chimera but these philosophers, whose hands seem only made to dabble in dirt, and their eyes to pore over the microscope or crucible, have indeed performed miracles. They penetrate into the recesses of nature and show how she works in her hiding-places. They ascend into the heavens; they have discovered how the blood circulates, and the nature of the air we breathe. They have acquired new and almost unlimited powers; they can command the thunders of heaven, mimic the earthquake, and even mock the invisible world with its own shadows.”' START INDEX: 0 : “The ancient teachers of this science,” said he, START INDEX: 1 : ancient teachers of this science,” said he, “promised START INDEX: 2 : teachers of this science,” said he, “promised impossibilities START INDEX: 3 : of this science,” said he, “promised impossibilities and START INDEX: 4 : this science,” said he, “promised impossibilities and performed START INDEX: 5 : science,” said he, “promised impossibilities and performed nothing. START INDEX: 6 : said he, “promised impossibilities and performed nothing. The START INDEX: 7 : he, “promised impossibilities and performed nothing. The modern ... START INDEX: 112 : the earthquake, and even mock the invisible world START INDEX: 113 : earthquake, and even mock the invisible world with START INDEX: 114 : and even mock the invisible world with its START INDEX: 115 : even mock the invisible world with its own START INDEX: 116 : mock the invisible world with its own shadows.” -------------------- #### #!/usr/bin/env perl use 5.026; use warnings; my $text = q{“The ancient teachers of this science,” said he, “promised impossibilities and performed nothing. The modern masters promise very little; they know that metals cannot be transmuted and that the elixir of life is a chimera but these philosophers, whose hands seem only made to dabble in dirt, and their eyes to pore over the microscope or crucible, have indeed performed miracles. They penetrate into the recesses of nature and show how she works in her hiding-places. They ascend into the heavens; they have discovered how the blood circulates, and the nature of the air we breathe. They have acquired new and almost unlimited powers; they can command the thunders of heaven, mimic the earthquake, and even mock the invisible world with its own shadows.”}; for ( 1 .. 8 ) { say qq{$_-word ngrams of '$text'}; say for nGramWords( $_, $text ); say q{-} x 20; } sub nGramWords { my ( $nWords, $string ) = @_; my @words = split m{\s+}, $string; my $start = 0; my @nGrams; while ( scalar @words >= $nWords ) { push @nGrams, join q{ }, qq{START INDEX: @{ [ $start ++ ] } : }, @words[ 0 .. $nWords - 1 ]; shift @words; } return @nGrams; } 1;