args are --input-corpus 2.short.shelley.txt --ngram-length 2 --output-state 2.short.state
{
"counts" => {
"achieve|treading" => 1,
"acquired|new" => 1,
...big snip of similar...
"words|of" => 1,
"works|in" => 1,
},
"cum-twisted-dist" => {
achieve => ["treading", 1],
acquired => ["new", 1],
already => ["marked", 1],
and => [
"the",
0.142857142857143,
"performed",
0.285714285714286,
"that",
0.428571428571429,
"even",
0.571428571428571,
"show",
0.714285714285714,
"their",
0.857142857142857,
"almost",
1,
],
As => ["he", 1],
as => ["if", 1],
ascend => ["into", 1],
be => ["transmuted", 1],
blood => ["circulates", 1],
...big snip...
with => ["a", 0.5, "its", 1],
words => ["of", 1],
works => ["in", 1],
},
"dist" => {
"achieve|treading" => 0.0087719298245614,
"acquired|new" => 0.0087719298245614,
...big snip...
"words|of" => 0.0087719298245614,
"works|in" => 0.0087719298245614,
},
"N" => 2,
}
./2.analyse_text.pl : done.
####
args are --input-corpus 2.short.shelley.txt --ngram-length 8 --output-state 2.short.state
{
"counts" => {
"already|marked|I|will|pioneer|a|new|way" => 1,
"crucible|have|indeed|performed|miracles|They|penetrate|into" => 1,
"Frankenstein|more|far|more|will|I|achieve|treading" => 1,
"heavens|they|have|discovered|how|the|blood|circulates" => 1,
"mock|the|invisible|world|with|its|own|shadows" => 1,
"of|nature|and|show|how|she|works|in" => 1,
"one|purpose|So|much|has|been|done|exclaimed" => 1,
"only|made|to|dabble|in|dirt|and|their" => 1,
"promised|impossibilities|and|performed|nothing|The|modern|masters" => 1,
"promise|very|little|they|know|that|metals|cannot" => 1,
"sounded|and|soon|my|mind|was|filled|with" => 1,
"Such|were|the|professor|s|words|rather|let" => 1,
"they|can|command|the|thunders|of|heaven|mimic" => 1,
"the|air|we|breathe|They|have|acquired|new" => 1,
"The|ancient|teachers|of|this|science|said|he" => 1,
"the|elixir|of|life|is|a|chimera|but" => 1,
"the|fate|enounced|to|destroy|me|As|he" => 1,
"touched|which|formed|the|mechanism|of|my|being" => 1,
"unfold|to|the|world|the|deepest|mysteries|of" => 1,
"went|on|I|felt|as|if|my|soul" => 1,
"were|grappling|with|a|palpable|enemy|one|by" => 1,
},
"cum-twisted-dist" => {
"already|marked|I|will|pioneer|a|new" => ["way", 1],
"crucible|have|indeed|performed|miracles|They|penetrate" => ["into", 1],
"Frankenstein|more|far|more|will|I|achieve" => ["treading", 1],
"heavens|they|have|discovered|how|the|blood" => ["circulates", 1],
"mock|the|invisible|world|with|its|own" => ["shadows", 1],
"of|nature|and|show|how|she|works" => ["in", 1],
"one|purpose|So|much|has|been|done" => ["exclaimed", 1],
"only|made|to|dabble|in|dirt|and" => ["their", 1],
"promised|impossibilities|and|performed|nothing|The|modern" => ["masters", 1],
"promise|very|little|they|know|that|metals" => ["cannot", 1],
"sounded|and|soon|my|mind|was|filled" => ["with", 1],
"Such|were|the|professor|s|words|rather" => ["let", 1],
"they|can|command|the|thunders|of|heaven" => ["mimic", 1],
"the|air|we|breathe|They|have|acquired" => ["new", 1],
"The|ancient|teachers|of|this|science|said" => ["he", 1],
"the|elixir|of|life|is|a|chimera" => ["but", 1],
"the|fate|enounced|to|destroy|me|As" => ["he", 1],
"touched|which|formed|the|mechanism|of|my" => ["being", 1],
"unfold|to|the|world|the|deepest|mysteries" => ["of", 1],
"went|on|I|felt|as|if|my" => ["soul", 1],
"were|grappling|with|a|palpable|enemy|one" => ["by", 1],
},
"dist" => {
"already|marked|I|will|pioneer|a|new|way" => 0.0476190476190476,
"crucible|have|indeed|performed|miracles|They|penetrate|into" => 0.0476190476190476,
"Frankenstein|more|far|more|will|I|achieve|treading" => 0.0476190476190476,
"heavens|they|have|discovered|how|the|blood|circulates" => 0.0476190476190476,
"mock|the|invisible|world|with|its|own|shadows" => 0.0476190476190476,
"of|nature|and|show|how|she|works|in" => 0.0476190476190476,
"one|purpose|So|much|has|been|done|exclaimed" => 0.0476190476190476,
"only|made|to|dabble|in|dirt|and|their" => 0.0476190476190476,
"promised|impossibilities|and|performed|nothing|The|modern|masters" => 0.0476190476190476,
"promise|very|little|they|know|that|metals|cannot" => 0.0476190476190476,
"sounded|and|soon|my|mind|was|filled|with" => 0.0476190476190476,
"Such|were|the|professor|s|words|rather|let" => 0.0476190476190476,
"they|can|command|the|thunders|of|heaven|mimic" => 0.0476190476190476,
"the|air|we|breathe|They|have|acquired|new" => 0.0476190476190476,
"The|ancient|teachers|of|this|science|said|he" => 0.0476190476190476,
"the|elixir|of|life|is|a|chimera|but" => 0.0476190476190476,
"the|fate|enounced|to|destroy|me|As|he" => 0.0476190476190476,
"touched|which|formed|the|mechanism|of|my|being" => 0.0476190476190476,
"unfold|to|the|world|the|deepest|mysteries|of" => 0.0476190476190476,
"went|on|I|felt|as|if|my|soul" => 0.0476190476190476,
"were|grappling|with|a|palpable|enemy|one|by" => 0.0476190476190476,
},
"N" => 8,
}
./2.analyse_text.pl : done.
####
#!/usr/bin/env perl
# FILE: analyse_text.pl
# by bliako
use 5.011;
use warnings;
use Getopt::Long;
use Data::Dump qw/dump/;
use lib '.';
use Markov::Ndimensional;
my @args = @ARGV;
say "args are @args";
my $input_corpus_filename = undef;
my $input_state_filename = undef;
my $output_state_filename = undef;
my $output_stats_filename = undef;
my $separator = '\s';
my $internal_separator = '|';
my $ngram_length = -1;
if (
!Getopt::Long::GetOptions(
'input-corpus=s' => \$input_corpus_filename,
'input-state=s' => \$input_state_filename,
'output-state=s' => \$output_state_filename,
'output-stats=s' => \$output_stats_filename,
'ngram-length=i' => \$ngram_length,
'separator=s' => \$separator,
'help|h' => sub { print STDERR usage($0); exit(0) }
)
)
{
print STDERR usage($0)
. "\n\nSomething wrong with command-line parameters...\n";
exit(1);
}
if ( $ngram_length <= 0 ) {
print STDERR "$0 : ngram-length must be a positive integer.\n";
exit(1);
}
my %params = ();
if ( defined($output_state_filename) ) { $params{'need'} = { 'all' => 1 } }
else { $params{'avoid'} = { 'counts' => 1 } }
my $state = undef;
if ( defined($input_state_filename) ) {
$state = load_state($input_state_filename);
if ( !defined($state) ) {
print STDERR "$0 : call to " . 'load_state()' . " has failed.\n";
exit(1);
}
$params{'counts'} = $state->{'counts'};
}
if ( defined($input_corpus_filename) ) {
$state = learn(
{
%params,
'ngram-length' => $ngram_length,
'separator' => $separator,
'internal-separator' => $internal_separator,
'remove-these-characters' => '[^a-zA-Z]',
'input-filename' => $input_corpus_filename,
}
);
if ( !defined($state) ) {
print STDERR "$0 : call to " . 'learn()' . " has failed.\n";
exit(1);
}
}
if ( !defined($state) ) {
print STDERR "$0 : --input-state and/or --input-fasta must be specified.\n";
exit(1);
}
if ( defined($output_state_filename) ) {
if ( !save_state( $state, $output_state_filename ) ) {
print STDERR "$0 : call to " . 'save_state()' . " has failed.\n";
exit(1);
}
}
if ( defined($output_stats_filename) ) {
print Data::Dump::dump($state);
}
else {
print Data::Dump::dump($state);
}
print "\n$0 : done.\n";
exit(0);
sub usage {
return "Usage : $0 \n";
}
1;
####
“The ancient teachers of this science,” said he,
“promised impossibilities and performed nothing. The modern masters
promise very little; they know that metals cannot be transmuted and that
the elixir of life is a chimera but these philosophers, whose hands seem
only made to dabble in dirt, and their eyes to pore over the microscope or
crucible, have indeed performed miracles. They penetrate into the recesses
of nature and show how she works in her hiding-places. They ascend into the
heavens; they have discovered how the blood circulates, and the nature of
the air we breathe. They have acquired new and almost unlimited powers;
they can command the thunders of heaven, mimic the earthquake, and even
mock the invisible world with its own shadows.”
Such were the professor’s words—rather let me say such the words of
the fate—enounced to destroy me. As he went on I felt as if my soul
were grappling with a palpable enemy; one by one the various keys were
touched which formed the mechanism of my being; chord after chord was
sounded, and soon my mind was filled with one thought, one conception,
one purpose. So much has been done, exclaimed the soul of
Frankenstein—more, far more, will I achieve; treading in the steps
already marked, I will pioneer a new way, explore unknown powers, and
unfold to the world the deepest mysteries of creation.
####
8-word ngrams of '“The ancient teachers of this science,” said he,
“promised impossibilities and performed nothing. The modern masters
promise very little; they know that metals cannot be transmuted and that
the elixir of life is a chimera but these philosophers, whose hands seem
only made to dabble in dirt, and their eyes to pore over the microscope or
crucible, have indeed performed miracles. They penetrate into the recesses
of nature and show how she works in her hiding-places. They ascend into the
heavens; they have discovered how the blood circulates, and the nature of
the air we breathe. They have acquired new and almost unlimited powers;
they can command the thunders of heaven, mimic the earthquake, and even
mock the invisible world with its own shadows.”'
START INDEX: 0 : “The ancient teachers of this science,” said he,
START INDEX: 1 : ancient teachers of this science,” said he, “promised
START INDEX: 2 : teachers of this science,” said he, “promised impossibilities
START INDEX: 3 : of this science,” said he, “promised impossibilities and
START INDEX: 4 : this science,” said he, “promised impossibilities and performed
START INDEX: 5 : science,” said he, “promised impossibilities and performed nothing.
START INDEX: 6 : said he, “promised impossibilities and performed nothing. The
START INDEX: 7 : he, “promised impossibilities and performed nothing. The modern
...
START INDEX: 112 : the earthquake, and even mock the invisible world
START INDEX: 113 : earthquake, and even mock the invisible world with
START INDEX: 114 : and even mock the invisible world with its
START INDEX: 115 : even mock the invisible world with its own
START INDEX: 116 : mock the invisible world with its own shadows.”
--------------------
####
#!/usr/bin/env perl
use 5.026;
use warnings;
my $text = q{“The ancient teachers of this science,” said he,
“promised impossibilities and performed nothing. The modern masters
promise very little; they know that metals cannot be transmuted and that
the elixir of life is a chimera but these philosophers, whose hands seem
only made to dabble in dirt, and their eyes to pore over the microscope or
crucible, have indeed performed miracles. They penetrate into the recesses
of nature and show how she works in her hiding-places. They ascend into the
heavens; they have discovered how the blood circulates, and the nature of
the air we breathe. They have acquired new and almost unlimited powers;
they can command the thunders of heaven, mimic the earthquake, and even
mock the invisible world with its own shadows.”};
for ( 1 .. 8 ) {
say qq{$_-word ngrams of '$text'};
say for nGramWords( $_, $text );
say q{-} x 20;
}
sub nGramWords {
my ( $nWords, $string ) = @_;
my @words = split m{\s+}, $string;
my $start = 0;
my @nGrams;
while ( scalar @words >= $nWords ) {
push @nGrams, join q{ },
qq{START INDEX: @{ [ $start ++ ] } : },
@words[ 0 .. $nWords - 1 ];
shift @words;
}
return @nGrams;
}
1;