Re^3: problem count the number of words (updated)

Here's an example of counting defined sets of "words" (which can be tricky to define) based on the technique described in the Building Regex Alternations Dynamically article by haukex. If you can figure out how to get the contents of your positive and negative word data files into the corresponding arrays (and if my notion of what you want is anywhere near what you actually want), you may be on your way.

Note that the code is set up for case-insensitive matching and counting: the negative word "fourscore" matches "FoUrScOrE" in the example sentence, and so on. Note, again, that the concept of a "word" can be slippery, so the use of the \b boundary assertion, among other details, may not be appropriate.

c:\@Work\Perl\monks>perl -wMstrict -MData::Dump -le
"my @positive = qw(nation conceived liberty created equal foo);
 my @negative = qw(fourscore SEVEN fOrTh fathers continent bar);
 ;;
 my $sentence = 'FoUrScOrE and seven years ago '
              . 'our fathers brought forth, on this continent, '
              . 'a new nation, conceived in liberty, and dedicated '
              . 'to the proposition that all men are created equal. '
              . 'Repeat seven nation fathers nation.'
              ;
 ;;
 my %pos = map { lc($_) => 0 } @positive;
 my $rx_pos = make_regex(\%pos);
 print 'for debug: positive rx: ', $rx_pos;
 ;;
 my %neg = map { lc($_) => 0 } @negative;
 my $rx_neg = make_regex(\%neg);
 print 'for debug: negative rx: ', $rx_neg;
 ;;
 my %other;
 my $rx_undefined = qr{ (?! $rx_pos | $rx_neg) }xms;
 my $rx_word      = qr{ \b [[:alpha:]]+ \b }xms;
 ;;
 ++$pos  { lc $_ } for $sentence =~ m{ $rx_pos }xmsg;
 ++$neg  { lc $_ } for $sentence =~ m{ $rx_neg }xmsg;
 ++$other{ lc $_ } for $sentence =~ m{ $rx_undefined $rx_word }xmsg;
 ;;
 dd \%pos;
 dd \%neg;
 dd \%other;
 ;;
 ;;
 sub make_regex {
   my ($hr_wordlist) = @_;
   ;;
   my ($rx) =
     map  qr{ (?i) \b (?: $_) \b }xms,
     join '|',
     map  quotemeta,
     reverse sort
     keys %$hr_wordlist
     ;
   ;;
   return $rx;
   }
"
for debug: positive rx: (?msx-i: (?i) \b (?: nation|liberty|foo|equal|
+created|conceived) \b )
for debug: negative rx: (?msx-i: (?i) \b (?: seven|fourscore|forth|fat
+hers|continent|bar) \b )
{ conceived => 1, created => 1, equal => 1, foo => 0, liberty => 1, na
+tion => 3 }

{ bar => 0, continent => 1, fathers => 2, forth => 1, fourscore => 1, 
+seven => 2 }

{
  a           => 1,
  ago         => 1,
  all         => 1,
  "and"       => 2,
  are         => 1,
  brought     => 1,
  dedicated   => 1,
  in          => 1,
  men         => 1,
  new         => 1,
  on          => 1,
  our         => 1,
  proposition => 1,
  repeat      => 1,
  that        => 1,
  the         => 1,
  this        => 1,
  to          => 1,
  years       => 1,
}
[download]

Update: In the make_regex() function, the lines
reverse sort
map quotemeta,
~~are swapped~~ | were swapped (fixed); they should be
map quotemeta,
reverse sort
i.e., sort-ing, either lexically or by length, should be done on the raw strings before the quotemeta step.

Give a man a fish: <%-{-{-{-<

Comment on Re^3: problem count the number of words (updated) Select or Download Code

Replies are listed 'Best First'.
Re^4: problem count the number of words by GHMON (Novice) on Dec 28, 2018 at 09:37 UTC
Hi TNX bro i could find the my problem , my problem was about these section foreach (my $word = <$inwp>) { $countp{$word}++ ; } , foreach (my $word = <$nwt>) {$countn{$word}++ ; } because i split the words by /\n/ that my code didn't need to this separator my new code #Hi Codder use warnings ; use strict ; #use DBI ; use utf8 ; use Encode ; my $numlinep = 0 ; my $traincountp = 0 ; my $pt = '/root/Positive.txt' ; my $ptt = '/root/positivetrain1.txt' ; my $pwt = '/root/Positive2.txt' ; my $ntw = '/root/Negative2.txt' ; my $pttt = '/root/positivetest.txt' ; open (my $in , "<:encoding(utf8)" , "$pt") or die "$pt: $!" ; while (my $line = <$in>) { $numlinep++ ; } close $in ; open ($in , "<:encoding(utf8)" , "$pt") or die "$pt: $!" ; while (my $linep = <$in>) { my $inp ; if ($traincountp <= (0.7$numlinep)){ open ($inp , ">>" , "$ptt") or die "$ptt: $!" ; print $inp $linep , "\n" ; } if ($traincountp > (0.7$numlinep)){ open (my $inpt , ">>" , "$pttt") or die "$pttt: $!" ; print $inpt $linep , "\n" ; } $traincountp++ ; } close $in ; my $numlinet = 0 ; my $traincountn = 0 ; my $nt = '/root/Negative.txt' ; my $ntt = '/root/negativetrain1.txt' ; my $nttt = '/root/negativetest.txt' ; open (my $it , "<:encoding(utf8)" , "$nt") or die "$nt: $!" ; while (my $line = <$it>) { $numlinet++ ; } close $it ; open ($it , "<:encoding(utf8)" , "$nt") or die "$nt: $!" ; while (my $linen = <$it>) { my $itn ; if ($traincountn <= (0.7$numlinet)){ open ($itn , ">>" , "$ntt") or die "$ntt: $!" ; print $itn $linen , "\n" ; } if ($traincountn > (0.7$numlinet)) { open (my $ittn , ">>" , "$nttt") or die "$nttt: $!" ; print $ittn $linen , "\n" ; } $traincountn++ ; } close $it ; my $numlinepw = 0 ; my %countp = () ; open (my $inw , "<:encoding(utf8)" , "$ptt") or die "$ptt: $!" ; open (my $inwp , "<:encoding(utf8)" , "$pwt") or die "$pwt: $!" ; while (<$inw>) { my @pwords ; my @ptw ; my $elementp ; my $countp ; #@pwords = split (/\n/ , $inwp) ; #push @ptw , @pwords ; foreach (my $word = <$inwp>) { $countp{$word}++ ; } while ( ( my $kp , my $vp) = each %countp ) { open (my $hashp , ">>" , 'wordsbagp.txt') ; print $hashp "$kp = $vp\n" ; #print "$kp => $vp\n" ; #print "$kp" , "\n" ; #print "$vp" , "\n" ; #print "$kp" , "\n" , "$vp" , "\n" ; } $numlinepw++ ; } #print "$numlinepw" , "\n" ; my $numlinenw = 0 ; my %countn = () ; open (my $itw , "<:encoding(utf8)" , "$ntt") or die "$ntt: $!" ; open (my $nwt , "<:encoding(utf8)" , "$ntw" ) or die "$ntw: $!" ; while (<$itw>) { $numlinenw++ ; my @nwords ; my @ntw ; my $elementn ; my $countn ; #@nwords = split (/\n/ , $nwt) ; #push @ntw , @nwords ; foreach (my $word = <$nwt>) { $countn{$word}++ ; } while ( ( my $kn , my $vn ) = each %countn ) { open (my $hashn , '>>' , 'wordsbagn.txt') or die $! ; print $hashn "$kn = $vn\n" ; #print "$kn => $vn\n" ; #print "$kn" , "\n" ; #print "$vn" , "\n" ; #print "$kn" , "\n" , "$vn" , "\n" ; } } print 'Finish First Section' , "\n" ; [download]	[reply] [d/l]
Re^5: problem count the number of words by poj (Abbot) on Jan 01, 2019 at 17:06 UTC
`foreach (my $word = <$inwp>) { $countp{$word}++ ; }` [download] You appear to be just counting the words in the words file `Positive2.txt` and I can't see where you are spitting the sentences in `Positive.txt` into words. Perhaps this will help you progess #!/usr/bin/perl use strict; # create positive word hash my $pos_words = '/root/Positive2.txt'; open my $fh_in,'<:encoding(utf8)',$pos_words or die "Could not open $pos_words : $!"; my %pos_word = map { s/^\s+\|\s+$//g; $_=>1 } <$fh_in>; close $fh_in; # open positive word count output file my $pos_outfile = '/root/wordsbagp.txt'; open my $fh_out,'>',$pos_outfile or die "Could not open $pos_outfile\n"; # read positive sentence file my $pos_text = '/root/Positive.txt'; open my $fh_in,'<:encoding(utf8)',$pos_text or die "Could not open $pos_text : $!"; my @train = <$fh_in>; close $fh_in; # split into 2 arrays 70% / 30% my $offset = int 0.7 * @train; push my @test, splice @train,$offset; my $lineno = 0; for my $line (@train){ ++$lineno; # split sentence into words chomp($line); my @words = split /\b/,$line; # count positive words only my %count = (); for my $word (@words){ ++$count{$word} if exists $pos_word{$word}; } # print results print $fh_out "\nline $lineno: $line\n"; for my $word (sort keys %count){ printf $fh_out " %-10s => %d\n",$word,$count{$word}; } }; close $fh_in; close $fh_out; [download] poj	[reply] [d/l] [select]