comment on

This is what I ended up with and am using (incorporating fizbin's points).

I've changed the definition of a word.

Hyphenated words are now separated and treated separately
Accented words are now ascii-ised

The other difference is that HTML entities are decoded while extracting text from the HTML.
All the text (apart from curly punctuation) is Latin 1.

#!/usr/bin/perl

use strict;
use warnings;
use locale;

my $config = {
  stop => 'data/stop.txt',
  minw => 3,
  maxw => 20,
};

my $words = q(
  and cat's O'Rielly counter-productive crèche 
  2000 1980s 777777 x0000
  repeat repeat repeat
);

my %keywords = get_word(lc $words);

for (keys %keywords){
  print "$_\n";
}

sub get_word{
  
  my ($text) = @_;
  
  my ($min, $max) = ($config->{minw}, $config->{maxw});
  my %stop = get_stop($config->{stop});
  my %acc  = get_accent();
  my %pat  = get_patterns();
  my %keywords;
  for ($text){
    s/$pat{rsqu}/'/g;
    s/$pat{hyph}/ /g;
    s/(.)/$acc{$1}?$acc{$1}:$1/eg;
  }
  my @words = ($text =~ /$pat{word}/g);
  my %seen;  
  for (@words){
    s/$pat{posv}//;
    s/$pat{apos}//g;
    next if length() < $min or length() > $max;
    next if /$pat{numb}/ and not /$pat{date}/;
    s/$pat{tail}// if /$pat{numb}/;
    next if exists $stop{$_};

    unless ($seen{$_}){
      $seen{$_}++;
      $keywords{$_} = undef;
    }
  }
  return %keywords;
}    

sub get_patterns{
  my %pat = (
    posv => q/'s$/,            # possessive s
    apos => qr/'/,             # apostrophe
    hyph => qr/-/,             # hyphen
    numb => qr/\d/,            # number
    date => qr/^[12]\d{3}s?$/, # date like 1960, 1990s
    rsqu => qr/\x{2019}/,      # right single quote
    tail => qr/s$/,            # trailing s for stripping off numbers
    word => qr/([\w']+)/,      # word that may contain apostrophe
  );
  return %pat;
}

sub get_accent{
  return qw(
    À A Á A Â A Ã A Ä A Å A Æ AE 
    Ç C 
    È E É E Ê E Ë E 
    Ì I Í I Î I Ï I 
    Ð TH Ñ N 
    Ò O Ó O Ô O Õ O Ö O Ø O 
    Ù U Ú U Û U Ü U 
    Ý U Þ TH ß ss 
    à a á a â a ã a ä a å a æ ae 
    ç c 
    è e é e ê e ë e 
    ì i í i î i ï i 
    ð th ñ n 
    ò o ó o ô o õ o ö o ø o 
    ù u ú u û u ü u 
    ý y þ th ÿ y
  );
}

sub get_stop {
  # load stop word file
  # sample
  my %stop = (
    and => undef,
    the => undef,
    any => undef,
  );
  return %stop;
}
[download]

output:

---------- Capture Output ----------
> "C:\Perl\bin\perl.exe" clean_words.pl
1980
orielly
productive
creche
cat
counter
2000
repeat

> Terminated with exit code 0.
[download]

In reply to Re: Extracting keywords from HTML by wfsp
in thread Extracting keywords from HTML by wfsp

Are you posting in the right place? Check out Where do I post X? to know for sure.
Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
<code> <a> <b> <big> <blockquote> <br /> <dd> <dl> <dt> <em> <font> <h1> <h2> <h3> <h4> <h5> <h6> <hr /> <i> <li> <nbsp> <ol> <p> <small> <strike> <strong> <sub> <sup> <table> <td> <th> <tr> <tt> <u> <ul>
Snippets of code should be wrapped in <code> tags not <pre> tags. In fact, <pre> tags should generally be avoided. If they must be used, extreme care should be taken to ensure that their contents do not have long lines (<70 chars), in order to prevent horizontal scrolling (and possible janitor intervention).
Want more info? How to link or How to display code and escape characters are good places to start.


Welcome to the Monastery
	PerlMonks