Re: How can I use Clusterize

Replies are listed 'Best First'.
Re^2: How can I use Clusterize by joni (Initiate) on Apr 21, 2011 at 13:32 UTC
Yes, I would appreciate your help. Actually, I'm trying to get news articles from different sources and cluster them according to their content (similar to GoogleNews). This is my code for getting the contenst of news articles: use WWW::Newsgrabber; use LWP::Simple; use HTML::ContentExtractor; use LWP::UserAgent; $kot_tekst='HASH\|albania.htmlcomments\|lajmet\|free_web_stats\|index.html +$\|ne.html$\|arkivi.html$\|IMG\|html#c'; my $extractor = HTML::ContentExtractor->new(); my $agent=LWP::UserAgent->new; $dirname = "C:\\Users\\Administrator\\Desktop\\corpus"; my $j=1; $obj[0]= WWW::Newsgrabber->new( url => 'http://www.shekulli.com.al/biz +nes/', regex => '\.html' ); $obj[1] = WWW::Newsgrabber->new( url => 'http://www.gazeta-shqip.com/# +/ekonomi', regex => '\.html' ); foreach $item (@obj){ my $ResultHashRef = $item->getNews(); while ( my ($url,$name)=each(%{$ResultHashRef})){ if ($url !~ /$kot_tekst/){ $counter++; my $res=$agent->get($url); my $HTML = $res->decoded_content(); $extractor->extract($url,$HTML); $c= $extractor->as_text(); $c =~ m/ KOMENTE/g; $c=substr($c,1,pos($c)-7); $hash_biznes{$url}=$c; }}}; [download] This works fine, but as I said, I want now to cluster different articles, to find the similar ones.	[reply] [d/l]

Replies are listed 'Best First'.

Re^2: How can I use Clusterize
by joni (Initiate) on Apr 21, 2011 at 13:32 UTC

use WWW::Newsgrabber; 
use LWP::Simple;
use HTML::ContentExtractor;
use LWP::UserAgent;

$kot_tekst='HASH|albania.htmlcomments|lajmet|free_web_stats|index.html
+$|ne.html$|arkivi.html$|IMG|html#c';

my $extractor = HTML::ContentExtractor->new();
my $agent=LWP::UserAgent->new;
$dirname = "C:\\Users\\Administrator\\Desktop\\corpus";
my $j=1;

$obj[0]= WWW::Newsgrabber->new( url => 'http://www.shekulli.com.al/biz
+nes/', regex => '\.html' );
$obj[1] = WWW::Newsgrabber->new( url => 'http://www.gazeta-shqip.com/#
+/ekonomi', regex => '\.html' );

foreach $item (@obj){
my $ResultHashRef = $item->getNews();

while ( my ($url,$name)=each(%{$ResultHashRef})){
        if ($url !~ /$kot_tekst/){
        $counter++;
        my $res=$agent->get($url);
    my $HTML = $res->decoded_content();
    $extractor->extract($url,$HTML);
    $c= $extractor->as_text();
    $c =~ m/ KOMENTE/g;
    $c=substr($c,1,pos($c)-7);    
    $hash_biznes{$url}=$c;
}}};
[download]

[reply]
[d/l]