#!/usr/bin/perl -w
use strict;
use Getopt::Std;
use URI;
use File::Path;
use LWP::UserAgent;
use HTML::TokeParser;
my %opts;
getopt('dk', \%opts);
unless ($opts{d} && $opts{k}) {
error("Usage = $0 -d url -k keyword");
}
my $retrieved_document = retrieve_document($opts{d});
archive_document($opts{d}, $retrieved_document);
my $doc_rating = analyze_document($retrieved_document, $opts{k});
print "Document rating for keyword: ", $opts{k}, " = $doc_rating\n";
sub archive_document {
# creates a hierarchy of directories to
# store data in based on domain/file path
my ($url, $doc) = @_;
my $path = url_to_path($url);
mkpath([$path], 1, 0755);
open DATA, ">$path/data" or error("Can't write data: $!\n");
print DATA $doc;
close DATA;
}
sub error {
# before you ask, this is here so I can easily change
# the formatting of the error messages later on (think HTML)
my $error = shift;
print "Error: $error\n";
exit;
}
sub analyze_document {
# takes an html document and performs a (very) crude
# analysis to determine relevance to the given keyword
# returns an integer relevance rating
my ($doc, $keyword) = @_;
my $p = HTML::TokeParser->new(\$doc) || die "$!";
my %tag_weights = (
a => {
text => 2,
},
title => {
text => 5,
},
p => {
text => 1,
}
);
my $rating = 0;
# This ain't pretty, suggestions for improvements
# are greatly appreciated
while (my $token = $p->get_token) {
my $token_type = shift @{$token};
if ($token_type eq "S") {
my ($tag, $attr, $attrseq, $rawtxt) = @{$token};
for (keys %tag_weights) {
if ($tag eq $_) {
if ($p->get_text("/$tag") =~ /\Q$keyword\E/i) {
$rating += $tag_weights{$tag}{text};
}
}
}
}
}
return $rating;
}
sub retrieve_document {
my $url = shift;
my $ua = LWP::UserAgent->new;
$ua->agent("cjf/0.0.1");
my $req = HTTP::Request->new(GET => $url);
$req->header('Accept' => 'text/html');
my $res = $ua->request($req);
if ($res->is_success) {
return $res->content;
} else {
error($res->status_line);
}
# should probably add a check on the size of the document
# not a huge concern yet because it's locally submitted
}
sub url_to_path {
my $url = URI->new(shift);
print $url, "\n";
my $path = $url->host;
$path =~ tr[.][/];
$path .= $url->path;
unless (substr($url->path, -1) eq "/") {
$path .= '/';
}
return $path;
}
-
Are you posting in the right place? Check out Where do I post X? to know for sure.
-
Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
<code> <a> <b> <big>
<blockquote> <br /> <dd>
<dl> <dt> <em> <font>
<h1> <h2> <h3> <h4>
<h5> <h6> <hr /> <i>
<li> <nbsp> <ol> <p>
<small> <strike> <strong>
<sub> <sup> <table>
<td> <th> <tr> <tt>
<u> <ul>
-
Snippets of code should be wrapped in
<code> tags not
<pre> tags. In fact, <pre>
tags should generally be avoided. If they must
be used, extreme care should be
taken to ensure that their contents do not
have long lines (<70 chars), in order to prevent
horizontal scrolling (and possible janitor
intervention).
-
Want more info? How to link
or How to display code and escape characters
are good places to start.