comment on

#!/usr/bin/perl -w

use strict;

use Getopt::Std;
use URI;
use File::Path;
use LWP::UserAgent;
use HTML::TokeParser;

my %opts;

getopt('dk', \%opts);

unless ($opts{d} && $opts{k}) {
    error("Usage = $0 -d url -k keyword");
}

my $retrieved_document = retrieve_document($opts{d});

archive_document($opts{d}, $retrieved_document);

my $doc_rating = analyze_document($retrieved_document, $opts{k});

print "Document rating for keyword: ", $opts{k}, " = $doc_rating\n";

sub archive_document {

    # creates a hierarchy of directories to
    # store data in based on domain/file path

    my ($url, $doc) = @_;

    my $path = url_to_path($url);

    mkpath([$path], 1, 0755);

    open DATA, ">$path/data" or error("Can't write data: $!\n");
        print DATA $doc;
    close DATA;

}


sub error {

    # before you ask, this is here so I can easily change
    # the formatting of the error messages later on (think HTML)

    my $error = shift;
    print "Error: $error\n";
    exit;
}

sub analyze_document {

    # takes an html document and performs a (very) crude
    # analysis to determine relevance to the given keyword
    # returns an integer relevance rating

    my ($doc, $keyword) = @_;

    my $p = HTML::TokeParser->new(\$doc) || die "$!";

    my %tag_weights = (
        a => {
            text => 2,
        },
        title => {
            text => 5,
        },
        p => {
            text => 1,
        }
    );


    my $rating = 0;

    # This ain't pretty, suggestions for improvements
    # are greatly appreciated

    while (my $token = $p->get_token) {

        my $token_type = shift @{$token};

        if ($token_type eq "S") {

            my ($tag, $attr, $attrseq, $rawtxt) = @{$token};

            for (keys %tag_weights) {

                if ($tag eq $_) {

                    if ($p->get_text("/$tag") =~ /\Q$keyword\E/i) {
                        $rating += $tag_weights{$tag}{text};
                    }
                }

            }
        }
    }

    return $rating;

}


sub retrieve_document {

    my $url = shift;

    my $ua = LWP::UserAgent->new;

    $ua->agent("cjf/0.0.1");

    my $req = HTTP::Request->new(GET => $url);

    $req->header('Accept' => 'text/html');

    my $res = $ua->request($req);

    if ($res->is_success) {
        return $res->content;
    } else {
        error($res->status_line);
    }

    # should probably add a check on the size of the document
    # not a huge concern yet because it's locally submitted

}

sub url_to_path {

    my $url  = URI->new(shift);

    print $url, "\n";

    my $path = $url->host;

    $path  =~ tr[.][/];

    $path .= $url->path;

    unless (substr($url->path, -1) eq "/") {
        $path .= '/';
    }

    return $path;

}
[download]

In reply to Simple HTML doc retrieval and analysis script by cjf

Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!

Titles consisting of a single word are discouraged, and in most cases are disallowed outright.

Read Where should I post X? if you're not absolutely sure you're posting in the right place.

Please read these before you post! —

Posts may use any of the Perl Monks Approved HTML tags:

a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, details, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, summary, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr

You may need to use entities for some characters, as follows. (Exception: Within code tags, you can put the characters literally.)

	For:		Use:
	&		`&`
	<		`<`
	>		`>`
	[		`[`
	]		`]`

Link using PerlMonks shortcuts! What shortcuts can I use for linking?

See Writeup Formatting Tips and other pages linked from there for more info.