#!/usr/bin/perl -w
########################################################
# Program written by Tom Freeman 2003
# -----------------------------------
# Purpose: To collect information about the technologies
# being used on websites, particularly client side
# technologies. As well as the server the sites are using.
#
# Improvements
# v1.8: Added support for the META exclusion protocol
# i.e. <META name="ROBOTS" content="NOINDEX, NOFOLLOW">
#
# v3.3: now retains a list of new domains on disk in
# domains.dat file
#
########################################################
require LWP::Parallel::UserAgent;
use HTTP::Request;
#use LWP::Debug qw(+); # display tons of debugging messages. See 'perl
+doc LWP::Debug'
use URI;
use URI::URL;
use HTML::LinkExtor;
use LWP::Simple qw(get);
use Time::SoFar qw( runtime runinterval figuretimes );
use Fcntl;
use DBI;
my $domainsfile = 'domains.dat';
####### VARIABLES ######################################
my $pua = LWP::Parallel::UserAgent->new();
$pua->in_order (1); # handle requests in order of registration
$pua->duplicates(0); # ignore duplicates
$pua->timeout (2); # in seconds
$pua->redirect (1); # follow redirects
my @domains=();
push @domains, 'http://www.bbc.co.uk'; #load first value onto the doma
+ins array
my $pass_counter = '0';
$num_domains = scalar @domains;
while ($num_domains > '0') { # Start Loopy Code
######## Load up the old domains file from disk ######
open DOMAINSFILE, "< $domainsfile" || print "Failed to open logfile
+for writing: $!";
@domains = (<DOMAINSFILE>);
close(DOMAINSFILE);
######################################################
$pass_counter++;
my @selected_domains=();
while (scalar @selected_domains <= 10) {
my $new_domain = shift @domains;
my $visited = &checkExists($new_domain);
if (!$visited) {
push @selected_domains, $new_domain;
}
}
my $reqs = [
HTTP::Request->new('GET', $selected_domains[0]),
HTTP::Request->new('GET', $selected_domains[1]),
HTTP::Request->new('GET', $selected_domains[2]),
HTTP::Request->new('GET', $selected_domains[3]),
HTTP::Request->new('GET', $selected_domains[4]),
HTTP::Request->new('GET', $selected_domains[5]),
HTTP::Request->new('GET', $selected_domains[6]),
HTTP::Request->new('GET', $selected_domains[7]),
HTTP::Request->new('GET', $selected_domains[8]),
HTTP::Request->new('GET', $selected_domains[9]),
];
foreach my $req (@$reqs) {
if ($req->url) {
print "Registering ".$req->url."\n";
if ( my $res = $pua->register ($req) ) {
print STDERR $res->error_as_HTML;
}
}
}
$entries = $pua->wait();
foreach (keys %$entries) {
my $res = $entries->{$_}->response;
my $domain = $res->request->url;
if ($res->is_success) {
# Do lots of interesting stuff with the
# content and write out to the database
}
}
$num_domains = scalar @domains; # get the size of array
######### Write out the new domains file ######
open DOMAINSFILE, "> $domainsfile" || print "Failed to open logfile
+for writing: $!";
foreach my $domain (@domains) {
print DOMAINSFILE "$domain\n";
}
close(DOMAINSFILE);
###############################################
(@domains)=();
undef %$entries;
}
I hope this isn't too much code to annoy people, i'd be interested to hear any of your thoughts, ideas and comments?
Many Thanks, Tom |