#!/usr/bin/perl package Metabot; use strict; use WWW::SimpleRobot; use HTML::Entities; require HTML::Parser; use HTML::Summary; use HTML::TreeBuilder; use Lingua::EN::Keywords; @Metabot::ISA = qw(HTML::Parser); my $url = $ARGV[0]; my $parser = Metabot->new; my $robot = WWW::SimpleRobot->new( URLS => [ $url ], FOLLOW_REGEX => "^$url", DEPTH => 2, TRAVERSAL => 'depth', VISIT_CALLBACK => sub { my ( $url, $depth, $html, $links ) = @_; print "$url - depth $depth\n"; $html = decode_entities($html); $html =~ s/document\.write\(.+?\)\;//g; $html =~ s/\&\#.+?\;//g; my $tree = new HTML::TreeBuilder; $tree->parse($html); my $summarizer = new HTML::Summary( LENGTH => 250, USE_META => 1, ); my $summary = $summarizer->generate( $tree ); $summary =~ s/\s+/ /gs; print "Summary: $summary\n"; $parser->parse($html); my $text = $parser->{TEXT}; my @keywords = keywords($summary . $text); print "Keywords: " . join(", ", @keywords) . "\n\n"; } , BROKEN_LINK_CALLBACK => sub { my ( $url, $linked_from, $depth ) = @_; print STDERR "$url looks like a broken link on $linked_from\n"; print STDERR "Depth = $depth\n"; } ); $robot->traverse; my @urls = @{$robot->urls}; my @pages = @{$robot->pages}; for my $page ( @pages ) { my $url = $page->{url}; my $depth = $page->{depth}; my $modification_time = $page->{modification_time}; } sub text { my ($self,$text) = @_; $self->{TEXT} .= $text; } #### [Ganesha:~/Desktop] davistv% perl metabot4.pl http://cincypg.org http://cincypg.org/ - depth 0 Summary: The Cincinnati Programmers' Guild is founded on the premise that the art of software design is best practiced with a sense of craftsmanship and personal responsibility. 2003 February 11 - Member Tom Wulf going on Safari CPG Member Tom Wulf has volunt Keywords: art of software design, member tom wulf, guild, tom wulf, safari cpg member tom wulf, cincinnati programmers http://cincypg.org/legal/Bylaws.html - depth 1 Summary: BYLAWS OF THE CINCINNATI PROGRAMMERS GUILD April 16, 2002 ARTICLE I. ARTICLE II. Section 1. ARTICLE III. Section 1. Section 2. Section 3. Section 4. Section 5. ARTICLE IV. Section 1. Section 2. Section 3. Section 4. Section 5. Section 6. Section 7. S Keywords: section, meetings, council, member, offices, councilors http://cincypg.org/contact.shtml - depth 1 Summary: Cincinnati Programmers' Guild: Contacts General Guild Information Troy Davis Foundertroy@glyss.com Secretary Jason Paul Secretaryjason@adaptiveinfosystems.com Webmaster Jeremy Phelps Guild Webmasterwebmaster@cincyp Keywords: meetings, section, council, member, offices, guild http://cincypg.org/cgi-bin/links.pl - depth 1 Summary: Submit a link for inclusion on this page: Title: URL: Select a categoryTutorials and Online DocumentationOther Computer GroupsInformation Technology NewsOtherUSENET NewsgroupsAlgorithm sitesComputer humorOpinionated tripeVendor WebsitesOther Guilds Keywords: meeting, section, council, guild, member, offices http://cincypg.org/directions.shtml - depth 1 Summary: Cincinnati Programmers' Guild This page has moved. Keywords: meeting, section, council, member, guild, offices http://cincypg.org/events.shtml - depth 1 Summary: Many thanks to our host: Future Events Event TypeWhenWhere(Click for directions.)TopicPresenter(s) Monthly MeetingJune 17th, 2003 18:30 (6:30PM)KiZAN TechnologiesInvitation to CVSMr. Possible Future Topics: ActionScript: Flash isn't just for designer Keywords: meeting, monthly meeting, section, ), 6:00pm, council http://cincypg.org/subscribe.shtml - depth 1 Summary: Enter your e-mail address below. Check here if you are unsubscribing. Keywords: meeting, monthly meeting, section, ), council, 6:00pm http://cincypg.org/join.shtml - depth 1 Summary: Join the Cincinnati Programmers' Guild We are not currently keeping an Official Membership List, nor is there a formal definition of who is and is not a Guild member. Just show up at our next meeting. Keywords: meeting, monthly meeting, members, section, ), guild