#!/usr/local/bin/perl use warnings; use strict; use WWW::Mechanize; use URI::Escape; use HTML::Entities 'decode_entities'; use HTML::TokeParser::Simple; use Getopt::Long; no warnings 'defined'; # simple hack to avoid warnings for pages that weren't fetched $|++; # so we can see the results as they are printed GetOptions( 'help|?' => \&usage, 'user=s' => \my $TARGET_USER, 'verbose=i' => \my $VERBOSE, 'syndicated' => \my $INCLUDE_SYNDICATED, 'communities' => \my $INCLUDE_COMMUNITIES, 'minimum=i' => \my $minimum, ); $minimum ||= 1; $VERBOSE ||= 0; # bad regexes. Need to improve them! use constant FRIENDS => { regex => qr{href='http://www\.livejournal\.com/(?:community|users)/[^/]+/friends'}, label => 'friends', instance => qr{^/userinfo.bml\?user=(.*)$}, }; use constant INTERESTS => { regex => qr{href='/interests\.bml'}, label => 'interests', instance => qr{^/interests.bml\?int=(.*)$}, }; my $MECH = WWW::Mechanize->new; $TARGET_USER ||= die "You must supply an LJ username"; print "Fetching user info for ($TARGET_USER) ...\n" if $VERBOSE; my $html = get_user_info($TARGET_USER); print "Fetching friends list for ($TARGET_USER)...\n" if $VERBOSE; my $users = get_list($html,FRIENDS); my $current = 1; my $count = @$users; my %sections; foreach my $user (@$users) { print "Fetching $user: $current out of $count\n" if $VERBOSE; sleep 1; # be nice to their server $current++; print "Fetching user info for ($user) ...\n" if $VERBOSE > 1; my $html = get_user_info($user); next unless $html; print "Fetching interests for ($user) ...\n" if $VERBOSE > 1; my $interests = get_list($html, INTERESTS); foreach my $interest (@$interests) { $interest = uri_unescape($interest); $interest =~ tr/+/ /; $sections{$interest}++; } } my @results = sort { $b->[1] <=> $a->[1] || $a->[0] cmp $b->[0] } map { [ $_, $sections{$_}, ($sections{$_} / $count) * 100] } grep { $sections{$_} >= $minimum } keys %sections; foreach my $interest (@results) { printf "%30s %d %.1d%%\n", @$interest; } sub get_list { my ($html,$section) = @_; my $parser = HTML::TokeParser::Simple->new(\$html); while (my $token = $parser->get_token) { next unless $token->as_is =~ /$section->{regex}/; last; } $parser->get_tag('td'); # advance to first td tag my @sections; while (my $token = $parser->get_token) { last if $token->is_end_tag('td'); # we're at the end of the member # table data element next unless $token->is_start_tag('a'); if ($token->return_attr->{href} =~ $section->{instance}) { push @sections => decode_entities($1); } } printf("\t%d %s found\n", scalar @sections, $section->{label}) if $VERBOSE > 1; return \@sections; } sub get_user_info { my $user = shift; my $info = sprintf "http://www.livejournal.com/userinfo.bml?user=%s&mode=full" => uri_escape($user); my $page = $MECH->get($info); my $html = $MECH->content; if ('Error' eq $MECH->title && $html =~ /Unknown user/) { # this isn't perfect, but it's reasonable since LJ does # not return error codes warn "User ($user) not found"; return; } if ($user ne $TARGET_USER && $MECH->title =~ /Syndicated Account/ && ! $INCLUDE_SYNDICATED) { print "\tSkipping syndicated account ($user)\n" if $VERBOSE; return; } if ($user ne $TARGET_USER && $MECH->title =~ /Community Info/ && ! $INCLUDE_COMMUNITIES) { print "\tSkipping community ($user)\n" if $VERBOSE; return; } return $html; } sub usage { print <<" END_USAGE"; $0 will calculate the number of interests that the friends for a given user has. $0 --user [options] --help Display this information and exit --? Same as '--help' --user Mandatory. This is the user we will fetch friends for. --syndicated Include syndicated "friends" (e.g., doonesbury) --communities Include communities --minimum Takes and integer, \$x. Does not report on interests shared by only \$x or fewer people. --verbose Takes an integer (0, 1, or 2). If zero, will print nothing but the interest list (this is the default). 1 and 2 will print more and more information. These are useful to let you know the program has not "hung" if you're working with a large list or over a slow connection. Example: $0 --user publius_ovidius --verbose 2 --communities That will calculate the common interests for friends of publius_ovidius, displaying verbose information and includes community interests (but does not include syndicated feed interests). Note that arguments may be abbreviated to the first letter. The above command may be written as: $0 -u publius_ovidius -v 2 -c END_USAGE exit; }