Adjust the way the get_sids() sub is called
my $lproc = 621557;
my @sids = get_sids($url, $pid, $lproc);
Change the sub
sub get_sids{
my ($url, $pid, $lproc) = @_;
my $page = 1;
my $uri = URI->new($url);
my ($i, @sids);
while ($page){
# build the uri
$uri->query_form(page => $page, pid => $pid);
my $uri_string = $uri->as_string;
# get the content, check for success
my $content = get $uri->as_string;
die qq{LWP get failed: $!\n} unless $content;
# build the tree
my $t = HTML::TreeBuilder->new_from_content($content)
or die qq{new from content failed: $!\n};
# get a list of all anchor tags
my @anchors = $t->look_down(_tag => q{a})
or die qq{no tables found in : $!\n};
# look at each anchor
my $more = 1; # flag
for my $anchor (@anchors){
# get the href
my $href = $anchor->attr(q{href});
if ($href){
# test for a sid in the query fragment
my $uri = URI->new($href);
my %q = $uri->query_form;
my $sid = $q{sid};
next unless $sid;
# exit the while loop if it
# is the last processed sid
$more--, last if $sid == $lproc;
# otherwise save it
push @sids, $sid;
}
}
last unless $more;
# see if there is another page
$page = get_next_page($t);
# avoid accidental indefinite loops
# hammering the server, adjust to suit
die if $i++ > 5;
}
# send 'em back
return @sids;
}
Have a look at the URI docs to see what the $uri->query_form does. Very useful.
Update: corrected the sub |