in reply to Re^9: collect data from web pages and insert into mysql
in thread collect data from web pages and insert into mysql
OK, have looked it over and think I understands most of it fairly well now. Adapted it to do the whole list as well as import PID/Lproc for processing
There's a bug somewhere though making it abort if a PID have 0 SIDs. Instead of moving on to next PID for processing it simply ends.
Here's what I have so far (I added some print stuff so I can see it progressing):
#! c:/perl/bin/perl use warnings; use LWP::Simple qw(get); use HTML::TreeBuilder; use Data::Dumper; use URI; # Open list of pid & lproc combos and set for processing my $sfile = q{c:/scr/settings.txt}; open my $settings, qq{<}, $sfile or die qq{Could not open $sfile: $!\n +}; my $pidlist = <$settings>; # Get list of pids and process all while ( $pidlist = <$settings> ) { chomp $pidlist; my ( $cpid, $lproc ) = split ( /\t/, $pidlist ); #my $page = 1; my $url = q{http://csr.wwiionline.com/scripts/services/persona/sor +ties.jsp}; my @sids = get_sids($url, $cpid, $lproc); die qq{no sids found\n} unless @sids; print qq{Processing PID: $cpid, Last processed SID: $lproc\n}; print Dumper \@sids; }; close $settings or die q{Failed to close file: $!\n}; print qq{\nAll done!\n}; sub get_sids{ my ($url, $cpid, $lproc) = @_; my $page = 1; my $uri = URI->new($url); my ($i, @sids); while ($page){ # build the uri $uri->query_form(page => $page, pid => $cpid); my $uri_string = $uri->as_string; # get the content, check for success my $content = get $uri->as_string; die qq{LWP get failed: $!\n} unless $content; # build the tree my $t = HTML::TreeBuilder->new_from_content($content) or die qq{new from content failed: $!\n}; # get a list of all anchor tags my @anchors = $t->look_down(_tag => q{a}) or die qq{no tables found in : $!\n}; # look at each anchor my $more = 1; # flag for my $anchor (@anchors){ # get the href my $href = $anchor->attr(q{href}); if ($href){ # test for a sid in the query fragment my $uri = URI->new($href); my %q = $uri->query_form; my $sid = $q{sid}; next unless $sid; # exit the while loop if it # is the last processed sid $more--, last if $sid == $lproc; # otherwise save it push @sids, $sid; } } last unless $more; # see if there is another page $page = get_next_page_number($t); # avoid accidental indefinite loops # hammering the server, adjust to suit die if $i++ > 7; } # send 'em back return @sids; } sub get_next_page_number { my ($t) = @_; # we want table 9 my @tables = $t->look_down(_tag => q{table}); my $table = $tables[8]; # first row my @trs = $table->look_down(_tag => q{tr}); my $tr = $trs[0]; # second column my @tds = $tr->look_down(_tag => q{td}); my $td = $tds[1]; # get any text my $page_number_txt = $td->as_text; # and test if it is a page number # will be undef otherwise my ($page) = $page_number_txt =~ /PAGE (\d) >/; return $page; }
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Re^11: collect data from web pages and insert into mysql
by wfsp (Abbot) on Aug 03, 2010 at 15:36 UTC |