turbolofi has asked for the wisdom of the Perl Monks concerning the following question:
#!/usr/bin/perl -w # Get urls from result page use warnings; use strict; use LWP::Simple; use HTML::Entities; # for htmldecode my ( $resultpage, $resultpage_url, $i, $new_url, $new_title, $title, @urls, + $n, $host, $article_title, $authors, $source_journal, $source_volume, $source_iss +ue, $source_pages, $source_publish_date ); $host = "http://apps.isiknowledge.com"; $resultpage_url = 'http://apps.isiknowledge.com/summary.do?product=UA& +search_mode=GeneralSearch&qid=2&SID=S1NIfp9Koh1L9D1D5I4&page=1&action +=changePageSize&pageSize=10'; $n = 0; main(); sub main{ $resultpage = get ("$resultpage_url") or die "couldn't retrieve"; while ($resultpage =~ m{<a class=\"smallV110\" href=\"(.*?)\">}gis +) { push(@urls, "$1\n"); for $i ($urls[$n]) { $new_url = get ($host . decode_entities($urls[$n])); sleep 1; # be nice to the server $new_url =~ m{<title>(.*?)<\/title>}gis; # capture page ti +tle $new_title = $1; print "$new_title\n"; $new_url =~ m{\<td class\=\"FullRecTitle\">(.*?)</td>}gis; + # Capture article title $article_title = $1; print "$article_title\n"; $new_url =~ m{(journal of hazardous)}gis; # should capture + journal name - still struggling with this regexp. Is repeated! $source_journal = $1; print "$source_journal\n"; }; $n++; }; };
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Re: How to prevent a value from being repeated?
by almut (Canon) on May 12, 2009 at 13:52 UTC | |
by turbolofi (Acolyte) on May 12, 2009 at 13:55 UTC |