(.*?)<\/title>}gis; # capture page title $new_title = $1; print "$new_title\n"; $new_url =~ m{\<td class\=\"FullRecTitle\">(.*?)</td>}gis; # Capture article title $article_title = $1; print "$article_title\n"; $new_url =~ m{(journal of hazardous)}gis; # should capture journal name - still struggling with this regexp. Is repeated! $source_journal = $1; print "$source_journal\n"; }; $n++; }; };

#!/usr/bin/perl -w
# Get urls from result page
use warnings;
use strict;
use LWP::Simple;
use HTML::Entities; # for htmldecode

my (
$resultpage, $resultpage_url, $i, $new_url, $new_title, $title, @urls, $n, $host,
$article_title, $authors, $source_journal, $source_volume, $source_issue, $source_pages, $source_publish_date
);

$host = "http://apps.isiknowledge.com";
$resultpage_url = 'http://apps.isiknowledge.com/summary.do?product=UA&search_mode=GeneralSearch&qid=2&SID=S1NIfp9Koh1L9D1D5I4&page=1&action=changePageSize&pageSize=10';
$n = 0;
main();

sub main{
	$resultpage = get ("$resultpage_url") or die "couldn't retrieve";

	while ($resultpage =~ m{<a class=\"smallV110\" href=\"(.*?)\">}gis) {
	push(@urls, "$1\n");
		for $i ($urls[$n]) {
			$new_url = get ($host . decode_entities($urls[$n]));
			sleep 1; # be nice to the server
			$new_url =~ m{<title>(.*?)<\/title>}gis; # capture page title
			$new_title = $1;
			print "$new_title\n";
			$new_url =~ m{\<td class\=\"FullRecTitle\">(.*?)</td>}gis; # Capture article title
			$article_title = $1;
			print "$article_title\n";
			$new_url =~ m{(journal of hazardous)}gis; # should capture journal name - still struggling with this regexp. Is repeated!
			$source_journal = $1;
			print "$source_journal\n";
			
		};
$n++;
	};
};