in reply to Re^3: Combining Excel Parser with Google Scholar Scraper
in thread Combining Excel Parser with Google Scholar Scraper
#!/usr/bin/perl use WWW::Mechanize; #!/usr/bin/perl -w use strict; use Win32::OLE qw(in with); use Win32::OLE::Const 'Microsoft Excel'; $Win32::OLE::Warn = 3; # die on errors. +.. # get already active Excel application or open new my $Excel = Win32::OLE->GetActiveObject('Excel.Application') || Win32::OLE->new('Excel.Application', 'Quit'); # open Excel file my $Book = $Excel->Workbooks->Open("C:/Documents and Settings/rto5u/My + Documents/CV.xls"); # select worksheet number 1 (you can also select a worksheet by name) my $Sheet = $Book->Worksheets(1); foreach my $row (2..4) { foreach my $col (1..1) { # skip empty cells next unless defined $Sheet->Cells($row,$col)->{'Value'}; my $URL = 'http://scholar.google.com/advanced_scholar_search'; my $FORM_NAME = 'f'; #print "Author Name: "; #chomp ($AUTHOR = <>); my $AUTHOR = "MD Li"; #print "Paper Title: "; #chomp ($TITLE = <>); my $TITLE = $Sheet->Cells($row,$col)->{'Value'}; print "$TITLE"; #my $TITLE = "Region-specific transcriptional response to chro +nic nicotine in rat brain"; my $mech = WWW::Mechanize->new(stack_depth=>10); $mech->get($URL) || die ("Could not connect to $URL.\n"); my $res = $mech->submit_form( form_name => $FORM_NAME, fields => { 'num' => 100, 'as_epq' => $TITLE, 'as_occt' => 'title', 'as_sauthors' => $AUTHOR, 'as_allsubj' => 'all', }, ); while ($res && $res->is_success()){ my $content = $res->content; #print $content; while ($content =~ /<p class=g>(.*?)<\/font>\s\s\s/gs){ my $section = $1; my $title = ""; my $citedby = 0; # get title $title = getTitle($section); $title =~ s/<.*?>//g; $title =~ s/…/\.\.\./g; # get citedby # $citedby = getCitedBy($section); if ($citedby){ print "\"$title\"\nCited by: $citedby\n\n"; } } $res = $mech->follow_link( text_regex => qr/Next/i); } } } $Book->Close; ###################################################################### +####### sub getTitle($){ my ($section) = @_; my $title; if ($section =~ /<span class="w">.*?<a href.*?>(.*?)<\/a><\/span>/ +s){ # papers with a link $title = $1; }elsif ($section =~ / (.*?)<font size=-1>/s){ # pa +pers w/o a link $title = $1; }else{ $title = $1; } return $title; } #--------------------------------------------------------------------- +------- sub getCitedBy($){ my ($section) = @_; my $citedby; if ($section =~ />Cited by (\d+)</s){ $citedby = $1; } return $citedby; } #--------------------------------------------------------------------- +-------
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Re^5: Combining Excel Parser with Google Scholar Scraper
by kennethk (Abbot) on Apr 14, 2009 at 16:53 UTC |