greenjr has asked for the wisdom of the Perl Monks concerning the following question:
#set up #!c:/perl/bin/perl -w use WWW::Mechanize; cookie_jar => {}; use HTML::TokeParser; $mech = WWW::Mechanize->new(); $mech->agent_alias('Windows IE 6'); #open CEO list and create array for the list sub search { foreach $exec (@exec) { chop($exec); if ($exec=~m{(\d\d)[- /.](\d\d)[- /.](\d\d\d\d)}) { $datee="$1/$2/$3"; $dateyb=$3-1; $datemb=$1+1; $dateb="$datemb/01/$dateyb"; } if ($exec=~m/((\w+)\s*\(w+)/) { $name=$1 $2; $lname=$2; } #check the array #print "@exec"; #get to the web page $mech->get("http://public.kenan-flagler.unc.edu/applications/resources +/user/search.cfm?searchtype=alphasearch¤talpha=F"); $mech->follow("Factiva"); #this gets me to the search page #I have to go to each link because it forwards me to the next webpage #and checks for authorization #this was trial and error for me $mech->submit(); $mech->get("http://global.factiva.com/en/sess/login.asp?"); $mech->get("http://global.factiva.com/sb/default.aspx?NAPC=S&fcpil=en" +); #then I check to make sure I am on the right page $results=$mech->content(); print $results; #these are my search terms my $searchterm1="(rst=(BW OR FB OR FORTU OR NYTF OR TIMAG OR J OR EC O +R FTFT)) AND (($name) AND (CEO OR Chief OR Executive)) AND ($lname)/n +20/(optimis* OR confident OR confidence) NOT (litigat* OR investigat* + OR bankrupt* OR merge* OR fraud) AND (date from $dateb to $datee)"; my $searchterm2="(rst=(BW OR FB OR FORTU OR NYTF OR TIMAG OR J OR EC O +R FTFT)) AND (($name) AND (CEO OR Chief OR Executive)) AND ($lname)/n +20/(reliable OR cautious OR conservative OR practical OR frugal OR st +eady) NOT (litigat* OR investigat* OR bankrupt* OR merge* OR fraud) A +ND (date from $dateb to $datee)"; my $searchterm3="(rst=(BW OR FB OR FORTU OR NYTF OR TIMAG OR J OR EC O +R FTFT)) AND (($name) AND (CEO OR Chief OR Executive)) AND (litigat* +OR investigat* OR bankrupt* OR merge* OR fraud) AND (date from $dateb + to $datee)"; my $searchterm4="(rst=(BW OR FB OR FORTU OR NYTF OR TIMAG OR J OR EC O +R FTFT)) AND (($name) AND (CEO OR Chief OR Executive)) AND (date fro +m $dateb to $datee)"; #run a sub routine for the search terms and output files #search sub routine requires two arguments ($searchterm#,$datafile) &search($searchterm1,"c:/scripts/datao.txt"); &search($searchterm2,"c:/scripts/datano.txt"); &search($searchterm3,"c:/scripts/dataprob.txt"); &search($searchterm4,"c:/scripts/datatot.txt"); sub search { foreach $exec (@exec) { if ($exec=~m{(\d\d)[- /.](\d\d)[- /.](\d\d\d\d)}) { $datee="$1/$2/$3"; $dateyb=$3-1; $datemb=$1+1; $dateb="$datemb/01/$dateyb"; } if ($exec=~m/((\w+)\s*\(w+)/) { $name=$1 $2; $lname=$2; } # # #this is where I am having the problem #because the page is in javascript the mechanize #module doesn't recognize the form #I need some way to work around this #or some suggestions for other approaches # # $mech->form_name("PageBaseForm"); $mech->select("dr","_Unspecified"); $mech->select("sfd",""); $mech->tick("istensfn_bool","True"); $mech->tick("ister_bool","True"); $mech->tick("isteo_bool","True"); $mech->field("Run Search",$_[0]); $mech->submit(); #if I could get here I think it would work to get the #results from the search $results=$mech->content(); if ($results=~m/Headlines\s*\d+\s*'-'\s*\d+\s*of\s*(\d+)/i) { $count=$1; } else { $count='0'; } #print the output to a file open(DAT,">>$datafile"); print DAT "$name $datee $count"; } #close the file when I am done with this search close(DAT); #end the subroutine }
Here is a small part of the code where I think I should be getting wha +t I need, but you probably are all familiar with this type of code (w +hile I am not)... . . . <form name="PageBaseForm" method="post" action="/ha/default.aspx" id=" +PageBaseForm" onsubmit="return validateSb()"> <td class="sbFld" valign="bottom" colspan="2"><textarea name="ftx" id= +"ftx" rows="2" cols="50" onkeypress="return freeTxt_keypress(event)"> +</textarea><input type="submit" value="Run Search" class="majorBtn" o +nclick="if ( validateSb() ) doLinkSubmit('../ha/default.aspx');return + false;"/></td></tr><tr class="sbMid"> <td class="sbLbl" valign="middle"><label for="dr"><b>D +ate</b></label></td> <td class="sbFld" colspan="2"><select name="dr" id="dr +" onchange="tglDtRng(this)"> <option value="LastDay">In the last day</option> <option value="LastWeek">In the last week</option> <option value="LastMonth">In the last month</option> <option selected="selected" value="Last3Months">In the last 3 mont +hs</option> <option value="Last6Months">In the last 6 months</option> <option value="LastYear">In the last year</option> <option value="Last2Years">In the last 2 years</option> <option value="_Unspecified">All Dates</option> <option value="Custom">Enter date range...</option>
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Re: Webbot with Perl for Factiva data collection
by derby (Abbot) on Apr 06, 2007 at 13:01 UTC | |
by greenjr (Initiate) on Apr 06, 2007 at 13:14 UTC | |
|
Re: Webbot with Perl for Factiva data collection
by Joost (Canon) on Apr 06, 2007 at 13:04 UTC | |
by greenjr (Initiate) on Apr 06, 2007 at 13:17 UTC | |
|
Re: Webbot with Perl for Factiva data collection
by roboticus (Chancellor) on Apr 06, 2007 at 13:09 UTC | |
by greenjr (Initiate) on Apr 06, 2007 at 13:19 UTC | |
by roboticus (Chancellor) on Apr 06, 2007 at 13:38 UTC | |
by greenjr (Initiate) on Apr 06, 2007 at 15:18 UTC |