1: *** BREAKAGE! On 9/16/02, dice.com changed their search and result format,
   2: *** breaking this script. I'll post a fix once I've reverse-engineered
   3: *** their new format.
   4: 
   5: #!/usr/bin/perl -w
   6: 
   7: # newdice -- What's new on dice.com
   8: #
   9: # Spider dice.com looking for new job postings of interest, and
  10: # emit a single HTML page with all results. "New" is complicated
  11: # by  recruiters who withdraw/repost a job description daily, so
  12: # that it appears at the top of the list. To avoid this, we keep
  13: # keep md5 hashes of the job descriptions we've already seen.
  14: #
  15: # To avoid having to work through jobs that are uninteresting,
  16: # we exclude any post that contains an "ignore" term.
  17: #
  18: # N.B. To adapt this for your use, you'll need to reverse engineer
  19: # the dice.com search form that you care about, and update
  20: # $searchform below. You'll also need to carefully examine __DATA__
  21: # to remove terms that might interest you. Run this once, examine
  22: # the results, and amend __DATA__ as needed.
  23: #
  24: # This is all rather brute force, but it's "good enough" for
  25: # my needs. If you improve it, please send me a copy (or diffs).
  26: #
  27: # Dave W. Smith <dws@postcognitive.com>
  28: my $VERSION = "0.2 06 Sep 2002";
  29: 
  30: use strict;
  31: 
  32: use HTTP::Request::Common qw(GET POST);
  33: use HTTP::Cookies;
  34: use LWP::UserAgent;
  35: use Digest::MD5 qw(md5_hex);
  36: 
  37: my $cachefile = "dice.cache";
  38: 
  39: my $search = "http://jobsearch.dice.com/jobsearch/jobsearch.cgi";
  40: my $searchform = [
  41:     iquery => "#or(#1(AREA 209) #1(AREA 369) #1(AREA 408)" 
  42:             . " #1(AREA 415) #1(AREA 510) #1(AREA 650) #1(AREA 707)"
  43:             . " #1(AREA 831) #1(AREA 925) #1(AREA 559))",
  44:     banner => 1,
  45:     query => "",
  46:     method => "and",
  47:     acode => 408, acode => 510, acode => 650,
  48:     taxterm => "",
  49:     daysback => -f $cachefile ? 2 : 10,
  50:     num_per_page => 50,
  51:     num_to_retrieve => 2000
  52: ];
  53: 
  54: 
  55: # Set up the ignore regexp. (There are faster ways, but this works.)
  56: 
  57: my @ignore;
  58: while ( <DATA> ) {
  59:     chomp;
  60:     s/^\s+//; s/\s+$//;
  61:     next if /^(?:;|$)/;
  62:     push @ignore, $_;
  63: }
  64: my $ignore = join('|', @ignore);
  65: 
  66: # Set up a hash of md5s of the descriptions we've already seen.
  67: 
  68: my %seen;
  69: open(IN, "<", $cachefile) and do {
  70:     while ( <IN> ) {
  71:         chomp;
  72:         $seen{$_}++;		
  73:     }
  74:     close(IN);
  75: };
  76: 
  77: my $ua = new LWP::UserAgent();
  78: $ua->agent("Mozilla/4.0 (compatible; MSIE 5.01; Windows 98");
  79: 
  80: # Submit a search form. We exepct to get a 302 response with a Location:
  81: # for the real search results.
  82: 
  83: my $req = POST $search, $searchform;
  84: my $response = $ua->request($req);
  85: my $resultpage = $response->headers()->header("location");
  86: 
  87: my %jobByUrl;
  88: my %jobTitleByUrl;
  89: 
  90: $|++; print "<!-- "; #DEBUG
  91: 
  92: # Get the first search result page
  93: 
  94: $req = GET $resultpage;
  95: $response = $ua->request($req);
  96: 
  97: while ( 1 ) {
  98:     last if not $response->is_success();
  99: 
 100:     print ".";   #DEBUG
 101: 
 102:     my $body = $response->as_string();
 103:     foreach my $line ( split "\n", $body ) {
 104: 
 105:         if ( $line =~ m#^Position.*?href="(.*?)">(.*?)</a>(.*)# ) {
 106:             my($url,$title,$description) = ($1,$2,$3);
 107: 
 108:             # Skip stuff we don't care about
 109:             next if $title =~ /\b(?:$ignore)\b/io;
 110: 
 111:             # Skip stuff we've already seen
 112:             my $md5 = md5_hex($description);
 113:             next if defined $seen{$md5};
 114:             $seen{$md5}++;
 115: 
 116:             $jobByUrl{$url} = $line;
 117:             $jobTitleByUrl{$url} = $title;
 118:         }
 119:     }
 120: 
 121:     # If there's a next page, get it. Otherwise we're done.
 122:     last if $body !~ /href="([^"]+)">Show next/;
 123: 
 124:     $req = GET $1;
 125:     sleep(2);  # Don't hammer the server
 126:     $response = $ua->request($req);
 127: }
 128: 
 129: print "-->\n"; #DEBUG
 130: 
 131: print <<"";
 132: <!doctype html public "-//W3C//DTD HTML 4.0 Transitional//EN">
 133: <html>
 134: <head><title>New jobs of interest on dice.com</title></head>
 135: <body bgcolor="#FFFFFF" text="#000000"
 136:       link="#FF0000" vlink="#990000" alink="#FF0000">
 137: 
 138: print "No new jobs of interest\n" if 0 == keys %jobByUrl;
 139: 
 140: # Sorting by URL groups recruiters together, which tends to group like
 141: # jobs together. YMMV.
 142: 
 143: foreach my $url ( sort keys %jobByUrl ) {
 144:     print $jobByUrl{$url}, "\n";
 145: }
 146: 
 147: print <<"";
 148: </body>
 149: </html>
 150: 
 151: # Update the "seen" cache
 152: open(OUT, ">", $cachefile) and do {
 153:     print OUT "$_\n" foreach ( keys %seen );
 154:     close(OUT);
 155: }
 156: 
 157: 
 158: __DATA__
 159: ; Exclusion patterns. Any job title that contains any of these terms gets
 160: ; ignored, so take care not to add stuff that'll give you false negatives.
 161: 
 162: ; ignore various uninteresting jobs
 163: 
 164: sales
 165: underwriter
 166: tester
 167: qc
 168: administrator
 169: technician
 170: biologist
 171: chemist
 172: accounting
 173: accountant
 174: account
 175: accounts
 176: business development
 177: acount
 178: acounts
 179: acounting
 180: payroll
 181: 
 182: ; ... etc. my complete list elided in the interest of space

Replies are listed 'Best First'.
•Re: Spider dice.com for new jobs of interest
by merlyn (Sage) on Sep 06, 2002 at 19:14 UTC
      How does this compare to WWW::Search::Dice?

      Drat. I didn't know about WWW::Search::Dice. It might have saved a bit of work, but it appears to not handle duplicate descriptions that originate from different URLs (the "withdraw/repost" problem). That could probably be hacked in. My approach yields a page that looks like one very large DICE page. That'd be a bit harder to do using WWW::Search::Dice, since it disects the job description and hands you back pieces.

        Drat. I didn't know about WWW::Search::Dice.
        And neither did I, but when I see a handrolled solution here, I almost always wonder if it should be contributed to the CPAN, so I have the CPAN bookmarked and I simply typed "search dice" into the search box. The new improved CPAN search even looks into the documentation in a fairly fast way, so searching for keywords is even better than before.

        Use the CPAN. The CPAN is your friend.

        -- Randal L. Schwartz, Perl hacker

Re: Spider dice.com for new jobs of interest
by ignatz (Vicar) on Sep 16, 2002 at 16:40 UTC
    Looks like their new site design breaks this.
    ()-()
     \"/
      `