1: *** BREAKAGE! On 9/16/02, dice.com changed their search and result format,
2: *** breaking this script. I'll post a fix once I've reverse-engineered
3: *** their new format.
4:
5: #!/usr/bin/perl -w
6:
7: # newdice -- What's new on dice.com
8: #
9: # Spider dice.com looking for new job postings of interest, and
10: # emit a single HTML page with all results. "New" is complicated
11: # by recruiters who withdraw/repost a job description daily, so
12: # that it appears at the top of the list. To avoid this, we keep
13: # keep md5 hashes of the job descriptions we've already seen.
14: #
15: # To avoid having to work through jobs that are uninteresting,
16: # we exclude any post that contains an "ignore" term.
17: #
18: # N.B. To adapt this for your use, you'll need to reverse engineer
19: # the dice.com search form that you care about, and update
20: # $searchform below. You'll also need to carefully examine __DATA__
21: # to remove terms that might interest you. Run this once, examine
22: # the results, and amend __DATA__ as needed.
23: #
24: # This is all rather brute force, but it's "good enough" for
25: # my needs. If you improve it, please send me a copy (or diffs).
26: #
27: # Dave W. Smith <dws@postcognitive.com>
28: my $VERSION = "0.2 06 Sep 2002";
29:
30: use strict;
31:
32: use HTTP::Request::Common qw(GET POST);
33: use HTTP::Cookies;
34: use LWP::UserAgent;
35: use Digest::MD5 qw(md5_hex);
36:
37: my $cachefile = "dice.cache";
38:
39: my $search = "http://jobsearch.dice.com/jobsearch/jobsearch.cgi";
40: my $searchform = [
41: iquery => "#or(#1(AREA 209) #1(AREA 369) #1(AREA 408)"
42: . " #1(AREA 415) #1(AREA 510) #1(AREA 650) #1(AREA 707)"
43: . " #1(AREA 831) #1(AREA 925) #1(AREA 559))",
44: banner => 1,
45: query => "",
46: method => "and",
47: acode => 408, acode => 510, acode => 650,
48: taxterm => "",
49: daysback => -f $cachefile ? 2 : 10,
50: num_per_page => 50,
51: num_to_retrieve => 2000
52: ];
53:
54:
55: # Set up the ignore regexp. (There are faster ways, but this works.)
56:
57: my @ignore;
58: while ( <DATA> ) {
59: chomp;
60: s/^\s+//; s/\s+$//;
61: next if /^(?:;|$)/;
62: push @ignore, $_;
63: }
64: my $ignore = join('|', @ignore);
65:
66: # Set up a hash of md5s of the descriptions we've already seen.
67:
68: my %seen;
69: open(IN, "<", $cachefile) and do {
70: while ( <IN> ) {
71: chomp;
72: $seen{$_}++;
73: }
74: close(IN);
75: };
76:
77: my $ua = new LWP::UserAgent();
78: $ua->agent("Mozilla/4.0 (compatible; MSIE 5.01; Windows 98");
79:
80: # Submit a search form. We exepct to get a 302 response with a Location:
81: # for the real search results.
82:
83: my $req = POST $search, $searchform;
84: my $response = $ua->request($req);
85: my $resultpage = $response->headers()->header("location");
86:
87: my %jobByUrl;
88: my %jobTitleByUrl;
89:
90: $|++; print "<!-- "; #DEBUG
91:
92: # Get the first search result page
93:
94: $req = GET $resultpage;
95: $response = $ua->request($req);
96:
97: while ( 1 ) {
98: last if not $response->is_success();
99:
100: print "."; #DEBUG
101:
102: my $body = $response->as_string();
103: foreach my $line ( split "\n", $body ) {
104:
105: if ( $line =~ m#^Position.*?href="(.*?)">(.*?)</a>(.*)# ) {
106: my($url,$title,$description) = ($1,$2,$3);
107:
108: # Skip stuff we don't care about
109: next if $title =~ /\b(?:$ignore)\b/io;
110:
111: # Skip stuff we've already seen
112: my $md5 = md5_hex($description);
113: next if defined $seen{$md5};
114: $seen{$md5}++;
115:
116: $jobByUrl{$url} = $line;
117: $jobTitleByUrl{$url} = $title;
118: }
119: }
120:
121: # If there's a next page, get it. Otherwise we're done.
122: last if $body !~ /href="([^"]+)">Show next/;
123:
124: $req = GET $1;
125: sleep(2); # Don't hammer the server
126: $response = $ua->request($req);
127: }
128:
129: print "-->\n"; #DEBUG
130:
131: print <<"";
132: <!doctype html public "-//W3C//DTD HTML 4.0 Transitional//EN">
133: <html>
134: <head><title>New jobs of interest on dice.com</title></head>
135: <body bgcolor="#FFFFFF" text="#000000"
136: link="#FF0000" vlink="#990000" alink="#FF0000">
137:
138: print "No new jobs of interest\n" if 0 == keys %jobByUrl;
139:
140: # Sorting by URL groups recruiters together, which tends to group like
141: # jobs together. YMMV.
142:
143: foreach my $url ( sort keys %jobByUrl ) {
144: print $jobByUrl{$url}, "\n";
145: }
146:
147: print <<"";
148: </body>
149: </html>
150:
151: # Update the "seen" cache
152: open(OUT, ">", $cachefile) and do {
153: print OUT "$_\n" foreach ( keys %seen );
154: close(OUT);
155: }
156:
157:
158: __DATA__
159: ; Exclusion patterns. Any job title that contains any of these terms gets
160: ; ignored, so take care not to add stuff that'll give you false negatives.
161:
162: ; ignore various uninteresting jobs
163:
164: sales
165: underwriter
166: tester
167: qc
168: administrator
169: technician
170: biologist
171: chemist
172: accounting
173: accountant
174: account
175: accounts
176: business development
177: acount
178: acounts
179: acounting
180: payroll
181:
182: ; ... etc. my complete list elided in the interest of space
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
•Re: Spider dice.com for new jobs of interest
by merlyn (Sage) on Sep 06, 2002 at 19:14 UTC | |
by dws (Chancellor) on Sep 06, 2002 at 19:28 UTC | |
by merlyn (Sage) on Sep 06, 2002 at 20:36 UTC | |
|
Re: Spider dice.com for new jobs of interest
by ignatz (Vicar) on Sep 16, 2002 at 16:40 UTC |