0: *** BREAKAGE! On 9/16/02, dice.com changed their search and result format,
1: *** breaking this script. I'll post a fix once I've reverse-engineered
2: *** their new format.
3:
4: #!/usr/bin/perl -w
5:
6: # newdice -- What's new on dice.com
7: #
8: # Spider dice.com looking for new job postings of interest, and
9: # emit a single HTML page with all results. "New" is complicated
10: # by recruiters who withdraw/repost a job description daily, so
11: # that it appears at the top of the list. To avoid this, we keep
12: # keep md5 hashes of the job descriptions we've already seen.
13: #
14: # To avoid having to work through jobs that are uninteresting,
15: # we exclude any post that contains an "ignore" term.
16: #
17: # N.B. To adapt this for your use, you'll need to reverse engineer
18: # the dice.com search form that you care about, and update
19: # $searchform below. You'll also need to carefully examine __DATA__
20: # to remove terms that might interest you. Run this once, examine
21: # the results, and amend __DATA__ as needed.
22: #
23: # This is all rather brute force, but it's "good enough" for
24: # my needs. If you improve it, please send me a copy (or diffs).
25: #
26: # Dave W. Smith <dws@postcognitive.com>
27: my $VERSION = "0.2 06 Sep 2002";
28:
29: use strict;
30:
31: use HTTP::Request::Common qw(GET POST);
32: use HTTP::Cookies;
33: use LWP::UserAgent;
34: use Digest::MD5 qw(md5_hex);
35:
36: my $cachefile = "dice.cache";
37:
38: my $search = "http://jobsearch.dice.com/jobsearch/jobsearch.cgi";
39: my $searchform = [
40: iquery => "#or(#1(AREA 209) #1(AREA 369) #1(AREA 408)"
41: . " #1(AREA 415) #1(AREA 510) #1(AREA 650) #1(AREA 707)"
42: . " #1(AREA 831) #1(AREA 925) #1(AREA 559))",
43: banner => 1,
44: query => "",
45: method => "and",
46: acode => 408, acode => 510, acode => 650,
47: taxterm => "",
48: daysback => -f $cachefile ? 2 : 10,
49: num_per_page => 50,
50: num_to_retrieve => 2000
51: ];
52:
53:
54: # Set up the ignore regexp. (There are faster ways, but this works.)
55:
56: my @ignore;
57: while ( <DATA> ) {
58: chomp;
59: s/^\s+//; s/\s+$//;
60: next if /^(?:;|$)/;
61: push @ignore, $_;
62: }
63: my $ignore = join('|', @ignore);
64:
65: # Set up a hash of md5s of the descriptions we've already seen.
66:
67: my %seen;
68: open(IN, "<", $cachefile) and do {
69: while ( <IN> ) {
70: chomp;
71: $seen{$_}++;
72: }
73: close(IN);
74: };
75:
76: my $ua = new LWP::UserAgent();
77: $ua->agent("Mozilla/4.0 (compatible; MSIE 5.01; Windows 98");
78:
79: # Submit a search form. We exepct to get a 302 response with a Location:
80: # for the real search results.
81:
82: my $req = POST $search, $searchform;
83: my $response = $ua->request($req);
84: my $resultpage = $response->headers()->header("location");
85:
86: my %jobByUrl;
87: my %jobTitleByUrl;
88:
89: $|++; print "<!-- "; #DEBUG
90:
91: # Get the first search result page
92:
93: $req = GET $resultpage;
94: $response = $ua->request($req);
95:
96: while ( 1 ) {
97: last if not $response->is_success();
98:
99: print "."; #DEBUG
100:
101: my $body = $response->as_string();
102: foreach my $line ( split "\n", $body ) {
103:
104: if ( $line =~ m#^Position.*?href="(.*?)">(.*?)</a>(.*)# ) {
105: my($url,$title,$description) = ($1,$2,$3);
106:
107: # Skip stuff we don't care about
108: next if $title =~ /\b(?:$ignore)\b/io;
109:
110: # Skip stuff we've already seen
111: my $md5 = md5_hex($description);
112: next if defined $seen{$md5};
113: $seen{$md5}++;
114:
115: $jobByUrl{$url} = $line;
116: $jobTitleByUrl{$url} = $title;
117: }
118: }
119:
120: # If there's a next page, get it. Otherwise we're done.
121: last if $body !~ /href="([^"]+)">Show next/;
122:
123: $req = GET $1;
124: sleep(2); # Don't hammer the server
125: $response = $ua->request($req);
126: }
127:
128: print "-->\n"; #DEBUG
129:
130: print <<"";
131: <!doctype html public "-//W3C//DTD HTML 4.0 Transitional//EN">
132: <html>
133: <head><title>New jobs of interest on dice.com</title></head>
134: <body bgcolor="#FFFFFF" text="#000000"
135: link="#FF0000" vlink="#990000" alink="#FF0000">
136:
137: print "No new jobs of interest\n" if 0 == keys %jobByUrl;
138:
139: # Sorting by URL groups recruiters together, which tends to group like
140: # jobs together. YMMV.
141:
142: foreach my $url ( sort keys %jobByUrl ) {
143: print $jobByUrl{$url}, "\n";
144: }
145:
146: print <<"";
147: </body>
148: </html>
149:
150: # Update the "seen" cache
151: open(OUT, ">", $cachefile) and do {
152: print OUT "$_\n" foreach ( keys %seen );
153: close(OUT);
154: }
155:
156:
157: __DATA__
158: ; Exclusion patterns. Any job title that contains any of these terms gets
159: ; ignored, so take care not to add stuff that'll give you false negatives.
160:
161: ; ignore various uninteresting jobs
162:
163: sales
164: underwriter
165: tester
166: qc
167: administrator
168: technician
169: biologist
170: chemist
171: accounting
172: accountant
173: account
174: accounts
175: business development
176: acount
177: acounts
178: acounting
179: payroll
180:
181: ; ... etc. my complete list elided in the interest of space
In reply to Spider dice.com for new jobs of interest by dws
| For: | Use: | ||
| & | & | ||
| < | < | ||
| > | > | ||
| [ | [ | ||
| ] | ] |