comment on

0: *** BREAKAGE! On 9/16/02, dice.com changed their search and result format,
1: *** breaking this script. I'll post a fix once I've reverse-engineered
2: *** their new format.
3: 
4: #!/usr/bin/perl -w
5: 
6: # newdice -- What's new on dice.com
7: #
8: # Spider dice.com looking for new job postings of interest, and
9: # emit a single HTML page with all results. "New" is complicated
10: # by  recruiters who withdraw/repost a job description daily, so
11: # that it appears at the top of the list. To avoid this, we keep
12: # keep md5 hashes of the job descriptions we've already seen.
13: #
14: # To avoid having to work through jobs that are uninteresting,
15: # we exclude any post that contains an "ignore" term.
16: #
17: # N.B. To adapt this for your use, you'll need to reverse engineer
18: # the dice.com search form that you care about, and update
19: # $searchform below. You'll also need to carefully examine __DATA__
20: # to remove terms that might interest you. Run this once, examine
21: # the results, and amend __DATA__ as needed.
22: #
23: # This is all rather brute force, but it's "good enough" for
24: # my needs. If you improve it, please send me a copy (or diffs).
25: #
26: # Dave W. Smith <dws@postcognitive.com>
27: my $VERSION = "0.2 06 Sep 2002";
28: 
29: use strict;
30: 
31: use HTTP::Request::Common qw(GET POST);
32: use HTTP::Cookies;
33: use LWP::UserAgent;
34: use Digest::MD5 qw(md5_hex);
35: 
36: my $cachefile = "dice.cache";
37: 
38: my $search = "http://jobsearch.dice.com/jobsearch/jobsearch.cgi";
39: my $searchform = [
40:     iquery => "#or(#1(AREA 209) #1(AREA 369) #1(AREA 408)" 
41:             . " #1(AREA 415) #1(AREA 510) #1(AREA 650) #1(AREA 707)"
42:             . " #1(AREA 831) #1(AREA 925) #1(AREA 559))",
43:     banner => 1,
44:     query => "",
45:     method => "and",
46:     acode => 408, acode => 510, acode => 650,
47:     taxterm => "",
48:     daysback => -f $cachefile ? 2 : 10,
49:     num_per_page => 50,
50:     num_to_retrieve => 2000
51: ];
52: 
53: 
54: # Set up the ignore regexp. (There are faster ways, but this works.)
55: 
56: my @ignore;
57: while ( <DATA> ) {
58:     chomp;
59:     s/^\s+//; s/\s+$//;
60:     next if /^(?:;|$)/;
61:     push @ignore, $_;
62: }
63: my $ignore = join('|', @ignore);
64: 
65: # Set up a hash of md5s of the descriptions we've already seen.
66: 
67: my %seen;
68: open(IN, "<", $cachefile) and do {
69:     while ( <IN> ) {
70:         chomp;
71:         $seen{$_}++;		
72:     }
73:     close(IN);
74: };
75: 
76: my $ua = new LWP::UserAgent();
77: $ua->agent("Mozilla/4.0 (compatible; MSIE 5.01; Windows 98");
78: 
79: # Submit a search form. We exepct to get a 302 response with a Location:
80: # for the real search results.
81: 
82: my $req = POST $search, $searchform;
83: my $response = $ua->request($req);
84: my $resultpage = $response->headers()->header("location");
85: 
86: my %jobByUrl;
87: my %jobTitleByUrl;
88: 
89: $|++; print "<!-- "; #DEBUG
90: 
91: # Get the first search result page
92: 
93: $req = GET $resultpage;
94: $response = $ua->request($req);
95: 
96: while ( 1 ) {
97:     last if not $response->is_success();
98: 
99:     print ".";   #DEBUG
100: 
101:     my $body = $response->as_string();
102:     foreach my $line ( split "\n", $body ) {
103: 
104:         if ( $line =~ m#^Position.*?href="(.*?)">(.*?)</a>(.*)# ) {
105:             my($url,$title,$description) = ($1,$2,$3);
106: 
107:             # Skip stuff we don't care about
108:             next if $title =~ /\b(?:$ignore)\b/io;
109: 
110:             # Skip stuff we've already seen
111:             my $md5 = md5_hex($description);
112:             next if defined $seen{$md5};
113:             $seen{$md5}++;
114: 
115:             $jobByUrl{$url} = $line;
116:             $jobTitleByUrl{$url} = $title;
117:         }
118:     }
119: 
120:     # If there's a next page, get it. Otherwise we're done.
121:     last if $body !~ /href="([^"]+)">Show next/;
122: 
123:     $req = GET $1;
124:     sleep(2);  # Don't hammer the server
125:     $response = $ua->request($req);
126: }
127: 
128: print "-->\n"; #DEBUG
129: 
130: print <<"";
131: <!doctype html public "-//W3C//DTD HTML 4.0 Transitional//EN">
132: <html>
133: <head><title>New jobs of interest on dice.com</title></head>
134: <body bgcolor="#FFFFFF" text="#000000"
135:       link="#FF0000" vlink="#990000" alink="#FF0000">
136: 
137: print "No new jobs of interest\n" if 0 == keys %jobByUrl;
138: 
139: # Sorting by URL groups recruiters together, which tends to group like
140: # jobs together. YMMV.
141: 
142: foreach my $url ( sort keys %jobByUrl ) {
143:     print $jobByUrl{$url}, "\n";
144: }
145: 
146: print <<"";
147: </body>
148: </html>
149: 
150: # Update the "seen" cache
151: open(OUT, ">", $cachefile) and do {
152:     print OUT "$_\n" foreach ( keys %seen );
153:     close(OUT);
154: }
155: 
156: 
157: __DATA__
158: ; Exclusion patterns. Any job title that contains any of these terms gets
159: ; ignored, so take care not to add stuff that'll give you false negatives.
160: 
161: ; ignore various uninteresting jobs
162: 
163: sales
164: underwriter
165: tester
166: qc
167: administrator
168: technician
169: biologist
170: chemist
171: accounting
172: accountant
173: account
174: accounts
175: business development
176: acount
177: acounts
178: acounting
179: payroll
180: 
181: ; ... etc. my complete list elided in the interest of space

In reply to Spider dice.com for new jobs of interest by dws

Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!

Titles consisting of a single word are discouraged, and in most cases are disallowed outright.

Read Where should I post X? if you're not absolutely sure you're posting in the right place.

Please read these before you post! —

Posts may use any of the Perl Monks Approved HTML tags:

a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, details, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, summary, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr

You may need to use entities for some characters, as follows. (Exception: Within code tags, you can put the characters literally.)

	For:		Use:
	&		`&`
	<		`<`
	>		`>`
	[		`[`
	]		`]`

Link using PerlMonks shortcuts! What shortcuts can I use for linking?

See Writeup Formatting Tips and other pages linked from there for more info.