Beefy Boxes and Bandwidth Generously Provided by pair Networks
Your skill will accomplish
what the force of many cannot
 
PerlMonks  

jobs.perl.org watcher

by diotalevi (Canon)
on Jun 07, 2005 at 20:23 UTC ( [id://464439]=sourcecode: print w/replies, xml ) Need Help??
Category: Miscellaneous
Author/Contact Info "Joshua ben Jore" <jjore@cpan.org>
Description:

This monitors jobs.perl.org (and eventually other sites) for postings of interest. There is separate code for watching the Minnesota Job Bank but since that's very specific, I left it out.

crontab entry to run once an hour, at 32 minutes past.

32 * * * * /home/.../bin/JobWatcher
#!perl
use strict;
use warnings;
use Cache::FileCache;
# use Data::Dump::Streamer;
use DBM::Deep;
use HTML::FormatText;
use Mail::Sendmail 'sendmail';
use Params::Validate ':all';
use WWW::Mechanize;
use XML::RSS;
use vars ( '$DB', '$CACHE', '%CONFIG' );

main( sendmail => { smtp => '...',
                    From => '...',
                    To => '...' } );
exit;

sub main {
    my %p = @_;
    
    initialize( %p );
    do_jobs_perl_org( category => 'MN perl',
                      url => 'http://jobs.perl.org/rss/standard.rss?li
+mit=20',
                      constraints => [ sub { /minnesota/i } ] );
    do_jobs_perl_org( category => 'Tele perl',
                      url => 'http://jobs.perl.org/rss/telecommute.rss
+?limit=20' );
    do_jobs_perl_org( category => 'Domino perl',
                      url => 'http://jobs.perl.org/rss/standard.rss?li
+mit=20',
                      constraints => [ sub { /\b(?:domino|lotus|notes)
+\b/i } ] );
    # do_mn_job_bank( category => 'MN perl',
    #           constraints => [ sub { /\bperl\b/i } ],
    #           username => '...',
    #           password => '...',
    #           miles => 25,
    #           zip => ...,
    #           keyword => 'perl' );
    #do_mn_job_bank( category => 'MN Domino',
    #           constraints => [ sub { /\b(?:domino|lotus\s+notes)\b/i
+ } ],
    #           username => '...',
    #           password => '...' );
    
    return;
}

sub initialize {
    my %p = validate( @_,
                      { test => { type => SCALAR,
                                  optional => 1,
                                  default => 0 },
                        cache_namespace => { type => SCALAR,
                                             optional => 1,
                                             default => '1 week' },
                        cache_autopurge_interval => { type => SCALAR,
                                                      optional => 1,
                                                      default => '1 ho
+ur' },
                        db_filename => { type => SCALAR,
                                         optional => 1,
                                         default => "$ENV{HOME}/.jobwa
+tcher.db" },
                        wget_expiration => { type => SCALAR,
                                             optional => 1,
                                             default => '1 hour' },
                        sendmail => { type => HASHREF } } );
    
    $DB = DBM::Deep->new( $p{db_filename} );
    $CACHE = Cache::FileCache->new( { namespace => delete $p{cache_nam
+espace},
                                      autopurge_interval => delete $p{
+cache_autopurge_interval} } );
    
    @CONFIG{keys %p} = values %p;
    
    return;
}

sub do_mn_job_bank {
    my %p = validate( @_,
                      { category => { type => SCALAR },
                        constraints => { type => ARRAYREF,
                                         default => [ sub { 1 } ] },
                        username => { type => SCALAR },
                        password => { type => SCALAR },
                        zip => { type => SCALAR },
                        miles => { type => SCALAR },
                        keyword => { type => SCALAR },} );
    my $www = WWW::Mechanize->new;

    # Log in
    $www->get( 'http://www.mnworks.org/jsli.cfm' );
    $www->submit_form( fields => { li_user_name => $p{username},
                                   li_password => $p{password} },
                       button => 'Submit' );

    # Search
    $www->get( 'http://www1.mnworks.org/jobseeker/js_srch_pg.cfm' );
    $www->submit_form( fields => { i_geo_search_flag => 'zip_yes',
                                   zip_miles => $p{miles},
                                   zip_code => $p{zip},
                                   Submit => 'More Search Options' } )
+;
    print $www->content;
    $www->submit_form( fields => { # i_geo_search_flags => 'zip_yes',
                                   # zip_miles => $p{miles},
                                   # zip_code => $p{zip},
                                   i_job_description => $p{keyword} } 
+);
}

sub do_jobs_perl_org {
    my %p = validate( @_,
                      { category => { type => SCALAR },
                        url => { type => SCALAR },
                        constraints => { type => ARRAYREF,
                                         default => [ sub { 1 } ] } } 
+);
    my $rssurl = $p{url};
    
    my $feed = rss_feed_from_url( url => $rssurl );
    for my $job ( @{$feed->{items}} ) {
        my $url = $job->{link};
        my $title = $job->{title};
        my $desc = html2text( html => get( url => $url ) );
        
        # Remove spaces @ the start of lines
        {
            my %spaces;
            ++ $spaces{$_}
              for
                map length(),
                  $desc =~ /^( +)/gm;
            
            my $spaces_to_remove
              = ( sort { $spaces{$b} <=> $spaces{$a} }
                  keys %spaces )[0];
            my $space_to_remove = " " x $spaces_to_remove;
            $desc =~ s/^$space_to_remove//mg;
        }
        
        # Remove boilerplate & navigation at the top
        $desc =~ s/
            (?s:.+?)
            (.+)[\r\n]+
            [ \t]*=+[ \t]*[\r\n]+
        /$1\n/x;
        
        # Now collapse the description fields
        $desc =~ s((.+)(?=^Description:)){
            # Now I have a handle on just the key/value area ahead of 
+the
            # description
            local $_ = $1;
            
            # Collapse field lines together
            s/^(.+:)[ \t]*[\r\n]+
               (.+)/$1 $2/mgx;
            
            # Remove empty lines
            s/^[ \t]*[\r\n]+//gm;
            
            # Remove trailing / leading whitespace
            s/^\s+//;
            s/\s+$//;
            
            # Add in the \n I want.
            "$_\n";
        }sem;
        
        # Finally remove the boilerplate from the botton.
        $desc = reverse $desc;
        $desc =~ s/(?s:.+?)
                   ^ [ \t]* -+  [ \t]* [\r\n]+
                   ^ [ \t]*            [\r\n]+
        //xsm;
#       $desc =~ s/.+?^[ \t]*-{30,}[ \t]*[\r\n]+^\s+//s;
        $desc = reverse $desc;
        $desc =~ s/\s+$//;
        $desc .= "\n";
        
        if ( passes_constraints( constraints => $p{constraints},
                                 title => $title,
                                 desc => $desc ) ) {
            notify( id => $url,
                    title => $title,
                    desc  => "$p{category} $desc" );
        }
    }

    return;
}

sub passes_constraints {
    my %p = validate( @_,
                      { constraints => { type => ARRAYREF },
                        title => { type => SCALAR },
                        desc => { type => SCALAR } } );
    my @constraints = @{delete $p{constraints}};
    if ( not @constraints ) {
        @constraints = sub { 1 };
    }
    
    local $_ = "$p{title}\n$p{desc}";
    
    # When 
    if ( /Job no longer in database/ ) {
        return 0;
    }
    for my $constraint ( @constraints ) {
        if ( $constraint->() ) {
            return 1;
        }
    }
    return 0;
}

sub notify {
    my %p = validate( @_,
                      { id => { type => SCALAR },
                        title => { type => SCALAR },
                        desc => { type => SCALAR } } );
    
    return if $DB->{notified}{$p{id}};
    
    if ( $CONFIG{test} ) {
        print "NOTIFY $p{id}\n    $p{title}\n";
        return;
    }
    
    $DB->{notified}{$p{id}} = 1;

    sendmail( %{$CONFIG{sendmail}},
              Subject => $p{title},
              Text => $p{desc} );
}

sub rss_feed_from_url {
    my %p = validate( @_,
                      { url => { type => SCALAR } } );
    my $url = delete $p{url};
    
    my $parser = XML::RSS->new;
    $parser->parse( get( url => $url ) );
    return $parser;
}

sub html2text {
    my %p = validate( @_,
                      { html => { type => SCALAR } } );
    return HTML::FormatText->format_string( $p{html} );
}

sub get {
    my %p = validate( @_,
                      { url => { type => SCALAR } } );
    my $url = delete $p{url};

    my $content = $CACHE->get( $url );
    if ( not defined $content ) {
        my $www = WWW::Mechanize->new;
        $www->get( $url );
        $content = $www->content;
        $CACHE->set( $url, $content, $CONFIG{wget_expiration} );
    }

    return $content;
}
Replies are listed 'Best First'.
Re: jobs.perl.org watcher
by salva (Canon) on Jun 07, 2005 at 20:50 UTC
      It might be - but then I'd have to write more code. As-is, I already have to scrape web pages so it makes some sense to just keep it in the same domain.

Log In?
Username:
Password:

What's my password?
Create A New User
Domain Nodelet?
Node Status?
node history
Node Type: sourcecode [id://464439]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this?Last hourOther CB clients
Other Users?
Others musing on the Monastery: (4)
As of 2024-03-29 05:50 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    No recent polls found