http://qs1969.pair.com?node_id=464439
Category: Miscellaneous
Author/Contact Info "Joshua ben Jore" <jjore@cpan.org>
Description:

This monitors jobs.perl.org (and eventually other sites) for postings of interest. There is separate code for watching the Minnesota Job Bank but since that's very specific, I left it out.

crontab entry to run once an hour, at 32 minutes past.

32 * * * * /home/.../bin/JobWatcher
#!perl
use strict;
use warnings;
use Cache::FileCache;
# use Data::Dump::Streamer;
use DBM::Deep;
use HTML::FormatText;
use Mail::Sendmail 'sendmail';
use Params::Validate ':all';
use WWW::Mechanize;
use XML::RSS;
use vars ( '$DB', '$CACHE', '%CONFIG' );

main( sendmail => { smtp => '...',
                    From => '...',
                    To => '...' } );
exit;

sub main {
    my %p = @_;
    
    initialize( %p );
    do_jobs_perl_org( category => 'MN perl',
                      url => 'http://jobs.perl.org/rss/standard.rss?li
+mit=20',
                      constraints => [ sub { /minnesota/i } ] );
    do_jobs_perl_org( category => 'Tele perl',
                      url => 'http://jobs.perl.org/rss/telecommute.rss
+?limit=20' );
    do_jobs_perl_org( category => 'Domino perl',
                      url => 'http://jobs.perl.org/rss/standard.rss?li
+mit=20',
                      constraints => [ sub { /\b(?:domino|lotus|notes)
+\b/i } ] );
    # do_mn_job_bank( category => 'MN perl',
    #           constraints => [ sub { /\bperl\b/i } ],
    #           username => '...',
    #           password => '...',
    #           miles => 25,
    #           zip => ...,
    #           keyword => 'perl' );
    #do_mn_job_bank( category => 'MN Domino',
    #           constraints => [ sub { /\b(?:domino|lotus\s+notes)\b/i
+ } ],
    #           username => '...',
    #           password => '...' );
    
    return;
}

sub initialize {
    my %p = validate( @_,
                      { test => { type => SCALAR,
                                  optional => 1,
                                  default => 0 },
                        cache_namespace => { type => SCALAR,
                                             optional => 1,
                                             default => '1 week' },
                        cache_autopurge_interval => { type => SCALAR,
                                                      optional => 1,
                                                      default => '1 ho
+ur' },
                        db_filename => { type => SCALAR,
                                         optional => 1,
                                         default => "$ENV{HOME}/.jobwa
+tcher.db" },
                        wget_expiration => { type => SCALAR,
                                             optional => 1,
                                             default => '1 hour' },
                        sendmail => { type => HASHREF } } );
    
    $DB = DBM::Deep->new( $p{db_filename} );
    $CACHE = Cache::FileCache->new( { namespace => delete $p{cache_nam
+espace},
                                      autopurge_interval => delete $p{
+cache_autopurge_interval} } );
    
    @CONFIG{keys %p} = values %p;
    
    return;
}

sub do_mn_job_bank {
    my %p = validate( @_,
                      { category => { type => SCALAR },
                        constraints => { type => ARRAYREF,
                                         default => [ sub { 1 } ] },
                        username => { type => SCALAR },
                        password => { type => SCALAR },
                        zip => { type => SCALAR },
                        miles => { type => SCALAR },
                        keyword => { type => SCALAR },} );
    my $www = WWW::Mechanize->new;

    # Log in
    $www->get( 'http://www.mnworks.org/jsli.cfm' );
    $www->submit_form( fields => { li_user_name => $p{username},
                                   li_password => $p{password} },
                       button => 'Submit' );

    # Search
    $www->get( 'http://www1.mnworks.org/jobseeker/js_srch_pg.cfm' );
    $www->submit_form( fields => { i_geo_search_flag => 'zip_yes',
                                   zip_miles => $p{miles},
                                   zip_code => $p{zip},
                                   Submit => 'More Search Options' } )
+;
    print $www->content;
    $www->submit_form( fields => { # i_geo_search_flags => 'zip_yes',
                                   # zip_miles => $p{miles},
                                   # zip_code => $p{zip},
                                   i_job_description => $p{keyword} } 
+);
}

sub do_jobs_perl_org {
    my %p = validate( @_,
                      { category => { type => SCALAR },
                        url => { type => SCALAR },
                        constraints => { type => ARRAYREF,
                                         default => [ sub { 1 } ] } } 
+);
    my $rssurl = $p{url};
    
    my $feed = rss_feed_from_url( url => $rssurl );
    for my $job ( @{$feed->{items}} ) {
        my $url = $job->{link};
        my $title = $job->{title};
        my $desc = html2text( html => get( url => $url ) );
        
        # Remove spaces @ the start of lines
        {
            my %spaces;
            ++ $spaces{$_}
              for
                map length(),
                  $desc =~ /^( +)/gm;
            
            my $spaces_to_remove
              = ( sort { $spaces{$b} <=> $spaces{$a} }
                  keys %spaces )[0];
            my $space_to_remove = " " x $spaces_to_remove;
            $desc =~ s/^$space_to_remove//mg;
        }
        
        # Remove boilerplate & navigation at the top
        $desc =~ s/
            (?s:.+?)
            (.+)[\r\n]+
            [ \t]*=+[ \t]*[\r\n]+
        /$1\n/x;
        
        # Now collapse the description fields
        $desc =~ s((.+)(?=^Description:)){
            # Now I have a handle on just the key/value area ahead of 
+the
            # description
            local $_ = $1;
            
            # Collapse field lines together
            s/^(.+:)[ \t]*[\r\n]+
               (.+)/$1 $2/mgx;
            
            # Remove empty lines
            s/^[ \t]*[\r\n]+//gm;
            
            # Remove trailing / leading whitespace
            s/^\s+//;
            s/\s+$//;
            
            # Add in the \n I want.
            "$_\n";
        }sem;
        
        # Finally remove the boilerplate from the botton.
        $desc = reverse $desc;
        $desc =~ s/(?s:.+?)
                   ^ [ \t]* -+  [ \t]* [\r\n]+
                   ^ [ \t]*            [\r\n]+
        //xsm;
#       $desc =~ s/.+?^[ \t]*-{30,}[ \t]*[\r\n]+^\s+//s;
        $desc = reverse $desc;
        $desc =~ s/\s+$//;
        $desc .= "\n";
        
        if ( passes_constraints( constraints => $p{constraints},
                                 title => $title,
                                 desc => $desc ) ) {
            notify( id => $url,
                    title => $title,
                    desc  => "$p{category} $desc" );
        }
    }

    return;
}

sub passes_constraints {
    my %p = validate( @_,
                      { constraints => { type => ARRAYREF },
                        title => { type => SCALAR },
                        desc => { type => SCALAR } } );
    my @constraints = @{delete $p{constraints}};
    if ( not @constraints ) {
        @constraints = sub { 1 };
    }
    
    local $_ = "$p{title}\n$p{desc}";
    
    # When 
    if ( /Job no longer in database/ ) {
        return 0;
    }
    for my $constraint ( @constraints ) {
        if ( $constraint->() ) {
            return 1;
        }
    }
    return 0;
}

sub notify {
    my %p = validate( @_,
                      { id => { type => SCALAR },
                        title => { type => SCALAR },
                        desc => { type => SCALAR } } );
    
    return if $DB->{notified}{$p{id}};
    
    if ( $CONFIG{test} ) {
        print "NOTIFY $p{id}\n    $p{title}\n";
        return;
    }
    
    $DB->{notified}{$p{id}} = 1;

    sendmail( %{$CONFIG{sendmail}},
              Subject => $p{title},
              Text => $p{desc} );
}

sub rss_feed_from_url {
    my %p = validate( @_,
                      { url => { type => SCALAR } } );
    my $url = delete $p{url};
    
    my $parser = XML::RSS->new;
    $parser->parse( get( url => $url ) );
    return $parser;
}

sub html2text {
    my %p = validate( @_,
                      { html => { type => SCALAR } } );
    return HTML::FormatText->format_string( $p{html} );
}

sub get {
    my %p = validate( @_,
                      { url => { type => SCALAR } } );
    my $url = delete $p{url};

    my $content = $CACHE->get( $url );
    if ( not defined $content ) {
        my $www = WWW::Mechanize->new;
        $www->get( $url );
        $content = $www->content;
        $CACHE->set( $url, $content, $CONFIG{wget_expiration} );
    }

    return $content;
}
Replies are listed 'Best First'.
Re: jobs.perl.org watcher
by salva (Canon) on Jun 07, 2005 at 20:50 UTC
      It might be - but then I'd have to write more code. As-is, I already have to scrape web pages so it makes some sense to just keep it in the same domain.