Category:
Author/Contact Info Josh Jore
Description:

This is a convenient way to download some classes from MIT's OpenCourseware website.

#!/opt/perl-5.10.0/bin/perl
use strict;
use warnings;
use feature ':5.10';
use File::Path 'mkpath';
BEGIN { $INC{'OpenCourseware/Mech.pm'} = 1 }

my $mech = OpenCourseware::Mechanize->new;
for my $pattern ( @ARGV ) {
  for my $url ( $mech->find_all_courses( title_regexp => qr/$pattern/ 
+) ) {
    $mech->get( $url );

    my $path = $mech->course_filepath;
    mkpath( $path );
  
    $mech->mirror_syllabus;
    $mech->mirror_readings;
    $mech->mirror_assignments;
    $mech->mirror_exams;
    $mech->mirror_videos;
  }
}

package OpenCourseware::Mechanize;
use strict;
use warnings;
use feature ':5.10';
use WWW::Mechanize     ();
use HTML::Tidy         ();
use XML::LibXML        ();
use HTML::TableExtract ();
use File::Slurp        ();
use LWP::Simple        ();

use constant {
  XML_PARSER => XML::LibXML->new,
  HTML_TIDY  => HTML::Tidy->new
};

BEGIN {
  @OpenCourseware::Mechanize::ISA = 'WWW::Mechanize';

  XML_PARSER->no_network( 1 );

  for ( [syllabus => qr/^Syllabus\z/, 'syllabus.txt' ],
        [readings => qr/^Readings\z/, 'readings.txt' ], ) {
    my ( $name, $text_regex, $file ) = @$_;
    eval <<"AAA";
       sub mirror_$name {
         my ( \$self ) = \@_;
         my \$path = \$self->course_filepath;

         \$self->follow_link( text_regex => \$text_regex );
         eval {
           File::Slurp::write_file( "\$path/\$file", \$self->content( 
+format => 'text' ) );
         };
         my \$e = \$\@;
         \$self->back;
         die \$e if \$e;
       }
AAA
  }

  for ( [assignments => ['ASSIGNMENTS','SOLUTIONS TO CHALLENGE PROBLEM
+S'], qr/^Assignments\z/],
        [exams => ['EXAMS','SOLUTIONS'], qr/^Exams/], ) {
    my ( $name, $headers, $text_regex ) = @$_;

    eval <<"BBB";
      sub mirror_$name {
        my ( \$self ) = \@_;
        my \$path = \$self->course_filepath;
        my \$te = HTML::TableExtract->new( headers => \$headers,
                                           keep_html => 1 );

        \$self->follow_link( text_regex => \$text_regex );
        eval {
          \$te->parse( \$self->content );
          for ( \$te->tables ) {
            for ( \$_->rows ) {
              for ( grep { defined } \@\$_ ) {
                my ( \$url ) = m{"([^"]+)};
                my ( \$file ) = \$url =~ m{/([^/]+)\\z};

                LWP::Simple::mirror( "http://ocw.mit.edu\$url", "\$pat
+h/\$file" );
              }
            }
          }
        };
        my \$e = \$\@;
        \$self->back;
        die \$e if \$e;
      }
BBB
  }
}

sub mirror_videos {
  my ( $self ) = @_;
  my $path = $self->course_filepath;
  $self->follow_link( text_regex => qr/^Video Lectures\z/ );
  eval {
    my $doc = $xml_parser->parse_html_string( $html_tidy->clean( $self
+->content ) );
    my $nth = 1;
    for my $tr ( $doc->findnodes( '//tr' ) ) {
      my ( $name ) =
        grep { /\S/ }
        map { $_->data }
        $tr->findnodes( 'td[ position() = 1 ]/text()' );

      my @videos =
        grep { /\S/ }
        map { $_->value }
        $tr->findnodes( 'td[ position() = 2 ]/a/attribute::href' );

      if ( $videos[-1] ) {
        # pnm://a1599.v78709.c7870.g.vr.akamaistream.net/ondemand/7/15
+99/7870/v0001/mitstorage.download.akamai.com/7870/18/18.06/vi
        my ( $url ) = LWP::Simple::get( $videos[-1] ) =~ m{(?<=mitstor
+age.download.akamai.com/)(.+)};
        $url = "http://ocw.mit.edu/ans$url";
        my $file = sprintf "$path/%02d - $name.rm", $nth++;
        LWP::Simple::mirror( $url, $file );
      }
    }
  };
  my $e = $@;
  $self->back;
  die $e if $e;
}

sub course_filepath {
  my ( $self ) = @_;
  return
    join '/',
    grep { length }
    map {
      s[^[[:punct:]]+][];
      s[[[:punct:]]+\z][];
      $_;
    }
    $self->title =~ /([^|\s]+(?:\s+[^|\s+]+))/g;
}

sub find_all_courses {
  my ( $self, %p ) = @_;

  my $te = HTML::TableExtract->new( headers => [ 'Course Title' ],
                                    keep_html => 1 );

  $mech->get( 'http://ocw.mit.edu/OcwWeb/web/courses/courses/index.htm
+' );
  $te->parse( $mech->content );

  my @urls;
  for ( $te->tables ) {
    for ( $_->rows ) {
      when ($_ ~~ $p{title_regexp}) {
      push @urls, $_ ~~ /<a href="([^"]+)/ ? "http://ocw.mit.edu$1" : 
+();
      }
    }
  }

  return unique( @urls );
}

sub unique {
  my %seen;
  return
    grep { not $seen{$_}++ }
    @_;
}