#!/opt/perl-5.10.0/bin/perl use strict; use warnings; use feature ':5.10'; use File::Path 'mkpath'; BEGIN { $INC{'OpenCourseware/Mech.pm'} = 1 } my $mech = OpenCourseware::Mechanize->new; for my $pattern ( @ARGV ) { for my $url ( $mech->find_all_courses( title_regexp => qr/$pattern/ +) ) { $mech->get( $url ); my $path = $mech->course_filepath; mkpath( $path ); $mech->mirror_syllabus; $mech->mirror_readings; $mech->mirror_assignments; $mech->mirror_exams; $mech->mirror_videos; } } package OpenCourseware::Mechanize; use strict; use warnings; use feature ':5.10'; use WWW::Mechanize (); use HTML::Tidy (); use XML::LibXML (); use HTML::TableExtract (); use File::Slurp (); use LWP::Simple (); use constant { XML_PARSER => XML::LibXML->new, HTML_TIDY => HTML::Tidy->new }; BEGIN { @OpenCourseware::Mechanize::ISA = 'WWW::Mechanize'; XML_PARSER->no_network( 1 ); for ( [syllabus => qr/^Syllabus\z/, 'syllabus.txt' ], [readings => qr/^Readings\z/, 'readings.txt' ], ) { my ( $name, $text_regex, $file ) = @$_; eval <<"AAA"; sub mirror_$name { my ( \$self ) = \@_; my \$path = \$self->course_filepath; \$self->follow_link( text_regex => \$text_regex ); eval { File::Slurp::write_file( "\$path/\$file", \$self->content( +format => 'text' ) ); }; my \$e = \$\@; \$self->back; die \$e if \$e; } AAA } for ( [assignments => ['ASSIGNMENTS','SOLUTIONS TO CHALLENGE PROBLEM +S'], qr/^Assignments\z/], [exams => ['EXAMS','SOLUTIONS'], qr/^Exams/], ) { my ( $name, $headers, $text_regex ) = @$_; eval <<"BBB"; sub mirror_$name { my ( \$self ) = \@_; my \$path = \$self->course_filepath; my \$te = HTML::TableExtract->new( headers => \$headers, keep_html => 1 ); \$self->follow_link( text_regex => \$text_regex ); eval { \$te->parse( \$self->content ); for ( \$te->tables ) { for ( \$_->rows ) { for ( grep { defined } \@\$_ ) { my ( \$url ) = m{"([^"]+)}; my ( \$file ) = \$url =~ m{/([^/]+)\\z}; LWP::Simple::mirror( "http://ocw.mit.edu\$url", "\$pat +h/\$file" ); } } } }; my \$e = \$\@; \$self->back; die \$e if \$e; } BBB } } sub mirror_videos { my ( $self ) = @_; my $path = $self->course_filepath; $self->follow_link( text_regex => qr/^Video Lectures\z/ ); eval { my $doc = $xml_parser->parse_html_string( $html_tidy->clean( $self +->content ) ); my $nth = 1; for my $tr ( $doc->findnodes( '//tr' ) ) { my ( $name ) = grep { /\S/ } map { $_->data } $tr->findnodes( 'td[ position() = 1 ]/text()' ); my @videos = grep { /\S/ } map { $_->value } $tr->findnodes( 'td[ position() = 2 ]/a/attribute::href' ); if ( $videos[-1] ) { # pnm://a1599.v78709.c7870.g.vr.akamaistream.net/ondemand/7/15 +99/7870/v0001/mitstorage.download.akamai.com/7870/18/18.06/vi my ( $url ) = LWP::Simple::get( $videos[-1] ) =~ m{(?<=mitstor +age.download.akamai.com/)(.+)}; $url = "http://ocw.mit.edu/ans$url"; my $file = sprintf "$path/%02d - $name.rm", $nth++; LWP::Simple::mirror( $url, $file ); } } }; my $e = $@; $self->back; die $e if $e; } sub course_filepath { my ( $self ) = @_; return join '/', grep { length } map { s[^[[:punct:]]+][]; s[[[:punct:]]+\z][]; $_; } $self->title =~ /([^|\s]+(?:\s+[^|\s+]+))/g; } sub find_all_courses { my ( $self, %p ) = @_; my $te = HTML::TableExtract->new( headers => [ 'Course Title' ], keep_html => 1 ); $mech->get( 'http://ocw.mit.edu/OcwWeb/web/courses/courses/index.htm +' ); $te->parse( $mech->content ); my @urls; for ( $te->tables ) { for ( $_->rows ) { when ($_ ~~ $p{title_regexp}) { push @urls, $_ ~~ /<a href="([^"]+)/ ? "http://ocw.mit.edu$1" : +(); } } } return unique( @urls ); } sub unique { my %seen; return grep { not $seen{$_}++ } @_; }

In reply to Download MIT OpenCourseware by diotalevi

Title:
Use:  <p> text here (a paragraph) </p>
and:  <code> code here </code>
to format your post, it's "PerlMonks-approved HTML":



  • Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!
  • Titles consisting of a single word are discouraged, and in most cases are disallowed outright.
  • Read Where should I post X? if you're not absolutely sure you're posting in the right place.
  • Please read these before you post! —
  • Posts may use any of the Perl Monks Approved HTML tags:
    a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, details, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, summary, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr
  • You may need to use entities for some characters, as follows. (Exception: Within code tags, you can put the characters literally.)
            For:     Use:
    & &amp;
    < &lt;
    > &gt;
    [ &#91;
    ] &#93;
  • Link using PerlMonks shortcuts! What shortcuts can I use for linking?
  • See Writeup Formatting Tips and other pages linked from there for more info.