This snippet can fetch section code,poetry, obfu, cufp, and snippets from Perlmonks, and then save them in an organized structure.
Thanks
#!/usr/bin/perl use strict; use IO::All; use LWP::Simple; use Data::Dumper; use HTML::Entities; use Template::Extract; my $base = 'http://perlmonks.org/'; my $extor = Template::Extract->new; my $default_next_template = qr'<td align=right width=100><a href="(.+? +)">Next entries--></a> </td></tr></table>'; my $default_desc_template = qr'<data>.+?<field name="doctext">(.+?)</f +ield>.+?</data>'s; my $default_template = <<'.'; [% FOREACH record %][% ... %] <tr class = "post_head">[% ... %] <td>[% ... %] <a HREF="?node_id=[% node_id %]">[% title %]</a><br />[% ... %] on[% ... %] by <a HREF="[% ... %]">[% author %]</a>[% ... %] </td>[% ... %] </tr> [% END %] . my %profile = ( code => { url => $base."?node=Code Catacombs", desc_template => qr'<field name="codedescription">(.+?)</field +>'s, }, poetry => { url => $base.'?node=Perl Poetry', }, cooluses => { url => $base.'?node=Cool Uses for Perl', }, obfu => { url => $base.'?node=Obfuscated Code', }, snippets => { url => $base.'?node=Snippets Section', next_template => qr' <A HREF="(.+?)">Next 20--></A>' +, desc_template => qr'<field name="snippetdesc">(.+?)</field>'s, template => <<'.', <UL>[% FOREACH record %]<li><a HREF="?node_id=[% node_id %]">[% title +%]</a>[% ... %]by[% ... %]<a HREF="[% ... %]">[% author %]</a>[% END +%]<BR><BR> . }, ); sub clean_data { my $data = shift; $$data =~ s/\r\n/\n/sg; $$data =~ s/^\n+//s; $$data =~ s/\n+$/\n/s; $$data = decode_entities($$data); } mkdir "./perlmonks"; mkdir "./perlmonks/$_" for 0..9; foreach my $p (keys %profile){ while(1){ print "<< $p >>\n"; print $profile{$p}->{url},$/; my $doc = get($profile{$p}{url}); my $rec = $extor->extract( (defined($profile{$p}->{template}) ? $profile{$p}->{template} : $default_template) => $doc ); if(defined($profile{$p}->{next_template})){ $doc =~ m,$profile{$p}->{next_template},; $profile{$p}->{url} = $1 ? $base . $1 : undef; } else { $doc =~ m,$default_next_template,; $profile{$p}->{url} = $1 ? $base . $1 : undef; } my $dup_count = 0; foreach my $r (@{$rec->{record}}){ my $lc = substr($r->{node_id}, length($r->{node_id})-1); if( -e "./perlmonks/$lc/$r->{node_id}" ){ $dup_count++; next; } print '-- ', join(q/ /, @{$r}{qw(title author node_id)}), $/; # CODE my $data = get($base.'?node_id='.$r->{node_id}.';displaytype=dis +playcode'); clean_data(\$data); next unless $data; mkdir "./perlmonks/$lc/$r->{node_id}"; io("./perlmonks/$lc/$r->{node_id}/code")->print($data); # META io("./perlmonks/$lc/$r->{node_id}/meta") ->print(decode_entities($r->{title}).$/.$r->{author}); # DESCRIPTIONS $data = get($base.'?displaytype=xml;node_id='.$r->{node_id}); if($profile{$p}->{desc_template}){ ($data) = ($data =~ m($profile{$p}->{desc_template})); } else { ($data) = ($data =~ m($default_desc_template)); } clean_data(\$data); io("./perlmonks/$lc/$r->{node_id}/desc")->print($data); # last; } if($dup_count > @{$rec->{record}}/2){ print "No further updates from $p\n"; last; } } } __END__
In reply to MonkMirror.pl by xern
| For: | Use: | ||
| & | & | ||
| < | < | ||
| > | > | ||
| [ | [ | ||
| ] | ] |