This snippet can fetch section code,poetry, obfu, cufp, and snippets from Perlmonks, and then save them in an organized structure.

Thanks

#!/usr/bin/perl use strict; use IO::All; use LWP::Simple; use Data::Dumper; use HTML::Entities; use Template::Extract; my $base = 'http://perlmonks.org/'; my $extor = Template::Extract->new; my $default_next_template = qr'<td align=right width=100><a href="(.+? +)">Next entries--></a>&nbsp;</td></tr></table>'; my $default_desc_template = qr'<data>.+?<field name="doctext">(.+?)</f +ield>.+?</data>'s; my $default_template = <<'.'; [% FOREACH record %][% ... %] <tr class = "post_head">[% ... %] <td>[% ... %] <a HREF="?node_id=[% node_id %]">[% title %]</a><br />[% ... %] on[% ... %] by <a HREF="[% ... %]">[% author %]</a>[% ... %] </td>[% ... %] </tr> [% END %] . my %profile = ( code => { url => $base."?node=Code Catacombs", desc_template => qr'<field name="codedescription">(.+?)</field +>'s, }, poetry => { url => $base.'?node=Perl Poetry', }, cooluses => { url => $base.'?node=Cool Uses for Perl', }, obfu => { url => $base.'?node=Obfuscated Code', }, snippets => { url => $base.'?node=Snippets Section', next_template => qr'&nbsp;&nbsp;<A HREF="(.+?)">Next 20--&gt;</A>' +, desc_template => qr'<field name="snippetdesc">(.+?)</field>'s, template => <<'.', <UL>[% FOREACH record %]<li><a HREF="?node_id=[% node_id %]">[% title +%]</a>[% ... %]by[% ... %]<a HREF="[% ... %]">[% author %]</a>[% END +%]<BR><BR> . }, ); sub clean_data { my $data = shift; $$data =~ s/\r\n/\n/sg; $$data =~ s/^\n+//s; $$data =~ s/\n+$/\n/s; $$data = decode_entities($$data); } mkdir "./perlmonks"; mkdir "./perlmonks/$_" for 0..9; foreach my $p (keys %profile){ while(1){ print "<< $p >>\n"; print $profile{$p}->{url},$/; my $doc = get($profile{$p}{url}); my $rec = $extor->extract( (defined($profile{$p}->{template}) ? $profile{$p}->{template} : $default_template) => $doc ); if(defined($profile{$p}->{next_template})){ $doc =~ m,$profile{$p}->{next_template},; $profile{$p}->{url} = $1 ? $base . $1 : undef; } else { $doc =~ m,$default_next_template,; $profile{$p}->{url} = $1 ? $base . $1 : undef; } my $dup_count = 0; foreach my $r (@{$rec->{record}}){ my $lc = substr($r->{node_id}, length($r->{node_id})-1); if( -e "./perlmonks/$lc/$r->{node_id}" ){ $dup_count++; next; } print '-- ', join(q/ /, @{$r}{qw(title author node_id)}), $/; # CODE my $data = get($base.'?node_id='.$r->{node_id}.';displaytype=dis +playcode'); clean_data(\$data); next unless $data; mkdir "./perlmonks/$lc/$r->{node_id}"; io("./perlmonks/$lc/$r->{node_id}/code")->print($data); # META io("./perlmonks/$lc/$r->{node_id}/meta") ->print(decode_entities($r->{title}).$/.$r->{author}); # DESCRIPTIONS $data = get($base.'?displaytype=xml;node_id='.$r->{node_id}); if($profile{$p}->{desc_template}){ ($data) = ($data =~ m($profile{$p}->{desc_template})); } else { ($data) = ($data =~ m($default_desc_template)); } clean_data(\$data); io("./perlmonks/$lc/$r->{node_id}/desc")->print($data); # last; } if($dup_count > @{$rec->{record}}/2){ print "No further updates from $p\n"; last; } } } __END__

Replies are listed 'Best First'.
Re: MonkMirror.pl
by tye (Sage) on Sep 20, 2004 at 15:32 UTC

    Please don't spider PerlMonks (nor any web site) without adding delays to reduce the impact on the site servers.

    - tye        

Re: MonkMirror.pl
by Aristotle (Chancellor) on Sep 20, 2004 at 10:04 UTC
      You are right. I found it probably so just after having posted it. grr.......
Re: MonkMirror.pl
by theorbtwo (Prior) on Sep 21, 2004 at 10:46 UTC

    Like tye said, please, use delays. In addition to that, the "xml" link at the top of every page will give you XML carrying the data on that page, so you don't have to parse the HTML. Changes to the XML are done with careful consideration of how they effect existing users. Changes to the HTML we only consider changes in the visual rendering of web browsers, not accessability to spiders, such as this one.


    Warning: Unless otherwise stated, code is untested. Do not use without understanding. Code is posted in the hopes it is useful, but without warranty. All copyrights are relinquished into the public domain unless otherwise stated. I am not an angel. I am capable of error, and err on a fairly regular basis. If I made a mistake, please let me know (such as by replying to this node).