in reply to Run your own perlmonks!

As promised, the following is the code for getnode.pl. This will take a node out of pm and into your local perlmonks. Note that it's only as good as the data that it can actualy get -- it won't allow you to see things that you otherwise could not.

To use, first replace pmusername, pmpassword, dbusername, and dbpassword with the approprate values (preferably for a pmdev or god on the PM side, and with a user that has create table privileges in the named database on the database side). Then call it with a nodeid, a nodename, or a nodename and type. I suggest you first start with the pmmodule nodes, then the dbtable nodes, then the nodetype nodes (which you have to do partaly manualy for the time being, mostly because I'm lazy and haven't written it).

There are also many things you have to patch or set up manualy to get a semi-working PM. I'll document those at a later point.

Note that this code is horribly written, horribly innefficent, horribly incomplete, and generaly horrible. Please, watch this space for updates.

#!/usr/local/bin/perl5.8.0 # -*- cperl -*- # Version 0.02 released Mon Sep 29 09:32:41 CEST 2003 # -- Fixed stupid syntax errors # Version 0.01 released Mon Sep 29 09:14:41 CEST 2003 use warnings; use strict; use LWP::Simple ('get', '$ua'); use URI::Escape; use XML::Simple; use Data::Dumper; use HTML::TableExtract; use HTML::TreeBuilder; use DBI; + my $dbh=DBI->connect('dbi:mysql:perlmonks', 'dbusername', 'dbpassword' +); my $pmuser=uri_escape('pmusername'); my $pmpasswd=uri_escape('pmpassword'); my $node=uri_escape(shift); my $nodetype=shift; + $ua->cookie_jar({}); $ua->cookie_jar() or die "Couldn't create cookie jar: $!"; + get("http://perlmonks.org/?op=login;user=$pmuser;passwd=$pmpasswd") or die "Couldn't log in to perlmonks: $!"; + #warn "http://perlmonks.org/?node=$node;displaytype=xml" . # (defined $nodetype?";type=$nodetype":''); my $xml = get("http://perlmonks.org/?node=$node;displaytype=xml" . (defined $nodetype?";type=$nodetype":'')) or die "Couldnt't get node $node: $!"; + $xml = XMLin($xml); print Dumper $xml; + my $type = nowhitespace($xml->{type}{content}); + my $NODE; + $NODE->{node}{node_id}= $xml->{id}; $NODE->{node}{type_nodetype}=$xml->{type}{id}; $NODE->{node}{title}= $xml->{title}; $NODE->{node}{author_user}= $xml->{author}{id}; $NODE->{node}{createtime}= $xml->{created}; $NODE->{node}{lastupdate}= $xml->{updated}; # $NODE->{node}{hits} # $NODE->{node}{reputation} # $NODE->{node}{votescast} # $NODE->{node}{lockedby_user} # $NODE->{node}{locktime} # $NODE->{node}{core} # $NODE->{node}{package} # $NODE->{node}{postbonus} # $NODE->{node}{ucreatetime} # $NODE->{node}{node_iip} + + if ($type eq 'dbtable') { my $html = get("http://perlmonks.org/?node=$node;displaytype=print +" . (defined $nodetype?";type=$nodetype":'')) or die "Couldnt't get node $node: $!"; my $table = HTML::TableExtract->new( headers => [qw(Field Type Null Default Key Extr +a)] ); $table->parse($html); + $table = $table->first_table_state_found(); if (!$table) { die "Couldn't extract HTML table"; } + my $statement = "CREATE TABLE $node ("; + foreach ($table->rows()) { my %col; @col{qw(field type null default key extra)} = map {$_ eq chr(0xA +0) ? undef : $_} @$_; $statement .= " $col{field} $col{type}"; if (!$col{null}) { $statement .= " NOT"; } $statement .= " NULL"; if (defined $col{default}) { $statement .= " DEFAULT '$col{default}'"; } if (defined($col{extra}) && $col{extra} =~ /auto_increment/) { $statement .= " AUTO_INCREMENT"; } # if (defined($col{key}) && $col{key} eq 'PRI') { # $statement .= " PRIMARY KEY"; # } $statement .= ","; } + ## Now do it all again for the indeces. $html = get("http://perlmonks.org/?node=$node;type=dbtable;display +type=index") or die "Couldnt't get node $node: $!"; + $table = HTML::TableExtract->new (headers => [qw(Name Dup Seq Column Coll Card SubPt Packed Comme +nt)]); $table->parse($html); + $table = $table->first_table_state_found(); if (!$table) { die "Couldn't extract HTML table for indeces"; } + my %indexes; foreach ($table->rows()) { my %col; @col{qw(name dup seq column coll card subpt packed comment)} = m +ap {$_ eq chr(0xA0) ? undef : $_} @$_; $indexes{$col{name}}{dup} = $col{dup}; # seq starts at 1, but we want to map to a perl array. $indexes{$col{name}}{cols}[$col{seq}-1] = $col{column}; } + foreach my $idxname (keys %indexes) { # PRIMARY KEY: Named PRIMARY # KEY: "Norm. a synonym for INDEX..." # INDEX: # UNIQUE INDEX: if ($idxname eq 'PRIMARY') { $statement .= " PRIMARY KEY"; } else { $statement .= " UNIQUE" if (!$indexes{dup}); $statement .= " INDEX"; } $statement .= " ("; $statement .= join(', ', @{$indexes{$idxname}{cols}}); $statement .= "), "; } #FIXME: Join $statement =~ s?,\s*$?);?; print $statement, "\n"; my $sth = $dbh->prepare($statement) or die "Preparing: $DBI::errst +r"; $sth->execute or die "Executing: $DBI::errstr"; } elsif ($type eq 'linktype') { # There's nothing here; these are just node nodes by another name. } elsif ($type eq 'strangedoc') { my $html = get("http://perlmonks.org/?node=$node;displaytype=viewc +ode" . (defined $nodetype?";type=$nodetype":'')) or die "Couldnt't get node $node: $!"; my $tree = HTML::TreeBuilder->new_from_content($html); my $main_content = $tree->look_down('_tag', 'td', 'class', 'main_c +ontent'); my $pre = $main_content->look_down('_tag', 'pre'); my $code = $pre->as_text; $code=~s/^[ 0-9]{4}: //mg; $NODE->{document}{doctext} = $code; } elsif ($type eq 'note') { # Don't have note.rank or document.lastedit $NODE->{note}{parent_node} = nowhitespace($xml->{data}{field}{pare +nt_node}{content}); $NODE->{note}{root_node} = nowhitespace($xml->{data}{field}{root +_node}{content}); $NODE->{document}{doctext} = $xml->{data}{field}{doctext}{content} +; } elsif ($type eq 'opcode' or $type eq 'strangedoc') { my $html = get("http://perlmonks.org/?node=$node;displaytype=print +" . (defined $nodetype?";type=$nodetype":'')) or die "Couldnt't get node $node: $!"; my $tree = HTML::TreeBuilder->new_from_content($html); my $main_content = $tree->look_down('_tag', 'div', 'id', 'content' +); print $main_content->dump; my $pre = $main_content->look_down('_tag', 'pre'); my $code = $pre->as_text; $code=~s/^[ 0-9]{4}: //mg; #htmlcode $NODE->{htmlcode}{code} = $code; } elsif ($type eq 'nodetype') { # Handle these by hand for now, please. } elsif ($type eq 'image') { my $html = get("http://perlmonks.org/?node=$node;displaytype=print +" . (defined $nodetype?";type=$nodetype":'')) or die "Couldnt't get node $node: $!"; my $tree = HTML::TreeBuilder->new_from_content($html); my $main_content = $tree->look_down('_tag', 'div', 'id', 'content' +); my $img = $main_content->look_down('_tag', 'img'); my $p = $main_content->look_down('_tag', 'p'); $NODE->{image}{src} = $img->attr('src'); $NODE->{image}{alt} = $img->attr('alt'); # thumbsrc $NODE->{image}{description} = $p->as_text; } elsif ($type eq 'pmmodule') { my $html = get("http://perlmonks.org/?node=$node;displaytype=print +" . (defined $nodetype?";type=$nodetype":'')) or die "Couldnt't get node $node: $!"; my $tree = HTML::TreeBuilder->new_from_content($html); my $elem = $tree->look_down('_tag', 'font', 'size', '-1', sub { $_[0]->is_inside('tt'); }); my $file = IO::File->new(">".$NODE->{node}{title}) or die "Can't o +pen ".$NODE->{node}{title}." for writing"; print $file $elem->as_text; } elsif ($type eq 'nodelet') { #Table: nodelet # nltext $NODE->{nodelet}{nlcode} = $xml->{data}{field}{nlcode}{content}; # updateinterval # nlgoto $NODE->{nodelet}{parent_container} = $xml->{data}{field}{parent_co +ntainer}{content}+0; # lastupdate } elsif ($type eq 'htmlpage') { my $html = get("http://perlmonks.org/?node=$node;displaytype=print +" . (defined $nodetype?";type=$nodetype":'')) or die "Couldnt't get node $node: $!"; my $tree = HTML::TreeBuilder->new_from_content($html); my $main_content = $tree->look_down('_tag', 'div', 'id', 'content' +); my %data; my @bolds = $main_content->look_down('_tag', 'b'); foreach (@bolds) { $_->normalize_content; my $key=$_->as_text; my $value= ($_->right)[1]; #Two to the right; there's a ": " t +ext element, then the A or I tag. $data{$key}=$value; print "$key: $value\n"; } #Table: htmlpage $NODE->{htmlpage}{pagetype_nodetype} = a_element_to_nodeid($data{p +agetype}); $NODE->{htmlpage}{displaytype} = nowhitespace($xml->{data}{field}{ +displaytype}{content}); $NODE->{htmlpage}{page} = $xml->{data}{field}{page}{content}; $NODE->{htmlpage}{parent_container} = a_element_to_nodeid($data{'p +arent container'}); if ($data{theme}->attr('_tag') eq 'i') { # /this htmlpage does not belong to a theme/ $NODE->{htmlpage}{ownedby_theme} = 0; } else { $NODE->{htmlpage}{ownedby_theme} = a_element_to_nodeid($data{t +heme}); } # mimetype -- no way to find? $NODE->{htmlpage}{mimetype} = 'text/html'; } elsif ($type eq 'htmlcode') { $NODE->{htmlcode}{code} = $xml->{data}{field}{content}; } elsif ($type eq 'setting' or $type eq 'theme') { use URI::Escape; my $html = get("http://perlmonks.org/?node=$node;displaytype=print +" . (defined $nodetype?";type=$nodetype":'')) or die "Couldnt't get node $node: $!"; my @vars; } elsif ($type eq 'setting' or $type eq 'theme') { use URI::Escape; my $html = get("http://perlmonks.org/?node=$node;displaytype=print +" . (defined $nodetype?";type=$nodetype":'')) or die "Couldnt't get node $node: $!"; my @vars; + my $table = HTML::TableExtract->new(headers => ['Setting', 'Value' +]); $table->parse($html); foreach my $row ($table->rows) { my $key = uri_escape($row->[0]); my $val = uri_escape($row->[1]); $val='+' if ($val eq ''); push @vars, "$key=$val"; } $NODE->{setting}{vars} = join '&', @vars; } elsif ($type eq 'nodegroup' or $type eq 'usergroup' or $type eq 'nod +eletgroup') { my $html = get("http://perlmonks.org/?node=$node;displaytype=print +" . (defined $nodetype?";type=$nodetype":'')) or die "Couldnt't get node $node: $!"; my $tree = HTML::TreeBuilder->new_from_content($html); my $main_content = $tree->look_down('_tag', 'div', 'id', 'content' +); my @nodes = $main_content->look_down('_tag', 'li'); my $groupsth = $dbh->prepare("INSERT nodegroup(nodegroup_id, rank, + node_id, orderby) VALUES ($NODE->{node}{node_id}, ?, ?, ?)"); my $n=0; foreach my $node (@nodes) { my $a=$node->look_down('_tag', 'a'); my $href = $a->attr('href'); $href =~ m|(\d+)|; my $id = $1; print "$n: $id\n"; $groupsth->execute($n, $id, $n); $n++; } } elsif ($type eq 'data') { my $html = get("http://perlmonks.org/?node=$node;displaytype=print +" . (defined $nodetype?";type=$nodetype":'')) or die "Couldnt't get node $node: $!"; my $tree = HTML::TreeBuilder->new_from_content($html); my $main_content = $tree->look_down('_tag', 'div', 'id', 'content' +); $NODE->{document}{doctext} = nowhitespace($main_content->as_text); } elsif ($type eq 'rawpage' or $type eq 'css') { my $doctext = get("http://perlmonks.org/?node=$node" . (defined $nodetype?";type=$nodetype":'')) or die "Couldnt't get node $node: $!"; #Table document: $NODE->{document}{doctext}=$doctext; #Table rawpage: $NODE->{rawpage}={}; $NODE->{rawpage}{datatype} = 'text/css' if ($type eq 'css'); } elsif ($type eq 'pmdevsuperdoc' or $type eq 'superdoc' or $type eq ' +document' or $type eq 'fullpage' or $type eq 'perlquestion' or $NODE->{document}{doctext} = $xml->{data}{field}{content}; # $NODE->{document}{lastedit} } elsif ($type eq 'user') { # Table: user # nick # passwd # realname # email $NODE->{user}{lasttime} = nowhitespace($xml->{data}{field}{lastt +ime}{content}); # karma $NODE->{user}{experience} = nowhitespace($xml->{data}{field}{exper +ience}{content}); # votesleft # votes # imgsrc # lastupdate # scratchpad # givevotes $NODE->{setting} = {}; $NODE->{document} = {}; $NODE->{document}{doctext} = $xml->{data}{field}{doctext}{content} + if (exists $xml->{data}{field}{doctext}{content}); } elsif ($type eq 'container') { #Table container: # context # parent_container $NODE->{container}{parent_container} = nowhitespace($xml->{data}{f +ield}{parent_container}{content}); $NODE->{container}{context} = nowhitespace($xml->{data}{field}{con +text}{content}); } else { die "Unknown type $type"; } foreach my $table (keys %$NODE) { my @keys; my @values; if (not exists $NODE->{$table}{$table.'_id'}) { $NODE->{$table}{$table.'_id'} = $NODE->{node}{node_id}; } print "$table:\n"; foreach my $key (keys %{$NODE->{$table}}) { print "\t $key: $NODE->{$table}{$key}\n"; push @keys, $key; push @values, $NODE->{$table}{$key}; } # We don't care about the success of this line. $dbh->do("delete from $table where ".$table."_id=".$NODE->{node}{n +ode_id}); my $sth = $dbh->prepare("INSERT INTO $table (".join(',', @keys).") + values (".join(',', ('?')x@values).')'); $sth->execute(@values) or die; } sub a_element_to_nodeid { my $a=shift; my $href = $a->attr('href'); $href =~ m|(\d+)|; return "$1"; } # Strips leading and trailing whitespace (but not whitespace in the mi +ddle. sub nowhitespace { local $_=shift; s/^\s+//; s/\s+$//; return $_; }


Warning: Unless otherwise stated, code is untested. Do not use without understanding. Code is posted in the hopes it is useful, but without warranty. All copyrights are relinquished into the public domain unless otherwise stated. I am not an angel. I am capable of error, and err on a fairly regular basis. If I made a mistake, please let me know (such as by replying to this node).