#!/usr/bin/perl # scrape_wiki_anatomy.pl # extracts all the anatomy elements from a wikimedia dump # uses presence of "Infobox Anatomy" to identify pages use strict; use warnings; use MediaWiki::DumpFile::Compat; my $now_string = localtime; print STDERR "start time: ", $now_string,"\n"; my $file = shift(@ARGV) or die "must specify a MediaWiki dump of the current pages"; my $pmwd = Parse::MediaWikiDump->new; my $dump = $pmwd->revisions($file); my $found = 0; binmode(STDOUT, ':utf8'); binmode(STDERR, ':utf8'); print < EOF print "\n"; print < Wikipedia http://en.wikipedia.org/wiki/Main_Page MediaWiki 1.16wmf4 first-letter Media Special Talk User User talk Wikipedia Wikipedia talk File File talk MediaWiki MediaWiki talk Template Template talk Help Help talk Category Category talk Portal Portal talk Book Book talk EOF #this is the only currently known value but there could be more in the future if ($dump->case ne 'first-letter') { die "unable to handle any case setting besides 'first-letter'"; } my $i=0; while(my $page = $dump->next) { if (1) { #Sprint STDERR "Located text for revision ", $page->revision_id, "\n"; my $text = $page->text; if ($$text =~ m/\{\{Infobox Anatomy/){ $$text =~ s/&ndash/-/g; $$text =~ s/\/\
/g; print "\n\n"; print "", $page->title,"\n"; print "",$page->id,"\n"; print "\n",$page->revision_id,"\n"; print "",$page->timestamp,"\n"; print "\n"; if ($page->username) {print "",$page->username,"\n";} if ($page->userid) { print "",$page->userid,"\n";} if ($page->userip) { print "",$page->userip,"\n";} print "\n"; if ($page->minor) {print "\n";} print "", $$text, "\n\n\n"; $i++; if (($i%100)==0) {print STDERR ".";} } } } print ""; print STDERR "\n"; $now_string = localtime; print STDERR "end time: ",$now_string ,"\n"; print STDERR $i, " records dumped\n";