#scrape beppegrillo.it
############################################################
#!/usr/bin/perl;
use utf8;
use strict;
use warnings;
use File::Find;
use HTML::Tree;
use LWP::Simple;
############################################################
#set the working directory where the html files are
############################################################
my $dir = "CORPUS/2005/01/";
my $url;
#############################################################
#call a sub routine in order to operate on each file in the directory
############################################################
find(\&edit, $dir);
#
############################################################
#specifica cosa deve fare la subroutine edit
############################################################
sub edit() {
############################################################
#check that what you're working on is a file and not a directory; check that is an html file
############################################################
    my $file = $_;
    if ((-e $file) && (! -d $file) && (/.html?/)){	
############################################################
#open filehandle in order to read
############################################################
	open (FH, "<",$file) || die $!;
############################################################
#build the tree or die
############################################################
	my $tree = HTML::Tree->new();
	$tree->parse_file($file) || die $!;
############################################################
#get the main div, the one that contains the post and print it as html
############################################################
	my $getmaindiv = $tree->look_down(_tag => "div",id  => "post_principale") || die $!;
	print $getmaindiv->as_HTML, "\n";	
	close FH;
############################################################
#
############################################################
    }
}