#!/usr/bin/perl use warnings; use strict; use utf8; use HTML::Strip; use Text::CSV_XS qw(csv); my $csv = Text::CSV_XS->new({ sep_char => "\t" }); my $hs = HTML::Strip->new(); my $file = 'test.html'; my $out = 'out.csv'; open my $fh ,"<", $file or die "Failed to open $file!: $!\n"; open my $io ,">", $out or die "Failed to open $out!: $!\n"; my $flag = 0; while(my $line = <$fh>) { chomp $line; if ($line =~ /\<\/Table\>\
/){ $flag = 1;}
elsif ($line =~ /\\<\/A\>/){ $flag = 0;}
next if $line =~ /→/;
if ($flag) {
my @data = &cleanThis($line);
$csv->print($io, \@data);
}
}
close($fh);
close($io);
sub cleanThis {
my $string = shift;
my $clean_text = $hs->parse($string);
if ($clean_text =~ /^.+(\d+\:.+?[M|K])$/){
$clean_text = "$clean_text\tNone listed";
}
my ($asOf, $filer, $filing, $forOn, $docsSize, $agent) = $clean_text =~ m/(\d+\/\d+\/\d+)\s+(.+)\s+?(\w.+)\s+(\d+\/\d+\/\d+).+?(\d+\:\d.+?)\s+(.+)/;
my @formated = ($asOf,$filer,$filing,$agent);
foreach my $trim(@formated){
$trim =~ s/^\s+|\s+$//g;
}
return join("\t",@formated)
}