#!/usr/bin/perl use warnings; use strict; use utf8; use HTML::Strip; use Text::CSV_XS qw(csv); my $csv = Text::CSV_XS->new({ sep_char => "\t" }); my $hs = HTML::Strip->new(); my $file = 'test.html'; my $out = 'out.csv'; open my $fh ,"<", $file or die "Failed to open $file!: $!\n"; open my $io ,">", $out or die "Failed to open $out!: $!\n"; my $flag = 0; while(my $line = <$fh>) { chomp $line; if ($line =~ /\<\/Table\>\/){ $flag = 1;} elsif ($line =~ /\\<\/A\>/){ $flag = 0;} next if $line =~ /→/; if ($flag) { my @data = &cleanThis($line); $csv->print($io, \@data); } } close($fh); close($io); sub cleanThis { my $string = shift; my $clean_text = $hs->parse($string); if ($clean_text =~ /^.+(\d+\:.+?[M|K])$/){ $clean_text = "$clean_text\tNone listed"; } my ($asOf, $filer, $filing, $forOn, $docsSize, $agent) = $clean_text =~ m/(\d+\/\d+\/\d+)\s+(.+)\s+?(\w.+)\s+(\d+\/\d+\/\d+).+?(\d+\:\d.+?)\s+(.+)/; my @formated = ($asOf,$filer,$filing,$agent); foreach my $trim(@formated){ $trim =~ s/^\s+|\s+$//g; } return join("\t",@formated) } #### "1/27/16 Advanced Series Trust 497K Prudential Moneymar..Inc""1/27/16 Advisors Series Trust 497K US Bancorp Fund Svcs LLC""1/27/16 Advisors Series Trust 497K US Bancorp Fund Svcs LLC""1/27/16 Advisors Series Trust 497K US Bancorp Fund Svcs LLC""1/27/16 Ark ETF Trust 497K Vintage/FA""1/27/16 Ark ETF Trust 497K Vintage/FA""1/27/16 Delaware Group Cash Reserve 485BPOS DG3/FA""1/27/16 Federated Equity Income Fund Inc N-CSR Federated Admin..Svcs/FA""1/27/16 Federated Inv Series Funds Inc N-CSR Federated Admin..Svcs/FA""1/27/16 Fidelity Advisor Series I N-CSR Publishing Data...Inc/FA""1/27/16 Fidelity Commonwealth Trust N-CSR Publishing Data...Inc/FA""1/27/16 Fidelity Court Street Trust N-CSR Publishing Data...Inc/FA""1/27/16 Fidelity Court Street Trust II N-CSR Publishing Data...Inc/FA""1/27/16 Fidelity Financial Trust N-CSR Publishing Data...Inc/FA""1/27/16 Fidelity MT Vernon Street Trust N-CSR None listed""1/27/16 Fidelity Phillips Street Trust N-CSR Fidelity Aberdeen St..Tr""1/27/16 Fidelity Rutland Square Trust II 497K Fidelity Aberdeen St..Tr""1/27/16 Fidelity Rutland Square Trust II 497K Fidelity MT Vernon S..Tr""1/27/16 Fidelity Salem Street Trust N-CSR Publishing Data...Inc/FA""1/27/16 John Hancock ETF Trust 497K Data Communique Inc./FA"