use v6;
grammar HTML_table
{
token TOP {
<.rubbish>+?
<.ws> ''
'
<.ws> '
'
+
<.ws> '
'
<.rubbish>+
}
rule header {
> '' ~ '
' *
}
regex field {
<.ws> '' ~ ' | ' (.*?)
}
rule row {
> '' ~ '
' *
}
regex data {
<.ws> '' ~ ' | ' (.*?)
}
regex rubbish {
\N* \n
}
}
class HTML_table_actions
{
method header($/) {
make $>>.made;
}
method field($/) {
# make ~$/[0]; # verbatim
make $/[0].defined ?? $/[0].Str.trim !! '';
}
method row($/) {
make $>>.made;
}
method data($/) {
# make ~$/[0]; # verbatim
make $/[0].defined ?? $/[0].Str.trim !! '';
}
}
my $parser;
my @file_list = dir(test => / :i '.' html $ /);
my $file_name = @file_list[0].substr: 0, 16;
my $output_file = open $file_name ~ ".csv", :w;
my $file_content;
for @file_list {
$file_content = slurp($_, enc => 'iso-8859-1');
say "Parsing: $_";
$parser = HTML_table.parse($file_content, actions => HTML_table_actions.new);
unless $parser {
say "Unable to parse: $_";
last;
};
once { $output_file.print("{ $parser.made.join: ';'; }\n") };
$output_file.print("{ .join: ';'; }\n") for $parser>>.made;
}
$output_file.close;