use v6; my $file_name = "Z2020_G_004_202202161115.html"; my @table_header = (); my @table_data = (); my @file_lines = slurp( $file_name ).split: / \n /; for @file_lines { last if / '' /; if / '' (.+?) '' $ / { @table_header.push: $0; next; } if / '' $ / { @table_data.push: []; next; } if / '' (.*?) '' $ / { @table_data[*-1].push: $0; } } print( "{@table_header.join: ';'}\n" ); for @table_data { print( "{.join: ';'}\n" ) if .elems; } #### Clasification;Descrip;Cod Program;Descrip Program;Clasification Program;Credits;Payment ; ;1360; ;Services;150.000,00;62400 0,00; ;20.504,57;20.504,57;Services;0,00;-20.504,57 0,00; ;59.179,70;59.179,70;Services;6.254,79;-59.179,70 0,00; ;16.518,85;16.518,85;Services;0,00;33.481,15 #### use v6; # use lib $*PROGRAM.IO.parent.add: 'lib'; # use Grammar::Debugger; # use Grammar::Tracer; my @table_header; my @table_data; grammar html_table { token TOP { <.rubbish>+? <.rubbish>+ } rule head { <.ws> <.theadl> <.ws> <.trl> * <.ws> <.trr> <.ws> <.theadr> } rule hrow { <.ws> <.thl> <.thr> { @table_header.push: ~$ } } rule body { <.ws> <.tbodyl> [<.ws> <.trl> { @table_data.push: [] } * <.trr> ]* <.ws> <.tbodyr> } rule brow { <.ws> <.tdl> <.tdr> { @table_data[\*-1].push: ~$ } } token theadl { '' } token theadr { '' } token tbodyl { '' } token tbodyr { '' } token trl { '' } token trr { '' } token thl { '' } token thr { '' } token tdl { '' } token tdr { '' } regex data { .*? } regex rubbish { \N* \n } } my $file_name = "Z2020_G_004_202202161115.html"; my $file_content = slurp( $file_name ); my $p = html_table.parse( $file_content ); if $p.defined { print( "{@table_header.join: ';'}\n" ); for @table_data { print( "{.join: ';'}\n" ) if .elems; } } #### Clasification;Descrip;Cod Program;Descrip Program;Clasification Program;Credits;Payment ;;1360;;Services;150.000,00;62400;0,00;;20.504,57;20.504,57;Services;0,00;-20.504,57;0,00;;59.179,70;59.179,70;Services;6.254,79;-59.179,70;0,00;;16.518,85;16.518,85;Services;0,00;33.481,15 #### use v6; grammar HTML_table { token TOP { <.rubbish>+? <.ws> ''
<.ws> '' <.ws> '' + <.ws> '' <.rubbish>+ } rule header { '' ~ '' * } regex field { <.ws> '' ~ '' (.*?) } rule row { '' ~ '' * } regex data { <.ws> '' ~ '' (.*?) } regex rubbish { \N* \n } } class HTML_table_actions { method header($/) { make $>>.made; } method field($/) { # make ~$/[0]; # verbatim make $/[0].defined ?? $/[0].Str.trim !! ''; } method row($/) { make $>>.made; } method data($/) { # make ~$/[0]; # verbatim make $/[0].defined ?? $/[0].Str.trim !! ''; } } my $parser; my @file_list = dir(test => / :i '.' html $ /); my $file_name = @file_list[0].substr: 0, 16; my $output_file = open $file_name ~ ".csv", :w; my $file_content; for @file_list { $file_content = slurp($_, enc => 'iso-8859-1'); say "Parsing: $_"; $parser = HTML_table.parse($file_content, actions => HTML_table_actions.new); unless $parser { say "Unable to parse: $_"; last; }; once { $output_file.print("{ $parser
.made.join: ';'; }\n") }; $output_file.print("{ .join: ';'; }\n") for $parser>>.made; } $output_file.close; ####

europeanFormat

,

\.

\.

,

Clasification Descrip Cod Program Descrip Program Clasification Program Credits Payment
1360 Services 150.000,00 62400
0,00 20.504,57 20.504,57 Services 0,00 -20.504,57
0,00 59.179,70 59.179,70 Services 6.254,79 -59.179,70
0,00 16.518,85 16.518,85 Services 0,00 33.481,15
Total 89.478.403,32 32.751.626,25 122.230.029,57 102.342.399,26 89.476.722,29 84.657.323,46 4.819.398,83 32.753.307,28