in reply to HTML::TableExtract problem handling merged cells across rows
You have an off-by-one error in your data, your expectations are wrong
This is actually row 5 column 4 not column 1 <td class=3Dxl32 x:num=3D"7258831">R5Col1</td>
thus this is column 7 so its dropped <td class=3Dxl29>TSH5555 this data gets dropped but should not</td>
So fix your data or work around it or something else entirely
Here is how I found out
#!/usr/bin/perl -- use strict; use warnings; use Data::Dump qw/ dd /; use HTML::TableExtract; Main( @ARGV ); exit( 0 ); sub Main { my $te = HTML::TableExtract->new(); $te->parse( SkewHtml() ); $te->eof; $te->tables_dump('show_content', ',' ); dd( $te->rows ); } ## end of Main sub SkewHtml { my $html= q{ <html><head><title>skew test 2</title></head> <body> <table border=1> <tr> <td>head0</td> <td>head1</td> <td>head2</td> <td>head3</td> </tr> <tr> <td colspan=4> <b>0/0</b> colspan=4 <br> 0/0 undef undef undef < +br> "0/0" "0/0" "0/0" "0/0" </td> </tr> <tr> <td rowspan=2> <b>1/0</b> rowspan=2 <br> 1/0 1/1 1/2 1/3 </td> <td> 1/1 </td> <td> 1/2 </td> <td> 1/3 </td> </tr> <tr> <td colspan=2 rowspan=3> <b>2/1</b> colspan=2 rowspan=3 <br> undef + 2/1 undef 2/3 <br> "1/0" "2/1" "2/1" "2/3" </td> <td> 2/3 </td> </tr> <tr> <td> <b>3/0</b> <br> 3/0 undef undef 3/3 <br> "3/0" "2/1" "2/1" + "3/3" </td> <td> 3/3 </td> </tr> <tr> <td> <b>4/0</b> <br> 4/0 undef undef 4/3 <br> "4/0" "2/1" "2/1" + "4/3" </td> <td> 4/3 </td> </tr> <tr> <td colspan=2> <b>5/0</b> colspan=2 <br> 5/0 undef 5/2 5/3 <br> "5 +/0" "5/0" "5/2" "5/3" </td> <td> 5/2 </td> <td> 5/3 </td> </tr> </table> <pre> TABLE(0, 0): head0,head1,head2,head3 0/0 colspan=4 ,,, 1/0 rowspan=2 , 1/1 , 1/2 , 1/3 , 2/1 colspan=2 rowspan=3 ,, 2/3 3/0 ,,, 3/3 4/0 ,,, 4/3 5/0 colspan=2 ,, 5/2 , 5/3 ( ["head0", "head1", "head2", "head3"], [" 0/0 colspan=4 ", undef, undef, undef], [" 1/0 rowspan=2 ", " 1/1 ", " 1/2 ", " 1/3 "], [undef, " 2/1 colspan=2 rowspan=3 ", undef, " 2/3 "], [" 3/0 ", undef, undef, " 3/3 "], [" 4/0 ", undef, undef, " 4/3 "], [" 5/0 colspan=2 ", undef, " 5/2 ", " 5/3 "], ) </pre> </body> </html> }; $html =~ s{^\s+|\s+$}{}gm; $html =~ s{(<br>.+?)</td>}{<!-- $1 --> </td>}gm; $html =~ s{rowspan=3}{rowspan=3D2}g; return $html; } ## end of SkewHtml __END__ TABLE(0, 0): head0,head1,head2,head3 0/0 colspan=4 ,,, 1/0 rowspan=2 , 1/1 , 1/2 , 1/3 , 2/1 colspan=2 rowspan=3D2 ,, 2/3 3/0 ,,, 3/3 4/0 ,,, 4/3 5/0 colspan=2 ,, 5/2 , 5/3 ( ["head0", "head1", "head2", "head3"], [" 0/0 colspan=4 ", undef, undef, undef], [" 1/0 rowspan=2 ", " 1/1 ", " 1/2 ", " 1/3 "], [undef, " 2/1 colspan=2 rowspan=3D2 ", undef, " 2/3 "], [" 3/0 ", undef, undef, " 3/3 "], [" 4/0 ", undef, undef, " 4/3 "], [" 5/0 colspan=2 ", undef, " 5/2 ", " 5/3 "], )
| head0 | head1 | head2 | head3 |
| 0/0 colspan=4 0/0 undef undef undef "0/0" "0/0" "0/0" "0/0" |
|||
| 1/0 rowspan=2 1/0 1/1 1/2 1/3 |
1/1 | 1/2 | 1/3 |
| 2/1 colspan=2 rowspan=3 undef 2/1 undef 2/3 "1/0" "2/1" "2/1" "2/3" |
2/3 | ||
| 3/0 3/0 undef undef 3/3 "3/0" "2/1" "2/1" "3/3" |
3/3 | ||
| 4/0 4/0 undef undef 4/3 "4/0" "2/1" "2/1" "4/3" |
4/3 | ||
| 5/0 colspan=2 5/0 undef 5/2 5/3 "5/0" "5/0" "5/2" "5/3" |
5/2 | 5/3 | |
TABLE(0, 0): head0,head1,head2,head3 0/0 colspan=4 ,,, 1/0 rowspan=2 , 1/1 , 1/2 , 1/3 , 2/1 colspan=2 rowspan=3 ,, 2/3 3/0 ,,, 3/3 4/0 ,,, 4/3 5/0 colspan=2 ,, 5/2 , 5/3 ( ["head0", "head1", "head2", "head3"], [" 0/0 colspan=4 ", undef, undef, undef], [" 1/0 rowspan=2 ", " 1/1 ", " 1/2 ", " 1/3 "], [undef, " 2/1 colspan=2 rowspan=3 ", undef, " 2/3 "], [" 3/0 ", undef, undef, " 3/3 "], [" 4/0 ", undef, undef, " 4/3 "], [" 5/0 colspan=2 ", undef, " 5/2 ", " 5/3 "], )
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Re^2: HTML::TableExtract problem handling merged cells across rows (OBO rowspan colspan)
by jtravillian (Initiate) on Feb 27, 2015 at 15:30 UTC | |
by poj (Abbot) on Feb 27, 2015 at 16:46 UTC | |
by Anonymous Monk on Feb 28, 2015 at 00:59 UTC |