#!/usr/bin/perl use warnings; use strict; use Data::Dumper; use HTML::Parser (); my $html; { local $/; $html = } my ($h2, $link_item, $img_link, @db); my $p = HTML::Parser->new( api_version => 3, start_h => [\&find_right, 'self, tag, attr'], ) or die "can't create parser: $!"; $p->unbroken_text(1); $p->parse($html); { $, = "\n"; print @db; } sub find_right{ my ($self, $tag, $attr) = @_; if ( $tag eq 'div' and exists $attr->{class} and $attr->{class} =~ /right/ ) { $self->handler(start => \&right_start, 'self, tag, attr'); $self->handler(end => \&right_end, 'self, tag, attr'); } } sub right_start{ my ($self, $tag, $attr) = @_; if ( $tag eq 'p' and exists $attr->{class} and $attr->{class} eq 'header' ) { $self->handler(text => \&header_text, 'self, dtext'); } elsif ( $tag eq 'p' and exists $attr->{class} and $attr->{class} eq 'link_item' ) { $self->handler(start => \&link_item_href, 'self, tag, attr'); $self->handler(end => \&link_item_end, 'self, tag'); } elsif ($tag eq 'h2'){ $self->handler(start => \&h2_href, 'self, attr'); } elsif ($tag eq 'img'){ push @db, 'image:' . $attr->{src}; } elsif ($tag eq 'a'){ $img_link = 'image-link:' . $attr->{href}; $self->handler(start => \&img_link, 'self, tag, attr'); } } sub right_end{ my ($self, $tag) = @_; $self->eof if $tag eq '/div'; } sub img_link{ my ($self, $tag, $attr) = @_; $img_link .= ':' . $attr->{src}; push @db, $img_link; $img_link = ''; $self->handler(start => \&right_start, 'self, tag, attr'); } sub link_item_end{ my ($self, $tag) = @_; if ($tag eq '/p'){ $self->handler(start => \&right_start); $self->handler(end => \&right_end); push @db, $link_item; $link_item = ''; } } sub span_text{ my ($txt) = @_; for ($txt){ s/^\s+//; s/\s+$//; } $link_item .= ':' . $txt if $txt; } sub link_item_href{ my ($self, $tag, $attr) = @_; if ($tag eq 'a'){ $link_item = 'link-item:' . $attr->{href}; $self->handler(text => \&span_text, 'dtext'); } } sub h2_href{ my ($self, $attr) = @_; $h2 = 'link-h2:' . $attr->{href}; $self->handler(text => \&h2_text, 'self, dtext'); } sub h2_text{ my ($self, $txt) = @_; $h2 .= ':' . $txt; push @db, $h2; $h2 = ''; $self->handler(text => ''); $self->handler(start => \&right_start, 'self, tag, attr'); } sub header_text{ my ($self, $txt) = @_; push @db, "header:$txt"; $self->handler(start => \&right_start, 'self, tag, attr'); $self->handler(text => ''); } __DATA__ page with right column

see also

see something else

link to more