#!/usr/bin/perl use strict; use warnings; use HTML::TokeParser::Simple; my $html = join '', ; my $p = HTML::TokeParser::Simple->new(\$html); my ($title, $content, $keywords); while (my $t = $p->get_token){ last if $t->is_start_tag('body'); $title = $p->get_trimmed_text('/title') if $t->is_start_tag('title'); $content = $t->get_attr('content') if $t->is_start_tag('meta') and $t->get_attr('name') and $t->get_attr('name') eq 'Description'; $keywords = $t->get_attr('content') if $t->is_start_tag('meta') and $t->get_attr('name') and $t->get_attr('name') eq 'Keywords'; } print "title: $title\n"; print "content: $content\n"; my $tag; while (my $t = $p->get_token) { $tag = $t->get_tag if $t->is_start_tag(qr/^h[123456]|[biua]$/); if ($t->is_start_tag('img') and $t->get_attr('alt')){ my $attr = $t->get_attr('alt'); print "img attr: $attr\n"; $tag = ''; } elsif ($tag and $t->is_text){ my $txt = $t->as_is; print "$tag: $txt\n"; $tag = ''; } } __DATA__ henka's test page

header one

header two

header three

header four

header five
header siz

p tag paragraph

p tag containing underline and bold and a link

image alt text