Also, the title and meta tags appear in the head section of the html and all the others appear later in the body section. In these cases I use two loops.
output:#!/usr/bin/perl use strict; use warnings; use HTML::TokeParser::Simple; my $html = join '', <DATA>; my $p = HTML::TokeParser::Simple->new(\$html); my ($title, $content, $keywords); while (my $t = $p->get_token){ last if $t->is_start_tag('body'); $title = $p->get_trimmed_text('/title') if $t->is_start_tag('title'); $content = $t->get_attr('content') if $t->is_start_tag('meta') and $t->get_attr('name') and $t->get_attr('name') eq 'Description'; $keywords = $t->get_attr('content') if $t->is_start_tag('meta') and $t->get_attr('name') and $t->get_attr('name') eq 'Keywords'; } print "title: $title\n"; print "content: $content\n"; my $tag; while (my $t = $p->get_token) { $tag = $t->get_tag if $t->is_start_tag(qr/^h[123456]|[biua]$/); if ($t->is_start_tag('img') and $t->get_attr('alt')){ my $attr = $t->get_attr('alt'); print "img attr: $attr\n"; $tag = ''; } elsif ($tag and $t->is_text){ my $txt = $t->as_is; print "$tag: $txt\n"; $tag = ''; } } __DATA__ <html> <head> <title>henka's test page</title> <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1 +" /> <meta name="Description" content="the glories of HTML::TokeParser::Sim +ple" /> <meta name="keywords" content="one two three four five six seven eight + nine ten" /> <meta name="robots" content="noindex" /> <link rel="stylesheet" type="text/css" href="cwi.css" /> </head> <body> <h1>header one</h1> <h2>header two</h2> <h3>header three</h3> <h4>header four</h4> <h5>header five</h5> <h6>header siz</h6> <p>p tag paragraph</p> <p>p tag containing <u>underline</u> and <b>bold</b> and a <a href="li +nk.html">link</a></p> <img alt="image alt text" src="my.gif"> </body> </html>
---------- Capture Output ---------- > "c:\perl\bin\perl.exe" monk06.pl title: henka's test page content: the glories of HTML::TokeParser::Simple h1: header one h2: header two h3: header three h4: header four h5: header five h6: header siz u: underline b: bold a: link img attr: image alt text > Terminated with exit code 0.
In reply to Re^3: More efficient use of HTML::TokeParser::Simple
by wfsp
in thread More efficient use of HTML::TokeParser::Simple
by henka
| For: | Use: | ||
| & | & | ||
| < | < | ||
| > | > | ||
| [ | [ | ||
| ] | ] |