#!/usr/bin/perl use strict; use warnings; use HTML::Parser (); my $htmltree = [ { tag => 'document', content => [] } ]; my $node = $htmltree->[0]->{content}; my @prevnodes = ($htmltree); sub start { my $tagname = shift; my $attr = shift; my $newnode = {}; $newnode->{tag} = $tagname; foreach my $key(keys %{$attr}) { $newnode->{$key} = $attr->{$key}; } $newnode->{content} = []; push @prevnodes, $node; push @{$node}, $newnode; $node = $newnode->{content}; } sub end { my $tagname = shift; $node = pop @prevnodes; } sub text { my $text = shift; chomp $text; if($text ne '') { push @{$node}, $text; } } my $p = HTML::Parser->new( api_version => 3, start_h => [\&start, "tagname, attr"], end_h => [\&end, "tagname"], text_h => [\&text, "dtext"] ); $p->parse_file("test.html");