use warnings; use strict; use XML::LibXML; sub snipName { s/\s+//g and $_ = lcfirst($_) for @_ }; sub snipValue { s/\A\s+|\s+\z//g for @_ }; my $parser = XML::LibXML->new(); $parser->keep_blanks(0); # $parser->recover_silently(1); # Might need, might not. my $doc = $parser->parse_html_fh(\*DATA); my $xml = XML::LibXML::Document->new(); my $root_name = [ $doc->findnodes('//div[@class="spec_section"]//div[contains(@class,"spec_row_header")]') ]->[0]->textContent; snipName($root_name); $xml->setDocumentElement( $xml->createElement($root_name) ); for my $col ( $doc->findnodes('//div[@class="spec_row_left"]') ) { snipName( my $name = $col->textContent ); my $tag = $xml->createElement($name); my $value = [ $col->parentNode->findnodes('div[@class="spec_row_right"]') ]->[0]->textContent; snipValue( $value ); $tag->appendChild( $xml->createTextNode( $value ) ); $xml->getDocumentElement->appendChild($tag); } print $xml->serialize(1); __DATA__
Engine
Type
Diesel
Builder
MTU
Model
2x 16V396TB94
Power
2561kw / 3480hp
Total Power
5121kw / 6960hp
Engine Propulsion
Twin Screws
#### Diesel MTU 2x 16V396TB94 2561kw / 3480hp 5121kw / 6960hp Twin Screws