in reply to Weather warnings from www.meteoalarm.eu
I was unable to find a page on the website that corresponded to your regexes so I have taken a guess at what it might look like. If you could post a link to an actual page you're dealing with we might have more to go on. For instance, this uses HTML::TokeParser::Simple to do a single pass examining every token and extracting data as appropriate (it covers similar ground to the regex in your _extract_details method).
If the page is well structured it may be more appropriate to consider something like HTML::TreeBuilder which is more powerful and could simplify proceedings greatly.
#! /usr/bin/perl use strict; use warnings; use Data::Dumper; { package Meteoalarm::Parser; use HTML::TokeParser::Simple; sub new { my $class = shift; my $content = shift; my $p = HTML::TokeParser::Simple->new(string => $content); my $self = { parser => $p, }; bless($self, $class); return $self; } sub parse { my $self = shift; my (%data, $txt); my $t = $self->find_img() or return; $txt = $self->get_div_txt(q{info}); ($data{from}, $data{until}) = $txt =~ /^valid from (.*)Until(.*)$/ +; $txt = $self->get_div_txt(q{info}); ($data{type}, $data{level}) = $txt =~ /^(.*)Awareness Level: (.*)$ +/; $self->{data} = \%data; return 1; } sub find_img{ my $self = shift; my $p = $self->{parser}; while (my $t = $p->get_token){ return $t if $t->is_start_tag(q{img}); } return; } sub get_div_txt{ my $self = shift; my $div_class = shift; my $p = $self->{parser}; my $txt; while (my $t = $p->get_token){ if ( $t->is_start_tag(q{div}) and $t->get_attr(q{class}) and $t->get_attr(q{class}) eq $div_class ){ $p->get_token; $txt = $p->get_phrase; return $txt; } } return; } sub get_data{ my $self = shift; return $self->{data}; } } # script my $content = do{local $/; <DATA>}; my $mp = Meteoalarm::Parser->new($content); while ($mp->parse){ my $data = $mp->get_data; print Dumper $data; } __DATA__ <img src="my.jpeg"> <!-- possible stuff --> <div class="info"> <b>valid from</b> from date 1 <b>Until</b> until date 1 </div> <div class="info"> <b>type 1</b> Awareness Level: <b>awareness level 1</b> </div> <div class="text"> text </div> <!-- possible stuff --> <img src="my_other.jpeg"> <!-- possible stuff --> <div class="info"> <b>valid from</b> from date 2 <b>Until</b> until date 2 </div> <div class="info"> <b>type 2</b> Awareness Level: <b>awareness level 2</b> </div> <div class="text"> text </div> <!-- and so on -->
update: added output$VAR1 = { 'level' => 'awareness level 1', 'until' => ' until date 1', 'from' => 'from date 1 ', 'type' => 'type 1 ' }; $VAR1 = { 'level' => 'awareness level 2', 'until' => ' until date 2', 'from' => 'from date 2 ', 'type' => 'type 2 ' };
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Re^2: Weather warnings from www.meteoalarm.eu
by walto (Pilgrim) on May 27, 2010 at 20:23 UTC | |
by wfsp (Abbot) on May 28, 2010 at 08:50 UTC | |
by walto (Pilgrim) on May 30, 2010 at 12:28 UTC | |
|
Re^2: Weather warnings from www.meteoalarm.eu
by StommePoes (Scribe) on Jun 03, 2010 at 07:58 UTC |