Peamasii has asked for the wisdom of the Perl Monks concerning the following question:
Hello all I'm using perl 5.8.0. and HTML::Parser 3.26. I'm parsing a file with many links and just extracting the links. The problem that occurs consistently is that some of the link titles get truncated right in the Parser subroutine. Only about 10% of the links get truncated like this. I examined the input file with a hex editor and was unable to detect any reason for this to happen. If you run the code, most of the lines printed out with EditLine contain incomplete link title output in index.xml. Here is the input file and the code (sliparser.pl):
run command:> perl sliparser.pl prodnr.html number
written output with incomplete links:> index.xml
#!/usr/bin/perl use strict; # define the subclass package ProcExternal; use base "HTML::Parser"; #specifying filename to open if (($ARGV[0] eq "?") || ($#ARGV != 1)) {die "usage $0 file_name orde +ring (name/number)\n";}; my $file_name=$ARGV[0]; my $order = $ARGV[1]; my $skip = 1; my ($product_url, $product_bigurl, $product_id, $product_name, $dir_na +me); my (@tarray, @sarray); my ($orig_text, $orig_self, $product_line); @sarray = split /\\/,$file_name; pop @sarray; $dir_name = join ('\\', @sarray); if (!open(OFILE,">index.xml")) { die "Can't open $product_name: $!"; } +; print OFILE "<Products>\n"; &proc_html($file_name); print OFILE "</Products>\n"; close OFILE; sub text { my ($self, $text) = @_; $orig_self = $self; $orig_text = $text; if (!$skip) { @tarray = split(/ /,$text); if ($order eq "name") { $product_id = pop @tarray; } elsif ($order eq "number") { $product_id = shift @tarray; } elsif ($order eq "none") { $product_id = "999"; } else { die "invalid ordering parameter\n"; } if ($product_id !~/[0-9]/) { print "number format error: $text\n"; } foreach (@tarray) { $_ =~ s/\s+//g; } $product_name = join(' ',@tarray); $product_name =~ s/^\s//g; if (($product_id) && ($product_name)) { print OFILE "<Product>\n"; if ($#tarray < 2) { print OFILE "\t<EditLine>$orig_text</EditLine>\n"; print "EditLine #: $product_id $product_name\n"; }; print OFILE "\t<Name>$product_name</Name>\n"; print OFILE "\t<PDF>$product_url</PDF>\n"; print OFILE "\t<Number>$product_id</Number>\n"; print OFILE "</Product>\n\n"; # } } $skip = 1; } } sub comment { my ($self, $comment) = @_; } sub start { my ($self, $tag, $attr, $attrseq, $origtext) = @_; if ($tag eq "a") { $skip = 0; $product_line = $origtext; $product_bigurl = $attr->{href}; @sarray = split('/',$product_bigurl); $product_url = uc(pop @sarray); } } sub end { my ($self, $tag, $origtext) = @_; if ($tag eq "a") { # print $origtext; } } sub proc_html { my $htmlcontent = shift (@_); my $p = new ProcExternal; $p->parse_file($htmlcontent); $p->eof; return; }
edit (broquaint): added <readmore>
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Re: HTML::Parser problem
by matija (Priest) on Mar 29, 2004 at 08:40 UTC | |
by Peamasii (Sexton) on Mar 29, 2004 at 09:02 UTC | |
|
Re: HTML::Parser problem
by Peamasii (Sexton) on Mar 29, 2004 at 08:35 UTC | |
by b10m (Vicar) on Mar 29, 2004 at 09:00 UTC | |
by Peamasii (Sexton) on Mar 29, 2004 at 09:05 UTC |