spickles has asked for the wisdom of the Perl Monks concerning the following question:
Anyone know how to skip the entire content between two HTML tags? I'm using HTML::Parser, and I have figured out how to use the start and end subclasses to match on certain tags and manipulate them. If there is text within those tags I want to keep or modify, I print them to an output file. So I thought that if I matched on a style tag and simply returned (rather than printing the text to my output file) that it wouldn't be there. But now I understand that the text within the tags is processed by the separate subclass for text. What I'd like to do is match on the <style></style> tags and NOT print the text between them.
#!c:/perl/bin/perl #This version allows the use of a filename specified on the command li +ne use strict; use warnings; package HTMLStrip; use base "HTML::Parser"; #system("cls"); my $output = "c:/perl/bin/parseOutput.txt"; if (-e $output) { unlink $output; } open PARSETEXT,'>',$output or die $!; my $p = new HTMLStrip; # parse line-by-line, rather than the whole file at once while (<>) { $p->parse($_); } # flush and parse remaining unparsed HTML $p->eof; close PARSETEXT; sub text { my ($self, $text) = @_; chomp($text); $text =~ s/#.*//; # comments $text =~ s/^\s+//; # leading whitespace $text =~ s/\s+$//; # trailing whitespace #Once the beginning comment if found, remove style if ($text =~ /^<\!--$/) { next unless ($text =~ /^-->$/); } #Print non-blank lines if (length($text) > 0) { print PARSETEXT $text . "\n"; } } #Process OPENING/STARTING HTML tags sub start { my ($self, $tag, $attr, $attrseq, $origtext) = @_; #We're only interested in dealing with table tags if ($tag =~ /^table$/) { print PARSETEXT "\n************* BEGIN TABLE ****************\ +n"; } if ($tag =~ /^tr$/) { print PARSETEXT "\n"; } if ($tag =~ /^td$/) { print PARSETEXT "\t"; if (defined $attr->{'class'}) { if ($attr->{'class'} =~ /alarmClear/) { print PARSETEXT "OK"; } if ($attr->{'class'} =~ /alarmSet/) { print PARSETEXT "ALARM"; } } } } #Process CLOSING/ENDING HTML tags sub end { my($self, $tag, $origtext) = @_; if ($tag =~ /^table$/) { print PARSETEXT "\n************* END TABLE ****************\n" +; } }
############################################################ ############################################################ HTML FILE CONTENTS <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> <html> <head> <title>System Status</title> <style type="text/css"> <!-- /* colors to use: "CSI Blue" 004683 "CSI Red" 7e2d41 "CSI Grey" e5e5e5 "Liz Green" 00CC33 */ body { background-color: #FFFFFF; color: #004683; } dt { font-weight: bold; } #main { float: left; margin: 1em; } #menu { float: left; margin: 1em; font-family: sans-serif; background-color: #7e2d41; } #menu ul { list-style: none; margin: 0; padding: 0; } #menu li { margin: 1em; } #menu li a { color: #fff; text-decoration: none; } #menu li a { width: auto; } div.messages { /*width: 70%;*/ padding: 1em; margin-top: 1em; margin-bottom: 1em; font-size: x-large; border: thin solid black; } #messages { /*width: 70%;*/ padding: 1em; margin-top: 1em; margin-bottom: 1em; font-size: x-large; border: thin solid black; } /* TODO trash all these? */ .tallrow { padding-bottom: 6em; /*width: 70%;*/ } .input { /*width: 45%;*/ float: right; } .info { float: right; } /* TODO end of area to trash? */ table.bigdata { text-align: center; } div.boxer { padding: 1em; background-color: #e5e5e5; border: thin solid black; } td.label { text-align: right; padding-right: 1em; } td.labelInvalid { text-align: right; padding-right: 1em; color: red; } td.lineitem { text-align: left; } th { padding-right: 1em; } tr.spacer { height: 1em; } .section { text-align: right; } td.alarmSet { background-color: red; color:white; } td.alarmSet:after { content: "ALARM"; } td.alarmClear { background-color: #00CC33; } td.alarmClear:after { content: "OK"; } --> </style> </head> <body onLoad="setTimeout('reload()',60000)" > <div id="menu"><ul><li><a href="index.cgi">System Status</a></li><li>< +a href="netconf.cgi">Local Network</a></li><li><a href="rf.cgi">RF Co +nfiguration</a></li><li><a href="programfilter.cgi">Program a Filter< +/a></li><li><a href="remotenetconf.cgi">Remote Network</a></li><li><a + href="snmpconf.cgi">SNMP Configuration</a></li><li><a href="status.c +gi">System Health</a></li><li><a href="install.cgi">Install Software< +/a></li><li><a href="reboot.cgi">Reboot</a></li></ul></div><div id="m +ain" onLoad=setTimeout('reload()',60000)><h1>System Status</h1> <script type="text/javascript"> function reload() { window.location.reload(); } </script> <table class="bigdata"> <tr> <td class="label"><b>Timestamp</b></td> <td colspan="4" class="lineitem"> 2010:03:09 - 18:26:45 </ +td> </tr> <tr> <td class="label"><b>System Uptime</b></td> <!-- I'd like to use colspan="0" but IE sucks. --> <td colspan="4" class="lineitem"> 16 days 3 hours 5 minutes </td> </tr> <tr> <td class="label"><b>Software Version</b></td> <td colspan="4" class="lineitem"> 2.2.4 REL </td> </tr> <tr> <td class="label"><b>Serial Number</b></td> <td colspan="4" class="lineitem"> CFB90357-000000 </td> </tr> <tr> <td class="label"><b>Model Number</b></td> <td colspan="4" class="lineitem"> DSP85-C/P </td> </tr> <tr> <td class="label"><b>Item Number</b></td> <td colspan="4" class="lineitem"> CS10-377-403 </td> </tr> <tr> <td class="label"><b>Location</b></td> <td colspan="4" class="lineitem"> Unknown </td> </tr> <tr class="spacer" /> <tr> <th></th> <th colspan="2"> Band 1 (CELL) </th> <th colspan="2"> Band 2 (PCS) </th> </tr> <tr> <th align="right">Active Filter</th> <td colspan="2"><tt +>cgA0-0</tt></td><td colspan="2"><tt>pgA0B4B5F0C5-0</tt></td> </tr> <tr class="spacer" /> <tr> <th style="text-align:right">Power</th> <th> Down Link </th> <th> Up Link</th> <th> Down Link </th> <th> + Up Link</th> </tr> <tr> <td class="label"> In-band Input<sup><a target="_blank" href="help.html#1 +">?</a></sup> (dBm) </td> <td>-35.6</td><td>≤ -66.0</td><td>-43.4</td><td>≤ -66.0</td> </tr> <tr> <td class="label"> Measured Output<sup><a target="_blank" href="help.html +#2">?</a></sup> (dBm) </td> <td>18.6</td><td>≤ 4.0</td><td>22.8</td><td>≤ 4.0</td> </tr> <tr> <td class="label"> Composite Input<sup><a target="_blank" href="help. +html#4">?</a></sup> (dBm) </td> <td>-33.9</td><td>-43.4</td><td>-23.7</td><td>≤ -53.0</td> </tr> <tr class="spacer" /> <tr class="section"> <th>Gain Control</th> </tr> <tr> <td class="label">AGC Mode</td> <td colspan="2">On</td><td colspan="2">On</td> </tr> <tr> <td class="label">AGC Attenuation (dB)</td> <td>0.0</td><td>0.0</td><td>0.0</td><td>0.0</td> </tr> <tr> <td class="label">System Gain</td> <td>53.5</td><td>70.0</td><td>66.0</td><td>70.0</td> </tr> <tr class="spacer" /> <tr class="section"> <th>RF Alarms</th> </tr> <tr> <td class="label">Over Range</td> <td class="alarmClear" /> <td class="alarmClear" /> <td class="alarmClear" /> <td class="alarmClear" /> </tr> <tr> <td class="label">Oscillation</td> <td class="alarmClear" /> <td class="alarmClear" /> <td class="alarmClear" /> <td class="alarmClear" /> </tr> <tr> <td class="label">VSWR</td> <td class="alarmClear" /> <td class="alarmClear" /> <td class="alarmClear" /> <td class="alarmClear" /> </tr> <tr> <td class="label">Out of Band Overdrive</td> <td class="alarmClear" /> <td class="alarmClear" /> <td class="alarmClear" /> <td class="alarmClear" /> </tr> <tr> <td class="label">Low Signal</td> <td class="alarmClear" /> <td /> <td class="alarmClear" /> <td /> </tr> <tr> <td class="label">No Signal</td> <td class="alarmClear" /> <td /> <td class="alarmClear" /> <td /> </tr> <tr class="spacer" /> <tr class="section"> <th>System Alarms</th> </tr> <tr> <td class="label">Synthesizer Lock</td> <td class="alarmClear" /> <td class="alarmClear" /> <td class="alarmClear" /> <td class="alarmClear" /> </tr> <tr> <td class="label">Voltage</td> <td colspan="2" class="alarmClear" /> <td colspan="2" class="alarmClear" /> </tr> <tr> <td class="label">Temperature</td> <td colspan="2" class="alarmClear" /> <td colspan="2" class="alarmClear" /> </tr> <tr> <td class="label">Software</td> <td colspan="2" class="alarmClear" /> <td colspan="2" class="alarmClear" /> </tr> <tr> <td class="label">Hardware</td> <td colspan="2" class="alarmClear" /> <td colspan="2" class="alarmClear" /> </tr> </table> </div></body> </html>
Regards,
Scott
|
|---|
| Replies are listed 'Best First'. | |
|---|---|
|
Re: HTML::Parser Skip Style Content
by runrig (Abbot) on Mar 09, 2010 at 20:38 UTC | |
by almut (Canon) on Mar 09, 2010 at 21:07 UTC | |
by runrig (Abbot) on Mar 09, 2010 at 23:22 UTC | |
by spickles (Scribe) on Mar 18, 2010 at 15:01 UTC |