#!/usr/bin/perl
# remove img & anchor tags.plx
# Program will read in an html file, remove the img tag and print out entire doc.
# 1. No need for file variable yet: open (INFILE, "<".$htmlFile) or die("Can't read source file!\n");
# 2. Alternative: m/]+>(.*?)<\/A>/ - Will not remove closing tag though - why?
# 3. Why is interpreter flipping-out over an 'undefined variable', when
# original regexp, m/]+>(.*?)<\/A>/, is known to work. What am I missing?
use warnings;
use diagnostics;
use strict;
use HTML::Parser; # Include this module for future reference - may need to abandon
# regexps in favour of parse-trees.
# Declare and initialise variables.
my $pattern1 = '
';
my $pattern2 = ']+>';
my $pattern3 = '';
my @htmlLines;
# Open HTML test file and read into array.
open INFILE, "E:\\Documents and Settings\\Richard Lamb\\My Documents\\HTMLworkspace\\HTML practice\\My First Page!\\firsttest.html" or die "Sod! Can't open this file.\n";
@htmlLines = ;
close (INFILE);
# Test for presence of patterns in HTML file
if($pattern1)
{
scrapImageTag(); # calls to remove image tags
}
else
{
print "No tags matching this pattern within the HTML document.\n";
}
if($pattern2 && $pattern3)
{
scrapAnchorTag();
}
else
{
print "No tags matching this pattern within the HTML document.\n";
}
# Removes image tag elements in array
sub scrapImageTag
{
foreach my $line (@htmlLines)
{
# replace
with nothing.
$line =~ s/$pattern1//ig; # case insensitivity and global search for pattern
}
}
# Removes anchor tag elements in array
sub scrapAnchorTag
{
foreach my $line (@htmlLines)
{
# replace with nothing.
$line =~ s/$pattern2//ig; # case insensitivity and global search for pattern
$line =~ s/$pattern3//ig; # case insensitivity and global search for pattern
}
}
printHTML();
# prints the reformatted HTML doc
sub printHTML
{
for my $i (0..@htmlLines-1)
{
print $htmlLines[$i];
}
}
print "\n\n";
sleep 2;
print "Success?!\n";