comment on

#! /usr/bin/perl 

########
# tgrep - A grep for XML/HTML tags. See POD at bottom for docs
#
# Copyright (C) 2000-2002 Adrian Howard
# 
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2 of the License, or (at your option) any later version.
# 
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
# 
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
# USA
########

use strict;
use warnings;
use Getopt::Long;
use Pod::Usage;
use Fatal qw(open close rename);
use File::Find;


my $OPTIONS = {};
GetOptions( $OPTIONS,
    'attr=s@',
    'noattr=s@',
    'set=s@',
    'help|?',
    'nobackup',
    'first',
) or pod2usage(2);
pod2usage(2) if $OPTIONS->{help};
my $TAG = shift @ARGV || pod2usage({-verbose => 1, -message => 'need a
+ tag to search for'});
pod2usage({-verbose => 1, -message => "search for tag $TAG in what?"})
+ unless @ARGV;


sub has_attribute {
    my ($tag, $attr) = @_;
    my ($name, $value) = split(/=/, $attr);
    my $has_attribute = 0;
    if (defined($value)) {
        if ($value eq '') {
            $has_attribute = 1 if $tag =~ m/\s+$name\s*=\s*(['"])$valu
+e\1/si;
        } else {
            $has_attribute = 1 if $tag =~ m/\s+$name\s*=\s*(['"]?)$val
+ue\1/si;
        };
    } else {
        $has_attribute = 1 if $tag =~ m/\s+$name[^\w]/si;
    };
    return($has_attribute);
};


sub tag_matches {
    my $tag = shift;
    foreach my $attr (@{$OPTIONS->{attr}}) {
        return(0) unless has_attribute($tag, $attr);
    };
    foreach my $attr (@{$OPTIONS->{noattr}}) {
        return(0) if has_attribute($tag, $attr);
    };
    return(1);
};


sub move_original {
    my $file = shift;
    my $backup = $file . "~";
    my $i='';
    while (-e "$backup$i") { $i++ };
    rename $file, "$backup$i";
    return("$backup$i");
};


sub read_file {
    my $file = shift;
    local $/ = undef;
    open(FILE, $file);
    my $content = <FILE>;
    close(FILE);
    return($content);
};


sub clear_attribute_from_tag {
    my ($tag, $name) = @_;
    $tag =~ s/\s+$name\s*=\s*\w+?\b//gsi;
    $tag =~ s/\s+$name\b(\s*=\s*(['"]).*?\2)?//gsi;
    return($tag);
};


sub add_attribute_to_tag {
    my ($tag, $name, $value) = @_;
    my $attr = "$name";
    if (defined($value)) {
        $attr .= qq!="$value"!;
    };
    $tag =~ s/(<\w+)/$1 $attr/;
    return($tag);
};


sub set_tag_attributes {
    my $tag = shift;
    return($tag) unless tag_matches($tag);
    foreach my $attr (@{$OPTIONS->{set}}) {
        my ($name, $value) = split(/=/, $attr);
        $tag = clear_attribute_from_tag($tag, $name);
        $tag = add_attribute_to_tag($tag, $name, $value);
    };
    return($tag);
};


sub replace_matches_in_file {
    my $file = shift;
    my $content = read_file($file);
    $content =~ s/(<$TAG[^>]*>?)/set_tag_attributes($1)/sige;
    my $original_file = move_original($file);
    die "file $file still exists - this should not happen!\n" if -e $f
+ile;
    open(FILE, ">$file");
    print FILE $content;
    close(FILE);
    if ($OPTIONS->{nobackup}) {
        unlink $original_file;
    } else {
        print "$file -> $original_file\n";
    };
};


sub find_tag {
    my $file = $_;
    return unless -f $file;
    return if $file =~ m/~\d*$/;
    my $found;
    open(FILE, $file);
    LINE: while (my $line = <FILE>) {
        next unless $line =~ m/<$TAG/si;
        MATCH: while ($line =~ m/<$TAG/si) {
            my $line_number = $.;
            until ($line =~ m/<$TAG.*?>/) {
                my $next_line = <FILE>;
                last unless $next_line;
                $line .= $next_line;
            };
            my $tag;
            ($tag, $line) = ($line =~ m/(<$TAG[^>]*>?)(.*)/si);
            next MATCH unless tag_matches($tag);
            chomp($tag);
            $tag =~ s!([^\x20-\x7E])!sprintf("\\x%02X", ord($1))!gse;
            print "$File::Find::name : $line_number : $tag\n";
            $found=1;
            last LINE if $OPTIONS->{first};
        };
    };
    close(FILE);
    replace_matches_in_file($file) if $found && exists $OPTIONS->{set}
+;
};


find(\&find_tag, @ARGV);


__END__
=head1 NAME

tgrep - A grep for XML/HTML tags.

=head1 SYNOPSIS

tgrep [options] tag file ...

List all occurances of the tag in the specified files. Directories are
recursed into.

    Options:
        -attr ATTR      Return tags only if they contain attribute 
        -noattr ATTR    Return tags only if they do not contain attrib
+ute
        -set ATTR        Replace the attribute in the found tags
        -nobackup        Don't create backup copies of the original fi
+le
        -help           Brief help message
        -first            Only return first match in a file
        -?              Same as -help
        
ATTR can be "ATTR" or "ATTR=VALUE". VALUE in -attr or -noattr is treat
+ed as a regular expression. 

By default backup files are created of the form FILENAME~, FILENAME~1,
+ FILENAME~2, etc. when you
alter attributes using -set. Files named in this style will never be a
+ffected by tgrep.

Using -first will only show the first match in each file. -set will st
+ill alter all matching tags.
Using -first will increase speed for updates slightly.

Examples:

    % tgrep -noattr alt img htdocs
    (find all IMG tags without ALT text)

    % tgrep -attr width=60 -attr height=40 img htdocs
    (find IMG tags whose WIDTH is 60 and HEIGHT is 40)
    
    % tgrep -noattr 'class=.*sidebar' p htdocs
    (find paragraph tags whose class doesn't end with "sidebar")
    
    % tgrep -noattr alt -set alt="" -attr src=".*/1x1.gif" img htdocs
    (set the ALT text of all "1x1.gif" IMGs without existing ALT text 
+to "")      
        
=cut
[download]

In reply to tgrep - A grep for HTML tags by adrianh

Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!

Titles consisting of a single word are discouraged, and in most cases are disallowed outright.

Read Where should I post X? if you're not absolutely sure you're posting in the right place.

Please read these before you post! —

Posts may use any of the Perl Monks Approved HTML tags:

a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, details, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, summary, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr

You may need to use entities for some characters, as follows. (Exception: Within code tags, you can put the characters literally.)

	For:		Use:
	&		`&`
	<		`<`
	>		`>`
	[		`[`
	]		`]`

Link using PerlMonks shortcuts! What shortcuts can I use for linking?

See Writeup Formatting Tips and other pages linked from there for more info.