#!/usr/local/bin/perl use warnings; use strict; use Cwd; use File::Copy; use Getopt::Long; use HTML::TokeParser::Simple 1.4; use Pod::Usage; my $VERBOSE = 1; GetOptions( 'help|?' => sub { pod2usage(-verbose => 2);exit } , 'config=s' => \my $CONFIG, 'backup=s' => \my $BACKUP, 'verbose!' => \$VERBOSE, 'quiet' => sub { $VERBOSE = 0 }, 'debug' => \my $DEBUGGING, 'ordered' => \my $ORDERED_ATTRIBUTES, 'ignore' => \my $IGNORE_ATTRIBUTES, 'text' => \my $MATCH_TEXT, 'fuzzy' => \my $FUZZY_MATCH ); backup_dir( $BACKUP ); $BACKUP .= '/' unless substr( $BACKUP, -1 ) =~ /\//; my %CONFIG = read_config( $CONFIG ); if ( $DEBUGGING ) { no warnings 'once'; require Data::Dumper; print Data::Dumper->Dump([\%CONFIG], ['*CONFIG']); } my %REPLACEMENTS; @ARGV = map { glob $_ } @ARGV; foreach my $file (@ARGV) { print "Processing ($file)...\n" if $VERBOSE; backup_file( $file, $BACKUP ); $REPLACEMENTS{$file} = 0; open HTML, '+<', $file or die "Can't open ($file) for updating: $!"; my $html = parse_document( *HTML,$file ); print "\t$REPLACEMENTS{$file} replacement(s) made to ($file)\n" if $VERBOSE; if ( $REPLACEMENTS{$file} ) { seek HTML, 0, 0 or die "Can't seek to start of ($file): $!"; print HTML $html; truncate HTML, tell(HTML) or die "Can't truncate ($file): $!"; } if ( $DEBUGGING ) { print $html; } close HTML; } sub parse_document { my ($fh,$file) = @_; my $parser = HTML::TokeParser::Simple->new( $fh ); my $html = ''; while ( my $token = $parser->get_token ) { if( stacks_match( $parser, $CONFIG{stack} ) ) { $html .= $CONFIG{new}; $REPLACEMENTS{$file}++; } else { $html .= $token->as_is; } } return $html; } sub stacks_match { # if the stack matches the current token stream, return true and leave the # parser at the end of the stream match. If it doesn't match, set the # parser to its original state and return false. my ($parser,$stack) = @_; my $stacks_match = 1; my @current_stack; for my $i ( 0 .. $#$stack ) { my $token = $parser->get_token; unless ($token) { # we've reached the end of the document and thus cannot match $parser->unget_token(@current_stack); return; } push @current_stack => $token; $stacks_match = tokens_match($token,$stack->[$i]); unless ($stacks_match) { # stacks didn't match. Restore state and return if ( $DEBUGGING ) { print "\n*** Current stack match failed:\n\n"; print Data::Dumper::Dumper(\@current_stack), "\n"; } $parser->unget_token(@current_stack); return; } } if ( $DEBUGGING ) { print "\n*** Matched this stack against config stack:\n\n"; print Data::Dumper::Dumper(\@current_stack), "\n"; } return 1; } sub munge_text { my $text = shift; $text =~ s/\W//g; return lc $text; } sub tokens_match { my ($token,$stack_token) = @_; if ( $token->[0] ne $stack_token->[0] ) { # token types did not match return; } elsif ($token->is_tag) { return token_as_string($token) eq token_as_string($stack_token); } elsif ($MATCH_TEXT) { my $curr_text = $token->return_text; my $stack_text = $stack_token->return_text; if ($FUZZY_MATCH) { $curr_text = munge_text( $curr_text ); $stack_text = munge_text( $stack_text ); } if ( $curr_text =~ /\S/ or $stack_text =~ /\S/ ) { return $curr_text eq $stack_text; } } else { # we're ignoring whatever it is, so it's an automatic match return 1; } } sub read_config { my $file = shift; my %allowed = map {$_=>1} qw(old new); open CONFIG, "<", $file or die "Cannot read ($file): $!"; my %config; local $_; my ($section,$old_section) = ('',''); while () { next unless /\S/; if ( /^\s*\[([^\]]+)\]\s*$/ ) { # [$section] $section = $1; die "Unknown section ($section) in config file ($file)" unless exists $allowed{$section}; $config{$section} = ''; next; } $config{$section} .= $_; } chomp foreach values %config; close CONFIG; return add_stack( %config ); } sub add_stack { my %config = @_; my $html = $config{old}; $config{stack} = []; my $parser = HTML::TokeParser::Simple->new( \$html ); while ( my $token = $parser->get_token ) { push @{$config{stack}} => $token; } delete $config{old}; return %config; } sub token_as_string { # so far, this is fairly simple. It merely "stringifies" the tag type # and attributes. This may change in the future. my $token = shift; my $sequence = $token->return_attrseq; return canonical_tag($token) unless $sequence && ! $IGNORE_ATTRIBUTES; @$sequence = sort @$sequence unless $ORDERED_ATTRIBUTES; my $attributes = $token->return_attr; my $results = ''; foreach my $attr (@$sequence) { $results .= $attr . $attributes->{$attr}; } return canonical_tag($token).$results; } sub backup_dir { my $dir = shift || die pod2usage(); unless ( -d $dir ) { mkdir $dir or die "Could not makedir ($dir): $!"; } } sub canonical_tag { # prepends a backslash my $token = shift; my $tag = $token->return_tag; $tag = "/$tag" if $token->is_end_tag and '/' ne substr $tag, 0, 1; return $tag; } sub backup_file { my ($file,$backup) = @_; if ( -e $file ) { copy( $file, "$backup$file" ) or die "Could not copy ($file) to ($backup): $!";\ return 1; } else { warn "\tWARNING: File ($file) does not exist in (",cwd,")\n"; return; } } __END__ =head1 NAME htmlreplace -- A simple HTML replacement tool =head1 SYNOPSIS B for more information htmlreplace [options] [filenames] Options: --help Display POD --? Same as --help --verbose List files while processing them (default) Will also list number of substitutions made. --noverbose Turn off --verbose --quiet Same as --noverbose --config *file* Location of config file --backup *dir* The directory to back up the files to --text Match text (default is off -- only check structure) --fuzzy Same as --text, but matching is more robust (see below) --ignore Use this to ignore attributes --ordered If specified, attributes must appear in the same order in both the config html and the target html. Default is unordered. --debug This will dump the config token stack to STDOUT. =head1 OVERVIEW This program allows a the user to create a simple configuration file that will define HTML snippets and the replacement text for them. Then, a list of file names will be iterated over, checking the HTML and if any corresponding HTML is found, will replace the HTML as specified in the config file =head1 DESCRIPTION =head2 Configuration File The config file takes two tokes, which should be on lines by themselves, C<[old]> and C<[new]>. After the C<[new]> token, add the text that you wish to replace the HTML with. After the C<[old]> token, add the HTML that the program must search for and replace. An example configuration file named I: [old] [new]

This program parses the HTML into tokens, so whitespace is not important with tags. To use the above config file with a backup directory named I: htmlreplace --config copyright.cfg --backup old *.html Command line options may also be shortened to the smallest number of letters necessary to distinguish them from other options. Thus, the above can be written as follows: htmlreplace -c copyright.cfg -b old *.html =head2 Attribute handling Attributes are the name/value pairs associated with HTML start tags. For example, the following tag has an attribute name of I with a value of I.

Many tags will have multiple attributes. By default, attributes of start tags do not need to appear in the same order in the sample HTML and the HTML being examined. The following two C tags are equivalent: If attribute order is important, you may use the I<--ordered> switch (I<-o>). htmlreplace --ordered --config copyright.cgi --backup old *.html If you wish to ignore attributes and simply ensure that the order of the tags is correct, use the I<--ignore> (I<-i>) switch. htmlreplace --ordered --ignore --config copyright.cgi --backup old *.html =head2 Matching Text By default, C only matches the structure of the document, not the text. To match text, use the I<--text> (I<-t>) switch. This will ensure an exact match of the text (but skips any text that is pure whitespace). If the text might be a little off, such as unusual capitalization, extra white space, etc., you can use the I<--fuzzy> (I<-f>) switch instead. This will match text if the source and target texts match after all "non-word" (C) characters are removed and all letters have been lower-cased. =head1 COPYRIGHT Copyright (c) 2001 Curtis "Ovid" Poe. All rights reserved. This program is free software; you may redistribute it and/or modify it under the same terms as Perl itself.