comment on

#!/usr/bin/perl -w
use strict;

use XML::Twig;
use Getopt::Long;
use Pod::Usage;
use File::Temp qw{tempfile};

my $DEFAULT_SC = 'aspell -c';
my $DEFAULT_PP = 'indented';
my $DEFAULT_EXT= '.bak';

my $VERSION="0.01";

my ( $spellchecker, $ext, $attributes, $exclude_elements, 
     $include_elements, $pretty_print, $version, $help, $man);

GetOptions(  'spellchecker=s'     => \$spellchecker,
             'backup-extension=s' => \$ext,
             'attributes'         => \$attributes,
             'exclude_elements=s' => \$exclude_elements,
             'include_elements=s' => \$include_elements,
             'pretty_print:s'     => \$pretty_print,
             'version'            => \$version,
             'help'               => \$help,
             'man'                => \$man,
          ) or pod2usage(-verbose => 1, -exitval => -1);

pod2usage( -verbose => 1, -exitval => 0) if $help;
pod2usage( -verbose => 2, -exitval => 0) if $man;
if( $version) { print "$0 version $VERSION\n"; exit;}
            
# option processing
$spellchecker ||= $DEFAULT_SC;
$ext          ||= $DEFAULT_EXT;

if( $exclude_elements && $include_elements)
  { die "cannot use both --exclude-elements and --include-elements\n";
+ }
if( defined $pretty_print and !$pretty_print)
  { $pretty_print= $DEFAULT_PP; }

my %twig_options;

my( %include_elements);
if( $exclude_elements)
  { my @exclude_elts = split /\s+/, $exclude_elements;
    my %start_tag_handlers= map { $_ => \&exclude_elt } @exclude_elts;
    $twig_options{start_tag_handlers}= \%start_tag_handlers;
  }
if( $include_elements)
  { my @include_elts = split /\s+/, $include_elements;
    my %start_tag_handlers= map { $_ => \&include_elt } @include_elts;
    $twig_options{start_tag_handlers}= \%start_tag_handlers;
  }

$twig_options{pretty_print}= $pretty_print if( $pretty_print);

foreach my $file (@ARGV)
  { 
    my $id=0;
    my $id2elt={};           # id => element

    my( $tmp_fh, $tmp_file) = tempfile( "xml_spellcheck_XXXX", 
                                        SUFFIX => '.txt'
                                      );
    my $t= XML::Twig->new( keep_encoding =>1, %twig_options,);
    $t->parsefile( $file);

    foreach my $elt ($t->descendants( '#TEXT'))
      {
        if(    (!$include_elements and !$exclude_elements)
            or ($include_elements and  $elt->inherit_att( '#include'))
            or ($exclude_elements and !$elt->inherit_att( '#exclude'))
          )
          { $id++;
            process_text( $t, $elt, $id, $id2elt, $tmp_fh)
          }
      }
    close $tmp_fh;

    system( "$spellchecker $tmp_file") ==0
      or die "$spellchecker $tmp_file failed: $?";

   
    open( $tmp_fh, "<$tmp_file") or die "cannot open temp file $tmp_fi
+le: $!";
    while( <$tmp_fh>)
      { chomp;
        my( $id, $text)= split /:/, $_, 2;
        my $wrap= $id2elt->{$id};
        $text=~ s{<\\n>}{\n}g;
        my $text_elt= $wrap->first_child or die "internal error 100\n"
+;
        if( $text_elt->gi eq '#PCDATA')
          { $text_elt->set_pcdata( $text); }
        elsif( $text_elt->gi eq '#CDATA')
          { $text_elt->set_cdata( $text); }
        else 
          { die "internal error 101\n"; }
        $wrap->erase;
      }
    close $tmp_fh;

    rename( $file, "$file$ext") or die "cannot save backup file $file$
+ext: $!";
    open( FILE, ">$file")       or die "cannot save spell checked file
+ $file: $!";
    $t->print( \*FILE);
    close FILE;
  }     


sub include_elt
  { $_->set_att( '#include' => 1) ; }

sub exclude_elt
  { $_->set_att( '#exclude' => 1) ; }

sub process_text
  { my( $t, $elt, $id, $id2elt, $tmp_fh)= @_;
    my $wrap= $elt->wrap_in( '#SC');
    #$wrap->set_att( '#ID' => $id);
    $id2elt->{$id}= $wrap;
    my $text= $elt->text;
    $text=~ s{\n}{<\\n>}g;
    print $tmp_fh "$id:$text\n";
  }    

__END__

=head1 NAME

xml_spellcheck

=head1 SYNOPSIS

  xml_spellcheck [options] <files>

=head1 DESCRIPTION

xml_spellcheck lets you spell check the content of an XML file.
It extracts the text (the content of elements and optionally of
attributes), call a spell checker on it and then recreates the
XML document.

=head1 OPTIONS

Note that all options can be abbreviated to the first letter

=over 4

=item --conf <configuration_file>

Gets the options from a configuration file. NOT IMPLEMENTED YET.

=item --spellchecker <spellchecker>

The command to use for spell checking, including any option

By default C<aspell -c> is used

=item --backup-extension <extension>

By default the original file is saved with a C<.bak> extension. This o
+ption
changes the extension

=item --attributes 

Spell check attribute content. By default attribute values are NOT
spell checked. NOT YET IMPLEMENTED

=item --exclude_elements <list_of_excluded_elements>

A list of elements that should not be spell checked

=item --include_elements <list_of_included_elements>

A list of elements that should be spell checked (by default all elemen
+ts
are spell checked). 

C<--exclude_elements> and C<--include_elements> are mutually exclusive

=item --pretty_print <optional_pretty_print_style>

A pretty print style for the document, as defined in XML::Twig. If
the option is provided without a value then the C<indented> style is
used

=item --version

Dislay the tool version and exit

=item --help

Display help message and exit

=item --man

Display longer help message and exit

=back

=head1 EXAMPLES

=head1 BUGS

=head1 TODO

=over 4

=item --conf option

=item --attribute option

=back

=head1 PRE-REQUISITE

XML::Twig, Getopt::Long, Pod::Usage, File::Temp
XML::Twig requires XML::Parser.

=head1 SEE ALSO

XML::Twig

=head1 COPYRIGHT AND DISCLAIMER

This program is Copyright 2003 by Michel Rodriguez

This program is free software; you can redistribute it and/or modify
it under the terms of the Perl Artistic License or the GNU General 
Public License as published by the Free Software Foundation either
version 2 of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MER-
CHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General
Public License for more details.

If you do not have a copy of the GNU General Public License write to
the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139,
USA.

=head1 AUTHOR 

Michel Rodriguez <mirod@xmltwig.com>

xml_spellcheck is available at http://www.xmltwig.com/xmltwig/
[download]

In reply to xml_spellcheck by mirod

Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!

Titles consisting of a single word are discouraged, and in most cases are disallowed outright.

Read Where should I post X? if you're not absolutely sure you're posting in the right place.

Please read these before you post! —

Posts may use any of the Perl Monks Approved HTML tags:

a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, details, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, summary, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr

You may need to use entities for some characters, as follows. (Exception: Within code tags, you can put the characters literally.)

	For:		Use:
	&		`&`
	<		`<`
	>		`>`
	[		`[`
	]		`]`

Link using PerlMonks shortcuts! What shortcuts can I use for linking?

See Writeup Formatting Tips and other pages linked from there for more info.