#!/usr/bin/perl -w use strict; use XML::Twig; use Getopt::Long; use Pod::Usage; use File::Temp qw{tempfile}; my $DEFAULT_SC = 'aspell -c'; my $DEFAULT_PP = 'indented'; my $DEFAULT_EXT= '.bak'; my $VERSION="0.01"; my ( $spellchecker, $ext, $attributes, $exclude_elements, $include_elements, $pretty_print, $version, $help, $man); GetOptions( 'spellchecker=s' => \$spellchecker, 'backup-extension=s' => \$ext, 'attributes' => \$attributes, 'exclude_elements=s' => \$exclude_elements, 'include_elements=s' => \$include_elements, 'pretty_print:s' => \$pretty_print, 'version' => \$version, 'help' => \$help, 'man' => \$man, ) or pod2usage(-verbose => 1, -exitval => -1); pod2usage( -verbose => 1, -exitval => 0) if $help; pod2usage( -verbose => 2, -exitval => 0) if $man; if( $version) { print "$0 version $VERSION\n"; exit;} # option processing $spellchecker ||= $DEFAULT_SC; $ext ||= $DEFAULT_EXT; if( $exclude_elements && $include_elements) { die "cannot use both --exclude-elements and --include-elements\n"; + } if( defined $pretty_print and !$pretty_print) { $pretty_print= $DEFAULT_PP; } my %twig_options; my( %include_elements); if( $exclude_elements) { my @exclude_elts = split /\s+/, $exclude_elements; my %start_tag_handlers= map { $_ => \&exclude_elt } @exclude_elts; $twig_options{start_tag_handlers}= \%start_tag_handlers; } if( $include_elements) { my @include_elts = split /\s+/, $include_elements; my %start_tag_handlers= map { $_ => \&include_elt } @include_elts; $twig_options{start_tag_handlers}= \%start_tag_handlers; } $twig_options{pretty_print}= $pretty_print if( $pretty_print); foreach my $file (@ARGV) { my $id=0; my $id2elt={}; # id => element my( $tmp_fh, $tmp_file) = tempfile( "xml_spellcheck_XXXX", SUFFIX => '.txt' ); my $t= XML::Twig->new( keep_encoding =>1, %twig_options,); $t->parsefile( $file); foreach my $elt ($t->descendants( '#TEXT')) { if( (!$include_elements and !$exclude_elements) or ($include_elements and $elt->inherit_att( '#include')) or ($exclude_elements and !$elt->inherit_att( '#exclude')) ) { $id++; process_text( $t, $elt, $id, $id2elt, $tmp_fh) } } close $tmp_fh; system( "$spellchecker $tmp_file") ==0 or die "$spellchecker $tmp_file failed: $?"; open( $tmp_fh, "<$tmp_file") or die "cannot open temp file $tmp_fi +le: $!"; while( <$tmp_fh>) { chomp; my( $id, $text)= split /:/, $_, 2; my $wrap= $id2elt->{$id}; $text=~ s{<\\n>}{\n}g; my $text_elt= $wrap->first_child or die "internal error 100\n" +; if( $text_elt->gi eq '#PCDATA') { $text_elt->set_pcdata( $text); } elsif( $text_elt->gi eq '#CDATA') { $text_elt->set_cdata( $text); } else { die "internal error 101\n"; } $wrap->erase; } close $tmp_fh; rename( $file, "$file$ext") or die "cannot save backup file $file$ +ext: $!"; open( FILE, ">$file") or die "cannot save spell checked file + $file: $!"; $t->print( \*FILE); close FILE; } sub include_elt { $_->set_att( '#include' => 1) ; } sub exclude_elt { $_->set_att( '#exclude' => 1) ; } sub process_text { my( $t, $elt, $id, $id2elt, $tmp_fh)= @_; my $wrap= $elt->wrap_in( '#SC'); #$wrap->set_att( '#ID' => $id); $id2elt->{$id}= $wrap; my $text= $elt->text; $text=~ s{\n}{<\\n>}g; print $tmp_fh "$id:$text\n"; } __END__ =head1 NAME xml_spellcheck =head1 SYNOPSIS xml_spellcheck [options] <files> =head1 DESCRIPTION xml_spellcheck lets you spell check the content of an XML file. It extracts the text (the content of elements and optionally of attributes), call a spell checker on it and then recreates the XML document. =head1 OPTIONS Note that all options can be abbreviated to the first letter =over 4 =item --conf <configuration_file> Gets the options from a configuration file. NOT IMPLEMENTED YET. =item --spellchecker <spellchecker> The command to use for spell checking, including any option By default C<aspell -c> is used =item --backup-extension <extension> By default the original file is saved with a C<.bak> extension. This o +ption changes the extension =item --attributes Spell check attribute content. By default attribute values are NOT spell checked. NOT YET IMPLEMENTED =item --exclude_elements <list_of_excluded_elements> A list of elements that should not be spell checked =item --include_elements <list_of_included_elements> A list of elements that should be spell checked (by default all elemen +ts are spell checked). C<--exclude_elements> and C<--include_elements> are mutually exclusive =item --pretty_print <optional_pretty_print_style> A pretty print style for the document, as defined in XML::Twig. If the option is provided without a value then the C<indented> style is used =item --version Dislay the tool version and exit =item --help Display help message and exit =item --man Display longer help message and exit =back =head1 EXAMPLES =head1 BUGS =head1 TODO =over 4 =item --conf option =item --attribute option =back =head1 PRE-REQUISITE XML::Twig, Getopt::Long, Pod::Usage, File::Temp XML::Twig requires XML::Parser. =head1 SEE ALSO XML::Twig =head1 COPYRIGHT AND DISCLAIMER This program is Copyright 2003 by Michel Rodriguez This program is free software; you can redistribute it and/or modify it under the terms of the Perl Artistic License or the GNU General Public License as published by the Free Software Foundation either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MER- CHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. If you do not have a copy of the GNU General Public License write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. =head1 AUTHOR Michel Rodriguez <mirod@xmltwig.com> xml_spellcheck is available at http://www.xmltwig.com/xmltwig/

In reply to xml_spellcheck by mirod

Title:
Use:  <p> text here (a paragraph) </p>
and:  <code> code here </code>
to format your post, it's "PerlMonks-approved HTML":



  • Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!
  • Titles consisting of a single word are discouraged, and in most cases are disallowed outright.
  • Read Where should I post X? if you're not absolutely sure you're posting in the right place.
  • Please read these before you post! —
  • Posts may use any of the Perl Monks Approved HTML tags:
    a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, details, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, summary, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr
  • You may need to use entities for some characters, as follows. (Exception: Within code tags, you can put the characters literally.)
            For:     Use:
    & &amp;
    < &lt;
    > &gt;
    [ &#91;
    ] &#93;
  • Link using PerlMonks shortcuts! What shortcuts can I use for linking?
  • See Writeup Formatting Tips and other pages linked from there for more info.