#!/bin/perl -w # strips an XML file from all of it's tags and outputs the result # each tag content is on a separate line # the output is in latin1, remove the Latin => 1 option to get UTF-8 output # optionaly a tag can be passed, only the text within this tag will be output # the script also works whith input from STDIN # # usage strip_markup [-t ] use strict; use XML::TiePYX; use Getopt::Std; my $tag; my $in_tag; my %opts; getopts('t:',\%opts); unless( $tag= $opts{t}) { $in_tag= 1; } die "usage: $0 [-t ] \n" unless @ARGV<=1; tie *XML,'XML::TiePYX', $ARGV[0] || \*STDIN, Condense=>0, Latin => 1; while( ) { if( $tag && m/^\($tag/) { $in_tag= 1; } # check for opening $tag if( $tag && m/^\)$tag/) { $in_tag= 0; } # check for closing $tag next unless $in_tag; # skip unless $in_tag next unless s/^-//; # skip markup next if m/^\\n$/; # skip line returns next if m/^\s*$/; # skip empty lines print; # output the rest (text in $tag) }