#!/usr/bin/perl
use strict;
use warnings;
my $tag;
my $output;
my $fh;
my $deletestrings;
while (<DATA>) {
s/\s*<\s*(\/?)\s*(\w+)\s*>\s*/$1?"\n":"\n\{$2\}\n"/ge;
chomp;
s/[\cA-\cZ]//g; # To remove control characters
s/^[\\|<]$//g; # To delete the character like \ and < at the e
+nd of the line
s/[\\|<]$//; # To delete the character like \ and < at the beg
+ining of the line
s/^\s+//g; # To remove multiple spaces at the begining of the
+ line
s/\s+$//g; # To remove spaces at the end of the line
if(/^{(.*)}$/) { # match {METATAG} line
$fh = output($output, $tag, $fh);
$output = "";
$tag = $1;
} else { # not a {TAG} line
next unless($tag);
next if(/^\s*$/);
s/\\//g;
$output .= ($output) ? " $_" : "<$tag>$_";
}
}
$fh = output($output, $tag, $fh);
if($fh) {
print $fh "</ROOT>\n";
close($fh);
}
exit(0);
# Subroutine to open the file with the filename as {SOURCETAG}
# Subroutine to open the file with the filename as {SOURCETAG}
sub output {
my ($output, $tag, $fh) = @_;
if($output) {
if($output =~ m/<SOURCE>(.*)/) {
if($fh) {
print $fh "</ROOT>\n";
close($fh);
}
open($fh, '>', "$1.xml") or die "$1.xml: $!";
print $fh "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<RO
+OT>\n";
}
print $fh "$output</$tag>\n";
}
return($fh);
}
__DATA__
{SOURCE}
0904230634
{DATE}
090424
{EDITION}
1
{HEADLINE}
heredero del famoso deportista mexicano, lucha por enaltecer la vida y
+ obra
del autor de sus dM-mas
{SOURCE1}
Por Gisela Orozco 312.527.8461/ Chicago\
<byline> Por Gisela Orozco< ttl>312.527.8461/ Chicago</ttl>
When executed, it creates a file called 0904230634.xml in the current directory.
The output is
<?xml version="1.0" encoding="UTF-8"?>
<ROOT>
<SOURCE>0904230634</SOURCE>
<DATE>090424</DATE>
<EDITION>1</EDITION>
<HEADLINE>heredero del famoso deportista mexicano, lucha por enaltecer
+ la vida y obra del autor de sus dM-mas</HEADLINE>
<SOURCE1>Por Gisela Orozco 312.527.8461/ Chicago {byline}Por Gisela Or
+ozco{ttl}312.527.8461/ Chicago</SOURCE1>
</ROOT>
So
Instead of
<?xml version="1.0" encoding="UTF-8"?>
<ROOT>
<SOURCE>0904230634</SOURCE>
<DATE>090424</DATE>
<EDITION>1</EDITION>
<HEADLINE>heredero del famoso deportista mexicano, lucha por enaltecer
+ la vida y obra del autor de sus dM-mas</HEADLINE>
<SOURCE1>Por Gisela Orozco 312.527.8461/ Chicago</SOURCE1>
<byline>Por Gisela Orozco</byline>
<ttl>312.527.8461/ Chicago</ttl>
</ROOT>
So please tell me why the string is replaced only with {} and further it is not converted to < as for other tags |