comment on


#!/usr/bin/perl
#--------------
#txtattack.pl
#
#this is a script which takes text input by default and outputs DocBoo
+k XML by default having guessed at the semantic structure of the text
+.
#At the moment it's arranged in such a way that allows for expansion, 
+including the development of a module based on this as a template
use strict;
use warnings;
use vars qw($articlename $headertest $nextline $lnapply $writestart $w
+ritetitle $writeelement $writeheader $writeend $lineallowance $inform
+at $outformat $val $marker $line $isheader $string $paranumber $artic
+lename);
$informat = "text";
$outformat = "DocBook";
$lineallowance = 0;

#here should go the code for overriding the defaults
#
#but in the meantime i'll happily setup blind defaults and do the modu
+larity bit later, as i've seperated it all cool like
#good for testing ;-)

if($informat eq "text"){
    $articlename = sub{
        my $val = <SOURCE>;
        chomp($val);
        return $val;
    };
    $nextline = sub{
        return <SOURCE>;
    };
    $headertest = sub{
        if($string eq "\n" and $marker > $lineallowance){
            $isheader = 1;
        }
    };
}

if($outformat eq "DocBook"){
    $writestart = sub{
         print '<?xml version="1.0" encoding="UTF-8"?>';
         print '<!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4
+.1.2//EN" "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd">
+';
         print "\n\n";
    };
    $writetitle = sub{
        print "<article>\n <title>";
        print &$articlename;
        print "</title>";
    };
    $writeelement = sub {
        print "\n<para>\n$line\n</para>\n";
    };
    $writeheader = sub {
#         print "</chapter>";
#         print "\n<chapter id=\"$element\">\n";
        print "<title>$line</title>\n";
    };
    $writeend = sub{
        print "\n</article>";
    };
}
$lnapply = sub {
    if($isheader == 0){
    track("break isn't header");
        &$writeelement($line);
    }
    elsif($isheader == 1){
    track("break is header");
        &$writeheader($line);
     }
};

#---------------------
#sort out all function aliases before here
#--------------------
#and here we have the actual algorithm

sub liberate{
    
if (defined $_[0]){
    open SOURCE, $ARGV[0] or return("$!");
}
else{
    print "usage: semget [file] > [outfile]" and return;
}

$marker = 0;
$isheader = 0;
$paranumber = 0;

&$writestart;
&$writetitle;

while(defined($string = &$nextline)){
    &$headertest($string);
    if($string eq "\n") {
        track("found break");
        if($marker == $lineallowance){
            track("hit line allowance");
            $paranumber++;
            &$lnapply;
            $isheader = 0;
            $line = undef;
        }
        track("redundant break");
        $marker++;
    }
    else{
        chomp($string);
        #track("found text");
        if (defined $line){
            $line = "${line} $string";
        }
        else{
            $line = $string;
        }
        $marker = 0;
    }
#print "$string";
}

&$writeend;

}

liberate($ARGV[0]);

sub track{
    warn "\ntrack:$_[0] at $paranumber";
}
[download]

In reply to txt2docbook 2 by Maze

Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!

Titles consisting of a single word are discouraged, and in most cases are disallowed outright.

Read Where should I post X? if you're not absolutely sure you're posting in the right place.

Please read these before you post! —

Posts may use any of the Perl Monks Approved HTML tags:

a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, details, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, summary, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr

You may need to use entities for some characters, as follows. (Exception: Within code tags, you can put the characters literally.)

	For:		Use:
	&		`&`
	<		`<`
	>		`>`
	[		`[`
	]		`]`

Link using PerlMonks shortcuts! What shortcuts can I use for linking?

See Writeup Formatting Tips and other pages linked from there for more info.