#!/usr/bin/perl #-------------- #txtattack.pl # #this is a script which takes text input by default and outputs DocBoo +k XML by default having guessed at the semantic structure of the text +. #At the moment it's arranged in such a way that allows for expansion, +including the development of a module based on this as a template use strict; use warnings; use vars qw($articlename $headertest $nextline $lnapply $writestart $w +ritetitle $writeelement $writeheader $writeend $lineallowance $inform +at $outformat $val $marker $line $isheader $string $paranumber $artic +lename); $informat = "text"; $outformat = "DocBook"; $lineallowance = 0; #here should go the code for overriding the defaults # #but in the meantime i'll happily setup blind defaults and do the modu +larity bit later, as i've seperated it all cool like #good for testing ;-) if($informat eq "text"){ $articlename = sub{ my $val = <SOURCE>; chomp($val); return $val; }; $nextline = sub{ return <SOURCE>; }; $headertest = sub{ if($string eq "\n" and $marker > $lineallowance){ $isheader = 1; } }; } if($outformat eq "DocBook"){ $writestart = sub{ print '<?xml version="1.0" encoding="UTF-8"?>'; print '<!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4 +.1.2//EN" "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd"> +'; print "\n\n"; }; $writetitle = sub{ print "<article>\n <title>"; print &$articlename; print "</title>"; }; $writeelement = sub { print "\n<para>\n$line\n</para>\n"; }; $writeheader = sub { # print "</chapter>"; # print "\n<chapter id=\"$element\">\n"; print "<title>$line</title>\n"; }; $writeend = sub{ print "\n</article>"; }; } $lnapply = sub { if($isheader == 0){ track("break isn't header"); &$writeelement($line); } elsif($isheader == 1){ track("break is header"); &$writeheader($line); } }; #--------------------- #sort out all function aliases before here #-------------------- #and here we have the actual algorithm sub liberate{ if (defined $_[0]){ open SOURCE, $ARGV[0] or return("$!"); } else{ print "usage: semget [file] > [outfile]" and return; } $marker = 0; $isheader = 0; $paranumber = 0; &$writestart; &$writetitle; while(defined($string = &$nextline)){ &$headertest($string); if($string eq "\n") { track("found break"); if($marker == $lineallowance){ track("hit line allowance"); $paranumber++; &$lnapply; $isheader = 0; $line = undef; } track("redundant break"); $marker++; } else{ chomp($string); #track("found text"); if (defined $line){ $line = "${line} $string"; } else{ $line = $string; } $marker = 0; } #print "$string"; } &$writeend; } liberate($ARGV[0]); sub track{ warn "\ntrack:$_[0] at $paranumber"; }

In reply to txt2docbook 2 by Maze

Title:
Use:  <p> text here (a paragraph) </p>
and:  <code> code here </code>
to format your post, it's "PerlMonks-approved HTML":



  • Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!
  • Titles consisting of a single word are discouraged, and in most cases are disallowed outright.
  • Read Where should I post X? if you're not absolutely sure you're posting in the right place.
  • Please read these before you post! —
  • Posts may use any of the Perl Monks Approved HTML tags:
    a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, details, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, summary, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr
  • You may need to use entities for some characters, as follows. (Exception: Within code tags, you can put the characters literally.)
            For:     Use:
    & &amp;
    < &lt;
    > &gt;
    [ &#91;
    ] &#93;
  • Link using PerlMonks shortcuts! What shortcuts can I use for linking?
  • See Writeup Formatting Tips and other pages linked from there for more info.