Category: Text Processor
Author/Contact Info Chris Monahan aka Maze ForeverWatcher@googlemail.com
Description: this guesses the semantic structure from a text document, stripping the line endings and guessing where the paragraph breaks and headers should be. Good for processing Gutenburg 'plain vanilla ASCII' version 1 of txt2docbook, the original and perhaps simplest to understand
#!/usr/bin/perl
#--------------
#convert a text file to docbook XML, the quickhack version
use strict;
use warnings;
if (defined $ARGV[0]){
    open SOURCE, $ARGV[0] or die("$!");
}
else{
    print "usage: txt2dcb [text file]" and die;
}
print '<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC 
+"-//OASIS//DTD DocBook XML V4.1.2//EN" "http://www.oasis-open.org/doc
+book/xml/4.1.2/docbookx.dtd">';
print "\n<article>\n";
my $marker = 0;
my $isheader = 1;
my $line;
my $string;
while(defined($string = <SOURCE>)){
    if($string eq "\n" and $marker == 0){
        if ($isheader){
            print "\t<title>$line</title>\n";
            
        }
        else{
            print "\t<para>$line</para>\n" unless !$line;
        }
        $line = undef;
        $isheader = undef;
        $marker++;
    }
    elsif($string eq "\n" and $marker == 1){
        $isheader = 1;
    }
    else{
        chomp($string);
        if (defined $line){
            $line = $line.$string;
        }
        else{
            $line = $string;
        }
        $marker = 0;
    }
}
print "\n</article>";