XML2PDF

   1: #!/usr/bin/perl -w
   2: ##############################################################################
   3: #
   4: # XML2PDF
   5: #
   6: # Author : HolyGrail
   7: # Version: 0.7
   8: #
   9: # Usage: xml2pdf {<xmlfile>}
  10: #
  11: # This conversionprogram translates XML into PDF. You can specify more than one
  12: # xmlfile at the commandline and they will all be processed. The PDF files will
  13: # replace ".xml" with ".pdf" or add ".pdf" if the sourcefile does not end with 
  14: # ".xml". It does no XML validation at all. The XML file should conform to the 
  15: # following DTD:
  16: #
  17: # <!ELEMENT document (section1)+>
  18: # <!ATTLIST document
  19: #     title CDATA #REQUIRED
  20: # >
  21: # 
  22: # <!ELEMENT section1 (section2 | p | img)+>
  23: # <!ATTLIST section1
  24: #     title CDATA #REQUIRED
  25: # >
  26: #
  27: # <!ELEMENT section2 (p | img)+>
  28: # <!ATTLIST section2
  29: #     title CDATA #REQUIRED
  30: # >
  31: #
  32: # <!ELEMENT p (img | #PCDATA)>
  33: #
  34: # <!ELEMENT img>
  35: # <!ATTLIST img
  36: #     src CDATA #REQUIRED
  37: # >
  38: # 
  39: # Known issues:
  40: # * 'img'-elements are processed after a complete 'p' element is processed, 
  41: #   so the images that are meant to come 'in' the text, come 'after' the 
  42: #   text. The workaround for this should be to split the text up in multiple 
  43: #   p-elements and put the img-element between two p-elements.
  44: # * Images that are wider than the page fall off the page on the right side
  45: #
  46: # Bugs:
  47: # * Lots of them, no doubt
  48: #
  49: # Changes:
  50: #
  51: # 0.7: 'img' tags can now also be url's and no longer just local files
  52: # 0.6: It now also accepts images (JPG, GIF and PNG), changed the DTD 
  53: #      accordingly.
  54: # 0.5: Changed the program call, so you can process multiple files in one call
  55: # 0.4: It now can handle sections that don't fit on one page. It starts new 
  56: #      pages if needed.
  57: # 0.3: Adapted it to conform to OeufMayo's DTD
  58: # 0.2: Minor change (0.1 re-declared the global vars as 'my', thanks to davorg
  59: # 0.1: First release
  60: #
  61: ##############################################################################
  62: use strict;
  63: use PDFLib;
  64: use XML::XPath;
  65: use LWP::Simple;
  66: use vars qw ( $TOPMARGIN 
  67:               $BOTTOMMARGIN 
  68:               $LEFTMARGIN 
  69:               $RIGHTMARGIN 
  70:               $PAGEHEIGHTPOINTS 
  71:               $PAGEWIDTHPOINTS
  72: 	      $MAXYPOS
  73: 	      %FONTDEFINITION
  74: 	      %IMAGES
  75:              );
  76: # Define the fonts for each element
  77: %FONTDEFINITION = ( section1  => [face => "Helvetica-Bold", size => "16.0"],
  78:                     section2  => [face => "Helvetica-Bold", size => "14.0"],
  79:                     p         => [face => "Helvetica", size => "12.0"]
  80:                     );
  81: 
  82: # Pagesetup
  83: my $pagename      = "a4";
  84: $PAGEHEIGHTPOINTS = 842; # See PDFLib's documentation for 
  85: $PAGEWIDTHPOINTS  = 595; # the size of your page
  86: ($TOPMARGIN, $BOTTOMMARGIN, $LEFTMARGIN, $RIGHTMARGIN) = (10, 10, 15, 15);
  87: 
  88: foreach (@ARGV)
  89: {
  90:   # Documentsetup
  91:   my $xmlfile = $_;
  92:   my $xml = XML::XPath->new(filename => $xmlfile);
  93:   s/\.xml$/\.pdf/i if /\.xml$/i;
  94:   my $pdffile =  /\.pdf$/i ? $_ : "$_.pdf";
  95:   my $pdf = PDFLib->new( filename => $pdffile,
  96:                          papersize=> $pagename,
  97:                          creator  => "XML2PDF",
  98:                          title    => (($xml->findnodes('document'))[0]->getAttribute('name'))
  99:                         );
 100:   print "Converting $xmlfile to $pdffile \n";
 101: 
 102:   # pre-process the images:
 103:   foreach my $img ($xml->findnodes('//img'))
 104:   {
 105:     my $filetype = 'jpeg' if $img->getAttribute('src')=~ /[jpg|jpeg]$/i;
 106:     $filetype = 'gif' if $img->getAttribute('src')=~ /gif$/i;
 107:     $filetype = 'png' if $img->getAttribute('src')=~ /png$/i;
 108: 
 109:     my $filename = $img->getAttribute('src');
 110: 
 111:     if ($filename =~ /^http:\/\//i)
 112:     {
 113:       $filename =~ /.*\/(.*\..*)/;
 114:       print $1."\n";
 115:       getstore($img->getAttribute('src'), $1);
 116:       $filename=$1;
 117:     }  
 118: 
 119:     $IMAGES{$img->getAttribute('src')} =  
 120:        $pdf->load_image(filetype => $filetype || 'gif',
 121:                         filename => $filename );
 122:   }
 123: 
 124:   # Process the document
 125:   foreach my $section ($xml->findnodes('document/section1'))
 126:   {
 127:     #start every section on a new page
 128:     $pdf->start_page();
 129:     
 130:     # Calculate the starting Y-axis value
 131:     $MAXYPOS = $PAGEHEIGHTPOINTS - $TOPMARGIN - $BOTTOMMARGIN;
 132:     
 133:     # Print the header
 134:     printpdf($pdf, $FONTDEFINITION{section1}, 
 135:              "\n".$section->getAttribute('title')."\n\n", $MAXYPOS);
 136:     
 137:     processsection($pdf, $section);
 138:   }
 139: }
 140: 
 141: sub printpdf
 142: {
 143:     my ($pdf, $font, $what, $ypos) = @_;
 144:     $pdf->set_font( @{$font} );
 145:     my $charsnotprinted = length($what);
 146:     
 147:     while ($charsnotprinted)
 148:     {
 149:       $charsnotprinted = $pdf->print_boxed($what,
 150:                           mode => "left",
 151:                           x    => $LEFTMARGIN,
 152:                           y    => $BOTTOMMARGIN,
 153:                           w    => $PAGEWIDTHPOINTS - $RIGHTMARGIN - 
 154:                                    $LEFTMARGIN,
 155:                           h    => $ypos || $pdf->get_value("texty")
 156:                        );
 157:        $what = substr($what, (length($what) - $charsnotprinted), length($what));
 158:        $pdf->start_page() if $charsnotprinted;
 159:        $ypos = $PAGEHEIGHTPOINTS - $TOPMARGIN - $BOTTOMMARGIN;
 160:      }           
 161: }
 162: 
 163: sub processsection
 164: {
 165:   my ($pdf, $section) = @_;
 166:   foreach my $node ($section->findnodes('*'))
 167:   {
 168:     my $text;
 169:     if ($node->getName() eq "section2")
 170:     {
 171:       printpdf($pdf, $FONTDEFINITION{$node->getName()}, 
 172:                "\n".$node->getAttribute('title')."\n");
 173:       processsection($pdf, $node); #recurse!
 174:     }
 175:     if ($node->getName() eq "p")
 176:     {
 177:       printpdf($pdf, $FONTDEFINITION{$node->getName()}, 
 178:                "\n".$node->string_value()."\n");
 179:       processsection($pdf, $node); #recurse!
 180:     }
 181:     if ($node->getName() eq "img")
 182:     {
 183:       if ($pdf->get_value('texty')-$IMAGES{$node->getAttribute('src')}->height() < 0)
 184:       {
 185:         $pdf->start_page();
 186:         $pdf->set_text_pos($pdf->get_value('textx'), $PAGEHEIGHTPOINTS - $TOPMARGIN);
 187:       }
 188:       $pdf->add_image(img => $IMAGES{$node->getAttribute('src')},
 189:                       x   => $LEFTMARGIN,
 190: 		      y   => $pdf->get_value('texty')-$IMAGES{$node->getAttribute('src')}->height());
 191:       $pdf->set_text_pos($pdf->get_value('textx'), 
 192:                          $pdf->get_value('texty')-$IMAGES{$node->getAttribute('src')}->height());
 193:     }
 194:     
 195:   }
 196: 
 197: }
Back to Craft