Beefy Boxes and Bandwidth Generously Provided by pair Networks
Perl-Sensitive Sunglasses
 
PerlMonks  

XML2PDF

by holygrail (Scribe)
on Jul 02, 2001 at 17:12 UTC ( [id://93171]=perlcraft: print w/replies, xml ) Need Help??

   1: #!/usr/bin/perl -w
   2: ##############################################################################
   3: #
   4: # XML2PDF
   5: #
   6: # Author : HolyGrail
   7: # Version: 0.7
   8: #
   9: # Usage: xml2pdf {<xmlfile>}
  10: #
  11: # This conversionprogram translates XML into PDF. You can specify more than one
  12: # xmlfile at the commandline and they will all be processed. The PDF files will
  13: # replace ".xml" with ".pdf" or add ".pdf" if the sourcefile does not end with 
  14: # ".xml". It does no XML validation at all. The XML file should conform to the 
  15: # following DTD:
  16: #
  17: # <!ELEMENT document (section1)+>
  18: # <!ATTLIST document
  19: #     title CDATA #REQUIRED
  20: # >
  21: # 
  22: # <!ELEMENT section1 (section2 | p | img)+>
  23: # <!ATTLIST section1
  24: #     title CDATA #REQUIRED
  25: # >
  26: #
  27: # <!ELEMENT section2 (p | img)+>
  28: # <!ATTLIST section2
  29: #     title CDATA #REQUIRED
  30: # >
  31: #
  32: # <!ELEMENT p (img | #PCDATA)>
  33: #
  34: # <!ELEMENT img>
  35: # <!ATTLIST img
  36: #     src CDATA #REQUIRED
  37: # >
  38: # 
  39: # Known issues:
  40: # * 'img'-elements are processed after a complete 'p' element is processed, 
  41: #   so the images that are meant to come 'in' the text, come 'after' the 
  42: #   text. The workaround for this should be to split the text up in multiple 
  43: #   p-elements and put the img-element between two p-elements.
  44: # * Images that are wider than the page fall off the page on the right side
  45: #
  46: # Bugs:
  47: # * Lots of them, no doubt
  48: #
  49: # Changes:
  50: #
  51: # 0.7: 'img' tags can now also be url's and no longer just local files
  52: # 0.6: It now also accepts images (JPG, GIF and PNG), changed the DTD 
  53: #      accordingly.
  54: # 0.5: Changed the program call, so you can process multiple files in one call
  55: # 0.4: It now can handle sections that don't fit on one page. It starts new 
  56: #      pages if needed.
  57: # 0.3: Adapted it to conform to OeufMayo's DTD
  58: # 0.2: Minor change (0.1 re-declared the global vars as 'my', thanks to davorg
  59: # 0.1: First release
  60: #
  61: ##############################################################################
  62: use strict;
  63: use PDFLib;
  64: use XML::XPath;
  65: use LWP::Simple;
  66: use vars qw ( $TOPMARGIN 
  67:               $BOTTOMMARGIN 
  68:               $LEFTMARGIN 
  69:               $RIGHTMARGIN 
  70:               $PAGEHEIGHTPOINTS 
  71:               $PAGEWIDTHPOINTS
  72: 	      $MAXYPOS
  73: 	      %FONTDEFINITION
  74: 	      %IMAGES
  75:              );
  76: # Define the fonts for each element
  77: %FONTDEFINITION = ( section1  => [face => "Helvetica-Bold", size => "16.0"],
  78:                     section2  => [face => "Helvetica-Bold", size => "14.0"],
  79:                     p         => [face => "Helvetica", size => "12.0"]
  80:                     );
  81: 
  82: # Pagesetup
  83: my $pagename      = "a4";
  84: $PAGEHEIGHTPOINTS = 842; # See PDFLib's documentation for 
  85: $PAGEWIDTHPOINTS  = 595; # the size of your page
  86: ($TOPMARGIN, $BOTTOMMARGIN, $LEFTMARGIN, $RIGHTMARGIN) = (10, 10, 15, 15);
  87: 
  88: foreach (@ARGV)
  89: {
  90:   # Documentsetup
  91:   my $xmlfile = $_;
  92:   my $xml = XML::XPath->new(filename => $xmlfile);
  93:   s/\.xml$/\.pdf/i if /\.xml$/i;
  94:   my $pdffile =  /\.pdf$/i ? $_ : "$_.pdf";
  95:   my $pdf = PDFLib->new( filename => $pdffile,
  96:                          papersize=> $pagename,
  97:                          creator  => "XML2PDF",
  98:                          title    => (($xml->findnodes('document'))[0]->getAttribute('name'))
  99:                         );
 100:   print "Converting $xmlfile to $pdffile \n";
 101: 
 102:   # pre-process the images:
 103:   foreach my $img ($xml->findnodes('//img'))
 104:   {
 105:     my $filetype = 'jpeg' if $img->getAttribute('src')=~ /[jpg|jpeg]$/i;
 106:     $filetype = 'gif' if $img->getAttribute('src')=~ /gif$/i;
 107:     $filetype = 'png' if $img->getAttribute('src')=~ /png$/i;
 108: 
 109:     my $filename = $img->getAttribute('src');
 110: 
 111:     if ($filename =~ /^http:\/\//i)
 112:     {
 113:       $filename =~ /.*\/(.*\..*)/;
 114:       print $1."\n";
 115:       getstore($img->getAttribute('src'), $1);
 116:       $filename=$1;
 117:     }  
 118: 
 119:     $IMAGES{$img->getAttribute('src')} =  
 120:        $pdf->load_image(filetype => $filetype || 'gif',
 121:                         filename => $filename );
 122:   }
 123: 
 124:   # Process the document
 125:   foreach my $section ($xml->findnodes('document/section1'))
 126:   {
 127:     #start every section on a new page
 128:     $pdf->start_page();
 129:     
 130:     # Calculate the starting Y-axis value
 131:     $MAXYPOS = $PAGEHEIGHTPOINTS - $TOPMARGIN - $BOTTOMMARGIN;
 132:     
 133:     # Print the header
 134:     printpdf($pdf, $FONTDEFINITION{section1}, 
 135:              "\n".$section->getAttribute('title')."\n\n", $MAXYPOS);
 136:     
 137:     processsection($pdf, $section);
 138:   }
 139: }
 140: 
 141: sub printpdf
 142: {
 143:     my ($pdf, $font, $what, $ypos) = @_;
 144:     $pdf->set_font( @{$font} );
 145:     my $charsnotprinted = length($what);
 146:     
 147:     while ($charsnotprinted)
 148:     {
 149:       $charsnotprinted = $pdf->print_boxed($what,
 150:                           mode => "left",
 151:                           x    => $LEFTMARGIN,
 152:                           y    => $BOTTOMMARGIN,
 153:                           w    => $PAGEWIDTHPOINTS - $RIGHTMARGIN - 
 154:                                    $LEFTMARGIN,
 155:                           h    => $ypos || $pdf->get_value("texty")
 156:                        );
 157:        $what = substr($what, (length($what) - $charsnotprinted), length($what));
 158:        $pdf->start_page() if $charsnotprinted;
 159:        $ypos = $PAGEHEIGHTPOINTS - $TOPMARGIN - $BOTTOMMARGIN;
 160:      }           
 161: }
 162: 
 163: sub processsection
 164: {
 165:   my ($pdf, $section) = @_;
 166:   foreach my $node ($section->findnodes('*'))
 167:   {
 168:     my $text;
 169:     if ($node->getName() eq "section2")
 170:     {
 171:       printpdf($pdf, $FONTDEFINITION{$node->getName()}, 
 172:                "\n".$node->getAttribute('title')."\n");
 173:       processsection($pdf, $node); #recurse!
 174:     }
 175:     if ($node->getName() eq "p")
 176:     {
 177:       printpdf($pdf, $FONTDEFINITION{$node->getName()}, 
 178:                "\n".$node->string_value()."\n");
 179:       processsection($pdf, $node); #recurse!
 180:     }
 181:     if ($node->getName() eq "img")
 182:     {
 183:       if ($pdf->get_value('texty')-$IMAGES{$node->getAttribute('src')}->height() < 0)
 184:       {
 185:         $pdf->start_page();
 186:         $pdf->set_text_pos($pdf->get_value('textx'), $PAGEHEIGHTPOINTS - $TOPMARGIN);
 187:       }
 188:       $pdf->add_image(img => $IMAGES{$node->getAttribute('src')},
 189:                       x   => $LEFTMARGIN,
 190: 		      y   => $pdf->get_value('texty')-$IMAGES{$node->getAttribute('src')}->height());
 191:       $pdf->set_text_pos($pdf->get_value('textx'), 
 192:                          $pdf->get_value('texty')-$IMAGES{$node->getAttribute('src')}->height());
 193:     }
 194:     
 195:   }
 196: 
 197: }

Replies are listed 'Best First'.
Re: XML2PDF
by davorg (Chancellor) on Jul 02, 2001 at 18:05 UTC

    That looks very interesting. I'd never heard of PDFLib.pm (or, indeed, pdflib). I'll be taking a closer look at them.

    One question tho'. You declare a number of package variables with this code:

    use vars qw ( $TOPMARGIN $BOTTOMMARGIN $LEFTMARGIN $RIGHTMARGIN $PAGEHEIGHTPOINTS $PAGEWIDTHPOINTS );

    And then later you declare a number of lexical variables with the same names with this code:

    my $PAGEHEIGHTPOINTS = 842; my $PAGEWIDTHPOINTS = 595; # the size of your page my ($TOPMARGIN, $BOTTOMMARGIN, $LEFTMARGIN, $RIGHTMARGIN) = (10, 10, 15, 15);

    You then go on to use the lexical variables, but never the package variables. Is this left over from an earlier version of the code?

    --
    <http://www.dave.org.uk>

    Perl Training in the UK <http://www.iterative-software.com>

      Absolutely! And thank you, I've updated it now!

      --HolyGrail
Re: XML2PDF
by the_0ne (Pilgrim) on Jul 03, 2001 at 08:42 UTC
    I finally find somebody that is using pdflib also. We've used this module for over 2 1/2 years on a pretty large shipping company's site. Back then there was no real pdf perl module, so pdflib was what we used. It was pretty easy and very stable. In fact, so stable, that we're still using version 2.0 and they're now up to 4.x. No sense changing what isn't broken.

    Thanks for the node.

    Update

    Forgot to mention the use of XML is great. Starting to make me rethink our site design now. :)
      I've been using it as well and while it has really saved us tons in development time it doesn't have or do everything we need.

      Anyone interested in PDF document dispatch might also wish to check out HTMLDOC It can be called in a Perl script using system. I use a homegrown templating routine to generate an RG authorization complete with tabular layout, store it temporaily as HTML from a MySQL table, and then pass the temp file name to HTMLDOC and poof! instant PDF file ready for faxing. It's even easier than PDFLib. Here's a fragment from an actual app

      our $template; &template_fun; $template =~ s/\#([^\#])\#/${$1}/ge; # see update note my $tmstamp = localtime; $tmstamp =~s/ |:|-/_/g; my $tmpfile = "D:\\pdfs\\" . $tmstamp . ".html"; open (HTMLFILE, ">$tmpfile") || die "cannot do it: $!"; print HTMLFILE "$template"; close(HTMLFILE); ## here's where we make the actual system call chdir "D:\\HTMLDOC\\"; system("htmldoc -t pdf --webpage -f D:\\pdfs\\" . $RGA_num . ".pdf $tm +pfile"); unlink $tmpfile; sub template_fun(){ $template = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2//EN"> <HTML> <HEAD> <TITLE></TITLE> </HEAD> <BODY> <center>Returned Goods Authorization</center> <TABLE border=0 width=595> <TR> <TD width=85 valign=top> <P> <IMG SRC="D:\inetpub\wwwroot\images\D2.jpg" ALIGN=left><BR cle +ar=all> </TD> <TD width=450 valign=top><H2>#companyName#<BR> <font size=-2>#ourphone# #ourfax#</font></H2> </TD> </TR> <TR> <TD width=85>&nbsp;</TD> <TD width=450> <TABLE BORDER=0 width=350> <TR> <TD valign=top>To: #attn#<BR> Company: #cust_name#<BR> Phone: #phone#<BR> #dmethod#: #fax# </TD> <TD valign=top>From: #tech#<BR> Fax back: #our_fax#<BR> Reason for return: #code#<BR> Date: #date#<BR> Order number: #ord_num# </TD> </TR> </TABLE><P> <font size=+1>RGA number: #RGA_num#</font> <font size=2>(Write thi +s number on the outside of your package)</font> #raddress# </TD> </TR> </TABLE> </BODY> </HTML>'; }
      # Update: just read Death to Dot Star!

Log In?
Username:
Password:

What's my password?
Create A New User
Domain Nodelet?
Chatterbox?
and the web crawler heard nothing...

How do I use this?Last hourOther CB clients
Other Users?
Others lurking in the Monastery: (4)
As of 2024-03-28 20:46 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    No recent polls found