1: #!/usr/bin/perl -w
2: ##############################################################################
3: #
4: # XML2PDF
5: #
6: # Author : HolyGrail
7: # Version: 0.7
8: #
9: # Usage: xml2pdf {<xmlfile>}
10: #
11: # This conversionprogram translates XML into PDF. You can specify more than one
12: # xmlfile at the commandline and they will all be processed. The PDF files will
13: # replace ".xml" with ".pdf" or add ".pdf" if the sourcefile does not end with
14: # ".xml". It does no XML validation at all. The XML file should conform to the
15: # following DTD:
16: #
17: # <!ELEMENT document (section1)+>
18: # <!ATTLIST document
19: # title CDATA #REQUIRED
20: # >
21: #
22: # <!ELEMENT section1 (section2 | p | img)+>
23: # <!ATTLIST section1
24: # title CDATA #REQUIRED
25: # >
26: #
27: # <!ELEMENT section2 (p | img)+>
28: # <!ATTLIST section2
29: # title CDATA #REQUIRED
30: # >
31: #
32: # <!ELEMENT p (img | #PCDATA)>
33: #
34: # <!ELEMENT img>
35: # <!ATTLIST img
36: # src CDATA #REQUIRED
37: # >
38: #
39: # Known issues:
40: # * 'img'-elements are processed after a complete 'p' element is processed,
41: # so the images that are meant to come 'in' the text, come 'after' the
42: # text. The workaround for this should be to split the text up in multiple
43: # p-elements and put the img-element between two p-elements.
44: # * Images that are wider than the page fall off the page on the right side
45: #
46: # Bugs:
47: # * Lots of them, no doubt
48: #
49: # Changes:
50: #
51: # 0.7: 'img' tags can now also be url's and no longer just local files
52: # 0.6: It now also accepts images (JPG, GIF and PNG), changed the DTD
53: # accordingly.
54: # 0.5: Changed the program call, so you can process multiple files in one call
55: # 0.4: It now can handle sections that don't fit on one page. It starts new
56: # pages if needed.
57: # 0.3: Adapted it to conform to OeufMayo's DTD
58: # 0.2: Minor change (0.1 re-declared the global vars as 'my', thanks to davorg
59: # 0.1: First release
60: #
61: ##############################################################################
62: use strict;
63: use PDFLib;
64: use XML::XPath;
65: use LWP::Simple;
66: use vars qw ( $TOPMARGIN
67: $BOTTOMMARGIN
68: $LEFTMARGIN
69: $RIGHTMARGIN
70: $PAGEHEIGHTPOINTS
71: $PAGEWIDTHPOINTS
72: $MAXYPOS
73: %FONTDEFINITION
74: %IMAGES
75: );
76: # Define the fonts for each element
77: %FONTDEFINITION = ( section1 => [face => "Helvetica-Bold", size => "16.0"],
78: section2 => [face => "Helvetica-Bold", size => "14.0"],
79: p => [face => "Helvetica", size => "12.0"]
80: );
81:
82: # Pagesetup
83: my $pagename = "a4";
84: $PAGEHEIGHTPOINTS = 842; # See PDFLib's documentation for
85: $PAGEWIDTHPOINTS = 595; # the size of your page
86: ($TOPMARGIN, $BOTTOMMARGIN, $LEFTMARGIN, $RIGHTMARGIN) = (10, 10, 15, 15);
87:
88: foreach (@ARGV)
89: {
90: # Documentsetup
91: my $xmlfile = $_;
92: my $xml = XML::XPath->new(filename => $xmlfile);
93: s/\.xml$/\.pdf/i if /\.xml$/i;
94: my $pdffile = /\.pdf$/i ? $_ : "$_.pdf";
95: my $pdf = PDFLib->new( filename => $pdffile,
96: papersize=> $pagename,
97: creator => "XML2PDF",
98: title => (($xml->findnodes('document'))[0]->getAttribute('name'))
99: );
100: print "Converting $xmlfile to $pdffile \n";
101:
102: # pre-process the images:
103: foreach my $img ($xml->findnodes('//img'))
104: {
105: my $filetype = 'jpeg' if $img->getAttribute('src')=~ /[jpg|jpeg]$/i;
106: $filetype = 'gif' if $img->getAttribute('src')=~ /gif$/i;
107: $filetype = 'png' if $img->getAttribute('src')=~ /png$/i;
108:
109: my $filename = $img->getAttribute('src');
110:
111: if ($filename =~ /^http:\/\//i)
112: {
113: $filename =~ /.*\/(.*\..*)/;
114: print $1."\n";
115: getstore($img->getAttribute('src'), $1);
116: $filename=$1;
117: }
118:
119: $IMAGES{$img->getAttribute('src')} =
120: $pdf->load_image(filetype => $filetype || 'gif',
121: filename => $filename );
122: }
123:
124: # Process the document
125: foreach my $section ($xml->findnodes('document/section1'))
126: {
127: #start every section on a new page
128: $pdf->start_page();
129:
130: # Calculate the starting Y-axis value
131: $MAXYPOS = $PAGEHEIGHTPOINTS - $TOPMARGIN - $BOTTOMMARGIN;
132:
133: # Print the header
134: printpdf($pdf, $FONTDEFINITION{section1},
135: "\n".$section->getAttribute('title')."\n\n", $MAXYPOS);
136:
137: processsection($pdf, $section);
138: }
139: }
140:
141: sub printpdf
142: {
143: my ($pdf, $font, $what, $ypos) = @_;
144: $pdf->set_font( @{$font} );
145: my $charsnotprinted = length($what);
146:
147: while ($charsnotprinted)
148: {
149: $charsnotprinted = $pdf->print_boxed($what,
150: mode => "left",
151: x => $LEFTMARGIN,
152: y => $BOTTOMMARGIN,
153: w => $PAGEWIDTHPOINTS - $RIGHTMARGIN -
154: $LEFTMARGIN,
155: h => $ypos || $pdf->get_value("texty")
156: );
157: $what = substr($what, (length($what) - $charsnotprinted), length($what));
158: $pdf->start_page() if $charsnotprinted;
159: $ypos = $PAGEHEIGHTPOINTS - $TOPMARGIN - $BOTTOMMARGIN;
160: }
161: }
162:
163: sub processsection
164: {
165: my ($pdf, $section) = @_;
166: foreach my $node ($section->findnodes('*'))
167: {
168: my $text;
169: if ($node->getName() eq "section2")
170: {
171: printpdf($pdf, $FONTDEFINITION{$node->getName()},
172: "\n".$node->getAttribute('title')."\n");
173: processsection($pdf, $node); #recurse!
174: }
175: if ($node->getName() eq "p")
176: {
177: printpdf($pdf, $FONTDEFINITION{$node->getName()},
178: "\n".$node->string_value()."\n");
179: processsection($pdf, $node); #recurse!
180: }
181: if ($node->getName() eq "img")
182: {
183: if ($pdf->get_value('texty')-$IMAGES{$node->getAttribute('src')}->height() < 0)
184: {
185: $pdf->start_page();
186: $pdf->set_text_pos($pdf->get_value('textx'), $PAGEHEIGHTPOINTS - $TOPMARGIN);
187: }
188: $pdf->add_image(img => $IMAGES{$node->getAttribute('src')},
189: x => $LEFTMARGIN,
190: y => $pdf->get_value('texty')-$IMAGES{$node->getAttribute('src')}->height());
191: $pdf->set_text_pos($pdf->get_value('textx'),
192: $pdf->get_value('texty')-$IMAGES{$node->getAttribute('src')}->height());
193: }
194:
195: }
196:
197: }