Beefy Boxes and Bandwidth Generously Provided by pair Networks
Come for the quick hacks, stay for the epiphanies.
 
PerlMonks  

comment on

( [id://3333]=superdoc: print w/replies, xml ) Need Help??
#!/usr/bin/perl # ppt2doc.pl v1.00 use strict; use warnings; if ( ! $ARGV[0] || $ARGV[0] =~ /^-{1,2}(h|help|\?)$/i ) { system( "perldoc", $0 ) and die "For usage, perldoc $0\n"; exit 0; } use Cwd qw( abs_path ); # Use UTF-8 strings, not dumb ANSI transcoding that mangles non-Latin- +1. use Win32::OLE qw( in CP_UTF8 ); Win32::OLE->Option( CP => CP_UTF8 ); $Win32::OLE::Warn = 3; # Obtain the mso/wd/pp constants in the cleanest way. Asking for plain + # 'Microsoft Office' doesn't work for some reason: we need details. use Win32::OLE::Const; my $ole_const = { %{ Win32::OLE::Const->Load("Microsoft Word 11.0 Object Library") } +, %{ Win32::OLE::Const->Load("Microsoft PowerPoint 11.0 Object Libra +ry") }, %{ Win32::OLE::Const->Load("Microsoft Office 11.0 Object Library") + }, }; # Set up files - must be absolute paths as OLE server doesn't use your + CWD. my ( $pptfile, $docfile ) = @ARGV; $pptfile = abs_path( $pptfile ); ( my $file_stem = $pptfile ) =~ s/\.ppt$//i; if ( defined $docfile ) { $docfile = abs_path( $docfile ); } else { $docfile = $file_stem . '.doc'; } ####################### Extract from PowerPoint ###################### +######## # Create PowerPoint OLE server. Subroutines may legitimately manipulat +e this. #OLE server should 'Quit' if the script dies unexpectedly. my $ppt = Win32::OLE->new( 'PowerPoint.Application', 'Quit' ) or die "Can't create PowerPoint OLE: $!\n"; # Can't set this to 0, for reasons I can't/don't understand. $ppt->{Visible} = 1; my $pres = $ppt->Presentations->Open( $pptfile ) or die "Can't open PowerPoint file '$pptfile': $!\n"; my @slides; SLIDE: for my $slide ( in $ppt->ActivePresentation->Slides ) { # To select and extract images, we need to actively 'View' the sli +de in # question, which is an item from a 1-based 'array' of slides. # This also makes it clear where the processing has reached. $ppt->ActiveWindow->View->GotoSlide( $slide->{SlideNumber} ); next SLIDE unless slide_has_title( $slide ); # Slide type will be 'frontispiece', 'chapter', 'list' or 'normal' +. my $slide_type = get_slide_type( $slide ); my $picture_number = 1; my @slide_text; PLACEHOLDER: for my $placeholder ( in $slide->Shapes->Placeholders ) { if ( contains_table( $placeholder ) ) { push @slide_text, extract_table( $placeholder ); + } elsif ( contains_text( $placeholder ) ) { my $text_type = text_or_title( $placeholder ); PARAGRAPH: for my $paragraph ( in $placeholder->TextFrame->TextRange->Paragraphs ) { next PARAGRAPH unless is_worthwhile_text( $paragraph ) +; push @slide_text, extract_text( $paragraph, $text_type + ); } # Deal with icky special cases. if ( $slide_type eq 'frontispiece' ) { # The second placeholder of the 'frontispiece' slide c +ontains # a code number that should be prepended onto the titl +e. my ( $code_number ) = $slide->Shapes->Placeholders(2) ->TextFrame->TextRange->{Text} =~ /^(\w+)\s/i; # Ugly, but easiest way to do this: but this isn't OOP +, so # it's OK :) $slide_text[-1][1][0] = "$code_number $slide_text[-1][ +1][0]"; # Everything else on the slide should be discarded. + last PLACEHOLDER; } elsif ( $slide_type eq 'chapter' ) { # We also discard the non-title text on 'chapter' slid +es. last PLACEHOLDER; } } elsif ( contains_placeheld_picture( $placeholder ) ) { my $picture_filename = make_picture_filename ( $file_stem, $slide->{SlideNumber}, $picture_numb +er); push @slide_text, extract_picture ( $placeholder, $picture_filename ); $picture_number++; } else { next PLACEHOLDER; } } FLOATING_PICTURE: for my $shape ( in $slide->Shapes ) { # Harvest floating pictures in addition to those in placeholde +rs. # This will not include autoshapes or embedded OLE objects. next FLOATING_PICTURE unless contains_floating_picture( $shape + ); my $picture_filename = make_picture_filename ( $file_stem, $slide->{SlideNumber}, $picture_number); push @slide_text, extract_picture( $shape, $picture_filename ) +; $picture_number++; } push @slides, [ $slide_type => @slide_text ]; } ########################## Insert into Word ########################## +######## # Create Word OLE server. my $word = Win32::OLE->new( 'Word.Application', 'Quit' ) or die "Can't create Word OLE: $!\n"; $word->{Visible} = 1; my $doc = $word->Documents->Add or die "Can't create new Word document: $!\n"; # All the preprocessing leg-work has been done by the extractor. Print +ing # to Word is trivial with the type_paragraph subroutine. for my $slide ( @slides ) { my ( $slide_type, @paragraphs ) = @{ $slide }; for my $paragraph ( @paragraphs ) { type_paragraph( $paragraph, $slide_type ); } } $word->ActiveDocument->SaveAs( $docfile ); exit 0; ################################ Slides ############################## +######## sub get_slide_type { my ( $slide ) = @_; if ( $slide->{SlideNumber} == 1 ) { # 'frontispiece' is for the presentation's first slide; return 'frontispiece'; } elsif ( $slide->{Layout} == $ole_const->{ppLayoutTitle} ) { # 'chapter' is for the presentation's section heading slides; return 'chapter'; } elsif ( $slide->Shapes->Title->TextFrame->TextRange->{Text} =~ /^(Objectives|Summary|Test yourself|Answers)/i ) { # 'list' is for slides whose bulleted layouts should be mainta +ined; return 'list'; } else { # 'normal' is for generic text slides. return 'normal'; } } sub slide_has_title { my ( $slide ) = @_; if ( $slide->{Layout} == $ole_const->{ppLayoutBlank} || ! $slide->Shapes->Count ) { warn "Skipping slide $slide->{SlideNumber}: no content\n"; return; } elsif ( ! $slide->Shapes->{HasTitle} ) { warn "Skipping slide $slide->{SlideNumber}: no title\n"; return; } else { return 1; } } ############################# Formatting ############################# +######## sub get_style_for { my ( $slide_type, $text_type ) = @_; my %style_for = ( frontispiece => { title => 'Heading 1' + }, chapter => { title => 'Heading 2' + }, list => { title => 'Heading 2', text => 'Bullets' + }, normal => { title => 'Heading 3', text => 'Normal', picture => 'Figu +re' }, ); return exists $style_for{$slide_type}{$text_type} ? $style_for{$slide_type}{$text_type} : 'Normal' } ############################ Titles and Text ######################### +######## sub contains_text { my ( $placeholder ) = @_; return 1 if $placeholder->{HasTextFrame}; return; } sub is_worthwhile_text { my ( $paragraph ) = @_; return 1 if $paragraph->{Text} !~ /^\s*$/; return; } sub text_or_title { my ( $placeholder ) = @_; for ( $placeholder->PlaceholderFormat->{Type} ) { return 'title' if $_ == $ole_const->{ppPlaceholderTitle}; return 'title' if $_ == $ole_const->{ppPlaceholderCenterTitle} +; return 'title' if $_ == $ole_const->{ppPlaceholderVerticalTitl +e}; return 'title' if $_ == $ole_const->{ppPlaceholderSubtitle}; } return 'text'; } sub extract_text { my ( $paragraph, $text_type ) = @_; $text_type ||= 'text'; my @paragraph_text; RUN: for my $run ( in $paragraph->Runs ) { # A run is a section of text with the same font properties. my $run_item = _extract_run( $run ); push @paragraph_text, $run_item; } return [ $text_type, @paragraph_text ]; } sub _type_run { my ( $run ) = @_; my $selection = $word->Selection; my $run_text = shift @{ $run }; my %font_properties = @{ $run }; while ( my ( $font_property, $value ) = each %font_properties ) { $selection->Font->{ $font_property } = $value; } $selection->TypeText( $run_text ); } sub _extract_run { my ( $run ) = @_; my %font_properties; my $run_text = $run->{Text}; for my $name ( qw{ Italic Bold Subscript Superscript } ) { # Save these important font properties for each run, # so we don't have to manually reformat Arabidopsis and # CH4. Note that this assumes ppTrue == wdTrue == msoTrue, whi +ch # is probably true. $font_properties{ $name } = $run->Font->$name; } # Remove 'smart' "quotes" -- and em dashes. $run_text =~ tr/\x91\x92\x93\x94\x96\x97/\'\'\"\"--/; # Remove trailing newlines from bulleted lists. $run_text =~ s/( \x{0a}\x{0d} | \x{0a} | \x{0d} )+ $//x; return [ $run_text => %font_properties ]; } ################################ Paragraph ########################### +######## sub type_paragraph { my ( $paragraph, $slide_type ) = @_; my $selection = $word->Selection; my $paragraph_type = shift @{ $paragraph }; if ( $paragraph_type eq 'table' ) { my $table = shift @{ $paragraph }; _type_table( $table, $slide_type ); } elsif ( $paragraph_type eq 'picture' ) { my ( $filename ) = shift @{ $paragraph }; _type_picture( $filename, $slide_type ); } elsif ( $paragraph_type eq 'text' || $paragraph_type eq 'title' ) { _type_formatted_text ( $paragraph, get_style_for( $slide_type, $paragraph_type +) ); } else { warn "Unsupported paragraph type $paragraph_type\n"; } } sub _type_formatted_text { my ( $paragraph, $style ) = @_; my $selection = $word->Selection; $selection->{Style} = $style; for my $run ( @{ $paragraph } ) { _type_run( $run ); } $selection->TypeParagraph(); $selection->{Style} = 'Normal'; } ############################### Pictures ############################# +######## sub make_picture_filename { my ( $file_stem, $slide_number, $picture_number ) = @_; return $file_stem . '_' . $slide_number . '_' . $picture_number . +'.png'; } sub contains_floating_picture { my ( $shape ) = @_; return 1 if $shape->{Type} == $ole_const->{msoPicture}; return; } sub contains_placeheld_picture { my ( $placeholder ) = @_; # This is a hack, but if you check a placeholder's Type property # for == msoPicture, it's false, because its placeholderiness # overrides its picturiness. Why isn't there a # HasPicture property? return 1 if defined $placeholder->{PictureFormat}; return; } sub extract_picture { my ( $shape, $picture_filename ) = @_; # Select the shape containing the picture. $shape->Select; # Export the selected shape range as a PNG graphic. We'll store # the filename for later. $ppt->ActiveWindow->Selection->ShapeRange->Export ( $picture_filename, $ole_const->{ppShapeFormatPNG} ); return [ 'picture' => $picture_filename ]; } sub _type_picture { my ( $picture_filename, $slide_type ) = @_; my $selection = $word->Selection; # Insert inlined picture (a simple ->Shapes will float the picture +). $selection->InlineShapes->AddPicture ( $picture_filename, { LinkToFile => $ole_const->{msoFalse}, SaveWithDocument => $ole_const->{msoTrue}, } ); $selection->{Style} = get_style_for( $slide_type, 'picture' ); # Add a newline so we don't end up typing on the same line as the +picture. $selection->TypeParagraph(); } ################################ Tables ############################## +######## sub contains_table { my ( $placeholder ) = @_; return 1 if $placeholder->{HasTable}; return; } sub extract_table { my ( $shape ) = @_; my $table; # Extract data from table cell-by-cell, store in simple AoA. ROW: for my $i ( 1 .. $shape->Table->Rows->Count ) { CELL: for my $j ( 1 .. $shape->Table->Rows($i)->Cells->Count ) { my $cell = $shape->Table->Rows($i)->Cells($j)->Shape; my @cell_text; PARAGRAPH: for my $paragraph ( in $cell->TextFrame->TextRange->Paragr +aphs ) { push @cell_text, extract_text( $paragraph ); } # Convert to Perl 0-based arrays. $table->[ $i - 1 ][ $j - 1 ] = \@cell_text; } } return [ 'table' => $table ]; } sub _type_table { my ( $table, $slide_type ) = @_; my $selection = $word->Selection; # Collate table size and placement (range) parameters. my $number_of_rows = scalar @{ $table }; my $number_of_columns = scalar @{ $table->[0] }; my $end_of_doc = $selection->End; my $range = $word->ActiveDocument->Range ( { Start => $end_of_doc, End => $end_of_doc } ); # Insert table of calculated size at given range. my $table_obj = $selection->Tables->Add ( $range, $number_of_rows, $number_of_columns ); # Add data cell-by-cell. for my $i ( 0 .. $#{ $table } ) { for my $j ( 0 .. $#{ $table->[$i] } ) { # Use Perl 0-based arrays. $table_obj->Cell( $i + 1, $j + 1 )->Select; for my $paragraph ( @{ $table->[$i][$j] } ) { type_paragraph( $paragraph, $slide_type ); # It'd be nice to supress the newline at the end # of each cell, but I can't think of a nice way to do +this. } } } # Set selection outside the end of the table (end of document, act +ually). $selection->EndKey( $ole_const->{wdStory} ); } ###################################################################### +######## __END__ =head1 NAME ppt2doc.pl - Perl script to convert PowerPoint presentations to Word d +ocuments =head1 SYNOPSIS perl ppt2doc.pl PPTFILE [WORDFILE] =head1 ABSTRACT Convert PowerPoint presentation to Word document =head1 DESCRIPTION Converts PowerPoint presentation C<PPTFILE> to Word document C<WORDFIL +E>. If C<WORDFILE> is not specified, the data will be output to a file of the same name with the C<.ppt> extension exchanged for <.doc>. Pictures in PowerPoint placeholders will be extracted to PNG files nam +ed C<pptname_slidenumber_picturenumber.png>, where C<pptname> is the name + of the presentation, less its extension, C<slidenumber> is the number of +the slide on which the picture was found, and C<picturenumber> is the numb +er of the picture on that particular slide. The script is obviously tuned for converting my lecture slides to lect +ure notes, but the code shows how you can handle tables, paragraphs, title +s and images using OLE. =head1 SEE ALSO L<perl> L<Win32::OLE> L<Win32::OLE::Const> L<http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vba +pp10/html/pptocObjectModelApplication.asp> - Microsoft Office XP PowerPoint object model. =head1 AUTHOR Steve Cook, E<lt>steve@steve.gb.comE<gt> =head1 COPYRIGHT AND LICENSE Copyright 2005 by Steve Cook This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut

In reply to Convert PowerPoint Presentation to Word Document with Win32::OLE by polypompholyx

Title:
Use:  <p> text here (a paragraph) </p>
and:  <code> code here </code>
to format your post; it's "PerlMonks-approved HTML":



  • Are you posting in the right place? Check out Where do I post X? to know for sure.
  • Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
    <code> <a> <b> <big> <blockquote> <br /> <dd> <dl> <dt> <em> <font> <h1> <h2> <h3> <h4> <h5> <h6> <hr /> <i> <li> <nbsp> <ol> <p> <small> <strike> <strong> <sub> <sup> <table> <td> <th> <tr> <tt> <u> <ul>
  • Snippets of code should be wrapped in <code> tags not <pre> tags. In fact, <pre> tags should generally be avoided. If they must be used, extreme care should be taken to ensure that their contents do not have long lines (<70 chars), in order to prevent horizontal scrolling (and possible janitor intervention).
  • Want more info? How to link or How to display code and escape characters are good places to start.
Log In?
Username:
Password:

What's my password?
Create A New User
Domain Nodelet?
Chatterbox?
and the web crawler heard nothing...

How do I use this?Last hourOther CB clients
Other Users?
Others drinking their drinks and smoking their pipes about the Monastery: (4)
As of 2024-04-20 02:20 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    No recent polls found