comment on

#!/usr/bin/perl 

# ppt2doc.pl v1.00

use strict;
use warnings;

if ( ! $ARGV[0] || $ARGV[0] =~ /^-{1,2}(h|help|\?)$/i )
{
    system( "perldoc", $0 ) and die "For usage, perldoc $0\n";
    exit 0;
}

use Cwd qw( abs_path );

# Use UTF-8 strings, not dumb ANSI transcoding that mangles non-Latin-
+1.
use Win32::OLE qw( in CP_UTF8 );
Win32::OLE->Option( CP => CP_UTF8 );
$Win32::OLE::Warn = 3;

# Obtain the mso/wd/pp constants in the cleanest way. Asking for plain
+ 
# 'Microsoft Office' doesn't work for some reason: we need details.
use Win32::OLE::Const;
my $ole_const =
{ 
    %{ Win32::OLE::Const->Load("Microsoft Word 11.0 Object Library") }
+, 
    %{ Win32::OLE::Const->Load("Microsoft PowerPoint 11.0 Object Libra
+ry") }, 
    %{ Win32::OLE::Const->Load("Microsoft Office 11.0 Object Library")
+ }, 
};

# Set up files - must be absolute paths as OLE server doesn't use your
+ CWD.
my ( $pptfile, $docfile ) = @ARGV;
$pptfile = abs_path( $pptfile );
( my $file_stem = $pptfile ) =~ s/\.ppt$//i;
if ( defined $docfile )
{
    $docfile = abs_path( $docfile );
}
else
{
    $docfile = $file_stem . '.doc';
}

####################### Extract from PowerPoint ######################
+########

# Create PowerPoint OLE server. Subroutines may legitimately manipulat
+e this.
#OLE server should 'Quit' if the script dies unexpectedly.
my $ppt = Win32::OLE->new( 'PowerPoint.Application', 'Quit' )
    or die "Can't create PowerPoint OLE: $!\n";

# Can't set this to 0, for reasons I can't/don't understand.
$ppt->{Visible} = 1;

my $pres = $ppt->Presentations->Open( $pptfile ) 
    or die "Can't open PowerPoint file '$pptfile': $!\n";        

my @slides;

SLIDE:
for my $slide ( in $ppt->ActivePresentation->Slides )
{    
    # To select and extract images, we need to actively 'View' the sli
+de in 
    # question, which is an item from a 1-based 'array' of slides. 
    # This also makes it clear where the processing has reached.
    $ppt->ActiveWindow->View->GotoSlide( $slide->{SlideNumber} );
    
    next SLIDE unless slide_has_title( $slide );
    
    # Slide type will be 'frontispiece', 'chapter', 'list' or 'normal'
+.
    my $slide_type = get_slide_type( $slide );
    
    my $picture_number = 1;
    my @slide_text;
    
    PLACEHOLDER:
    for my $placeholder ( in $slide->Shapes->Placeholders ) 
    {            
        if ( contains_table( $placeholder ) )
        {
            push @slide_text, extract_table( $placeholder );          
+  
        }
        elsif ( contains_text( $placeholder ) )
        {
            my $text_type = text_or_title( $placeholder );
            PARAGRAPH:
            for my $paragraph 
            ( in $placeholder->TextFrame->TextRange->Paragraphs )
            {    
                next PARAGRAPH unless is_worthwhile_text( $paragraph )
+;
                push @slide_text, extract_text( $paragraph, $text_type
+ );
            }        

            # Deal with icky special cases.
            if ( $slide_type eq 'frontispiece' )
            {
                # The second placeholder of the 'frontispiece' slide c
+ontains 
                # a code number that should be prepended onto the titl
+e.
                my ( $code_number ) = 
                    $slide->Shapes->Placeholders(2)
                        ->TextFrame->TextRange->{Text}
                    =~ /^(\w+)\s/i;
                # Ugly, but easiest way to do this: but this isn't OOP
+, so
                # it's OK :)
                $slide_text[-1][1][0] = "$code_number $slide_text[-1][
+1][0]";

                # Everything else on the slide should be discarded.   
+             
                last PLACEHOLDER;
            }
            elsif ( $slide_type eq 'chapter' )
            {
                # We also discard the non-title text on 'chapter' slid
+es.
                last PLACEHOLDER;
            }
        }
        elsif ( contains_placeheld_picture( $placeholder ) )
        {
            my $picture_filename = 
                make_picture_filename
                    ( $file_stem, $slide->{SlideNumber}, $picture_numb
+er);        
            push @slide_text, extract_picture
                ( $placeholder, $picture_filename );
            $picture_number++;
        }
        else
        {
            next PLACEHOLDER;
        }
    }
    
    FLOATING_PICTURE:
    for my $shape ( in $slide->Shapes ) 
    {
        # Harvest floating pictures in addition to those in placeholde
+rs.
        # This will not include autoshapes or embedded OLE objects.
        next FLOATING_PICTURE unless contains_floating_picture( $shape
+ );
        my $picture_filename = 
            make_picture_filename
                ( $file_stem, $slide->{SlideNumber}, $picture_number);
        push @slide_text, extract_picture( $shape, $picture_filename )
+;    
        $picture_number++;
    }
    
    push @slides, [ $slide_type => @slide_text ];
}

########################## Insert into Word ##########################
+########

# Create Word OLE server.
my $word = Win32::OLE->new( 'Word.Application', 'Quit' )
    or die "Can't create Word OLE: $!\n";
$word->{Visible} = 1;
my $doc = $word->Documents->Add
    or die "Can't create new Word document: $!\n";

# All the preprocessing leg-work has been done by the extractor. Print
+ing
# to Word is trivial with the type_paragraph subroutine.
for my $slide ( @slides )
{
    my ( $slide_type, @paragraphs ) = @{ $slide };
    for my $paragraph ( @paragraphs )
    {
        type_paragraph( $paragraph, $slide_type );
    }
}

$word->ActiveDocument->SaveAs( $docfile );

exit 0;

################################ Slides ##############################
+########

sub get_slide_type
{
    my ( $slide ) = @_;
    if ( $slide->{SlideNumber} == 1 )
    {
        # 'frontispiece' is for the presentation's first slide;
        return 'frontispiece';
    }
    elsif ( $slide->{Layout} == $ole_const->{ppLayoutTitle} )
    {
        # 'chapter' is for the presentation's section heading slides;
        return 'chapter';
    }
    elsif
    (
        $slide->Shapes->Title->TextFrame->TextRange->{Text}
            =~ /^(Objectives|Summary|Test yourself|Answers)/i
    )
    {
        # 'list' is for slides whose bulleted layouts should be mainta
+ined;
        return 'list';
    }
    else
    {
        # 'normal' is for generic text slides.
        return 'normal';
    }
}

sub slide_has_title
{
    my ( $slide ) = @_;
    if
    (
        $slide->{Layout} == $ole_const->{ppLayoutBlank} 
            || ! $slide->Shapes->Count
    )
    {
        warn "Skipping slide $slide->{SlideNumber}: no content\n";
        return;
    }
    elsif ( ! $slide->Shapes->{HasTitle} )
    {
        warn "Skipping slide $slide->{SlideNumber}: no title\n";
        return;
    }
    else
    {
        return 1;
    }
}

############################# Formatting #############################
+########

sub get_style_for
{
    my ( $slide_type, $text_type ) = @_;
    my %style_for =
    (
        frontispiece => 
            { title => 'Heading 1'                                    
+    },
        chapter      => 
            { title => 'Heading 2'                                    
+    },
        list         => 
            { title => 'Heading 2', text => 'Bullets'                 
+    },
        normal       => 
            { title => 'Heading 3', text => 'Normal', picture => 'Figu
+re' },    
    );
    return exists $style_for{$slide_type}{$text_type} 
        ? $style_for{$slide_type}{$text_type} 
        : 'Normal'
}

############################ Titles and Text #########################
+########

sub contains_text
{
    my ( $placeholder ) = @_;
    return 1 if $placeholder->{HasTextFrame};
    return;
}

sub is_worthwhile_text
{
    my ( $paragraph ) = @_;
    return 1 if $paragraph->{Text} !~ /^\s*$/;
    return;        
}

sub text_or_title
{
    my ( $placeholder ) = @_;
    for ( $placeholder->PlaceholderFormat->{Type} )
    {
        return 'title' if $_ == $ole_const->{ppPlaceholderTitle};
        return 'title' if $_ == $ole_const->{ppPlaceholderCenterTitle}
+;
        return 'title' if $_ == $ole_const->{ppPlaceholderVerticalTitl
+e};
        return 'title' if $_ == $ole_const->{ppPlaceholderSubtitle};
    }
    return 'text';
}

sub extract_text
{
    my ( $paragraph, $text_type ) = @_;
    $text_type ||= 'text';
    my @paragraph_text;                    
    RUN:
    for my $run ( in $paragraph->Runs )
    {    
        # A run is a section of text with the same font properties.
        my $run_item = _extract_run( $run );
        push @paragraph_text, $run_item;
    }
    return [ $text_type, @paragraph_text ];
}

sub _type_run
{    
    my ( $run ) = @_;
    my $selection = $word->Selection;
    my $run_text  = shift @{ $run };
    my %font_properties = @{ $run }; 
    while ( my ( $font_property, $value ) = each %font_properties )
    {
        $selection->Font->{ $font_property } = $value; 
    }
    $selection->TypeText( $run_text );
}

sub _extract_run
{
    my ( $run ) = @_;
    my %font_properties;
    my $run_text = $run->{Text};
    for my $name ( qw{ Italic Bold Subscript Superscript } )
    {
        # Save these important font properties for each run,
        # so we don't have to manually reformat Arabidopsis and 
        # CH4. Note that this assumes ppTrue == wdTrue == msoTrue, whi
+ch
        # is probably true.
        $font_properties{ $name } = $run->Font->$name;
    }
    
    # Remove 'smart' "quotes" -- and em dashes.
    $run_text 
        =~ tr/\x91\x92\x93\x94\x96\x97/\'\'\"\"--/; 
    
    # Remove trailing newlines from bulleted lists.
    $run_text 
        =~ s/( \x{0a}\x{0d} | \x{0a} | \x{0d} )+ $//x;    
    
    return [ $run_text => %font_properties ];
}

################################ Paragraph ###########################
+########

sub type_paragraph
{
    my ( $paragraph, $slide_type ) = @_;
    my $selection = $word->Selection;
    my $paragraph_type = shift @{ $paragraph };

    if ( $paragraph_type eq 'table' )
    {
        my $table = shift @{ $paragraph };
        _type_table( $table, $slide_type );
    }
    elsif ( $paragraph_type eq 'picture' )
    {
        my ( $filename ) = shift @{ $paragraph };
        _type_picture( $filename, $slide_type );
    }
    elsif ( $paragraph_type eq 'text' || $paragraph_type eq 'title' )
    {
        _type_formatted_text
            ( $paragraph, get_style_for( $slide_type, $paragraph_type 
+) );
    }
    else
    {
        warn "Unsupported paragraph type $paragraph_type\n";
    }
}

sub _type_formatted_text
{
    my ( $paragraph, $style ) = @_;
    my $selection = $word->Selection;
    $selection->{Style} = $style;
    for my $run ( @{ $paragraph } )
    {
        _type_run( $run );
    }
    $selection->TypeParagraph();
    $selection->{Style} = 'Normal';    
}

############################### Pictures #############################
+########

sub make_picture_filename
{
    my ( $file_stem, $slide_number, $picture_number ) = @_;
    return $file_stem . '_' . $slide_number . '_' . $picture_number . 
+'.png';
}

sub contains_floating_picture
{
    my ( $shape ) = @_;
    return 1 if $shape->{Type} == $ole_const->{msoPicture};
    return;
}

sub contains_placeheld_picture
{
    my ( $placeholder ) = @_;
    # This is a hack, but if you check a placeholder's Type property
    # for == msoPicture, it's false, because its placeholderiness
    # overrides its picturiness. Why isn't there a 
    # HasPicture property?
    return 1 if defined $placeholder->{PictureFormat};
    return;    
}

sub extract_picture
{
    my ( $shape, $picture_filename ) = @_;
    
    # Select the shape containing the picture.
    $shape->Select;            
    
    # Export the selected shape range as a PNG graphic. We'll store
    # the filename for later.
    $ppt->ActiveWindow->Selection->ShapeRange->Export
        ( $picture_filename, $ole_const->{ppShapeFormatPNG} );
    
    return [ 'picture' => $picture_filename ];
}

sub _type_picture
{
    my ( $picture_filename, $slide_type ) = @_;
    my $selection = $word->Selection;
    
    # Insert inlined picture (a simple ->Shapes will float the picture
+).
    $selection->InlineShapes->AddPicture
    ( $picture_filename, 
        { 
            LinkToFile       => $ole_const->{msoFalse}, 
            SaveWithDocument => $ole_const->{msoTrue},
        } 
    );
    $selection->{Style} = get_style_for( $slide_type, 'picture' );
    # Add a newline so we don't end up typing on the same line as the 
+picture.
    $selection->TypeParagraph();
}

################################ Tables ##############################
+########

sub contains_table
{
    my ( $placeholder ) = @_;
    return 1 if $placeholder->{HasTable};
    return;
}

sub extract_table
{
    my ( $shape ) = @_;
    my $table;
    
    # Extract data from table cell-by-cell, store in simple AoA.
    ROW:
    for my $i ( 1 .. $shape->Table->Rows->Count )
    {
        CELL:
        for my $j ( 1 .. $shape->Table->Rows($i)->Cells->Count )
        {
            my $cell = $shape->Table->Rows($i)->Cells($j)->Shape;
            
            my @cell_text;
            
            PARAGRAPH:
            for my $paragraph ( in $cell->TextFrame->TextRange->Paragr
+aphs )
            {    
                push @cell_text, extract_text( $paragraph );
            }
            
            # Convert to Perl 0-based arrays.
            $table->[ $i - 1 ][ $j - 1 ] = \@cell_text;    
        }
    }
    return [ 'table' => $table ];
}

sub _type_table
{
    my ( $table, $slide_type ) = @_;
    
    my $selection = $word->Selection;
    
    # Collate table size and placement (range) parameters.
    my $number_of_rows    = scalar @{ $table };
    my $number_of_columns = scalar @{ $table->[0] };
    my $end_of_doc        = $selection->End;
    my $range = $word->ActiveDocument->Range
        ( { Start => $end_of_doc, End => $end_of_doc } );
    
    # Insert table of calculated size at given range.
    my $table_obj = $selection->Tables->Add
        ( $range, $number_of_rows, $number_of_columns );
        
    # Add data cell-by-cell.
    for my $i ( 0 .. $#{ $table } )
    {
        for my $j ( 0 .. $#{ $table->[$i] } )
        {
            # Use Perl 0-based arrays.
            $table_obj->Cell( $i + 1, $j + 1 )->Select;
            for my $paragraph ( @{ $table->[$i][$j] } )
            {
                type_paragraph( $paragraph, $slide_type );
                # It'd be nice to supress the newline at the end
                # of each cell, but I can't think of a nice way to do 
+this.
            }
        }
    }
    # Set selection outside the end of the table (end of document, act
+ually).
    $selection->EndKey( $ole_const->{wdStory} );
}

######################################################################
+########

__END__

=head1 NAME

ppt2doc.pl - Perl script to convert PowerPoint presentations to Word d
+ocuments

=head1 SYNOPSIS

    perl ppt2doc.pl PPTFILE [WORDFILE]

=head1 ABSTRACT

Convert PowerPoint presentation to Word document

=head1 DESCRIPTION

Converts PowerPoint presentation C<PPTFILE> to Word document C<WORDFIL
+E>.
If C<WORDFILE> is not specified, the data will be output to a file
of the same name with the C<.ppt> extension exchanged for <.doc>.

Pictures in PowerPoint placeholders will be extracted to PNG files nam
+ed
C<pptname_slidenumber_picturenumber.png>, where C<pptname> is the name
+ of
the presentation, less its extension, C<slidenumber> is the number of 
+the 
slide on which the picture was found, and C<picturenumber> is the numb
+er of 
the picture on that particular slide.
 
The script is obviously tuned for converting my lecture slides to lect
+ure 
notes, but the code shows how you can handle tables, paragraphs, title
+s and 
images using OLE.

=head1 SEE ALSO

L<perl>

L<Win32::OLE>

L<Win32::OLE::Const>

L<http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vba
+pp10/html/pptocObjectModelApplication.asp> -
Microsoft Office XP PowerPoint object model.

=head1 AUTHOR

Steve Cook, E<lt>steve@steve.gb.comE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright 2005 by Steve Cook

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself. 

=cut
[download]
In reply to Convert PowerPoint Presentation to Word Document with Win32::OLE by polypompholyx
Are you posting in the right place? Check out Where do I post X? to know for sure.
Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
<code> <a> <b> <big> <blockquote> <br /> <dd> <dl> <dt> <em> <font> <h1> <h2> <h3> <h4> <h5> <h6> <hr /> <i> <li> <nbsp> <ol> <p> <small> <strike> <strong> <sub> <sup> <table> <td> <th> <tr> <tt> <u> <ul>
Snippets of code should be wrapped in <code> tags not <pre> tags. In fact, <pre> tags should generally be avoided. If they must be used, extreme care should be taken to ensure that their contents do not have long lines (<70 chars), in order to prevent horizontal scrolling (and possible janitor intervention).
Want more info? How to link or How to display code and escape characters are good places to start.

Come for the quick hacks, stay for the epiphanies.
	PerlMonks