use Archive::Zip qw( :ERROR_CODES ); use Win32::OLE; my $mainfilepath="test.pptx"; my $zip = Archive::Zip->new(); # read the Word document, that is the ZIP file $zip->read( $mainfilepath ) == AZ_OK or die "Unable to open Office file\n"; my $wfh = $zip->extractMember( 'ppt/slides/slide1.xml' ); # Routine for reading file into $content variable. # Source is Perl Monks: http://www.perlmonks.org/?node_id=1952 # { local $/=undef; open (FILE,"<:utf8","ppt/slides/slide1.xml") || (message_error01()) ; binmode FILE; $content = ; close FILE; } my $nl = "\n"; # Alternative is "\r\n". my $lindent = " "; # Indent nested lists by "\t", " " etc. my $lwidth = 80; # Line width, used for short line justification. # ToDo: Better list handling. Currently assumed 8 level nesting. my @levchar = ('*', '+', 'o', '-', '**', '++', 'oo', '--'); # # Text extraction starts. # $content =~ s/(\r)?\n//; $content =~ s{]+?/>|}|$nl|og; $content =~ s||$nl|og; $content =~ s||\t|og; my $hr = '-' x 78 . $nl; $content =~ s|.*?|$hr|og; $content =~ s||$lindent x $1 . "$levchar[$1] "|oge; # # Uncomment either of below two lines and comment above line, if dealing # with more than 8 level nested lists. # # $content =~ s||$lindent x $1 . '* '|oge; # $content =~ s||'*' x ($1+1) . ' '|oge; $content =~ s{.*?(|]+>)(.*?)}/uc $2/oge; $content =~ s{(.*?)}/cjustify($1)/oge; $content =~ s{(.*?)}/rjustify($1)/oge; $content =~ s{(.*?)}/hyperlink($1,$2)/oge; #$content =~ s/<.*?>//g; $content =~ s/<.*?>/ /g; #Substitute tags with white spaces, in order to always have white spaces between words # # Convert non-ASCII characters/character sequences to ASCII characters. # # $content =~ s/\xE2\x82\xAC/\xC8/og; # euro symbol as saved by MSOffice $content =~ s/\xE2\x82\xAC/E/og; # euro symbol expressed as E $content =~ s/\xE2\x80\xA6/.../og; $content =~ s/\xE2\x80\xA2/::/og; # four dot diamond symbol $content =~ s/\xE2\x80\x9C/"/og; $content =~ s/\xE2\x80\x99/'/og; $content =~ s/\xE2\x80\x98/'/og; $content =~ s/\xE2\x80\x93/-/og; $content =~ s/\xC2\xA0//og; $content =~ s/&/&/ogi; $content =~ s/<//ogi; $content =~ s/\s+/ /g;#delete extra white spaces my @row = split(/\n/, $content);