in reply to Re: going through a Win32 MSWORD doc
in thread going through a Win32 MSWORD doc

this is the part to index a pdf file

# Checks if a file is PDF depending on the filename. If so, write it t +o a # temporary file and feed it to $PDFTOTEXT, return the output. If it's + not # PDF, return the buffer unmodified. sub parse_pdf { my $buffer = $_[0]; my $url = $_[1]; if ($url =~ m/\.pdf$/i && $PDFTOTEXT) { my $tmpfile = "$TMP_DIR/temp.pdf"; # Saving to a temporary file is necessary for http requested PDFs. + To # keeps things simpler, we also do it for local files from disk. open(TMPFILE, ">$tmpfile") or warn "Cannot write '$tmpfile': $!"; binmode(TMPFILE); print TMPFILE ${$buffer}; close(TMPFILE); # filename security check is done in to_be_ignored(): ${$buffer} = `$PDFTOTEXT "$tmpfile" -` or (warn "Cannot execute '$ +PDFTOTEXT $tmpfile -': $!" and return undef); unlink $tmpfile or warn "Cannot remove '$tmpfile: $!'" } } # Save a term's ID to the database, if it does not yet exist. Return t +he ID. sub record_term { my $term = $_[0]; print STDERR "Warning: record_term($term): No term was supplied\n" u +nless $term; if ($terms_db{$term}) { return $terms_db{$term}; } else { ++$TN; $terms_db{$term} = $TN; return $TN; } } # Is the file listed in @no_index or is it a PDF file with illegal cha +racters # in the filename? # Supported ways to list a file in conf/no_index: # /home/www/test/index.html (absolute path) # /test/index.html (path relative to webroot, but with slash) # test/index.html (path relative to webroot, no slash) # http://localhost/test/index.html (absolute URL) sub to_be_ignored { my $file = shift; # Check @no_index: my $file_relative; $file_relative = cut_document_root($file); foreach my $regexp (@no_index) { if( $file_relative =~ m/^\/?$regexp$/ || $file =~ m/^$regexp$/ ) { return "listed in no_index.txt"; } } # For PDF files check filename for security reasons (it later gets h +anded to a shell!): if( $file =~ m/\.pdf$/i && $PDFTOTEXT ) { if( $file !~ m/^[\/\\a-zA-Z0-9_.:+-]*$/ || $file =~ m/\.\./ ) { return "Ignoring '$file': illegal characters in filename"; } } return undef; }

I search some like this but for .doc .ppt and .xls file...

jcwren - 2001/12/20 22:18:00 UTC - added code tags