# Checks if a file is PDF depending on the filename. If so, write it t
+o a
# temporary file and feed it to $PDFTOTEXT, return the output. If it's
+ not
# PDF, return the buffer unmodified.
sub parse_pdf {
my $buffer = $_[0];
my $url = $_[1];
if ($url =~ m/\.pdf$/i && $PDFTOTEXT) {
my $tmpfile = "$TMP_DIR/temp.pdf";
# Saving to a temporary file is necessary for http requested PDFs.
+ To
# keeps things simpler, we also do it for local files from disk.
open(TMPFILE, ">$tmpfile") or warn "Cannot write '$tmpfile': $!";
binmode(TMPFILE);
print TMPFILE ${$buffer};
close(TMPFILE);
# filename security check is done in to_be_ignored():
${$buffer} = `$PDFTOTEXT "$tmpfile" -` or (warn "Cannot execute '$
+PDFTOTEXT $tmpfile -': $!" and return undef);
unlink $tmpfile or warn "Cannot remove '$tmpfile: $!'"
}
}
# Save a term's ID to the database, if it does not yet exist. Return t
+he ID.
sub record_term {
my $term = $_[0];
print STDERR "Warning: record_term($term): No term was supplied\n" u
+nless $term;
if ($terms_db{$term}) {
return $terms_db{$term};
} else {
++$TN;
$terms_db{$term} = $TN;
return $TN;
}
}
# Is the file listed in @no_index or is it a PDF file with illegal cha
+racters
# in the filename?
# Supported ways to list a file in conf/no_index:
# /home/www/test/index.html (absolute path)
# /test/index.html (path relative to webroot, but with slash)
# test/index.html (path relative to webroot, no slash)
# http://localhost/test/index.html (absolute URL)
sub to_be_ignored {
my $file = shift;
# Check @no_index:
my $file_relative;
$file_relative = cut_document_root($file);
foreach my $regexp (@no_index) {
if( $file_relative =~ m/^\/?$regexp$/ || $file =~ m/^$regexp$/ ) {
return "listed in no_index.txt";
}
}
# For PDF files check filename for security reasons (it later gets h
+anded to a shell!):
if( $file =~ m/\.pdf$/i && $PDFTOTEXT ) {
if( $file !~ m/^[\/\\a-zA-Z0-9_.:+-]*$/ || $file =~ m/\.\./ ) {
return "Ignoring '$file': illegal characters in filename";
}
}
return undef;
}
I search some like this but for .doc .ppt and .xls file...
jcwren - 2001/12/20 22:18:00 UTC - added code tags |