#!/usr/bin/perl use strict; use warnings; use Data::Dumper; use File::Find qw(finddepth); use File::Find qw(find); use PDF::API2; use Time::Progress; use List::MoreUtils qw( natatime ); use Time::Piece; use File::Basename; use DBI; BEGIN { # Set up log file my $log_file = "log.txt"; use CGI::Carp qw(carpout); open(LOG, ">>$log_file") or print "Unable to append to log: $!"; carpout(*LOG); $| = 1; # Disable buffering } # Where to search my @search_dirs = ( "/doca", "/docb" ); # dir doca size is 1.6TB and docb size is 3.1TB my $stime = Time::Piece->new; my $started_time = $stime->hms; my $start_time = Time::Piece->strptime( $started_time, '%H:%M:%S' ); my @pdf_files; # call list_dirs sub my $data = list_dirs(\@search_dirs); if ($data eq "NoData") { print "\n No SQL data available.\n\n"; exit; } my $count_data = scalar @$data; # Start progress bar $| = 1; my $p = new Time::Progress; print "\n Starting Counting Process: \n\n"; my $total_pages = process_data($data); print "\n\n Total Number of pages: $total_pages\n"; warn " Total Number of pages: $total_pages\n"; my $etime = Time::Piece->new; my $end_t = $etime->hms; my $end_time = Time::Piece->strptime( $end_t, '%H:%M:%S' ); my $done_time = $end_time - $start_time; my $converted_time = convert_time($done_time); print " \n\n Started at: $started_time \n"; print " \n Ended at: $end_t \n"; print "\n Processing time: $converted_time \n\n\n"; warn " Started at: $started_time | Ended at: $end_t | Processing time: $converted_time\n"; exit; sub process_data { my $dirs = shift; my $c = 0; my $totalpages = 0; foreach my $doc (@{$dirs}) { $c++; next unless ($doc =~ m/\.pdf$/i); print $p->report(" %45b %p\r", $c); my ($filename, $path) = fileparse($doc); eval { my $pdf = PDF::API2->open($doc); my $pages = $pdf->pages; $totalpages += $pages; #log results warn" $doc | $filename: Pages: $pages\n"; }; warn "$doc | Error captured : $@\n" if $@; } print $p->report("\n Done %p elapsed: %L (%l sec)", $c); return $totalpages; } sub list_dirs { my ($dirs_ref ) = @_; my @dirs = @{ $dirs_ref }; print "\n Searching in: @dirs \n\n"; my @files; # call process sub find( { wanted => \&process, follow => 0, no_chdir => 1 }, @dirs); print "\n\n Found PDF docs in:\n\n"; foreach my $found_pdf_doc (@pdf_files) { next unless ($found_pdf_doc =~ m/\.pdf$/i); } # I only want values with path and file names in it. for ( my $index = $#pdf_files; $index >= 0; --$index ) { splice @pdf_files, $index, 1 if $pdf_files[$index] !~ m/\.pdf$/i; } return \@pdf_files; } sub process{ my $dbh = DBI->connect ("dbi:CSV:", undef, undef, { f_ext => ".txt/r", f_enc => "utf-8", }); $dbh->{csv_tables}{prod_pdf_files} = { file => "pdfs.txt", # list of file names to search col_names => [qw( doc_name file_name acc_nbr )], }; my $sth = $dbh->prepare ("SELECT DISTINCT file_name FROM pdfs "); $sth->execute; my $sql_data = $sth->fetchrow_hashref; my @filenames = (); while (my $row = $sth->fetchrow_hashref) { next unless ($row->{file_name} =~ m/\.pdf$/i); push @filenames, $row->{file_name}; } my $addfile = 0; my $pdfs_iter = natatime( 50, @filenames ); my $c = 0; while (my @files = $pdfs_iter->()) { $c++; for my $test_file (@files) { if( index( $_,$test_file ) >-1 ) { ++$addfile; last; } } } push @pdf_files, $File::Find::name;# if $addfile; } # End process Sub sub convert_time { my $time = shift; my $days = int($time / 86400); $time -= ($days * 86400); my $hours = int($time / 3600); $time -= ($hours * 3600); my $minutes = int($time / 60); my $seconds = $time % 60; $days = $days < 1 ? '' : $days .'d '; $hours = $hours < 1 ? '' : $hours .'h '; $minutes = $minutes < 1 ? '' : $minutes . 'm '; $time = $days . $hours . $minutes . $seconds . 's'; return $time; }