Anonymous Monk has asked for the wisdom of the Perl Monks concerning the following question:

Hi Monks. I'm working on a script that uses Mojo::UserAgent to grab some webpages and parse them for keywords. I'm using proxies to do the search. I have the script referencing two text files to get the search terms and another file to get the proxy addresses. The problem however is that I have an error in my code so rather than pulling one proxy address I pull a much larger #. The script still runs and I get the results I was expecting but the misuse of the array and pulling 10-20 proxy values vs. 1 is making my computer grind to a halt as I'm using too much memory. I will continue to work on this myself but I can't seem to nail down the pulling too many proxy values issue. Any help of guidance is greatly appreciate. Here's my code:

#!/usr/bin/perl $ENV{'PERL_LWP_SSL_VERIFY_HOSTNAME'} = 0; use strict; use warnings; use Mozilla::CA; use Tcl::Tk; use strict; use Cwd 'chdir'; use File::Basename; use File::Find; use HTTP::Headers; use HTML::HeadParser; use Text::CSV; use HTML::Tidy; use LWP::Simple; use LWP::UserAgent; use LWP::Protocol::https; use feature 'say'; use File::Slurp 'slurp'; # makes it easy to read files. use Mojo; use Mojo::UserAgent; use URI; use Mojo::Collection; use autodie; use File::Path qw/ make_path remove_tree /; use File::Basename qw/ fileparse /; use LWP::Simple; use File::Spec; use HTML::FormatText; use 5.010; my $calls_dir2 = 'Bing/1Parsed/Html'; my $parsed_dir = 'Bing/1Parsed/Html2'; open(my $fh2, '<', 'parse1.txt') or die $!; chomp(my @parse_terms1 = <$fh2>); close($fh2); open($fh2, '<', 'parse2.txt') or die $!; for my $parse1 (@parse_terms1) { seek($fh2, 0, 0); while (my $parse2 = <$fh2>) { chomp($parse2); print "$parse1 $parse2\n"; my $wanted = $parse1.$parse2; my @files = glob "$calls_dir2/*.txt"; printf "Got %d files\n", scalar @files; for my $file (@files) { open my $in_fh, '<', $file; my $basename = fileparse($file); my ($prefix) = $basename =~ /^(.{9})/; my $rnumber = rand(1999); print $prefix, "\n"; my @matches; while (<$in_fh>) { #save to look back at what I changed #push @matches, $_ if / \Q$keywords\E .* \Q$prefix\E /x; #push @matches, $_ if / \Q^*(.*)$keywords\s*(.*)\E .* \Q^* +(.*)$prefix\s*(.*)\E /x; push @matches, $_ if / \Q$wanted\E /x; } #my $basename = fileparse($file); make_path($parsed_dir); open my $out_fh, '>', "$parsed_dir/${basename}.$wanted.$rnum +ber.txt"; print $out_fh $_ for @matches; close $out_fh; }}} my $calls_dir3 = "Bing/1Parsed/Html2/"; opendir( my $search_dir2, $calls_dir3 ) or die "$!\n"; my @files = grep /\.txt$/i, readdir $search_dir2; closedir $search_dir2; print "Got ", scalar @files, " files\n"; # proxies open my $fh9, '<', 'proxies.txt' or die $!; chomp(my @proxies = <$fh9>); close $fh9; foreach my $file (@files) { my %seen = (); my $current_file = $calls_dir3 . $file; my $proxy = shift @proxies; print "Current proxy:$proxy\n"; open my $FILE, '<', $current_file or die "$file: $!\n"; make_path('Bing/1Parsed/Html3/'); while (my $row = <$FILE>) { open my $fh1, ">", "Bing/1Parsed/Html3/$file.html" or die( +"Could not open file. $!"); chomp $row; print "$row\n"; my $xml1 = $row; $fh1->print ($row); # create useragent my $ua = LWP::UserAgent->new; $ua->agent('Mozilla/8.0'); # Use this UA/Proxy to fetch $ua->proxy(['http'], 'http://'.$proxy); my $xml2 = get $xml1; #$fh1->print ($xml1); $fh1->print ("\n"); #$fh1->print ($fh1.$xml2); #$fh1->print ($fh1); print $xml2; $fh1->print ($xml2); #close $fh1, ">", "Ask/Parsed/Html/$fi +le.html"; close $fh1; $xml2=1; }}

Replies are listed 'Best First'.
Re: Array issue - pulling too many values
by Eily (Monsignor) on Feb 25, 2016 at 17:02 UTC

    How do I post a question effectively? Your code is really hard to read (or rather discouraging), with useless variable names ($fh1, $fh2, $fh9, what's the difference?), useless variables (%seen), useless includes (and strictures are actually applied three times ...) and an indentation that looks half random at best. Please clean-up your code before you expect us to understand what it does.