Array issue - pulling too many values

Anonymous Monk has asked for the wisdom of the Perl Monks concerning the following question:

Hi Monks. I'm working on a script that uses Mojo::UserAgent to grab some webpages and parse them for keywords. I'm using proxies to do the search. I have the script referencing two text files to get the search terms and another file to get the proxy addresses. The problem however is that I have an error in my code so rather than pulling one proxy address I pull a much larger #. The script still runs and I get the results I was expecting but the misuse of the array and pulling 10-20 proxy values vs. 1 is making my computer grind to a halt as I'm using too much memory. I will continue to work on this myself but I can't seem to nail down the pulling too many proxy values issue. Any help of guidance is greatly appreciate. Here's my code:

#!/usr/bin/perl
$ENV{'PERL_LWP_SSL_VERIFY_HOSTNAME'} = 0;
use strict;
use warnings;
use Mozilla::CA;
use Tcl::Tk;
use strict;
use Cwd 'chdir';
use File::Basename;
use File::Find;
use HTTP::Headers;
use HTML::HeadParser;
use Text::CSV;
use HTML::Tidy;
use LWP::Simple;
use LWP::UserAgent;
use LWP::Protocol::https;
use feature 'say';
use File::Slurp 'slurp';  # makes it easy to read files.
use Mojo;
use Mojo::UserAgent;
use URI;
use Mojo::Collection;
use autodie;
use File::Path qw/ make_path remove_tree /;
use File::Basename qw/ fileparse /;
use LWP::Simple;
use File::Spec;
use HTML::FormatText;
use 5.010;

        my $calls_dir2  = 'Bing/1Parsed/Html';
        my $parsed_dir = 'Bing/1Parsed/Html2';

        open(my $fh2, '<', 'parse1.txt') or die $!;
        chomp(my @parse_terms1 = <$fh2>);
        close($fh2);

        open($fh2, '<', 'parse2.txt') or die $!;

        for my $parse1 (@parse_terms1) {
            seek($fh2, 0, 0);

            while (my $parse2 = <$fh2>) {
                chomp($parse2);
                print "$parse1 $parse2\n";

        my $wanted     = $parse1.$parse2;

        my @files = glob "$calls_dir2/*.txt";


        printf "Got %d files\n", scalar @files;

        for my $file (@files) {

          open my $in_fh, '<', $file;
          my $basename = fileparse($file);
          my ($prefix) = $basename =~ /^(.{9})/;
          my $rnumber = rand(1999);
          print $prefix, "\n";

          my @matches;
          while (<$in_fh>) {
              #save to look back at what I changed
            #push @matches, $_ if / \Q$keywords\E .* \Q$prefix\E /x;
            #push @matches, $_ if / \Q^*(.*)$keywords\s*(.*)\E .* \Q^*
+(.*)$prefix\s*(.*)\E /x;
            push @matches, $_ if / \Q$wanted\E /x;
          }

          #my $basename = fileparse($file);
          make_path($parsed_dir);
          open my $out_fh, '>', "$parsed_dir/${basename}.$wanted.$rnum
+ber.txt";
          print $out_fh $_ for @matches;
          close $out_fh;
        }}}

        my $calls_dir3 = "Bing/1Parsed/Html2/";
        opendir( my $search_dir2, $calls_dir3 ) or die "$!\n";
        my @files = grep /\.txt$/i, readdir $search_dir2;
        closedir $search_dir2;
        print "Got ", scalar @files, " files\n";


        # proxies
        open my $fh9, '<', 'proxies.txt' or die $!;
        chomp(my @proxies = <$fh9>);
        close $fh9;
    
    
        foreach my $file (@files) {
            my %seen         = ();
            my $current_file = $calls_dir3 . $file;
            my $proxy   = shift @proxies;
            print "Current proxy:$proxy\n";
            open my $FILE, '<', $current_file or die "$file: $!\n";
        

        make_path('Bing/1Parsed/Html3/');
        while (my $row = <$FILE>) {
            open my $fh1, ">", "Bing/1Parsed/Html3/$file.html" or die(
+"Could not open file. $!");
    
        chomp $row;
        print "$row\n";
        my $xml1 = $row;

        $fh1->print ($row);
    
                            
                            # create useragent
                            my $ua = LWP::UserAgent->new; 
                            $ua->agent('Mozilla/8.0');
                              # Use this UA/Proxy to fetch
                              $ua->proxy(['http'], 'http://'.$proxy); 

                                my $xml2 = get $xml1;
                                #$fh1->print ($xml1);
                                $fh1->print ("\n");
                                #$fh1->print ($fh1.$xml2);
                                #$fh1->print ($fh1);
                                print $xml2;
                                $fh1->print ($xml2);
                                #close $fh1, ">", "Ask/Parsed/Html/$fi
+le.html";
                                close $fh1;
                                $xml2=1;
                                }}
[download]

Comment on Array issue - pulling too many values Download Code

Replies are listed 'Best First'.
Re: Array issue - pulling too many values by Eily (Monsignor) on Feb 25, 2016 at 17:02 UTC
How do I post a question effectively? Your code is really hard to read (or rather discouraging), with useless variable names ($fh1, $fh2, $fh9, what's the difference?), useless variables (%seen), useless includes (and strictures are actually applied three times ...) and an indentation that looks half random at best. Please clean-up your code before you expect us to understand what it does.	[reply]