#!/usr/bin/perl $ENV{'PERL_LWP_SSL_VERIFY_HOSTNAME'} = 0; use strict; use warnings; use Mozilla::CA; use Tcl::Tk; use strict; use Cwd 'chdir'; use File::Basename; use File::Find; use HTTP::Headers; use HTML::HeadParser; use Text::CSV; use HTML::Tidy; use LWP::Simple; use LWP::UserAgent; use LWP::Protocol::https; use feature 'say'; use File::Slurp 'slurp'; # makes it easy to read files. use Mojo; use Mojo::UserAgent; use URI; use Mojo::Collection; use autodie; use File::Path qw/ make_path remove_tree /; use File::Basename qw/ fileparse /; use LWP::Simple; use File::Spec; use HTML::FormatText; use 5.010; my $calls_dir2 = 'Bing/1Parsed/Html'; my $parsed_dir = 'Bing/1Parsed/Html2'; open(my $fh2, '<', 'parse1.txt') or die $!; chomp(my @parse_terms1 = <$fh2>); close($fh2); open($fh2, '<', 'parse2.txt') or die $!; for my $parse1 (@parse_terms1) { seek($fh2, 0, 0); while (my $parse2 = <$fh2>) { chomp($parse2); print "$parse1 $parse2\n"; my $wanted = $parse1.$parse2; my @files = glob "$calls_dir2/*.txt"; printf "Got %d files\n", scalar @files; for my $file (@files) { open my $in_fh, '<', $file; my $basename = fileparse($file); my ($prefix) = $basename =~ /^(.{9})/; my $rnumber = rand(1999); print $prefix, "\n"; my @matches; while (<$in_fh>) { #save to look back at what I changed #push @matches, $_ if / \Q$keywords\E .* \Q$prefix\E /x; #push @matches, $_ if / \Q^*(.*)$keywords\s*(.*)\E .* \Q^*(.*)$prefix\s*(.*)\E /x; push @matches, $_ if / \Q$wanted\E /x; } #my $basename = fileparse($file); make_path($parsed_dir); open my $out_fh, '>', "$parsed_dir/${basename}.$wanted.$rnumber.txt"; print $out_fh $_ for @matches; close $out_fh; }}} my $calls_dir3 = "Bing/1Parsed/Html2/"; opendir( my $search_dir2, $calls_dir3 ) or die "$!\n"; my @files = grep /\.txt$/i, readdir $search_dir2; closedir $search_dir2; print "Got ", scalar @files, " files\n"; # proxies open my $fh9, '<', 'proxies.txt' or die $!; chomp(my @proxies = <$fh9>); close $fh9; foreach my $file (@files) { my %seen = (); my $current_file = $calls_dir3 . $file; my $proxy = shift @proxies; print "Current proxy:$proxy\n"; open my $FILE, '<', $current_file or die "$file: $!\n"; make_path('Bing/1Parsed/Html3/'); while (my $row = <$FILE>) { open my $fh1, ">", "Bing/1Parsed/Html3/$file.html" or die("Could not open file. $!"); chomp $row; print "$row\n"; my $xml1 = $row; $fh1->print ($row); # create useragent my $ua = LWP::UserAgent->new; $ua->agent('Mozilla/8.0'); # Use this UA/Proxy to fetch $ua->proxy(['http'], 'http://'.$proxy); my $xml2 = get $xml1; #$fh1->print ($xml1); $fh1->print ("\n"); #$fh1->print ($fh1.$xml2); #$fh1->print ($fh1); print $xml2; $fh1->print ($xml2); #close $fh1, ">", "Ask/Parsed/Html/$file.html"; close $fh1; $xml2=1; }}