in reply to Re^4: Why does this code continue to loop?
in thread Why does this code continue to loop?

When i run this

#!/usr/bin/perl use strict; use warnings; #use WWW::Mechanize ; #my $mech = WWW::Mechanize->new(); my $txt_url ; my $pdf_url ; my $html_url ; my $output ; my @output ; #my $file_content = "D:/Perl/Perl output/garbage_content.csv" ; my $file_content = "1189436input.txt" ; # my $get_file = "http://securities.stanford.edu/filings-case.html? +id=101092"; my $filename = "file_output_101092" ; # my $response = $mech->get($get_file, # ':content_file'=> $file_content,); open my $fh, '<', $file_content or die $?; my $i =1 ; while (<$fh>) { chomp; $txt_url=''; $pdf_url=''; $html_url=''; if(index($_,".txt")>=0) { $txt_url = $_ ; }else { if(index($_,".pdf")>=0) { $pdf_url = $_ ; }else { if(index($_,".html")>=0) { $html_url = $_ ; } } } if (($txt_url)|($pdf_url)|($html_url)) { $output = join ",", $filename, $txt_url, $pdf_url, $html_u +rl ; push @output, ($output) ; } } close $fh ; foreach (@output) { print "$_\n" ; }
i get this
file_output_101092,,, <a href="index.html">Home</a> file_output_101092,,, <a href="filings.html">Filings Da +tabase</a> file_output_101092,,, <a href="resources.html">Resource +s</a> file_output_101092,,, <a href="litigation-activity-indi +ces.html">Litigation Activity Indices</a> file_output_101092,,, <a href="clearinghouse-research.h +tml">Clearinghouse Research</a> file_output_101092,,, <a href="about-the-scac.html">Abo +ut</a> file_output_101092,,, <!-- <div style="float: left;positi +on: relative;top: 5px;"><a href="filings.html">Browse Filings Databas +e</a></div> --> file_output_101092,,, <!-- <div style="float: left;positi +on: relative;top: 5px;"><a href="filings.html">Browse Filings Databas +e</a></div> --> file_output_101092,,, + <tr class +="table-link" onclick="window.location='filings-documents/1010/ALTSE9 +8/001.html'" target="_blank"> file_output_101092, + <tr class="table-link" onclick="window.location='filings-document +s/1010/ALTSE98/000.txt'" target="_blank">,, file_output_101092,, + <tr class +="table-link" onclick="window.location='filings-documents/1010/ALTSE9 +8/19981029_r04c_9800528.pdf'" target="_blank">, file_output_101092,, + <tr class="table-link" onclick="window.location='filings-document +s/1010/ALTSE98/1999730_r07o_98CV00528.pdf'" target="_blank">, file_output_101092,, + <tr class="table-link" onclick="window.location='filings-document +s/1010/ALTSE98/1999830_r04c_98CV00528.pdf'" target="_blank">, file_output_101092,, + <tr class="table-link" onclick="window.location='filings-document +s/1010/ALTSE98/2002726_r03k_9800528.pdf'" target="_blank">, file_output_101092,,, <div class="span12"><p><a href="filings-case-p +ages-url-map.html" style="color: #ffffff"><strong>Cases</strong></a>< +/p></div> file_output_101092,,, <legend style="margin +-left: 0px;margin-bottom: 0px"><a href="about-the-scac.html" style="c +olor: #565656">About</a></legend> file_output_101092,,, <dd><a + href="about-the-scac.html#about" style="color: #151616">About Us</a> +</dd> file_output_101092,,, <dd><a + href="about-the-scac.html#methodology" style="color: #151616">Method +ology</a></dd> file_output_101092,,, <dd><a + href="about-the-scac.html#faq" style="color: #151616">FAQ</a></dd> file_output_101092,,, <dd st +yle="color: #565656"><a href="about-the-scac.html#sponsors" style="co +lor: #151616">Sponsors &amp; Partners</a></dd> file_output_101092,,, <dd><a + href="about-the-scac.html#register" style="color: #151616">Register< +/a></dd> file_output_101092,,, <dd><a + href="about-the-scac.html#contacts" style="color: #151616">Contact U +s</a></dd> file_output_101092,,, <dd><a + href="about-the-scac.html#legal" style="color: #151616">Legal Notice +s</a></dd> file_output_101092,,, <!-- < +dd><a href="about-the-scac.html#sitemap" style="color: #151616">Site +Map</a></dd> -->
I assume that the few "duplicates" there are from multiple references in the page or "#" references

Replies are listed 'Best First'.
Re^6: Why does this code continue to loop?
by rachard11 (Acolyte) on May 03, 2017 at 21:03 UTC
    Yes, I see that the problem was that the $txt_url, $html_url, and $pdf_url variables were never reset to null when the new input did not have a txt, html, or pdf reference. Thus, those variables were re-inserted into the array every time a new line was read in. Your code solves the problem. I appreciate the help!