#!/usr/bin/perl
use strict;
use warnings;
#use WWW::Mechanize ;
#my $mech = WWW::Mechanize->new();
my $txt_url ;
my $pdf_url ;
my $html_url ;
my $output ;
my @output ;
#my $file_content = "D:/Perl/Perl output/garbage_content.csv" ;
my $file_content = "1189436input.txt" ;
# my $get_file = "http://securities.stanford.edu/filings-case.html?
+id=101092";
my $filename = "file_output_101092" ;
# my $response = $mech->get($get_file,
# ':content_file'=> $file_content,);
open my $fh, '<', $file_content or die $?;
my $i =1 ;
while (<$fh>) {
chomp;
$txt_url='';
$pdf_url='';
$html_url='';
if(index($_,".txt")>=0) {
$txt_url = $_ ;
}else {
if(index($_,".pdf")>=0) {
$pdf_url = $_ ;
}else {
if(index($_,".html")>=0) {
$html_url = $_ ;
}
}
}
if (($txt_url)|($pdf_url)|($html_url))
{
$output = join ",", $filename, $txt_url, $pdf_url, $html_u
+rl ;
push @output, ($output) ;
}
}
close $fh ;
foreach (@output) {
print "$_\n" ;
}
i get this file_output_101092,,, <a href="index.html">Home</a>
file_output_101092,,, <a href="filings.html">Filings Da
+tabase</a>
file_output_101092,,, <a href="resources.html">Resource
+s</a>
file_output_101092,,, <a href="litigation-activity-indi
+ces.html">Litigation Activity Indices</a>
file_output_101092,,, <a href="clearinghouse-research.h
+tml">Clearinghouse Research</a>
file_output_101092,,, <a href="about-the-scac.html">Abo
+ut</a>
file_output_101092,,, <!-- <div style="float: left;positi
+on: relative;top: 5px;"><a href="filings.html">Browse Filings Databas
+e</a></div> -->
file_output_101092,,, <!-- <div style="float: left;positi
+on: relative;top: 5px;"><a href="filings.html">Browse Filings Databas
+e</a></div> -->
file_output_101092,,,
+ <tr class
+="table-link" onclick="window.location='filings-documents/1010/ALTSE9
+8/001.html'" target="_blank">
file_output_101092,
+ <tr class="table-link" onclick="window.location='filings-document
+s/1010/ALTSE98/000.txt'" target="_blank">,,
file_output_101092,,
+ <tr class
+="table-link" onclick="window.location='filings-documents/1010/ALTSE9
+8/19981029_r04c_9800528.pdf'" target="_blank">,
file_output_101092,,
+ <tr class="table-link" onclick="window.location='filings-document
+s/1010/ALTSE98/1999730_r07o_98CV00528.pdf'" target="_blank">,
file_output_101092,,
+ <tr class="table-link" onclick="window.location='filings-document
+s/1010/ALTSE98/1999830_r04c_98CV00528.pdf'" target="_blank">,
file_output_101092,,
+ <tr class="table-link" onclick="window.location='filings-document
+s/1010/ALTSE98/2002726_r03k_9800528.pdf'" target="_blank">,
file_output_101092,,, <div class="span12"><p><a href="filings-case-p
+ages-url-map.html" style="color: #ffffff"><strong>Cases</strong></a><
+/p></div>
file_output_101092,,, <legend style="margin
+-left: 0px;margin-bottom: 0px"><a href="about-the-scac.html" style="c
+olor: #565656">About</a></legend>
file_output_101092,,, <dd><a
+ href="about-the-scac.html#about" style="color: #151616">About Us</a>
+</dd>
file_output_101092,,, <dd><a
+ href="about-the-scac.html#methodology" style="color: #151616">Method
+ology</a></dd>
file_output_101092,,, <dd><a
+ href="about-the-scac.html#faq" style="color: #151616">FAQ</a></dd>
file_output_101092,,, <dd st
+yle="color: #565656"><a href="about-the-scac.html#sponsors" style="co
+lor: #151616">Sponsors & Partners</a></dd>
file_output_101092,,, <dd><a
+ href="about-the-scac.html#register" style="color: #151616">Register<
+/a></dd>
file_output_101092,,, <dd><a
+ href="about-the-scac.html#contacts" style="color: #151616">Contact U
+s</a></dd>
file_output_101092,,, <dd><a
+ href="about-the-scac.html#legal" style="color: #151616">Legal Notice
+s</a></dd>
file_output_101092,,, <!-- <
+dd><a href="about-the-scac.html#sitemap" style="color: #151616">Site
+Map</a></dd> -->
I assume that the few "duplicates" there are from multiple references in the page or "#" references
|