#!/usr/bin/perl -w use strict; use warnings; my $base_url = 'http://www.sec.gov/Archives'; my @aonly=qw{ edgar/data/1122304/0001193125-15-118890.txt' edgar/data/1640984/0001052918-16-000754.txt edgar/data/1640984/0001052918-16-000803.txt edgar/data/1084869/0001437749-16-024828.txt edgar/data/1084869/0001084869-16-000045.txt edgar/data/1084869/0001084869-16-000046.txt edgar/data/1084869/0001084869-16-000047.txt edgar/data/1084869/0000950162-16-000085.txt edgar/data/1511144/0001206774-16-004470.txt edgar/data/1665022/0001665022-16-000001.txt edgar/data/1662965/0001662965-16-000001.txt edgar/data/1651654/0001651654-16-000002.txt edgar/data/1664237/0001664237-16-000001.txt edgar/data/1664513/0001664513-16-000001.txt edgar/data/1665711/0001665711-16-000001.txt edgar/data/1665354/0001665354-16-000001.txt edgar/data/1664635/0001664635-16-000001.txt edgar/data/1625109/0001625109-16-000002.txt edgar/data/1658659/0001658659-16-000002.txt edgar/data/1666635/0001666635-16-000001.txt edgar/data/1614102/0001614102-16-000002.txt edgar/data/1665218/0001665218-16-000001.txt edgar/data/1663921/0000905729-16-000383.txt edgar/data/1666561/0001666561-16-000002.txt edgar/data/1668972/0001668972-16-000002.txt edgar/data/1540531/0000905718-16-001186.txt edgar/data/1540531/0000905718-16-001254.txt }; my $file_count=0; my $FH_OUT=\*STDOUT; my @fields=qw/cik form_type report_date file_date name/; foreach my $filetoget(@aonly) { my $res=get_process_trunc ($filetoget); if (scalar(keys(%$res))) { my $lineout=''; for my $field (@fields){ if ($res->{$field}) {$lineout.=$res->{$field}} $lineout.='|'; } print $FH_OUT $lineout."\n"; } } exit; sub get_process_trunc { # http://www.perlmonks.org/?node_id=1183107 my $filetoget=shift; my $fullfile="$base_url/$filetoget"; my $res={}; use LWP::UserAgent; my $received_size = 0; my $partial = ''; my $ua = LWP::UserAgent->new; my $response = $ua->get($fullfile , ':content_cb'=> sub { my ($data, $response, $protocol) = @_; $partial.=$data; $received_size += length $data; + die if ($received_size>10000); # die inside this callback interrupt th +e request, not the program!! } ); if ($partial) { # print 'length:'.length($partial)."\n"; my $line_count=0; for my $line (split qr/\'\n'/, $partial) { if($line=~m/^\s*CENTRAL\s*INDEX\s*KEY:\s*(\d*)/m) +{$res->{cik} =$1;} if($line=~m/^\s*FORM\s*TYPE:\s*(.*$)/m) +{$res->{form_type} =$1;} if($line=~m/^\s*CONFORMED\s*PERIOD\s*OF\s*REPORT:\s*(\d*)/m) +{$res->{report_date}=$1;} if($line=~m/^\s*FILED\s*AS\s*OF\s*DATE:\s*(\d*)/m) +{$res->{file_date} =$1;} if($line=~m/^\s*COMPANY\s*CONFORMED\s*NAME:\s*(.*$)/m) +{$res->{name} =$1;} $line_count++; last if ($line_count>50); } } # success return $res; } # get_process_trunc
result
0001394872|4|20160119|20160121|Ambient Water Corp| 0001394872|5|20150422|20160209|Ambient Water Corp| 0001084869|10-Q|20151227|20160205|1 800 FLOWERS COM INC| 0001084869|4|20160202|20160331|1 800 FLOWERS COM INC| 0001084869|4|20160329|20160331|1 800 FLOWERS COM INC| 0001084869|4|20160202|20160331|1 800 FLOWERS COM INC| 0001084869|SC 13G/A||20160216|1 800 FLOWERS COM INC| 0001511144|13F-HR|20151231|20160212|10-15 ASSOCIATES, INC.| 0001665022|D||20160127|10-20 Channel Center REIT| 0001662965|D||20160106|100 Wall Investments LLC| 0001651654|D/A||20160127|100INSIGHTS, INC| 0001664237|D||20160212|1075 Weybridge Holding, LLC| 0001664513|D||20160205|10Stories, Inc.| 0001665711|D||20160210|11 Madison Investor II LLC| 0001665354|D||20160210|11 Madison Investor LLC| 0001664635|D||20160223|11 Pine, Inc.| 0001625109|D||20160226|11 Roniin, LLC| 0001658659|D/A||20160113|110 Corcoran Property Partners, LLC| 0001666635|D||20160210|1111 Broadway Distribution, LLC| 0001614102|D||20160301|1125 North Fairfax LLC| 0001665218|D||20160128|114 REIT LP| 0001663921|D||20160126|1143 Highland Drive, LLC| 0001666561|D||20160211|1155 Boulder, LLC| 0001668972|D||20160323|11619 Euclid, LLC| 0001540531|13F-HR|20151231|20160216|12 West Capital Management LP| 0001386301|SC 13G||20160307|Research Solutions, Inc.|
I thought i recognized this type of data
notice reference to @Discipulus at Re: Split web page, first 30 lines only -- :content_cb trick


In reply to Re^3: Getstore to avoid of memory? by huck
in thread Getstore to avoid of memory? by wrkrbeee

Title:
Use:  <p> text here (a paragraph) </p>
and:  <code> code here </code>
to format your post, it's "PerlMonks-approved HTML":



  • Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!
  • Titles consisting of a single word are discouraged, and in most cases are disallowed outright.
  • Read Where should I post X? if you're not absolutely sure you're posting in the right place.
  • Please read these before you post! —
  • Posts may use any of the Perl Monks Approved HTML tags:
    a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, details, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, summary, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr
  • You may need to use entities for some characters, as follows. (Exception: Within code tags, you can put the characters literally.)
            For:     Use:
    & &amp;
    < &lt;
    > &gt;
    [ &#91;
    ] &#93;
  • Link using PerlMonks shortcuts! What shortcuts can I use for linking?
  • See Writeup Formatting Tips and other pages linked from there for more info.