server: http://rswatch page1: /RSData.aspx # form with field 'miljoe' page2: /RSData.aspx?miljoe=UDV # form with field ''TextBoxProductID' and button 'Button1' #### POST /RSData.aspx?miljoe=UDV HTTP/1.1 TE: deflate,gzip;q=0.3 Connection: TE Authorization: Basic S01EXHo2YW5kOno2YW5keXl5 Host: rswatch User-Agent: libwww-perl/5.805 Content-Length: 151 Content-Type: application/x-www-form-urlencoded DropDownListType=-TextBoxGUID&-=TextBoxUserName&-=TextBoxKommunenr&-=TextBoxProductID&KMD.NI.DPSagsbehandler=TextBoxShortText&-=Button1&Opdater+filter= HTTP/1.1 200 OK Date: Mon, 25 Dec 2006 14:25:36 GMT Server: Microsoft-IIS/6.0 MicrosoftOfficeWebServer: 5.0_Pub X-Powered-By: ASP.NET X-AspNet-Version: 1.1.4322 Set-Cookie: ASP.NET_SessionId=4dbkrgn4idtdwbuptotubemu; path=/ Cache-Control: private Content-Type: text/html; charset=utf-8 Content-Length: 11883 RSWatch - UDV
RSWatch - UDV 
<--    -->      -->>       
#### POST /RSData.aspx?miljoe=UDV HTTP/1.1 Accept: image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, application/x-shockwave-flash, */* Referer: http://rswatch/RSData.aspx?miljoe=UDV Accept-Language: da Content-Type: application/x-www-form-urlencoded Accept-Encoding: gzip, deflate User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; InfoPath.1) Host: rswatch Content-Length: 0 Connection: Keep-Alive Cache-Control: no-cache Cookie: ASP.NET_SessionId=0i4zi0q0uag51lypzvg4m0va Authorization: Negotiate TlRMTVNTUAABAAAAB4IIogAAAAAAAAAAAAAAAAAAAAAFASgKAAAAD0== HTTP/1.1 401 Unauthorized Content-Length: 83 Content-Type: text/html Server: Microsoft-IIS/6.0 WWW-Authenticate: Negotiate TlRMTVNTUAACAAAABgAGADgAAAAFgomixPDhPomZ5sYAAAAAAAAAAI4AjgA+AAAABQLODgAAAA9LAE0ARAACAAYASwBNAEQAAQAQAE8ARABTAFcARQBCADAAMQAEABoAaQBuAHQAZQByAG4ALgBrAG0AZAAuAGQAawADACwATwBEAFMAVwBFAEIAMAAxAC4AaQBuAHQAZQByAG4ALgBrAG0AZAAuAGQAawAFABoAaQBuAHQAZQByAG4ALgBrAG0AZAAuAGQAawAAAAAA MicrosoftOfficeWebServer: 5.0_Pub X-Powered-By: ASP.NET Date: Mon, 25 Dec 2006 17:43:04 GMT ErrorError: Access is Denied. POST /RSData.aspx?miljoe=UDV HTTP/1.1 Accept: image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, application/x-shockwave-flash, */* Referer: http://rswatch/RSData.aspx?miljoe=UDV Accept-Language: da Content-Type: application/x-www-form-urlencoded Accept-Encoding: gzip, deflate User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; InfoPath.1) Host: rswatch Content-Length: 368 Connection: Keep-Alive Cache-Control: no-cache Cookie: ASP.NET_SessionId=0i4zi0q0uag51lypzvg4m0va Authorization: Negotiate TlRMTVNTUAADAAAAGAAYAGQAAAAYABgAfAAAAAYABgBIAAAACgAKAE4AAAAMAAwAWAAAAAAAAACUAAAABYKIogUBKAoAAAAPSwBNAEQAWgA2AEEATgBEAEgAMgA0ADkANgA0AG2gazXZgVp0AAAAAAAAAAAAAAAAAAAAAC4sefx6XWUzFigAY3IxHngpT+49JULFTA== __VIEWSTATE=dDwxMTA1MDg5NDkzO3Q8O2w8aTwxPjtpPDM%2BOz47bDx0PHA8cDxsPFRleHQ7PjtsPFw8dGl0bGVcPlJTV2F0Y2ggLSBVRFZcPC90aXRsZVw%2BOz4%2BOz47Oz47dDxwPHA8bDxUZXh0Oz47bDxVRFY7Pj47Pjs7Pjs%2BPjs%2BQtBaNAQOnC4Eqk2prlcPA4K8wqw%3D&DropDownListType=-&TextBoxGUID=-&TextBoxUserName=-&TextBoxKommunenr=-&TextBoxProductID=-&TextBoxShortText=KMD.NI.DPSagsbehandler&Button1=Opdater+filter HTTP/1.1 200 OK Date: Mon, 25 Dec 2006 17:43:18 GMT Server: Microsoft-IIS/6.0 MicrosoftOfficeWebServer: 5.0_Pub X-Powered-By: ASP.NET X-AspNet-Version: 1.1.4322 Cache-Control: private Content-Type: text/html; charset=utf-8 Content-Length: 11823 RSWatch - UDV
RSWatch - UDV 
<--          -->>       
#### ### Arg parsing, Initialization, IO setup cut out here... ### ====================================================================== ### do_POST -- Params: ### the URL, (odsweb01.kmd.dk [172.31.88.103]: http://rswatch/RSData.aspx) ### an arrayref or hashref for the key/value pairs, ### optionally: any header lines: (key,value, key,value) ### ====================================================================== sub do_POST { if ( ! $ua ) { $ua = new LWP::UserAgent(keep_alive=>1,parse_head=>0); $ua->credentials('rswatch:80', 'rswatch', "KMD\\z6and", 'xxxxxxx'); $ua->default_header('Referer' => "http:\/\/rswatch\/RSData.aspx?miljoe=$args{E}"); $ua->default_header('Accept-Language' => 'da'); push @{$ua->requests_redirectable}, 'POST'; $ua->cookie_jar( {} ); $ua->env_proxy(); } my $resp = $ua->post(@_); return ($resp->content, $resp->status_line, $resp->is_success, $resp) if wantarray; return unless $resp->is_success; return $resp->content; } ### ====================================================================== ### do_RSbase : Parse RSwatch DB by traversing <-- ('forrige') link chain ### ====================================================================== ### Termination: sub not_interesting :'$done' when ($S < $args{T}), cf. ### sub set_args : $tw = "20051103151100"; # 1.log date sub do_RSbase { # Start in Browsing mode $browsing = 1; print "Browsing page:\n"; # Parse 1.st and previous pages, until done my $previous = "http://rswatch/RSData.aspx?miljoe=UDV"; for (my $p = 1; !$done; ) { print ">" . $p++ . "\n"; usleep ($args{S}); # Pause and... $previous = do_page($previous); # parse previous page. } } ### ====================================================================== ### do_page : Parse RSWatch page ### ====================================================================== sub do_page { # --- Fetch page (1.page & back-links) my $url = shift; my @parms = []; =cut # this doesn't work... my @parms = [ 'TextBoxProductID'=> 'KMD.NI.DPSagsbehandler', 'Button1' => 'Opdater filter', ]; =cut my ($content, $message, $is_success) = do_POST("$url", @parms); die "***ERROR: HTTP to $url:\r\n\t$message\n" unless $is_success; #print "$content\n\n"; # --- Decode & Parse page my $root = HTML::TreeBuilder->new; $content = decode("utf8", $content); $root->parse($content); # --- Extract page backlink my $node_prev = $root->find_by_attribute("id", "HyperLink2"); my $link_prev = $node_prev->attr("href"); # --- Process main log table my @tables = $root->find_by_tag_name('table'); my @table_rows = $tables[2]->find_by_tag_name('tr'); do_summary(\@table_rows); # --- Free parse resources $root->eof; #$root->dump; $root->delete; # --- Return link to previous page return "http://rswatch/" . $link_prev; # or 0, if last page! } ### ====================================================================== ### do_summary : Parse RSWatch log summary table ### ====================================================================== ### ---------------------------------------------------------------------- ### Raise flags: !browsing if past -f(rom); $done if past -t(o). sub not_interesting { my $r_table_cells = shift; my @table_cells = @{$r_table_cells}; my $S = ($table_cells[4]->as_text); $S =~ s/[-: ]//g; if ($S > $args{F}) { $browsing ||=1; return 1;} # Before from.. skip if ($args{T} > $S) { $done = 1; return 1;} # After to... quit if ($browsing) { $browsing = 0; print "\n"; } # 0: Interesting! return; } ### ---------------------------------------------------------------------- ### Parse each log $row to @log_record table on page sub do_summary { my $r_table_rows = shift; # ref param my @table_rows = @{$r_table_rows}; # cast to array shift(@table_rows); # discard header row ROW: # --- Process each $row to @log_record foreach my $row (@table_rows) { return if $done; my @log_record; my @table_cells = $row->find_by_tag_name('td'); if ( exists($table_cells[5]) && $table_cells[5]->as_text=~/DPSagsbehandler/i ) # TODO:read from config { # --- If interesting: build @log_record from HTML next ROW if not_interesting(\@table_cells); # Skip out-of-bounds foreach my $cell (@table_cells) { push @log_record, $cell->as_text; } # --- If E(rror): process row detailsand push on @log_record my $type = $table_cells[1]->as_text; # [E(rror)|S|W|R|T] if ($type =~ /E/i) { my $detail_link = "http://rswatch/" . $table_cells[0]->find_by_tag_name('a')->attr('href'); my $details = do_details($detail_link); push @log_record, $details; } # --- Reformat and print @log_record to file (tee to STDOUT) print_record(\@log_record); } } } ### ====================================================================== ### do_details : Parse RSWatch details ### ====================================================================== sub do_details { # --- Fetch details page for $url my $url = shift; my ($content, $message, $is_success) = do_POST("$url", []); die "***ERROR: POST to $url:\r\n\t$message\n" unless $is_success; # --- Decode & Parse page my $root = HTML::TreeBuilder->new; $content = decode("utf8", $content); $root->parse($content); # --- Retrieve details text my @tables = $root->find_by_tag_name('table'); my @table_rows = $tables[3]->find_by_tag_name('tr'); shift (@table_rows); # discard table header my $details = $table_rows[0]->find_by_attribute("valign", "top")->as_text(); # --- Free parse resources $root->eof; #$root->dump; #print "\tSUMMARY: $url\n"; $root->delete; return $details; } ### ====================================================================== ### print_record : Print one log record ### ====================================================================== sub print_record { my $r_log_record = shift; # ref param my @log_record = @{$r_log_record}; # cast to array # --- Reformat log record $log_record[4] =~ s/ /#/; # seperate date,time in TimeStamp for my $i (1..2) { shift(@log_record); } # discard FejllogId & Type my @print_record; push @print_record, split('#', $log_record[2]); # TimeStamp date and time push @print_record, ""; # DPxxx -- Fill in push @print_record, $log_record[1]; # Municipality No. push @print_record, $log_record[0]; # User ID # --- Parse ShortText ### TO-BE-DONE ### push @print_record, "