in reply to Re^4: XML Parsing from URL
in thread SOLVED: XML Parsing from URL
I had some free time to play with LWP and server push. In fact, it is quite easy to make LWP call a callback whenever a complete document from a multipart container is received. It does not matter if the part is received in one chunk or in many chunks, and it does not matter if one chunk of data contains one or more parts.
The trick is to know that LWP uses HTTP::Response, which inherits from HTTP::Message. And HTTP::Message contains everything needed to handle multipart messages, both for server push and for multipart POST requests.
This is my code. Feel free to use it. Make it a CPAN module if you like it.
package MultipartFilter; use v5.12; use warnings; sub hookInto # $userAgent, onStart => sub { ... }, onDocument => sub { + ... }, onEnd => sub { ... } { my ($class,$ua,%args)=@_; my $onStart=$args{'onStart'}//sub {}; my $onDocument=$args{'onDocument'}//sub {}; my $onEnd=$args{'onEnd'}//sub {}; $ua->add_handler( response_header => sub { # my ($response,$ua,$h)=@_; my ($response,$ua)=@_; $onStart->($response,$ua); # Remember how many times we called the onDocument callbac +k $response->{'.multipartfilter'}=0; return; }, m_media_type => "multipart/*" ); my $flushDocuments=sub { my ($parts,$response,$ua)=@_; # get rid of all the parts that we have already processed for (my $i=0; $i<$response->{'.multipartfilter'}; $i++) { shift @$parts; } # call the onDocument callback for all new parts for my $part (@$parts) { $onDocument->($part,$response,$ua); $response->{'.multipartfilter'}++; } }; $ua->add_handler( response_data => sub { # my ($response,$ua,$h,$data)=@_; my ($response,$ua)=@_; my @parts=$response->parts(); # The last part is special, it may be not yet completely t +ransmitted. # All other parts are complete, but may still need the onD +ocument # callback call: my $lastpart=pop @parts; $flushDocuments->(\@parts,$response,$ua); my $clen=$lastpart->header('Content-Length'); defined($clen) or die "Missing Content-Length header in mu +ltipart response"; my $cref=$lastpart->content_ref(); # don't copy possibly l +arge content around # If the content is shorter than announced, we have not ye +t # received all of it, and must not call the callback now. unless (length($$cref)<$clen) { $onDocument->($lastpart,$response,$ua); $response->{'.multipartfilter'}++; } return 1; }, m_media_type => "multipart/*" ); $ua->add_handler( response_done => sub { # my ($response,$ua,$h)=@_; my ($response,$ua)=@_; my @parts=$response->parts(); # All parts are complete, including the last one. # Make sure the callback has been called for all of them. $flushDocuments->(\@parts,$response,$ua); $onEnd->($response,$ua); return; }, m_media_type => "multipart/*" ); return; } 1;
#!/usr/bin/perl use v5.12; use warnings; use Data::Dumper; use LWP::UserAgent; use MultipartFilter; $|=1; my $ua=LWP::UserAgent->new(); MultipartFilter->hookInto( $ua, onStart => sub { say "** Hey, I got a multi-part response!"; }, onDocument => sub { my ($part,$response,$ua)=@_; say "** Hey, I got a new document!"; say "Headers are:"; say $part->headers->as_string(); say "Content is:"; say $part->content(); }, onEnd => sub { say "** End of multipart response."; } ); say "*** single-part response ***"; my $resp=$ua->get("http://localhost/~alex/files/server-push/hello.cgi" +); say $resp->as_string(); say "*** multi-part response ***"; $resp=$ua->get("http://localhost/~alex/files/server-push/push-server.c +gi"); say $resp->as_string();
#!/usr/bin/perl use v5.12; use warnings; $|=1; print "Status: 200 OK\r\n", "Content-Type: text/plain\r\n", "\r\n", "Hello World!\r\n";
#!/usr/bin/perl use v5.12; use warnings; $|=1; my $boundary=join('-','cut','here',$$,time(),rand()); print "Status: 200 OK\r\n", "MIME-Version: 1.0\r\n", "Content-Type: multipart/mixed; boundary=\"$boundary\"\r\n", "\r\n"; for (1..3) { my $now=localtime; my $text1=<<__END_OF_HTML__; <!DOCTYPE html> <html> <head><title>Testing multipart/mixed</title></head> <body> <h1>Testing <code>multipart/mixed</code></h1> __END_OF_HTML__ my $text2=<<__END_OF_HTML__; <dl><dt>Counter</dt><dd>$_</dd><dt>Server time</dt><dd>$now</d +d></dl> </body> </html> __END_OF_HTML__ print "--$boundary\r\n", "Content-Type: text/html; charset=windows-1252\r\n", "Content-Length: ",length("$text1$text2"),"\r\n", "X-My-Counter: $_\r\n", "\r\n", $text1; sleep 1; print $text2; sleep 1; } print "--$boundary--\r\n";
So, to properly handle documents, use MultipartFilter;, remove the :content_cb handler, delete sub raw_handler, and insert the following code before my $response = $browser->get(...);:
MultipartFilter->hookInto($browser, onDocument => sub { my $part=shift; $twig->parse($part->content()); } );
Test enviroment: Apache 2.4.12 and Perl 5.18.1 on Slackware64 14.1
Alexander
|
---|