use strict; use warnings; use LWP::UserAgent; use HTML::Strip; # fetch the html source my $ua = new LWP::UserAgent; my $req = new HTTP::Request (GET => 'http://www.ntu.edu.sg/eee/eee4/cv +/eekteoh.htm'); my $res = $ua->request($req); die "unable to fetch HTML source: $res->status_line" if !$res->is_success(); my $html = $res->content(); # fetch the html source $html =~ s/\x0D//g; # convert to unix format # grab the html fragments my ($research_interest, $selected_publications) = $html =~ /(Research Interests.*)(Selected Publications.*)Projects/s; # strip html tags my $hs = HTML::Strip->new(); $research_interest = $hs->parse( $research_interest ); $research_interest =~ s/\n+/\n/sg; $research_interest =~ s/(Research Interests)/$1\n------------------/; $selected_publications = $hs->parse($selected_publications); $selected_publications =~ s/\n+/\n/sg; $selected_publications =~ s/(Selected Publications)/$1\n-------------- +-------/; $hs->eof; print "$research_interest\n\n$selected_publications";
Research Interests ------------------ Computer Vision and Pattern Recognition Autonomous Navigation of Outdoor AGVs Intelligent Systems Robotics Industrial Automation Selected Publications --------------------- DG Shen, Harace HS Ip, ....
In reply to Re: text extract
by Roger
in thread text extract
by shu
| For: | Use: | ||
| & | & | ||
| < | < | ||
| > | > | ||
| [ | [ | ||
| ] | ] |