#!/usr/bin/env perl use strict; use warnings; use FindBin; use File::Util; use Data::Dumper; use HTML::TableExtract qw(tree); use Lingua::EN::NameParse::Simple; =head1 NAME mm_subscriber_2csv.pl =head1 VERSION Version 0.01 =head1 SYNOPSIS =over mm_subscriber_2csv.pl > mm_subscriber_list.csv =back This script assumes that it is located in a directory full of html files downloaded from a Mailman listserv's Member Management tool. These are the web forms which permit one to manage the list subscription of each subscriber to a list. This script will parse each html file and harvest the name and email address of each subscriber, printing them on STDOUT in csv format, ready to be redirected into a file for importation into a database. =cut print "'TITLE','FIRST','MIDDLE','LAST','SUFFIX','EMAIL'\n"; my($f) = File::Util->new(); my (@html_files) = $f->list_dir( "$FindBin::Bin",'--files-only','--pattern=\.html'); foreach my $html_file ( @html_files ){ my $html; open( 'HTML', '<', $html_file ) or die "Unable to open $html_file \n"; while(){ $html .= $_; } close(HTML); parse_subscriber_list( $html ); } sub parse_subscriber_list { my $html = shift; my $te = HTML::TableExtract->new( headers => [ 'unsub', 'member', 'mod', 'hide', 'nomail', 'ack', 'not metoo', 'nodupes', 'digest', 'plain', 'language' ] ); my $row_count; $te->parse($html); foreach my $ts ($te->tables){ foreach my $row ($ts->rows){ $row_count++; my $name = $row->[1]->content->[2]->attr('value'); my %name = Lingua::EN::NameParse::Simple::ParseName($name); my $email = $row->[1]->content->[3]->attr('value'); $email =~ s/%40/\@/; my @record; foreach my $field (qw/ TITLE FIRST MIDDLE LAST SUFFIX / ){ $name{$field} ||= ''; push @record, $name{$field}; } push @record, $email; my $record = "'" . join( "','", @record ) . "'"; print $record . "\n"; } } } exit; =head1 ACKNOWLEDGEMENTS I publish this with appreciation to the authors of the modules which made it possible, and to choroba from the Czech Republic, who shared a clue with me by way of PerlMonks.org on how to more effectively use HTML::TableExtract. Thanks again to the perl community who made cpan and perlmonks available to us all. =head1 LISCENSE This script is made available subject to the conditions of the Gnu Public Liscense, v2. You are welcome to use, and modify this code so long as any redistribution is made subject to the same terms. =head1 COPYRIGHT 2012, Hugh Esco, YMD Partners LLC; dba/ http://CampaignFoundations.com/ =cut