#!/usr/bin/env perl
use strict;
use warnings;

use FindBin;
use File::Util;
use Data::Dumper;
use HTML::TableExtract qw(tree);
use Lingua::EN::NameParse::Simple;

=head1 NAME 

mm_subscriber_2csv.pl

=head1 VERSION 

Version 0.01 

=head1 SYNOPSIS

=over

mm_subscriber_2csv.pl > mm_subscriber_list.csv

=back

This script assumes that it is located in a directory full of html
files downloaded from a Mailman listserv's Member Management tool.
These are the web forms which permit one to manage the list
subscription of each subscriber to a list.  

This script will parse each html file and harvest the name and email
address of each subscriber, printing them on STDOUT in csv format,
ready to be redirected into a file for importation into a database.

=cut

print "'TITLE','FIRST','MIDDLE','LAST','SUFFIX','EMAIL'\n";
my($f) = File::Util->new();
my (@html_files) = $f->list_dir(
    "$FindBin::Bin",'--files-only','--pattern=\.html');
foreach my $html_file ( @html_files ){
    my $html;
    open( 'HTML', '<', $html_file ) 
        or die "Unable to open $html_file \n";
    while(<HTML>){ $html .= $_; }
    close(HTML);
    parse_subscriber_list( $html );
}

sub parse_subscriber_list {
    my $html = shift;
    my $te = HTML::TableExtract->new(
        headers => [ 'unsub', 'member', 'mod', 'hide', 
                     'nomail', 'ack', 'not metoo', 
                     'nodupes', 'digest', 'plain', 'language' ] );

    my $row_count;
    $te->parse($html);
    foreach my $ts ($te->tables){
        foreach my $row ($ts->rows){
            $row_count++;
            my $name = $row->[1]->content->[2]->attr('value');
            my %name = Lingua::EN::NameParse::Simple::ParseName($name);
            my $email = $row->[1]->content->[3]->attr('value');
            $email =~ s/%40/\@/;
            my @record;
            foreach my $field (qw/ TITLE FIRST MIDDLE LAST SUFFIX / ){
                $name{$field} ||= '';
                push @record, $name{$field};
            }
            push @record, $email;
            my $record = "'" . join( "','", @record ) . "'";
            print $record . "\n";
        }        
    }
}

exit;

=head1 ACKNOWLEDGEMENTS

I publish this with appreciation to the authors of the modules which made it possible, and to choroba from the Czech Republic, who shared a clue with me by way of PerlMonks.org on how to more effectively use HTML::TableExtract.  

Thanks again to the perl community who made cpan and perlmonks available to us all.

=head1 LISCENSE

This script is made available subject to the conditions of the Gnu Public Liscense, v2.  You are welcome to use, and modify this code so long as any redistribution is made subject to the same terms.  

=head1 COPYRIGHT

2012, Hugh Esco, YMD Partners LLC; dba/ http://CampaignFoundations.com/

=cut