#!/usr/bin/perl
use warnings;
use strict;
use feature qw{ say };
use HTML::Parser;
use WWW::Mechanize;
my ($date,$first_page,$last_page,@toc);
sub get_date {
my ($self, $tag, $attr) = @_;
if ('span' eq $tag
and $attr->{class}
and 'highwire-cite-metadata-date' eq $attr->{class}
and not defined $date
) {
$self->handler(text => \&next_text_to_date, 'self, text');
} elsif ('span' eq $tag
and $attr->{class}
and 'highwire-cite-metadata-pages' eq $attr->{class}
) {
if (not defined $first_page) {
$self->handler(text => \&parse_first_page, 'self, text');
} else {
$self->handler(text => \&parse_last_page, 'self, text');
}
} elsif ('span' eq $tag
and $attr->{class}
and 'highwire-cite-metadata-doi' eq $attr->{class}
) {
$self->handler(text => \&retrieve_doi, 'self, text');
} elsif ('div' eq $tag
and $attr->{class}
and $attr->{class} =~ /\bissue-toc-section\b/
) {
$self->handler(text => \&next_text_to_toc, 'self, text');
}
}
sub next_text_to_date {
my ($self, $text) = @_;
$text =~ s/^\s+|\s+$//g;
$date = $text;
$self->handler(text => undef);
}
sub parse_first_page {
my ($self, $text) = @_;
if ($text =~ /([A-Z0-9]+)(?:-[0-9A-Z]+)?/) {
$first_page = $1;
$self->handler(text => undef);
}
}
sub parse_last_page {
my ($self, $text) = @_;
if ($text =~ /(?:[A-Z0-9]+-)?([0-9A-Z]+)/) {
$last_page = $1;
$self->handler(text => undef);
}
}
sub next_text_to_toc {
my ($self, $text) = @_;
push @toc, [$text];
$self->handler(text => undef);
}
sub retrieve_doi {
my ($self, $text) = @_;
if ('DOI:' ne $text) {
$text =~ s/^\s+|\s+$//g;
push @{ $toc[-1] }, $text;
$self->handler(text => undef);
}
}
print STDERR 'Enter the URL: ';
chomp(my $url = <>);
my ($volume, $issue) = (split m(/), $url)[-2, -1];
my $p = 'HTML::Parser'->new( api_version => 3,
start_h => [ \&get_date, 'self, tagname, attr' ],
);
my $mech = 'WWW::Mechanize'->new(agent => 'Mozilla');
$mech->get($url);
my $contents = $mech->content;
$p->parse($contents);
$p->eof;
my $toc;
for my $section (@toc) {
$toc .= "\n";
$toc .= "".shift(@$section)."\n";
$toc .= join q(), map "$_\n", @$section;
$toc .= "\n";
}
open (F6, ">meta_issue_$issue.xml");
print F6 <<"__HTML__";
Cadmus
$date
$first_page-$last_page
$toc
__HTML__