#!/usr/bin/perl use warnings; use strict; use feature qw{ say }; use HTML::Parser; use WWW::Mechanize; my ($date,$first_page,$last_page,@toc); sub get_date { my ($self, $tag, $attr) = @_; if ('span' eq $tag and $attr->{class} and 'highwire-cite-metadata-date' eq $attr->{class} and not defined $date ) { $self->handler(text => \&next_text_to_date, 'self, text'); } elsif ('span' eq $tag and $attr->{class} and 'highwire-cite-metadata-pages' eq $attr->{class} ) { if (not defined $first_page) { $self->handler(text => \&parse_first_page, 'self, text'); } else { $self->handler(text => \&parse_last_page, 'self, text'); } } elsif ('span' eq $tag and $attr->{class} and 'highwire-cite-metadata-doi' eq $attr->{class} ) { $self->handler(text => \&retrieve_doi, 'self, text'); } elsif ('div' eq $tag and $attr->{class} and $attr->{class} =~ /\bissue-toc-section\b/ ) { $self->handler(text => \&next_text_to_toc, 'self, text'); } } sub next_text_to_date { my ($self, $text) = @_; $text =~ s/^\s+|\s+$//g; $date = $text; $self->handler(text => undef); } sub parse_first_page { my ($self, $text) = @_; if ($text =~ /([A-Z0-9]+)(?:-[0-9A-Z]+)?/) { $first_page = $1; $self->handler(text => undef); } } sub parse_last_page { my ($self, $text) = @_; if ($text =~ /(?:[A-Z0-9]+-)?([0-9A-Z]+)/) { $last_page = $1; $self->handler(text => undef); } } sub next_text_to_toc { my ($self, $text) = @_; push @toc, [$text]; $self->handler(text => undef); } sub retrieve_doi { my ($self, $text) = @_; if ('DOI:' ne $text) { $text =~ s/^\s+|\s+$//g; push @{ $toc[-1] }, $text; $self->handler(text => undef); } } print STDERR 'Enter the URL: '; chomp(my $url = <>); my ($volume, $issue) = (split m(/), $url)[-2, -1]; my $p = 'HTML::Parser'->new( api_version => 3, start_h => [ \&get_date, 'self, tagname, attr' ], ); my $mech = 'WWW::Mechanize'->new(agent => 'Mozilla'); $mech->get($url); my $contents = $mech->content; $p->parse($contents); $p->eof; my $toc; for my $section (@toc) { $toc .= "\n"; $toc .= "".shift(@$section)."\n"; $toc .= join q(), map "$_\n", @$section; $toc .= "\n"; } open (F6, ">meta_issue_$issue.xml"); print F6 <<"__HTML__"; Cadmus $date $first_page-$last_page $toc __HTML__