#!/usr/bin/perl use strict; use warnings; use HTML::Entities qw(encode_entities); use XML::Twig; use URI::Escape; my @urls; for(1..150000){ push(@urls, { url => "http://www.example.com/some/really/interesting/file.html", priority => 0.5, } ); } for(1..3){ my $outfile = "test_sitemap". $_ .".xml"; print "Working on $outfile, map $_ of 3 (".@urls." remaining URLs)\n"; ## this sitemap my $cnt; my $sitemap; $sitemap = Sitemap->new( type => 'html', file => $outfile ); # build the xml structure inside out foreach my $i (1..49500){ my $url = shift(@urls); last if @urls == 0; next if $url->{url} eq ''; ## add $sitemap->add({ loc => $url->{url}, priority => $url->{priority} }); $cnt++; } my $output = $sitemap->output(); print " Count: ".($sitemap->{cnt}->{html}||0)." html urls, ".($sitemap->{cnt}->{video}||0)." video urls.\n"; $sitemap = undef; # check length if(length($output) < 10485760){ print " This sitemap file is OK: (uncompressed under 10MB) ". (length($output)) ." bytes\n"; }else{ print " This sitemap file is TOO BIG (uncompressed over 10MB).\n"; } open(my $DATEI, ">$outfile") or die "Could not open output file: $!"; binmode($DATEI); print $DATEI $output; close($DATEI); $output = undef; print " $cnt URLs added to sitemap $outfile.\n"; print " Doing gzip compress... "; system "gzip -f9 $outfile"; # f forced overwrite, compression level 9/best print "done.\n"; if(-s "$outfile.gz" < 10485760){ print " This sitemap file is OK (compressed under 10MB) ". (-s "$outfile.gz") ." bytes.\n"; }else{ print " This sitemap file is TOO BIG (compressed over 10MB).\n"; } } package Sitemap; use strict; use warnings; use XML::Twig; use open OUT => ':utf8'; use HTML::Entities qw(encode_entities); use Data::Dumper; sub new { my ($class, %cfg) = @_; my $self = bless { xml => undef, type => $cfg{type} || 'html', # sitemaps are either "video" or "html" cnt => undef, }, $class; if($cfg{type} eq 'video'){ # init XML::Twig for a video-sitemap $self->{xml} = XML::Twig::Elt->new('urlset', { 'xmlns' => 'http://www.sitemaps.org/schemas/sitemap/0.9', 'xmlns:video' => 'http://www.google.com/schemas/sitemap-video/1.1', }); }else{ # init XML::Twig for standard sitemap $self->{xml} = XML::Twig::Elt->new('urlset', { 'xmlns' => 'http://www.google.com/schemas/sitemap/0.84', 'xmlns:xsi' => 'http://www.w3.org/2001/XMLSchema-instance', 'xsi:schemaLocation' => join(' ', 'http://www.google.com/schemas/sitemap/0.84', 'http://www.google.com/schemas/sitemap/0.84/sitemap.xsd', ), }); } return $self; } sub add { my $self = shift; my $ref = shift; if($self->{type} eq 'video'){ $self->add_video($ref); }else{ $self->add_html($ref); } } sub add_html { my $self = shift; my $ref = shift; # step over elements in fixed order my @elements; foreach my $key (qw(loc changefreq lastmod priority)){ if($key eq 'loc'){ if($ref->{loc}){ push(@elements, XML::Twig::Elt->new('loc', {}, $ref->{loc}) ); }else{ print "Error: 'loc' is a mandatory value and missing!\n" } }elsif($key eq 'priority'){ if($ref->{priority}){ push(@elements, XML::Twig::Elt->new('priority', {}, $ref->{priority}) ); }else{ print "Error: 'priority' is a mandatory value and missing!\n" } } } # wrap these sub-elements into an "url"-level my $elt = XML::Twig::Elt->new('url', {}, @elements); undef(@elements); # and add this url-sub-element nest to the xml-document $elt->paste(last_child => $self->{xml}); $self->{cnt}->{html}++; } sub add_video { my $self = shift; my $ref = shift; ## omitted } sub output { my $self = shift; my $file = shift; $self->{xml}->set_pretty_print('indented'); my $header = ''."\n"; if($file){ open(my $fh, ">$file") or die "Could not open output file: $!"; binmode($fh); print $fh $header . $self->{xml}->sprint(); close($fh); return; }else{ return $header . $self->{xml}->sprint(); } } sub DESTROY { my $self = shift; undef($self->{xml}); undef($self); } 1;