The London Perl and Raku Workshop takes place on 26th Oct 2024. If your company depends on Perl, please consider sponsoring and/or attending.
use strict;
use warnings;
package WWW::Sitemap::XML;
BEGIN {
  $WWW::Sitemap::XML::AUTHORITY = 'cpan:AJGB';
}
{
  $WWW::Sitemap::XML::VERSION = '1.121160';
}
#ABSTRACT: XML Sitemap protocol

use Moose;

use WWW::Sitemap::XML::URL;
use XML::LibXML 1.70;
use Scalar::Util qw( blessed );

use WWW::Sitemap::XML::Types qw( SitemapURL );


has '_rootcontainer' => (
    is => 'ro',
    traits => [qw( Array )],
    isa => 'ArrayRef',
    default => sub { [] },
    handles => {
        _add_entry => 'push',
        _count_entries => 'count',
        _entries => 'elements',
    }
);

has '_first_loc' => (
    is => 'rw',
);

has '_check_req_interface' => (
    is => 'ro',
    default => sub {
        sub {
            die 'object does not implement WWW::Sitemap::XML::URL::Interface'
                unless is_SitemapURL($_[0]);
        }
    }
);

has '_entry_class' => (
    is => 'ro',
    default => 'WWW::Sitemap::XML::URL'
);

has '_root_ns' => (
    is => 'ro',
    default => sub {
        {
            'xmlns' => "http://www.sitemaps.org/schemas/sitemap/0.9",
            'xmlns:xsi' => "http://www.w3.org/2001/XMLSchema-instance",
            'xsi:schemaLocation' => join(' ',
                'http://www.sitemaps.org/schemas/sitemap/0.9',
                'http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd'
            ),
        }
    },
);

has '_root_elem' => (
    is => 'ro',
    default => 'urlset',
);

sub _pre_check_add {
    my ($self, $entry) = @_;

    $self->_check_req_interface->($entry);

    die "Single file cannot contain more then 50 000 entries"
        if $self->_count_entries >= 50_000;

    my $loc = $entry->loc;

    die "URL cannot be longer then 2048 characters"
        unless length $loc < 2048;

    my($scheme, $authority) = $loc =~ m|(?:([^:/?#]+):)?(?://([^/?#]*))?|;
    my $new = "$scheme://$authority";
    if ( $self->_count_entries ) {
        my $first = $self->_first_loc;

        die "All URLs in same file should use the same protocol and reside on the "
            ."same host: $first, not $new" unless $first eq $new;
    } else {
        $self->_first_loc( $new );
    }
}


sub add {
    my $self = shift;

    my $class = $self->_entry_class;

    my $arg = @_ == 1 && blessed $_[0] ?
                shift @_ : $class->new(@_);

    $self->_pre_check_add($arg);

    $self->_add_entry( $arg );
}


sub urls { shift->_entries }


sub load {
    my $self = shift;

    $self->add($_) for $self->read(@_);
}


sub read {
    my ($self, %args) = @_;

    my @entries;
    my $class = $self->_entry_class;

    my $xml = XML::LibXML->load_xml( %args );

    for my $url ( $xml->getDocumentElement->nonBlankChildNodes() ) {
        push @entries,
            $class->new(
                map { $_->nodeName => $_->textContent } $url->nonBlankChildNodes
            );
    }

    return @entries;
}


sub write {
    my ($self, $fh, $format) = @_;

    $format ||= 0;

    my $writer = 'toFH';
    my $xml = $self->as_xml;

    unless ( ref $fh ) {
        $writer = 'toFile';
        if ( $fh =~ /\.gz$/i ) {
            $xml->setCompression(8);
        }
    }

    $xml->$writer( $fh, $format );
}


sub as_xml {
    my $self = shift;

    my $xml = XML::LibXML->createDocument('1.0','UTF-8');
    my $root = $xml->createElement($self->_root_elem);

    while (my ($k, $v) = each %{ $self->_root_ns() } ) {
        $root->setAttribute($k, $v);
    };

    $root->appendChild($_) for
        map {
            my $xml = $_->as_xml;
            blessed $xml ? $xml : XML::LibXML->load_xml(string => $xml)->documentElement()
        } $self->_entries;

    $xml->setDocumentElement($root);

    return $xml;
}


__PACKAGE__->meta->make_immutable;

1;


__END__
=pod

=encoding utf-8

=head1 NAME

WWW::Sitemap::XML - XML Sitemap protocol

=head1 VERSION

version 1.121160

=head1 SYNOPSIS

    use WWW::Sitemap::XML;

    my $map = WWW::Sitemap::XML->new();

    # add new url
    $map->add( 'http://mywebsite.com/' );

    # or
    $map->add(
        loc => 'http://mywebsite.com/',
        lastmod => '2010-11-22',
        changefreq => 'monthly',
        priority => 1.0,
    );

    # or
    $map->add(
        WWW::Sitemap::XML::URL->new(
            loc => 'http://mywebsite.com/',
            lastmod => '2010-11-22',
            changefreq => 'monthly',
            priority => 1.0,
        )
    );

    # read URLs from existing sitemap.xml file
    my @urls = $map->read( 'sitemap.xml' );

    # load urls from existing sitemap.xml file
    $map->load( 'sitemap.xml' );

    # get XML::LibXML object
    my $xml = $map->as_xml;

    print $xml->toString(1);

    # write to file
    $map->write( 'sitemap.xml', my $pretty_print = 1 );

    # write compressed
    $map->write( 'sitemap.xml.gz' );

=head1 DESCRIPTION

Read and write sitemap XML files as defined at L<http://www.sitemaps.org/>.

=head1 METHODS

=head2 add($url|%attrs)

    $map->add(
        WWW::Sitemap::XML::URL->new(
            loc => 'http://mywebsite.com/',
            lastmod => '2010-11-22',
            changefreq => 'monthly',
            priority => 1.0,
        )
    );

Add the C<$url> object representing single page in the sitemap.

Accepts blessed objects implementing L<WWW::Sitemap::XML::URL::Interface>.

Otherwise the arguments C<%attrs> are passed as-is to create new
L<WWW::Sitemap::XML::URL> object.

    $map->add(
        loc => 'http://mywebsite.com/',
        lastmod => '2010-11-22',
        changefreq => 'monthly',
        priority => 1.0,
    );

    # single url argument
    $map->add( 'http://mywebsite.com/' );

    # is same as
    $map->add( loc => 'http://mywebsite.com/' );

Performs basic validation of URLs added:

=over

=item * maximum of 50 000 URLs in single sitemap

=item * URL no longer then 2048 characters

=item * all URLs should use the same protocol and reside on same host

=back

=head2 urls

    my @urls = $map->urls;

Returns a list of all URL objects added to sitemap.

=head2 load(%sitemap_location)

    $map->load( location => $sitemap_file );

It is a shortcut for:

    $map->add($_) for $map->read( location => $sitemap_file );

Please see L<"read"> for details.

=head2 read(%sitemap_location)

    # file or url to sitemap
    my @urls = $map->read( location => $file_or_url );

    # file handle
    my @urls = $map->read( IO => $fh );

    # XML string
    my @urls = $map->read( string => $xml );

Read the sitemap from file, URL, open file handle or string and return the list of
L<WWW::Sitemap::XML::URL> objects representing C<E<lt>urlE<gt>> elements.

=head2 write($file, $format = 0)

    # write to file
    $map->write( 'sitemap.xml', my $pretty_print = 1);

    # or
    my $fh = IO::File->new();
    $fh->open('sitemap.xml', 'w');
    $map->write( $fh, my $pretty_print = 1);
    $cfh->close;

    # write compressed
    $map->write( 'sitemap.xml.gz' );

Write XML sitemap to C<$file> - a file name or L<IO::Handle> object.

If file names ends in C<.gz> then the output file will be compressed by
setting compression on XML object - please note that it requires I<libxml2> to
be compiled with I<zlib> support.

Optional C<$format> is passed to C<toFH> or C<toFile> methods
(depending on the type of C<$file>, respectively for file handle and file name)
as described in L<XML::LibXML>.

=head2 as_xml

    my $xml = $map->as_xml;

    # pretty print
    print $xml->toString(1);

    # write compressed
    $xml->setCompression(8);
    $xml->toFile( 'sitemap.xml.gz' );

Returns L<XML::LibXML::Document> object representing the sitemap in XML format.

The C<E<lt>urlE<gt>> elements are built by calling I<as_xml> on all URL objects
added into sitemap.

=head1 SEE ALSO

Please see those modules/websites for more information related to this module.

=over 4

=item *

L<WWW::SitemapIndex::XML>

=item *

L<http://www.sitemaps.org/>

=item *

L<Search::Sitemap>

=back

=head1 AUTHOR

Alex J. G. Burzyński <ajgb@cpan.org>

=head1 COPYRIGHT AND LICENSE

This software is copyright (c) 2010 by Alex J. G. Burzyński <ajgb@cpan.org>.

This is free software; you can redistribute it and/or modify it under
the same terms as the Perl 5 programming language system itself.

=cut