Imported Upstream version 1.10
This commit is contained in:
354
lib/WWW/Google/SiteMap.pm
Normal file
354
lib/WWW/Google/SiteMap.pm
Normal file
@@ -0,0 +1,354 @@
|
||||
package WWW::Google::SiteMap;
|
||||
use vars qw($VERSION); $VERSION = '1.10';
|
||||
|
||||
=head1 NAME
|
||||
|
||||
WWW::Google::SiteMap - DEPRECATED - See Search::Sitemap
|
||||
|
||||
=head1 DEPRECATED
|
||||
|
||||
Now that more search engines than just Google are supporting the Sitemap
|
||||
protocol, the WWW::Google::SiteMap module has been renamed to
|
||||
L<Search::Sitemap>.
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
use WWW::Google::SiteMap;
|
||||
|
||||
my $map = WWW::Google::SiteMap->new(file => 'sitemap.gz');
|
||||
|
||||
# Main page, changes a lot because of the blog
|
||||
$map->add(WWW::Google::SiteMap::URL->new(
|
||||
loc => 'http://www.jasonkohles.com/',
|
||||
lastmod => '2005-06-03',
|
||||
changefreq => 'daily',
|
||||
priority => 1.0,
|
||||
));
|
||||
|
||||
# Top level directories, don't change as much, and have a lower priority
|
||||
$map->add({
|
||||
loc => "http://www.jasonkohles.com/$_/",
|
||||
changefreq => 'weekly',
|
||||
priority => 0.9, # lower priority than the home page
|
||||
}) for qw(
|
||||
software gpg hamradio photos scuba snippets tools
|
||||
);
|
||||
|
||||
$map->write;
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
The Sitemap Protocol allows you to inform search engine crawlers about URLs
|
||||
on your Web sites that are available for crawling. A Sitemap consists of a
|
||||
list of URLs and may also contain additional information about those URLs,
|
||||
such as when they were last modified, how frequently they change, etc.
|
||||
|
||||
This module allows you to create and modify sitemaps.
|
||||
|
||||
=cut
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
use WWW::Google::SiteMap::URL qw();
|
||||
use XML::Twig qw();
|
||||
unless($IO::Zlib::VERSION) { eval "use IO::Zlib ()"; }
|
||||
my $ZLIB = $IO::Zlib::VERSION;
|
||||
use IO::File qw();
|
||||
require UNIVERSAL;
|
||||
use Carp qw(carp croak);
|
||||
use HTML::Entities qw(decode_entities);
|
||||
|
||||
=head1 METHODS
|
||||
|
||||
=over 4
|
||||
|
||||
=item new()
|
||||
|
||||
Creates a new WWW::Google::SiteMap object.
|
||||
|
||||
my $map = WWW::Google::SiteMap->new(
|
||||
file => 'sitemap.gz',
|
||||
);
|
||||
|
||||
=cut
|
||||
|
||||
sub new {
|
||||
my $class = shift;
|
||||
my %opts = @_;
|
||||
my $self = bless({}, ref($class) || $class);
|
||||
while(my($key,$value) = each %opts) { $self->$key($value) }
|
||||
if($self->file && -e $self->file) { $self->read }
|
||||
return $self;
|
||||
}
|
||||
|
||||
=item read()
|
||||
|
||||
Read a sitemap in to this object. If a filename is specified, it will be
|
||||
read from that file, otherwise it will be read from the file that was
|
||||
specified with the file() method. Reading of compressed files is done
|
||||
automatically if the filename ends with .gz.
|
||||
|
||||
=cut
|
||||
|
||||
sub read {
|
||||
my $self = shift;
|
||||
my $file = shift || $self->file ||
|
||||
croak "No filename specified for ".(ref($self)||$self)."::read";
|
||||
|
||||
# don't try to parse missing or empty files
|
||||
# no errors for this, because we might be creating it
|
||||
return unless -f $file && -s $file;
|
||||
|
||||
# don't try to parse very small compressed files
|
||||
# (empty .gz files are 20 bytes)
|
||||
return if $file =~ /\.gz/ && -s $file < 50;
|
||||
|
||||
my $fh;
|
||||
if($file =~ /\.gz$/i) {
|
||||
croak "IO::Zlib not available, cannot read compressed sitemaps"
|
||||
unless $ZLIB;
|
||||
$fh = IO::Zlib->new($file,"rb");
|
||||
} else {
|
||||
$fh = IO::File->new($file,"r");
|
||||
}
|
||||
my @urls = ();
|
||||
my $urlparser = sub {
|
||||
my $self = shift;
|
||||
my $elt = shift;
|
||||
|
||||
my $url = WWW::Google::SiteMap::URL->new();
|
||||
foreach my $c ($elt->children) {
|
||||
my $var = $c->gi;
|
||||
if($var eq 'loc') {
|
||||
$url->$var(decode_entities($c->text));
|
||||
} else {
|
||||
$url->$var($c->text);
|
||||
}
|
||||
}
|
||||
$self->purge;
|
||||
push(@urls,$url);
|
||||
};
|
||||
my $twig = XML::Twig->new(
|
||||
twig_roots => {
|
||||
'urlset/url' => $urlparser,
|
||||
'sitemapindex/sitemap' => $urlparser,
|
||||
},
|
||||
);
|
||||
$twig->safe_parse(join('',$fh->getlines))
|
||||
or die "Could not parse $file ($@)";
|
||||
$self->urls(@urls);
|
||||
}
|
||||
|
||||
=item write([$file])
|
||||
|
||||
Write the sitemap out to the file. If a filename is specified, it will be
|
||||
written to that file, otherwise it will be written to the file that was
|
||||
specified with the file() method. Writing of compressed files is done
|
||||
automatically if the filename ends with .gz.
|
||||
|
||||
=cut
|
||||
|
||||
sub write {
|
||||
my $self = shift;
|
||||
my $file = shift || $self->file ||
|
||||
croak "No filename specified for ".(ref($self)||$self)."::write";
|
||||
|
||||
my $fh;
|
||||
if($file =~ /\.gz$/i) {
|
||||
croak "IO::Zlib not available, cannot write compressed sitemaps"
|
||||
unless $ZLIB;
|
||||
$fh = IO::Zlib->new($file,"wb9");
|
||||
} else {
|
||||
$fh = IO::File->new($file,"w");
|
||||
}
|
||||
croak "Could not create '$file'" unless $fh;
|
||||
$fh->print($self->xml);
|
||||
}
|
||||
|
||||
=item urls()
|
||||
|
||||
Return the L<WWW::Google::SiteMap::URL> objects that make up the sitemap.
|
||||
|
||||
=cut
|
||||
|
||||
sub urls {
|
||||
my $self = shift;
|
||||
$self->{urls} = \@_ if @_;
|
||||
my @urls = grep { ref($_) && defined $_->loc } @{$self->{urls}};
|
||||
return wantarray ? @urls : \@urls;
|
||||
}
|
||||
|
||||
=item add($item,[$item...])
|
||||
|
||||
Add the L<WWW::Google::SiteMap::URL> items listed to the sitemap.
|
||||
|
||||
If you pass hashrefs instead of L<WWW::Google::SiteMap::URL> objects, it
|
||||
will turn them into objects for you. If the first item you pass is a
|
||||
simple scalar that matches \w, it will assume that the values passed are
|
||||
a hash for a single object. If the first item passed matches m{^\w+://}
|
||||
(i.e. it looks like a URL) then all the arguments will be treated as URLs,
|
||||
and L<WWW::Google::SiteMap::URL> objects will be constructed for them, but only
|
||||
the loc field will be populated.
|
||||
|
||||
This means you can do any of these:
|
||||
|
||||
# create the WWW::Google::SiteMap::URL object yourself
|
||||
my $url = WWW::Google::SiteMap::URL->new(
|
||||
loc => 'http://www.jasonkohles.com/',
|
||||
priority => 1.0,
|
||||
);
|
||||
$map->add($url);
|
||||
|
||||
# or
|
||||
$map->add(
|
||||
{ loc => 'http://www.jasonkohles.com/' },
|
||||
{ loc => 'http://www.jasonkohles.com/software/google-sitemap/' },
|
||||
{ loc => 'http://www.jasonkohles.com/software/geo-shapefile/' },
|
||||
);
|
||||
|
||||
# or
|
||||
$map->add(
|
||||
loc => 'http://www.jasonkohles.com/',
|
||||
priority => 1.0,
|
||||
);
|
||||
|
||||
# or even something funkier
|
||||
$map->add(qw(
|
||||
http://www.jasonkohles.com/
|
||||
http://www.jasonkohles.com/software/www-google-sitemap/
|
||||
http://www.jasonkohles.com/software/geo-shapefile/
|
||||
http://www.jasonkohles.com/software/text-fakedata/
|
||||
));
|
||||
foreach my $url ($map->urls) { $url->changefreq('daily') }
|
||||
|
||||
=cut
|
||||
|
||||
sub add {
|
||||
my $self = shift;
|
||||
if(ref($_[0])) {
|
||||
if(UNIVERSAL::isa($_[0],"WWW::Google::SiteMap::URL")) {
|
||||
push(@{$self->{urls}}, @_);
|
||||
} elsif(ref($_[0]) =~ /HASH/) {
|
||||
push(@{$self->{urls}},map {
|
||||
WWW::Google::SiteMap::URL->new($_)
|
||||
} @_);
|
||||
}
|
||||
} elsif($_[0] =~ /^\w+$/) {
|
||||
push(@{$self->{urls}}, WWW::Google::SiteMap::URL->new(@_));
|
||||
} elsif($_[0] =~ m{^\w+://}) {
|
||||
push(@{$self->{urls}}, map {
|
||||
WWW::Google::SiteMap::URL->new(loc => $_)
|
||||
} @_);
|
||||
} else {
|
||||
croak "Can't turn '".(
|
||||
ref($_[0]) || $_[0]
|
||||
)."' into WWW::Google::SiteMap::URL object";
|
||||
}
|
||||
}
|
||||
|
||||
=item xml();
|
||||
|
||||
Return the xml representation of the sitemap.
|
||||
|
||||
=cut
|
||||
|
||||
sub xml {
|
||||
my $self = shift;
|
||||
|
||||
my $xml = XML::Twig::Elt->new('urlset', {
|
||||
'xmlns' => 'http://www.google.com/schemas/sitemap/0.84',
|
||||
'xmlns:xsi' => 'http://www.w3.org/2001/XMLSchema-instance',
|
||||
'xsi:schemaLocation' => join(' ',
|
||||
'http://www.google.com/schemas/sitemap/0.84',
|
||||
'http://www.google.com/schemas/sitemap/0.84/sitemap.xsd',
|
||||
),
|
||||
});
|
||||
foreach($self->urls) {
|
||||
$_->as_elt->paste(last_child => $xml);
|
||||
}
|
||||
$xml->set_pretty_print($self->pretty);
|
||||
my $header = '<?xml version="1.0" encoding="UTF-8"?>';
|
||||
if($self->pretty) { $header .= "\n" }
|
||||
return $header.$xml->sprint();
|
||||
}
|
||||
|
||||
=item file()
|
||||
|
||||
Get or set the filename associated with this object. If you call read() or
|
||||
write() without a filename, this is the default.
|
||||
|
||||
=cut
|
||||
|
||||
sub file {
|
||||
my $self = shift;
|
||||
$self->{file} = shift if @_;
|
||||
return $self->{file};
|
||||
}
|
||||
|
||||
=item pretty()
|
||||
|
||||
Set this to a true value to enable 'pretty-printing' on the XML output. If
|
||||
false (the default) the XML will be more compact but not as easily readable
|
||||
for humans (Google and other computers won't care what you set this to).
|
||||
|
||||
If you set this to a 'word' (something that matches /[a-z]/i), then that
|
||||
value will be passed to XML::Twig directly (see the L<XML::Twig> pretty_print
|
||||
documentation). Otherwise if a true value is passed, it means 'nice', and a
|
||||
false value means 'none'.
|
||||
|
||||
Returns the value it was set to, or the current value if called with no
|
||||
arguments.
|
||||
|
||||
=cut
|
||||
|
||||
sub pretty {
|
||||
my $self = shift;
|
||||
my $val = shift || return $self->{pretty} || 'none';
|
||||
|
||||
if($val =~ /[a-z]/i) {
|
||||
$self->{pretty} = $val;
|
||||
} elsif($val) {
|
||||
$self->{pretty} = 'nice';
|
||||
} else {
|
||||
$self->{pretty} = 'none';
|
||||
}
|
||||
return $self->{pretty};
|
||||
}
|
||||
|
||||
=back
|
||||
|
||||
=head1 MODULE HOME PAGE
|
||||
|
||||
The home page of this module is
|
||||
L<http://www.jasonkohles.com/software/WWW-Google-SiteMap>. This is where you
|
||||
can always find the latest version, development versions, and bug reports. You
|
||||
will also find a link there to report bugs.
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
L<WWW::Google::SiteMap::Index>
|
||||
|
||||
L<WWW::Google::SiteMap::Ping>
|
||||
|
||||
L<WWW::Google::SiteMap::Robot>
|
||||
|
||||
L<http://www.jasonkohles.com/software/WWW-Google-SiteMap>
|
||||
|
||||
L<https://www.google.com/webmasters/sitemaps/docs/en/protocol.html>
|
||||
|
||||
=head1 AUTHOR
|
||||
|
||||
Jason Kohles, E<lt>email@jasonkohles.comE<gt>
|
||||
|
||||
=head1 COPYRIGHT AND LICENSE
|
||||
|
||||
Copyright (C) 2005 by Jason Kohles
|
||||
|
||||
This library is free software; you can redistribute it and/or modify
|
||||
it under the same terms as Perl itself, either Perl version 5.8.4 or,
|
||||
at your option, any later version of Perl 5 you may have available.
|
||||
|
||||
=cut
|
||||
|
||||
1;
|
||||
__END__
|
||||
189
lib/WWW/Google/SiteMap/Index.pm
Normal file
189
lib/WWW/Google/SiteMap/Index.pm
Normal file
@@ -0,0 +1,189 @@
|
||||
package WWW::Google::SiteMap::Index;
|
||||
use vars qw($VERSION); $VERSION = '1.10';
|
||||
|
||||
=head1 NAME
|
||||
|
||||
WWW::Google::SiteMap::Index - DEPRECATED - See Search::Sitemap
|
||||
|
||||
=head1 DEPRECATED
|
||||
|
||||
Now that more search engines than just Google are supporting the Sitemap
|
||||
protocol, the WWW::Google::SiteMap module has been renamed to
|
||||
L<Search::Sitemap>.
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
use WWW::Google::SiteMap::Index;
|
||||
|
||||
my $index = WWW::Google::SiteMap::Index->new(
|
||||
file => 'sitemap-index.gz',
|
||||
);
|
||||
|
||||
$index->add(WWW::Google::SiteMap::URL->new(
|
||||
loc => 'http://www.jasonkohles.com/sitemap1.gz',
|
||||
lastmod => '2005-11-01',
|
||||
));
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
A sitemap index is used to point Google at your sitemaps if you have more
|
||||
than one of them.
|
||||
|
||||
=cut
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
use base 'WWW::Google::SiteMap';
|
||||
|
||||
=head1 METHODS
|
||||
|
||||
=over 4
|
||||
|
||||
=item new()
|
||||
|
||||
Creates a new WWW::Google::SiteMap::Index object.
|
||||
|
||||
my $index = WWW::Google::SiteMap::Index->new(
|
||||
file => 'sitemap-index.gz',
|
||||
);
|
||||
|
||||
=item read()
|
||||
|
||||
Read a sitemap index in to this object. If a filename is specified, it will
|
||||
be read from that file, otherwise it will be read from the file that was
|
||||
specified with the file() method. Reading of compressed files is done
|
||||
automatically if the filename ends with .gz.
|
||||
|
||||
=item write([$file]);
|
||||
|
||||
Write the sitemap index out to the file. If a filename is specified, it will
|
||||
be written to that file, otherwise it will be written to the file that was
|
||||
specified with the file() method. Writing of compressed files is done
|
||||
automatically if the filename ends with .gz
|
||||
|
||||
=item urls()
|
||||
|
||||
Return the L<WWW::Google::SiteMap::URL> objects that make up the sitemap index.
|
||||
|
||||
=item add($item,[$item...]);
|
||||
|
||||
Add the L<WWW::Google::SiteMap::URL> items listed to the sitemap index.
|
||||
|
||||
If you pass hashrefs instead of L<WWW::Google::SiteMap::URL> objects, it
|
||||
will turn them into objects for you. If the first item you pass is a
|
||||
simple scalar that matches \w, it will assume that the values passed are
|
||||
a hash for a single object. If the first item passed matches m{^\w+://}
|
||||
(i.e. it looks like a URL) then all the arguments will be treated as URLs,
|
||||
and L<WWW::Google::SiteMap::URL> objects will be constructed for them, but only
|
||||
the loc field will be populated.
|
||||
|
||||
This means you can do any of these:
|
||||
|
||||
# create the WWW::Google::SiteMap::URL object yourself
|
||||
my $url = WWW::Google::SiteMap::URL->new(
|
||||
loc => 'http://www.jasonkohles.com/sitemap1.gz',
|
||||
);
|
||||
$map->add($url);
|
||||
|
||||
# or
|
||||
$map->add(
|
||||
{ loc => 'http://www.jasonkohles.com/sitemap1.gz' },
|
||||
{ loc => 'http://www.jasonkohles.com/sitemap2.gz' },
|
||||
{ loc => 'http://www.jasonkohles.com/sitemap3.gz' },
|
||||
);
|
||||
|
||||
# or
|
||||
$map->add(
|
||||
loc => 'http://www.jasonkohles.com/sitemap1.gz',
|
||||
priority => 1.0,
|
||||
);
|
||||
|
||||
# or even something funkier
|
||||
$map->add(qw(
|
||||
http://www.jasonkohles.com/
|
||||
http://www.jasonkohles.com/software/www-google-sitemap/
|
||||
http://www.jasonkohles.com/software/geo-shapefile/
|
||||
http://www.jasonkohles.com/software/text-fakedata/
|
||||
));
|
||||
foreach my $url ($map->urls) { $url->lastmod('2005-11-01') }
|
||||
|
||||
=item xml();
|
||||
|
||||
Return the xml representation of the sitemap index.
|
||||
|
||||
=cut
|
||||
|
||||
sub xml {
|
||||
my $self = shift;
|
||||
|
||||
my $xml = XML::Twig::Elt->new('sitemapindex', {
|
||||
'xmlns' => 'http://www.google.com/schemas/sitemap/0.84',
|
||||
'xmlns:xsi' => 'http://www.w3.org/2001/XMLSchema-instance',
|
||||
'xsi:schemaLocation' => join(' ',
|
||||
'http://www.google.com/schemas/sitemap/0.84',
|
||||
'http://www.google.com/schemas/sitemap/0.84/siteindex.xsd',
|
||||
),
|
||||
});
|
||||
foreach($self->urls) {
|
||||
$_->as_elt('sitemap',qw(loc lastmod))->paste(last_child => $xml);
|
||||
}
|
||||
$xml->set_pretty_print($self->pretty);
|
||||
my $header = '<?xml version="1.0" encoding="UTF-8"?>';
|
||||
if($self->pretty) { $header .= "\n" }
|
||||
return $header.$xml->sprint();
|
||||
}
|
||||
|
||||
=item file();
|
||||
|
||||
Get or set the filename associated with this object. If you call read() or
|
||||
write() without a filename, this is the default.
|
||||
|
||||
=item pretty()
|
||||
|
||||
Set this to a true value to enable 'pretty-printing' on the XML output. If
|
||||
false (the default) the XML will be more compact but not as easily readable
|
||||
for humans (Google and other computers won't care what you set this to).
|
||||
|
||||
If you set this to a 'word' (something that matches /[a-z]/i), then that
|
||||
value will be passed to XML::Twig directly (see the L<XML::Twig> pretty_print
|
||||
documentation). Otherwise if a true value is passed, it means 'nice', and a
|
||||
false value means 'none'.
|
||||
|
||||
Returns the value it was set to, or the current value if called with no
|
||||
arguments.
|
||||
|
||||
=back
|
||||
|
||||
=head1 MODULE HOME PAGE
|
||||
|
||||
The home page of this module is
|
||||
L<http://www.jasonkohles.com/software/WWW-Google-SiteMap>. This is where you
|
||||
can always find the latest version, development versions, and bug reports. You
|
||||
will also find a link there to report bugs.
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
L<WWW::Google::SiteMap>
|
||||
|
||||
L<WWW::Google::SiteMap::Ping>
|
||||
|
||||
L<http://www.jasonkohles.com/software/WWW-Google-Sitemap>
|
||||
|
||||
L<https://www.google.com/webmasters/sitemaps/docs/en/protocol.html#sitemapFileRequirements>
|
||||
|
||||
=head1 AUTHOR
|
||||
|
||||
Jason Kohles, E<lt>email@jasonkohles.comE<gt>
|
||||
|
||||
=head1 COPYRIGHT AND LICENSE
|
||||
|
||||
Copyright (C) 2005 by Jason Kohles
|
||||
|
||||
This library is free software; you can redistribute it and/or modify
|
||||
it under the same terms as Perl itself, either Perl version 5.8.4 or,
|
||||
at your option, any later version of Perl 5 you may have available.
|
||||
|
||||
=cut
|
||||
|
||||
1;
|
||||
__END__
|
||||
197
lib/WWW/Google/SiteMap/Ping.pm
Normal file
197
lib/WWW/Google/SiteMap/Ping.pm
Normal file
@@ -0,0 +1,197 @@
|
||||
package WWW::Google::SiteMap::Ping;
|
||||
use vars qw($VERSION); $VERSION = '1.10';
|
||||
|
||||
=head1 NAME
|
||||
|
||||
WWW::Google::SiteMap::Ping - DEPRECATED - See Search::Sitemap
|
||||
|
||||
=head1 DEPRECATED
|
||||
|
||||
Now that more search engines than just Google are supporting the Sitemap
|
||||
protocol, the WWW::Google::SiteMap module has been renamed to
|
||||
L<Search::Sitemap>.
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
use WWW::Google::SiteMap::Ping;
|
||||
|
||||
my $ping = WWW::Google::SiteMap::Ping->new(
|
||||
'http://www.jasonkohles.com/sitemap.gz',
|
||||
);
|
||||
|
||||
$ping->submit;
|
||||
print "These pings succeeded:\n";
|
||||
foreach($ping->success) {
|
||||
print "$_: ".$ping->status($_)."\n";
|
||||
}
|
||||
print "These pings failed:\n";
|
||||
foreach($ping->failure) {
|
||||
print "$_: ".$ping->status($_)."\n";
|
||||
}
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
This module makes it easy to notify Google that your sitemaps, or sitemap
|
||||
indexes, have been updated. See L<WWW::Google::SiteMap> and
|
||||
L<WWW::Google::SiteMap::Index> for tools to help you create sitemaps and
|
||||
indexes.
|
||||
|
||||
=cut
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
use LWP::UserAgent;
|
||||
use URI::Escape qw(uri_escape);
|
||||
|
||||
=head1 METHODS
|
||||
|
||||
=over 4
|
||||
|
||||
=item new();
|
||||
|
||||
Create a new WWW::Google::SiteMap::Ping object. Can be given a list of
|
||||
URLs which refer to sitemaps or sitemap indexes, these URLs will simply
|
||||
be passed to url().
|
||||
|
||||
=cut
|
||||
|
||||
sub new {
|
||||
my $class = shift;
|
||||
my $self = bless({}, ref($class) || $class);
|
||||
$self->{urls} = {};
|
||||
$self->add_urls(@_);
|
||||
return $self;
|
||||
}
|
||||
|
||||
=item add_urls(@urls);
|
||||
|
||||
Add one or more urls to the list of URLs to submit to Google.
|
||||
|
||||
=cut
|
||||
|
||||
sub add_urls {
|
||||
my $self = shift;
|
||||
foreach(@_) {
|
||||
$self->{urls}->{$_} ||= 'PENDING';
|
||||
}
|
||||
}
|
||||
|
||||
=item urls();
|
||||
|
||||
Return the list of urls that will be (or were) submitted to google.
|
||||
|
||||
=cut
|
||||
|
||||
sub urls { return keys %{shift()->{urls}}; }
|
||||
|
||||
=item submit
|
||||
|
||||
Submit the urls to Google, returns the number of successful submissions. This
|
||||
module uses L<LWP::UserAgent> for the web-based submissions, and will honor
|
||||
proxy settings in the environment. See L<LWP::UserAgent> for more information.
|
||||
|
||||
=cut
|
||||
|
||||
sub submit {
|
||||
my $self = shift;
|
||||
|
||||
my $ua = $self->user_agent();
|
||||
my $success = 0;
|
||||
foreach my $url ($self->urls) {
|
||||
my $ping = "http://www.google.com/webmasters/sitemaps/ping?".
|
||||
"sitemap=".uri_escape($url);
|
||||
my $response = $ua->get($ping);
|
||||
if($response->is_success) {
|
||||
$self->{urls}->{$url} = 'SUCCESS';
|
||||
$success++;
|
||||
} else {
|
||||
$self->{urls}->{$url} = $response->status_line;
|
||||
}
|
||||
}
|
||||
|
||||
return $success;
|
||||
}
|
||||
|
||||
=item success();
|
||||
|
||||
Return the URLs that were successfully submitted. Note that success only
|
||||
means that the request was successfully received by Google, it does not
|
||||
mean your sitemap was found, loaded or parsed successfully. If you want
|
||||
to know whether your sitemap was loaded or parsed successfully, you have
|
||||
to go to L<http://www.google.com/webmasters/sitemaps> and check the status
|
||||
there.
|
||||
|
||||
=cut
|
||||
|
||||
sub success {
|
||||
my $self = shift;
|
||||
|
||||
return grep { $self->{urls}->{$_} eq 'SUCCESS' } keys %{$self->{urls}};
|
||||
}
|
||||
|
||||
=item failure();
|
||||
|
||||
Return the URLs that were not successfully submitted.
|
||||
|
||||
=cut
|
||||
|
||||
sub failure {
|
||||
my $self = shift;
|
||||
|
||||
return grep { $self->{urls}->{$_} ne 'SUCCESS' } keys %{$self->{urls}};
|
||||
}
|
||||
|
||||
=item user_agent();
|
||||
|
||||
If called with no arguments, will return the current L<LWP::UserAgent> object
|
||||
which will be used to access the web-based submission. If called with an
|
||||
arugment, you can set the user agent that will be used in case you need to
|
||||
give it special arguments. It must be a L<LWP::UserAgent> object.
|
||||
|
||||
If you call submit without having provided a user agent, one will be created
|
||||
for you that is a basic L<LWP::UserAgent> object, which honors proxy settings
|
||||
in the environment.
|
||||
|
||||
=cut
|
||||
|
||||
sub user_agent {
|
||||
my $self = shift;
|
||||
|
||||
if(@_) { $self->{_ua} = shift }
|
||||
unless($self->{_ua}) {
|
||||
$self->{_ua} = LWP::UserAgent->new();
|
||||
$self->{_ua}->env_proxy;
|
||||
$self->{_ua}->timeout(10);
|
||||
}
|
||||
return $self->{_ua};
|
||||
}
|
||||
|
||||
=back
|
||||
|
||||
=head1 MODULE HOME PAGE
|
||||
|
||||
The home page of this module is
|
||||
L<http://www.jasonkohles.com/software/WWW-Google-SiteMap>. This is where you
|
||||
can always find the latest version, development versions, and bug reports. You
|
||||
will also find a link there to report bugs.
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
L<http://www.google.com/webmasters/sitemaps/docs/en/submit.html#ping>
|
||||
|
||||
=head1 AUTHOR
|
||||
|
||||
Jason Kohles, E<lt>email@jasonkohles.comE<gt>
|
||||
|
||||
=head1 COPYRIGHT AND LICENSE
|
||||
|
||||
Copyright (C) 2005 by Jason Kohles
|
||||
|
||||
This library is free software; you can redistribute it and/or modify
|
||||
it under the same terms as Perl itself, either Perl version 5.8.4 or,
|
||||
at your option, any later version of Perl 5 you may have available.
|
||||
|
||||
=cut
|
||||
|
||||
1;
|
||||
__END__
|
||||
582
lib/WWW/Google/SiteMap/Robot.pm
Normal file
582
lib/WWW/Google/SiteMap/Robot.pm
Normal file
@@ -0,0 +1,582 @@
|
||||
package WWW::Google::SiteMap::Robot;
|
||||
use vars qw($VERSION); $VERSION = '1.10';
|
||||
|
||||
=head1 NAME
|
||||
|
||||
WWW::Google::SiteMap::Robot - DEPRECATED - See Search::Sitemap
|
||||
|
||||
=head1 DEPRECATED
|
||||
|
||||
Now that more search engines than just Google are supporting the Sitemap
|
||||
protocol, the WWW::Google::SiteMap module has been renamed to
|
||||
L<Search::Sitemap>.
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
use WWW::Google::SiteMap::Robot;
|
||||
|
||||
my $robot = WWW::Google::SiteMap::Robot->new(
|
||||
domain => 'www.jasonkohles.com',
|
||||
restrict => qr{^http://www.jasonkohles.com/},
|
||||
starting_url => ['/index.html','/google-me.html'],
|
||||
delay => 1, # delay in minutes
|
||||
sitemap_file => '/var/www/html/sitemap.gz',
|
||||
sitemap_url => 'http://www.jasonkohles.com/sitemap.gz',
|
||||
user_agent => 'MyOwnSpider/1.0',
|
||||
);
|
||||
$robot->run();
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
This is a simple robot class which subclasses L<LWP::RobotUA> to create a
|
||||
web-crawling spider. By giving it the URL to your home page, it will crawl
|
||||
all the pages it can find and create a sitemap for them.
|
||||
|
||||
=cut
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
use WWW::Mechanize;
|
||||
use WWW::RobotRules;
|
||||
use Carp qw(croak);
|
||||
use POSIX qw(strftime);
|
||||
use WWW::Google::SiteMap;
|
||||
use WWW::Google::SiteMap::Ping;
|
||||
|
||||
=head1 METHODS
|
||||
|
||||
=over 4
|
||||
|
||||
=item new();
|
||||
|
||||
Create a new WWW::Google::SiteMap::Robot object.
|
||||
|
||||
=cut
|
||||
|
||||
sub new {
|
||||
my $class = shift;
|
||||
my %args = @_;
|
||||
|
||||
my $self = bless({},ref($class)||$class);
|
||||
|
||||
croak "No domain specified" unless $args{domain};
|
||||
|
||||
# These items have other methods that depend on them, so they need to
|
||||
# be called in this order:
|
||||
foreach my $x (qw(domain status_storage)) {
|
||||
$self->$x(delete($args{$x}));
|
||||
}
|
||||
|
||||
while(my($k,$v) = each %args) { $self->$k($v) }
|
||||
|
||||
return $self;
|
||||
}
|
||||
|
||||
=item domain();
|
||||
|
||||
Get/Set the domain name of the server you want to spider. This is used both
|
||||
to create the initial URLs to put in the TO-DO list, as well as to create a
|
||||
built-in restriction that prevents the robot from leaving your site.
|
||||
|
||||
Google doesn't allow a sitemap to refer to URL's that are outside the domain
|
||||
that the sitemap was retrieved for, so there really isn't any benefit in
|
||||
allowing the robot to cross multiple domains. If you really think you need
|
||||
to do this, you probably really just want more than one robot. If you are
|
||||
absolutely certain you want to cross domain boundaries, then you'll have to
|
||||
subclass this module, and Google will probably reject your sitemaps.
|
||||
|
||||
=cut
|
||||
|
||||
sub domain {
|
||||
my $self = shift;
|
||||
|
||||
if(@_) { $self->{domain} = shift }
|
||||
return $self->{domain};
|
||||
}
|
||||
|
||||
=item restrict();
|
||||
|
||||
Get/Set the url restrictions. The restriction list can be any of the
|
||||
following:
|
||||
|
||||
=over 4
|
||||
|
||||
=item A list reference (or a list)
|
||||
|
||||
A list reference is assumed to contain a list of any of the following types.
|
||||
When passed as an argument to the constructor it has to be a reference, but
|
||||
when you are calling restrict() as a method, you can pass it a list, and it
|
||||
will turn it into a list reference. If you provide more than one restrict
|
||||
item in a list, the first one to return true will cause the rest of them to
|
||||
be skipped, so the URL will be restricted (skipped) if any of the items are
|
||||
true (if you want more complexity than that, then just use a code reference
|
||||
by itself, which can do whatever it wants.)
|
||||
|
||||
=item A code reference
|
||||
|
||||
If you give restrict a code reference, it will be passed the URL that is
|
||||
about to be spidered, if the code returns a true value, the URL will be
|
||||
skipped. If it returns false, it will not be restricted.
|
||||
|
||||
=item A regexp reference
|
||||
|
||||
If you give it a regexp reference, then the regexp will be applied to the
|
||||
URL about to be spidered, if the regexp matches, then the URL will be
|
||||
skipped.
|
||||
|
||||
=back
|
||||
|
||||
If called with no arguments, it will return the current list of restrictions.
|
||||
|
||||
There are built-in restrictions that are always applied at the end of your
|
||||
restriction list. One is a url regexp that matches your domain name, to
|
||||
prevent the robot from leaving your site (it's qr{^\w+://YOUR_DOMAIN/}).
|
||||
The other is a restriction that excludes any URLs that are not allowed by
|
||||
your robots.txt. This module doesn't provide any method for ignoring the
|
||||
robots.txt restriction (because it's dangerous), you should really modify
|
||||
your robots.txt to allow this robot to bypass any of the restrictions you
|
||||
don't want it to honor.
|
||||
|
||||
For example, if your robot.txt contains:
|
||||
|
||||
User-Agent: *
|
||||
Disallow: /admin
|
||||
Disallow: /google-stuff
|
||||
|
||||
Then those two paths will not be included in your sitemap. If you decided
|
||||
you actually did want /google-stuff to appear in your sitemap, you could add
|
||||
this to your robots.txt:
|
||||
|
||||
User-Agent: WWWGoogleSiteMapRobot
|
||||
Disallow: /admin
|
||||
|
||||
=cut
|
||||
|
||||
sub restrict {
|
||||
my $self = shift;
|
||||
|
||||
if(@_) { $self->{restrict} = \@_ }
|
||||
unless($self->{restrict}) { $self->{restrict} = [] }
|
||||
|
||||
return @{$self->{restrict}};
|
||||
}
|
||||
|
||||
=item starting_url();
|
||||
|
||||
If called with one or more arguments, they are assumed to be URLs which will
|
||||
seed the spider. The spider continues to run as long as there are URLs in
|
||||
it's "TO-DO" list, this method simply adds items to that list. The arguments
|
||||
to starting_url are just the filename part of the url, if you don't specify
|
||||
one, it defaults to '/'.
|
||||
|
||||
You can pass it either a list of URLs, or a list reference (so you can use
|
||||
a list reference in the constructor.)
|
||||
|
||||
=cut
|
||||
|
||||
sub starting_url {
|
||||
my $self = shift;
|
||||
|
||||
if(@_) {
|
||||
$self->{starting_url} = \@_;
|
||||
$self->_populate_starting_urls;
|
||||
}
|
||||
unless($self->{starting_url}) { $self->{starting_url} = ['/'] }
|
||||
return $self->{starting_url};
|
||||
}
|
||||
|
||||
sub _populate_starting_urls {
|
||||
my $self = shift;
|
||||
my @populate = @_;
|
||||
|
||||
unless(@populate) { @populate = $self->starting_url() }
|
||||
|
||||
foreach(@populate) {
|
||||
next unless $_;
|
||||
if(ref($_)) { $self->_populate_starting_urls(@{$_}); next; }
|
||||
$self->{storage}->{"http://".$self->domain.$_} ||= '';
|
||||
}
|
||||
}
|
||||
|
||||
=item delay();
|
||||
|
||||
Get or set the delay (in minutes) to wait between requests. The default is
|
||||
1 minute, and if you want to hammer on your web server you can set this to
|
||||
a value less than 1.
|
||||
|
||||
=cut
|
||||
|
||||
sub delay {
|
||||
my $self = shift;
|
||||
|
||||
if(@_) { $self->{delay} = shift }
|
||||
|
||||
return $self->{delay} || 1;
|
||||
}
|
||||
|
||||
=item sitemap_file();
|
||||
|
||||
Sets the filename to save the L<WWW::Google::SiteMap> object to. This is
|
||||
required.
|
||||
|
||||
=cut
|
||||
|
||||
sub sitemap_file {
|
||||
my $self = shift;
|
||||
|
||||
if(@_) { $self->{sitemap_file} = shift }
|
||||
return $self->{sitemap_file};
|
||||
}
|
||||
|
||||
=item sitemap_url();
|
||||
|
||||
Sets the url for the sitemap. This is optional, but if you specify it, then
|
||||
the robot will notify Google (using L<WWW::Google::SiteMap::Ping>) after it
|
||||
writes a new sitemap.
|
||||
|
||||
=cut
|
||||
|
||||
sub sitemap_url {
|
||||
my $self = shift;
|
||||
|
||||
if(@_) { $self->{sitemap_url} = shift }
|
||||
return $self->{sitemap_url};
|
||||
}
|
||||
|
||||
=item user_agent();
|
||||
|
||||
Set the User Agent that this robot uses to identify itself. The default is
|
||||
'WWWGoogleSiteMapRobot/version' (unless you have subclassed this module, it's
|
||||
actually the class name with special characters removed.)
|
||||
Be careful about changing this while the robot is active (this includes
|
||||
changing it between runs if you are storing the state) as this affects how
|
||||
your robot interprets your robots.txt file.
|
||||
|
||||
=cut
|
||||
|
||||
sub user_agent {
|
||||
my $self = shift;
|
||||
|
||||
if(@_) { $self->{user_agent} = shift }
|
||||
unless($self->{user_agent}) {
|
||||
my $pkg = ref($self) || $self;
|
||||
$pkg =~ s/\W//g;
|
||||
$self->{user_agent} = join('/',$pkg,$VERSION);
|
||||
}
|
||||
return $self->{user_agent};
|
||||
}
|
||||
|
||||
=item robot_rules();
|
||||
|
||||
Get or set the L<WWW::RobotRules> object used to handle robots.txt.
|
||||
|
||||
=cut
|
||||
|
||||
sub robot_rules {
|
||||
my $self = shift;
|
||||
|
||||
if(@_) { $self->{robot_rules} = shift }
|
||||
unless($self->{robot_rules}) {
|
||||
$self->{robot_rules} = WWW::RobotRules->new($self->user_agent);
|
||||
my $url = "http://".$self->domain."/robots.txt";
|
||||
my $mech = $self->mechanize();
|
||||
$mech->get($url);
|
||||
$self->{robot_rules}->parse($url,$mech->content);
|
||||
}
|
||||
return $self->{robot_rules};
|
||||
}
|
||||
|
||||
=item mechanize();
|
||||
|
||||
Get or set the L<WWW::Mechanize> object used for retrieving web documents.
|
||||
|
||||
=cut
|
||||
|
||||
sub mechanize {
|
||||
my $self = shift;
|
||||
|
||||
if(@_) { $self->{mech} = shift }
|
||||
unless($self->{mech}) {
|
||||
$self->{mech} = WWW::Mechanize->new(
|
||||
agent => $self->user_agent,
|
||||
stack_depth => 1,
|
||||
);
|
||||
}
|
||||
return $self->{mech};
|
||||
}
|
||||
|
||||
=item status_storage();
|
||||
|
||||
If you provide status_storage with a tied hash, it will be used to store the
|
||||
state of the TO-DO list which includes the data needed to build the sitemap,
|
||||
as well as the list of unvisited URLs. This means that the robot can continue
|
||||
where it left off if it is interrupted for some reason before finishing, then
|
||||
you don't have to re-spider the entire site. This is strongly recommended.
|
||||
|
||||
You can use this with basically anything that can be implemented as a tied
|
||||
hash, as long as it can handle fully-qualified URLs as keys, the values will
|
||||
be simple scalars (it won't try to store references or anything like that
|
||||
in the values.)
|
||||
|
||||
Example:
|
||||
|
||||
use WWW::Google::SiteMap::Robot;
|
||||
use GDBM::File;
|
||||
|
||||
tie my %storage, 'GDBM_File', '/tmp/my-robot-status', &GDBM_WRCREAT, 0640;
|
||||
my $robot = WWW::Google::SiteMap::Robot->new(
|
||||
restrict => qr{^http://www.jasonkohles.com/},
|
||||
starting_url => 'http://www.jasonkohles.com/index.html',
|
||||
sitemap_file => '/var/www/html/sitemap.gz',
|
||||
);
|
||||
|
||||
If you don't provide a tied hash to store the status in, it will be stored in
|
||||
a normal (in-memory) hash.
|
||||
|
||||
=cut
|
||||
|
||||
sub status_storage {
|
||||
my $self = shift;
|
||||
|
||||
if(@_) {
|
||||
$self->{storage} = shift;
|
||||
# If the storage is changed, we might have lost our starting urls
|
||||
$self->_populate_starting_urls;
|
||||
}
|
||||
unless($self->{storage}) {
|
||||
$self->{storage} = {};
|
||||
$self->_populate_starting_urls;
|
||||
}
|
||||
return $self->{storage};
|
||||
}
|
||||
|
||||
=item pending_urls();
|
||||
|
||||
Return a list of all the URLs that have been found, but have not yet been
|
||||
visited. This may include URLs that will later be restricted, and will not
|
||||
be visited.
|
||||
|
||||
=cut
|
||||
|
||||
sub pending_urls {
|
||||
my $self = shift;
|
||||
my $todo = $self->status_storage;
|
||||
return grep { ! $todo->{$_} } keys %{$todo};
|
||||
}
|
||||
|
||||
=item restricted_urls();
|
||||
|
||||
Return a list of all the URLs that are in the TO-DO list that have already
|
||||
been tried, but were skipped because they were restricted.
|
||||
|
||||
=cut
|
||||
|
||||
sub restricted_urls {
|
||||
my $self = shift;
|
||||
$self->_url_data_match(qr/^RESTRICTED /o);
|
||||
}
|
||||
|
||||
=item visited_urls();
|
||||
|
||||
Return a list of all the URLs that have already been visited, and will be
|
||||
included in the sitemap.
|
||||
|
||||
=cut
|
||||
|
||||
sub visited_urls {
|
||||
my $self = shift;
|
||||
$self->_url_data_match(qr/^OK /o);
|
||||
}
|
||||
|
||||
=item run();
|
||||
|
||||
Start the robot running. If you are building your robot into a larger
|
||||
program that has to handle other tasks as well, then you can pass an integer
|
||||
to run(), which will be the number of URLs to check (of course then you will
|
||||
have to call it again later, probably in a loop, to make sure you get them
|
||||
all.) Returns true if something was done, returns false if no pending URLs
|
||||
were found in the TO-DO list. Calling start() again after it has returned
|
||||
false is rather pointless. If you call it in a loop as part of a larger
|
||||
program, you are also responsible for calling write_sitemap() after all the
|
||||
data is collected.
|
||||
|
||||
If called with no arguments (or a false argument) it will run until there are
|
||||
no more URLs to process.
|
||||
|
||||
=cut
|
||||
|
||||
sub run {
|
||||
my $self = shift;
|
||||
my $count = shift;
|
||||
|
||||
my $counter = $count;
|
||||
my @waiting = $self->pending_urls;
|
||||
my $mech = $self->mechanize;
|
||||
while(1) {
|
||||
sleep($self->delay * 60); # sleep first, because of all the nexts
|
||||
unless(@waiting) { @waiting = $self->pending_urls }
|
||||
if(my $url = shift(@waiting)) {
|
||||
# first make sure we didn't already do it
|
||||
next if $self->{storage}->{$url};
|
||||
# Then make sure it isn't restricted
|
||||
if($self->_check_restrictions($url)) {
|
||||
$self->{storage}->{$url} = 'RESTRICTED';
|
||||
next;
|
||||
}
|
||||
$mech->get($url);
|
||||
if($mech->success) {
|
||||
# extract the last modification time from the page
|
||||
my $modtime = $mech->response->last_modified()
|
||||
|| (time - $mech->response->current_age);
|
||||
$self->{storage}->{$url} = "SUCCESS $modtime";
|
||||
# add any links in the page to our todo list
|
||||
foreach($mech->links) {
|
||||
my $url = $_->url_abs;
|
||||
$url =~ s/#[^#]+$//;
|
||||
$self->{storage}->{$url} ||= '';
|
||||
}
|
||||
} else {
|
||||
$self->{storage}->{$url} = 'ERROR '.$mech->status();
|
||||
}
|
||||
next;
|
||||
}
|
||||
|
||||
if($count) {
|
||||
last unless $counter--;
|
||||
} else {
|
||||
last unless @waiting;
|
||||
}
|
||||
}
|
||||
unless($count) { # if you are limiting, you have to do this part yourself
|
||||
$self->write_sitemap() if $self->sitemap_file();
|
||||
}
|
||||
}
|
||||
|
||||
sub _check_restrictions {
|
||||
my $self = shift;
|
||||
my $url = shift;
|
||||
|
||||
# some hard-coded restrictions for safety sake
|
||||
if($url !~ /^(http|https):/) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
foreach my $r ($self->restrict) {
|
||||
if(ref($r) eq 'Regexp' && $url =~ /$r/) {
|
||||
return 1;
|
||||
}
|
||||
if(ref($r) eq 'CODE' && $r->($url)) {
|
||||
return 1
|
||||
}
|
||||
}
|
||||
my $domain = $self->domain;
|
||||
if($url !~ m{^\w+://$domain}o) {
|
||||
return 1;
|
||||
}
|
||||
unless($self->robot_rules->allowed($url)) {
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
=item write_sitemap();
|
||||
|
||||
Write out the sitemap (if a sitemap file was specified), and optionally notify
|
||||
Google (if a sitemap url was specified).
|
||||
|
||||
=cut
|
||||
|
||||
sub write_sitemap {
|
||||
my $self = shift;
|
||||
|
||||
my $map = WWW::Google::SiteMap->new(
|
||||
file => $self->sitemap_file,
|
||||
pretty => 1,
|
||||
);
|
||||
while(my($url,$val) = each(%{$self->{storage}})) {
|
||||
next unless $val =~ /^SUCCESS /;
|
||||
my(undef,$lastmod) = split(' ',$val);
|
||||
$map->add(WWW::Google::SiteMap::URL->new(
|
||||
loc => $url,
|
||||
lastmod => $lastmod,
|
||||
));
|
||||
}
|
||||
$map->write;
|
||||
|
||||
if($self->sitemap_url) {
|
||||
my $ping = WWW::Google::SiteMap::Ping->new($self->sitemap_url);
|
||||
$ping->submit;
|
||||
}
|
||||
}
|
||||
|
||||
sub _url_data_match {
|
||||
my $self = shift;
|
||||
my $regexp = shift;
|
||||
|
||||
my $todo = $self->status_storage;
|
||||
return grep {
|
||||
$todo->{$_} && $todo->{$_} =~ /^$regexp/o
|
||||
} keys %{$todo};
|
||||
}
|
||||
|
||||
=back
|
||||
|
||||
=head1 EXAMPLE ROBOT
|
||||
|
||||
#!/usr/bin/perl -w
|
||||
##################
|
||||
use strict;
|
||||
use warnings;
|
||||
use lib 'lib';
|
||||
use WWW::Google::SiteMap::Robot;
|
||||
use GDBM_File;
|
||||
|
||||
foreach my $site (qw(www.example.com www.example2.com www.example3.com)) {
|
||||
my $status = '/tmp/sitemap-robot-status.$site.db';
|
||||
tie my %storage, 'GDBM_File', $status, &GDBM_WRCREAT, 0640
|
||||
my $robot = WWW::Google::SiteMap::Robot->new(
|
||||
domain => $site,
|
||||
status_storage => \%storage,
|
||||
sitemap_file => "/var/www/$site/sitemap.gz",
|
||||
sitemap_url => "http://$site/sitemap.gz",
|
||||
);
|
||||
$robot->run();
|
||||
}
|
||||
|
||||
=head1 MODULE HOME PAGE
|
||||
|
||||
The home page of this module is
|
||||
L<http://www.jasonkohles.com/software/WWW-Google-SiteMap>. This is where you
|
||||
can always find the latest version, development versions, and bug reports. You
|
||||
will also find a link there to report bugs.
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
L<WWW::Google::SiteMap>
|
||||
|
||||
L<WWW::Google::SiteMap::Index>
|
||||
|
||||
L<WWW::Google::SiteMap::Ping>
|
||||
|
||||
L<http://www.jasonkohles.com/software/WWW-Google-SiteMap>
|
||||
|
||||
L<WWW::Mechanize>
|
||||
|
||||
L<WWW::RobotRules>
|
||||
|
||||
=head1 AUTHOR
|
||||
|
||||
Jason Kohles, E<lt>email@jasonkohles.comE<gt>
|
||||
|
||||
=head1 COPYRIGHT AND LICENSE
|
||||
|
||||
Copyright (C) 2005 by Jason Kohles
|
||||
|
||||
This library is free software; you can redistribute it and/or modify
|
||||
it under the same terms as Perl itself, either Perl version 5.8.4 or,
|
||||
at your option, any later version of Perl 5 you may have available.
|
||||
|
||||
=cut
|
||||
|
||||
1;
|
||||
__END__
|
||||
294
lib/WWW/Google/SiteMap/URL.pm
Normal file
294
lib/WWW/Google/SiteMap/URL.pm
Normal file
@@ -0,0 +1,294 @@
|
||||
package WWW::Google::SiteMap::URL;
|
||||
use vars qw($VERSION); $VERSION = '1.10';
|
||||
|
||||
=head1 NAME
|
||||
|
||||
WWW::Google::SiteMap::URL - DEPRECATED - See Search::Sitemap
|
||||
|
||||
=head1 DEPRECATED
|
||||
|
||||
Now that more search engines than just Google are supporting the Sitemap
|
||||
protocol, the WWW::Google::SiteMap module has been renamed to
|
||||
L<Search::Sitemap>.
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
use WWW::Google::SiteMap;
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
This is a helper class that supports L<WWW::Google::SiteMap> and
|
||||
L<WWW::Google::SiteMap::Index>.
|
||||
|
||||
=cut
|
||||
|
||||
=head1 METHODS
|
||||
|
||||
=over 4
|
||||
|
||||
=cut
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
use Carp qw(carp croak);
|
||||
use XML::Twig qw();
|
||||
use POSIX qw(strftime);
|
||||
use HTML::Entities qw(encode_entities);
|
||||
|
||||
=item new()
|
||||
|
||||
=cut
|
||||
|
||||
sub new {
|
||||
my $class = shift;
|
||||
my %opts = ref($_[0]) ? %{$_[0]} : @_;
|
||||
|
||||
my $self = bless({}, $class);
|
||||
while(my($key,$value) = each %opts) { $self->$key($value) }
|
||||
return $self;
|
||||
}
|
||||
|
||||
=item loc()
|
||||
|
||||
Change the URL associated with this object. For a L<WWW::Google::SiteMap>
|
||||
this specifies the URL to add to the sitemap, for a
|
||||
L<WWW::Google::SiteMap::Index>, this is the URL to the sitemap.
|
||||
|
||||
=cut
|
||||
|
||||
sub loc {
|
||||
shift->_doval('loc', sub {
|
||||
local $_ = shift;
|
||||
return unless defined;
|
||||
return 'must be less than 2048 characters long' unless length($_) < 2048;
|
||||
return 'must be a fully qualified url' unless m{^https?://};
|
||||
return;
|
||||
}, @_);
|
||||
}
|
||||
|
||||
=item changefreq()
|
||||
|
||||
Set the change frequency of the object. This field is not used in sitemap
|
||||
indexes, only in sitemaps.
|
||||
|
||||
=cut
|
||||
|
||||
sub changefreq {
|
||||
shift->_doval('changefreq', sub {
|
||||
local $_ = shift;
|
||||
my @values = qw(always hourly daily weekly monthly yearly never);
|
||||
my $re = join('|',@values);
|
||||
return unless defined;
|
||||
return 'must be one of '.join(', ',@values) unless /^$re$/;
|
||||
return;
|
||||
}, @_);
|
||||
}
|
||||
|
||||
=item lastmod()
|
||||
|
||||
Set the last modified time. You have to provide this as one of the following:
|
||||
|
||||
=over 4
|
||||
|
||||
=item a complete ISO8601 time string
|
||||
|
||||
A complete time string will be accepted in exactly this format:
|
||||
|
||||
YYYY-MM-DDTHH:MM:SS+TZ:TZ
|
||||
YYYY - 4-digit year
|
||||
MM - 2-digit month (zero padded)
|
||||
DD - 2-digit year (zero padded)
|
||||
T - literal character 'T'
|
||||
HH - 2-digit hour (24-hour, zero padded)
|
||||
SS - 2-digit second (zero padded)
|
||||
+TZ:TZ - Timezone offset (hours and minutes from GMT, 2-digit, zero padded)
|
||||
|
||||
=item epoch time
|
||||
|
||||
Seconds since the epoch, such as would be returned from time(). If you provide
|
||||
an epoch time, then an appropriate ISO8601 time will be constructed with
|
||||
gmtime() (which means the timezone offset will be +00:00). If anyone knows
|
||||
of a way to determine the timezone offset of the current host that is
|
||||
cross-platform and doesn't add dozens of dependencies then I might change this.
|
||||
|
||||
=item an ISO8601 date (YYYY-MM-DD)
|
||||
|
||||
A simple date in YYYY-MM-DD format. The time will be set to 00:00:00+00:00.
|
||||
|
||||
=item a L<DateTime> object.
|
||||
|
||||
If a L<DateTime> object is provided, then an appropriate timestamp will be
|
||||
constructed from it.
|
||||
|
||||
=item a L<HTTP::Response> object.
|
||||
|
||||
If given an L<HTTP::Response> object, the last modified time will be
|
||||
calculated from whatever time information is available in the response
|
||||
headers. Currently this means either the Last-Modified header, or tue
|
||||
current time - the current_age() calculated by the response object.
|
||||
This is useful for building web crawlers.
|
||||
|
||||
=back
|
||||
|
||||
Note that in order to conserve memory, any of these items that you provide
|
||||
will be converted to a complete ISO8601 time string when they are stored.
|
||||
This means that if you pass an object to lastmod(), you can't get it back
|
||||
out. If anyone actually has a need to get the objects back out, then I
|
||||
might make a configuration option to store the objects internally.
|
||||
|
||||
If you have suggestions for other types of date/time objects or formats
|
||||
that would be usefule, let me know and I'll consider them.
|
||||
|
||||
=cut
|
||||
|
||||
sub lastmod {
|
||||
my $self = shift;
|
||||
|
||||
return $self->{lastmod} unless @_;
|
||||
|
||||
my $value = shift;
|
||||
if(ref($value)) {
|
||||
if($value->isa('DateTime')) { # DateTime object
|
||||
my($date,$tzoff) = $value->strftime("%Y-%m-%dT%T","%z");
|
||||
if($tzoff =~ /^([+-])?(\d\d):?(\d\d)/) {
|
||||
$tzoff = ($1 || '+').$2.':'.($3||'00');
|
||||
} else {
|
||||
$tzoff = '+00:00';
|
||||
}
|
||||
$self->{lastmod} = $date.$tzoff;
|
||||
} elsif($value->isa('HTTP::Response')) {
|
||||
my $modtime = $value->last_modified()
|
||||
|| (time - $value->current_age());
|
||||
$self->{lastmod} = strftime("%Y-%m-%dT%T+00:00",gmtime($_));
|
||||
}
|
||||
} else {
|
||||
local $_ = $value;
|
||||
if(/^\d+$/) { # epoch time
|
||||
$self->{lastmod} = strftime("%Y-%m-%dT%T+00:00",gmtime($_));
|
||||
} elsif(/^\d\d\d\d-\d\d-\d\d$/) {
|
||||
$self->{lastmod} = $_.'T00:00:00+00:00';
|
||||
} elsif(/^\d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\d\+\d\d:\d\d$/) {
|
||||
$self->{lastmod} = $_;
|
||||
}
|
||||
}
|
||||
|
||||
return $self->{lastmod} if $self->{lastmod};
|
||||
$self->_err("'$_' is not a valid value for lastmod");
|
||||
}
|
||||
|
||||
=item priority()
|
||||
|
||||
Set the priority. This field is not used in sitemap indexes, only in sitemaps.
|
||||
|
||||
=cut
|
||||
|
||||
sub priority {
|
||||
shift->_doval('priority', sub {
|
||||
local $_ = shift;
|
||||
return unless defined;
|
||||
return 'must be a number' unless /^[\d\.]+$/;
|
||||
return 'must be greater than 0.0' unless $_ >= 0.0;
|
||||
return 'must be less than 1.0' unless $_ <= 1.0;
|
||||
return;
|
||||
}, @_);
|
||||
}
|
||||
|
||||
sub _doval {
|
||||
my $self = shift;
|
||||
my $var = shift;
|
||||
my $valid = shift;
|
||||
return $self->{$var} unless @_;
|
||||
my $value = shift;
|
||||
if(my $res = $valid->($value)) {
|
||||
my $msg = "'$value' is not a valid value for $var: $res";
|
||||
if($self->{lenient}) { carp $msg } else { croak $msg }
|
||||
} else {
|
||||
$self->{$var} = $value;
|
||||
}
|
||||
}
|
||||
sub _err {
|
||||
my $self = shift;
|
||||
|
||||
if($self->{lenient}) { carp @_ } else { croak @_ }
|
||||
}
|
||||
|
||||
|
||||
=item delete()
|
||||
|
||||
Delete this object from the sitemap or the sitemap index.
|
||||
|
||||
=cut
|
||||
|
||||
sub delete {
|
||||
my $self = shift;
|
||||
for(keys %{$self}) { $self->{$_} = undef }
|
||||
}
|
||||
|
||||
=item lenient()
|
||||
|
||||
If lenient contains a true value, then errors will not be fatal.
|
||||
|
||||
=cut
|
||||
|
||||
sub lenient {
|
||||
my $self = shift;
|
||||
$self->{lenient} = shift if @_;
|
||||
return $self->{lenient};
|
||||
}
|
||||
|
||||
sub as_elt {
|
||||
my $self = shift;
|
||||
my $type = shift || 'url';
|
||||
my @fields = @_;
|
||||
unless(@fields) { @fields = qw(loc changefreq lastmod priority) }
|
||||
my @elements = ();
|
||||
foreach(@fields) {
|
||||
my $val = $self->$_() || next;
|
||||
if($_ eq 'loc') {
|
||||
$val = XML::Twig::Elt->new('#PCDATA' => encode_entities($val));
|
||||
$val->set_asis(1);
|
||||
} else {
|
||||
$val = XML::Twig::Elt->new('#PCDATA' => $val);
|
||||
}
|
||||
push(@elements,$val->wrap_in($_));
|
||||
}
|
||||
return XML::Twig::Elt->new($type, {}, @elements);
|
||||
}
|
||||
|
||||
=back
|
||||
|
||||
=head1 MODULE HOME PAGE
|
||||
|
||||
The home page of this module is
|
||||
L<http://www.jasonkohles.com/software/WWW-Google-SiteMap>. This is where you
|
||||
can always find the latest version, development versions, and bug reports. You
|
||||
will also find a link there to report bugs.
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
L<WWW::Google::SiteMap>
|
||||
|
||||
L<WWW::Google::SiteMap::Index>
|
||||
|
||||
L<WWW::Google::SiteMap::Ping>
|
||||
|
||||
L<http://www.jasonkohles.com/software/WWW-Google-SiteMap/>
|
||||
|
||||
L<https://www.google.com/webmasters/sitemaps/docs/en/protocol.html>
|
||||
|
||||
=head1 AUTHOR
|
||||
|
||||
Jason Kohles, E<lt>email@jasonkohles.comE<gt>
|
||||
|
||||
=head1 COPYRIGHT AND LICENSE
|
||||
|
||||
Copyright (C) 2005 by Jason Kohles
|
||||
|
||||
This library is free software; you can redistribute it and/or modify
|
||||
it under the same terms as Perl itself, either Perl version 5.8.4 or,
|
||||
at your option, any later version of Perl 5 you may have available.
|
||||
|
||||
=cut
|
||||
|
||||
1;
|
||||
__END__
|
||||
Reference in New Issue
Block a user