Imported Upstream version 1.10

This commit is contained in:
Mario Fetka
2017-09-15 14:26:26 +02:00
commit ee1dc9fb9a
22 changed files with 4358 additions and 0 deletions

354
lib/WWW/Google/SiteMap.pm Normal file
View File

@@ -0,0 +1,354 @@
package WWW::Google::SiteMap;
use vars qw($VERSION); $VERSION = '1.10';
=head1 NAME
WWW::Google::SiteMap - DEPRECATED - See Search::Sitemap
=head1 DEPRECATED
Now that more search engines than just Google are supporting the Sitemap
protocol, the WWW::Google::SiteMap module has been renamed to
L<Search::Sitemap>.
=head1 SYNOPSIS
use WWW::Google::SiteMap;
my $map = WWW::Google::SiteMap->new(file => 'sitemap.gz');
# Main page, changes a lot because of the blog
$map->add(WWW::Google::SiteMap::URL->new(
loc => 'http://www.jasonkohles.com/',
lastmod => '2005-06-03',
changefreq => 'daily',
priority => 1.0,
));
# Top level directories, don't change as much, and have a lower priority
$map->add({
loc => "http://www.jasonkohles.com/$_/",
changefreq => 'weekly',
priority => 0.9, # lower priority than the home page
}) for qw(
software gpg hamradio photos scuba snippets tools
);
$map->write;
=head1 DESCRIPTION
The Sitemap Protocol allows you to inform search engine crawlers about URLs
on your Web sites that are available for crawling. A Sitemap consists of a
list of URLs and may also contain additional information about those URLs,
such as when they were last modified, how frequently they change, etc.
This module allows you to create and modify sitemaps.
=cut
use strict;
use warnings;
use WWW::Google::SiteMap::URL qw();
use XML::Twig qw();
unless($IO::Zlib::VERSION) { eval "use IO::Zlib ()"; }
my $ZLIB = $IO::Zlib::VERSION;
use IO::File qw();
require UNIVERSAL;
use Carp qw(carp croak);
use HTML::Entities qw(decode_entities);
=head1 METHODS
=over 4
=item new()
Creates a new WWW::Google::SiteMap object.
my $map = WWW::Google::SiteMap->new(
file => 'sitemap.gz',
);
=cut
sub new {
my $class = shift;
my %opts = @_;
my $self = bless({}, ref($class) || $class);
while(my($key,$value) = each %opts) { $self->$key($value) }
if($self->file && -e $self->file) { $self->read }
return $self;
}
=item read()
Read a sitemap in to this object. If a filename is specified, it will be
read from that file, otherwise it will be read from the file that was
specified with the file() method. Reading of compressed files is done
automatically if the filename ends with .gz.
=cut
sub read {
my $self = shift;
my $file = shift || $self->file ||
croak "No filename specified for ".(ref($self)||$self)."::read";
# don't try to parse missing or empty files
# no errors for this, because we might be creating it
return unless -f $file && -s $file;
# don't try to parse very small compressed files
# (empty .gz files are 20 bytes)
return if $file =~ /\.gz/ && -s $file < 50;
my $fh;
if($file =~ /\.gz$/i) {
croak "IO::Zlib not available, cannot read compressed sitemaps"
unless $ZLIB;
$fh = IO::Zlib->new($file,"rb");
} else {
$fh = IO::File->new($file,"r");
}
my @urls = ();
my $urlparser = sub {
my $self = shift;
my $elt = shift;
my $url = WWW::Google::SiteMap::URL->new();
foreach my $c ($elt->children) {
my $var = $c->gi;
if($var eq 'loc') {
$url->$var(decode_entities($c->text));
} else {
$url->$var($c->text);
}
}
$self->purge;
push(@urls,$url);
};
my $twig = XML::Twig->new(
twig_roots => {
'urlset/url' => $urlparser,
'sitemapindex/sitemap' => $urlparser,
},
);
$twig->safe_parse(join('',$fh->getlines))
or die "Could not parse $file ($@)";
$self->urls(@urls);
}
=item write([$file])
Write the sitemap out to the file. If a filename is specified, it will be
written to that file, otherwise it will be written to the file that was
specified with the file() method. Writing of compressed files is done
automatically if the filename ends with .gz.
=cut
sub write {
my $self = shift;
my $file = shift || $self->file ||
croak "No filename specified for ".(ref($self)||$self)."::write";
my $fh;
if($file =~ /\.gz$/i) {
croak "IO::Zlib not available, cannot write compressed sitemaps"
unless $ZLIB;
$fh = IO::Zlib->new($file,"wb9");
} else {
$fh = IO::File->new($file,"w");
}
croak "Could not create '$file'" unless $fh;
$fh->print($self->xml);
}
=item urls()
Return the L<WWW::Google::SiteMap::URL> objects that make up the sitemap.
=cut
sub urls {
my $self = shift;
$self->{urls} = \@_ if @_;
my @urls = grep { ref($_) && defined $_->loc } @{$self->{urls}};
return wantarray ? @urls : \@urls;
}
=item add($item,[$item...])
Add the L<WWW::Google::SiteMap::URL> items listed to the sitemap.
If you pass hashrefs instead of L<WWW::Google::SiteMap::URL> objects, it
will turn them into objects for you. If the first item you pass is a
simple scalar that matches \w, it will assume that the values passed are
a hash for a single object. If the first item passed matches m{^\w+://}
(i.e. it looks like a URL) then all the arguments will be treated as URLs,
and L<WWW::Google::SiteMap::URL> objects will be constructed for them, but only
the loc field will be populated.
This means you can do any of these:
# create the WWW::Google::SiteMap::URL object yourself
my $url = WWW::Google::SiteMap::URL->new(
loc => 'http://www.jasonkohles.com/',
priority => 1.0,
);
$map->add($url);
# or
$map->add(
{ loc => 'http://www.jasonkohles.com/' },
{ loc => 'http://www.jasonkohles.com/software/google-sitemap/' },
{ loc => 'http://www.jasonkohles.com/software/geo-shapefile/' },
);
# or
$map->add(
loc => 'http://www.jasonkohles.com/',
priority => 1.0,
);
# or even something funkier
$map->add(qw(
http://www.jasonkohles.com/
http://www.jasonkohles.com/software/www-google-sitemap/
http://www.jasonkohles.com/software/geo-shapefile/
http://www.jasonkohles.com/software/text-fakedata/
));
foreach my $url ($map->urls) { $url->changefreq('daily') }
=cut
sub add {
my $self = shift;
if(ref($_[0])) {
if(UNIVERSAL::isa($_[0],"WWW::Google::SiteMap::URL")) {
push(@{$self->{urls}}, @_);
} elsif(ref($_[0]) =~ /HASH/) {
push(@{$self->{urls}},map {
WWW::Google::SiteMap::URL->new($_)
} @_);
}
} elsif($_[0] =~ /^\w+$/) {
push(@{$self->{urls}}, WWW::Google::SiteMap::URL->new(@_));
} elsif($_[0] =~ m{^\w+://}) {
push(@{$self->{urls}}, map {
WWW::Google::SiteMap::URL->new(loc => $_)
} @_);
} else {
croak "Can't turn '".(
ref($_[0]) || $_[0]
)."' into WWW::Google::SiteMap::URL object";
}
}
=item xml();
Return the xml representation of the sitemap.
=cut
sub xml {
my $self = shift;
my $xml = XML::Twig::Elt->new('urlset', {
'xmlns' => 'http://www.google.com/schemas/sitemap/0.84',
'xmlns:xsi' => 'http://www.w3.org/2001/XMLSchema-instance',
'xsi:schemaLocation' => join(' ',
'http://www.google.com/schemas/sitemap/0.84',
'http://www.google.com/schemas/sitemap/0.84/sitemap.xsd',
),
});
foreach($self->urls) {
$_->as_elt->paste(last_child => $xml);
}
$xml->set_pretty_print($self->pretty);
my $header = '<?xml version="1.0" encoding="UTF-8"?>';
if($self->pretty) { $header .= "\n" }
return $header.$xml->sprint();
}
=item file()
Get or set the filename associated with this object. If you call read() or
write() without a filename, this is the default.
=cut
sub file {
my $self = shift;
$self->{file} = shift if @_;
return $self->{file};
}
=item pretty()
Set this to a true value to enable 'pretty-printing' on the XML output. If
false (the default) the XML will be more compact but not as easily readable
for humans (Google and other computers won't care what you set this to).
If you set this to a 'word' (something that matches /[a-z]/i), then that
value will be passed to XML::Twig directly (see the L<XML::Twig> pretty_print
documentation). Otherwise if a true value is passed, it means 'nice', and a
false value means 'none'.
Returns the value it was set to, or the current value if called with no
arguments.
=cut
sub pretty {
my $self = shift;
my $val = shift || return $self->{pretty} || 'none';
if($val =~ /[a-z]/i) {
$self->{pretty} = $val;
} elsif($val) {
$self->{pretty} = 'nice';
} else {
$self->{pretty} = 'none';
}
return $self->{pretty};
}
=back
=head1 MODULE HOME PAGE
The home page of this module is
L<http://www.jasonkohles.com/software/WWW-Google-SiteMap>. This is where you
can always find the latest version, development versions, and bug reports. You
will also find a link there to report bugs.
=head1 SEE ALSO
L<WWW::Google::SiteMap::Index>
L<WWW::Google::SiteMap::Ping>
L<WWW::Google::SiteMap::Robot>
L<http://www.jasonkohles.com/software/WWW-Google-SiteMap>
L<https://www.google.com/webmasters/sitemaps/docs/en/protocol.html>
=head1 AUTHOR
Jason Kohles, E<lt>email@jasonkohles.comE<gt>
=head1 COPYRIGHT AND LICENSE
Copyright (C) 2005 by Jason Kohles
This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.4 or,
at your option, any later version of Perl 5 you may have available.
=cut
1;
__END__

View File

@@ -0,0 +1,189 @@
package WWW::Google::SiteMap::Index;
use vars qw($VERSION); $VERSION = '1.10';
=head1 NAME
WWW::Google::SiteMap::Index - DEPRECATED - See Search::Sitemap
=head1 DEPRECATED
Now that more search engines than just Google are supporting the Sitemap
protocol, the WWW::Google::SiteMap module has been renamed to
L<Search::Sitemap>.
=head1 SYNOPSIS
use WWW::Google::SiteMap::Index;
my $index = WWW::Google::SiteMap::Index->new(
file => 'sitemap-index.gz',
);
$index->add(WWW::Google::SiteMap::URL->new(
loc => 'http://www.jasonkohles.com/sitemap1.gz',
lastmod => '2005-11-01',
));
=head1 DESCRIPTION
A sitemap index is used to point Google at your sitemaps if you have more
than one of them.
=cut
use strict;
use warnings;
use base 'WWW::Google::SiteMap';
=head1 METHODS
=over 4
=item new()
Creates a new WWW::Google::SiteMap::Index object.
my $index = WWW::Google::SiteMap::Index->new(
file => 'sitemap-index.gz',
);
=item read()
Read a sitemap index in to this object. If a filename is specified, it will
be read from that file, otherwise it will be read from the file that was
specified with the file() method. Reading of compressed files is done
automatically if the filename ends with .gz.
=item write([$file]);
Write the sitemap index out to the file. If a filename is specified, it will
be written to that file, otherwise it will be written to the file that was
specified with the file() method. Writing of compressed files is done
automatically if the filename ends with .gz
=item urls()
Return the L<WWW::Google::SiteMap::URL> objects that make up the sitemap index.
=item add($item,[$item...]);
Add the L<WWW::Google::SiteMap::URL> items listed to the sitemap index.
If you pass hashrefs instead of L<WWW::Google::SiteMap::URL> objects, it
will turn them into objects for you. If the first item you pass is a
simple scalar that matches \w, it will assume that the values passed are
a hash for a single object. If the first item passed matches m{^\w+://}
(i.e. it looks like a URL) then all the arguments will be treated as URLs,
and L<WWW::Google::SiteMap::URL> objects will be constructed for them, but only
the loc field will be populated.
This means you can do any of these:
# create the WWW::Google::SiteMap::URL object yourself
my $url = WWW::Google::SiteMap::URL->new(
loc => 'http://www.jasonkohles.com/sitemap1.gz',
);
$map->add($url);
# or
$map->add(
{ loc => 'http://www.jasonkohles.com/sitemap1.gz' },
{ loc => 'http://www.jasonkohles.com/sitemap2.gz' },
{ loc => 'http://www.jasonkohles.com/sitemap3.gz' },
);
# or
$map->add(
loc => 'http://www.jasonkohles.com/sitemap1.gz',
priority => 1.0,
);
# or even something funkier
$map->add(qw(
http://www.jasonkohles.com/
http://www.jasonkohles.com/software/www-google-sitemap/
http://www.jasonkohles.com/software/geo-shapefile/
http://www.jasonkohles.com/software/text-fakedata/
));
foreach my $url ($map->urls) { $url->lastmod('2005-11-01') }
=item xml();
Return the xml representation of the sitemap index.
=cut
sub xml {
my $self = shift;
my $xml = XML::Twig::Elt->new('sitemapindex', {
'xmlns' => 'http://www.google.com/schemas/sitemap/0.84',
'xmlns:xsi' => 'http://www.w3.org/2001/XMLSchema-instance',
'xsi:schemaLocation' => join(' ',
'http://www.google.com/schemas/sitemap/0.84',
'http://www.google.com/schemas/sitemap/0.84/siteindex.xsd',
),
});
foreach($self->urls) {
$_->as_elt('sitemap',qw(loc lastmod))->paste(last_child => $xml);
}
$xml->set_pretty_print($self->pretty);
my $header = '<?xml version="1.0" encoding="UTF-8"?>';
if($self->pretty) { $header .= "\n" }
return $header.$xml->sprint();
}
=item file();
Get or set the filename associated with this object. If you call read() or
write() without a filename, this is the default.
=item pretty()
Set this to a true value to enable 'pretty-printing' on the XML output. If
false (the default) the XML will be more compact but not as easily readable
for humans (Google and other computers won't care what you set this to).
If you set this to a 'word' (something that matches /[a-z]/i), then that
value will be passed to XML::Twig directly (see the L<XML::Twig> pretty_print
documentation). Otherwise if a true value is passed, it means 'nice', and a
false value means 'none'.
Returns the value it was set to, or the current value if called with no
arguments.
=back
=head1 MODULE HOME PAGE
The home page of this module is
L<http://www.jasonkohles.com/software/WWW-Google-SiteMap>. This is where you
can always find the latest version, development versions, and bug reports. You
will also find a link there to report bugs.
=head1 SEE ALSO
L<WWW::Google::SiteMap>
L<WWW::Google::SiteMap::Ping>
L<http://www.jasonkohles.com/software/WWW-Google-Sitemap>
L<https://www.google.com/webmasters/sitemaps/docs/en/protocol.html#sitemapFileRequirements>
=head1 AUTHOR
Jason Kohles, E<lt>email@jasonkohles.comE<gt>
=head1 COPYRIGHT AND LICENSE
Copyright (C) 2005 by Jason Kohles
This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.4 or,
at your option, any later version of Perl 5 you may have available.
=cut
1;
__END__

View File

@@ -0,0 +1,197 @@
package WWW::Google::SiteMap::Ping;
use vars qw($VERSION); $VERSION = '1.10';
=head1 NAME
WWW::Google::SiteMap::Ping - DEPRECATED - See Search::Sitemap
=head1 DEPRECATED
Now that more search engines than just Google are supporting the Sitemap
protocol, the WWW::Google::SiteMap module has been renamed to
L<Search::Sitemap>.
=head1 SYNOPSIS
use WWW::Google::SiteMap::Ping;
my $ping = WWW::Google::SiteMap::Ping->new(
'http://www.jasonkohles.com/sitemap.gz',
);
$ping->submit;
print "These pings succeeded:\n";
foreach($ping->success) {
print "$_: ".$ping->status($_)."\n";
}
print "These pings failed:\n";
foreach($ping->failure) {
print "$_: ".$ping->status($_)."\n";
}
=head1 DESCRIPTION
This module makes it easy to notify Google that your sitemaps, or sitemap
indexes, have been updated. See L<WWW::Google::SiteMap> and
L<WWW::Google::SiteMap::Index> for tools to help you create sitemaps and
indexes.
=cut
use strict;
use warnings;
use LWP::UserAgent;
use URI::Escape qw(uri_escape);
=head1 METHODS
=over 4
=item new();
Create a new WWW::Google::SiteMap::Ping object. Can be given a list of
URLs which refer to sitemaps or sitemap indexes, these URLs will simply
be passed to url().
=cut
sub new {
my $class = shift;
my $self = bless({}, ref($class) || $class);
$self->{urls} = {};
$self->add_urls(@_);
return $self;
}
=item add_urls(@urls);
Add one or more urls to the list of URLs to submit to Google.
=cut
sub add_urls {
my $self = shift;
foreach(@_) {
$self->{urls}->{$_} ||= 'PENDING';
}
}
=item urls();
Return the list of urls that will be (or were) submitted to google.
=cut
sub urls { return keys %{shift()->{urls}}; }
=item submit
Submit the urls to Google, returns the number of successful submissions. This
module uses L<LWP::UserAgent> for the web-based submissions, and will honor
proxy settings in the environment. See L<LWP::UserAgent> for more information.
=cut
sub submit {
my $self = shift;
my $ua = $self->user_agent();
my $success = 0;
foreach my $url ($self->urls) {
my $ping = "http://www.google.com/webmasters/sitemaps/ping?".
"sitemap=".uri_escape($url);
my $response = $ua->get($ping);
if($response->is_success) {
$self->{urls}->{$url} = 'SUCCESS';
$success++;
} else {
$self->{urls}->{$url} = $response->status_line;
}
}
return $success;
}
=item success();
Return the URLs that were successfully submitted. Note that success only
means that the request was successfully received by Google, it does not
mean your sitemap was found, loaded or parsed successfully. If you want
to know whether your sitemap was loaded or parsed successfully, you have
to go to L<http://www.google.com/webmasters/sitemaps> and check the status
there.
=cut
sub success {
my $self = shift;
return grep { $self->{urls}->{$_} eq 'SUCCESS' } keys %{$self->{urls}};
}
=item failure();
Return the URLs that were not successfully submitted.
=cut
sub failure {
my $self = shift;
return grep { $self->{urls}->{$_} ne 'SUCCESS' } keys %{$self->{urls}};
}
=item user_agent();
If called with no arguments, will return the current L<LWP::UserAgent> object
which will be used to access the web-based submission. If called with an
arugment, you can set the user agent that will be used in case you need to
give it special arguments. It must be a L<LWP::UserAgent> object.
If you call submit without having provided a user agent, one will be created
for you that is a basic L<LWP::UserAgent> object, which honors proxy settings
in the environment.
=cut
sub user_agent {
my $self = shift;
if(@_) { $self->{_ua} = shift }
unless($self->{_ua}) {
$self->{_ua} = LWP::UserAgent->new();
$self->{_ua}->env_proxy;
$self->{_ua}->timeout(10);
}
return $self->{_ua};
}
=back
=head1 MODULE HOME PAGE
The home page of this module is
L<http://www.jasonkohles.com/software/WWW-Google-SiteMap>. This is where you
can always find the latest version, development versions, and bug reports. You
will also find a link there to report bugs.
=head1 SEE ALSO
L<http://www.google.com/webmasters/sitemaps/docs/en/submit.html#ping>
=head1 AUTHOR
Jason Kohles, E<lt>email@jasonkohles.comE<gt>
=head1 COPYRIGHT AND LICENSE
Copyright (C) 2005 by Jason Kohles
This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.4 or,
at your option, any later version of Perl 5 you may have available.
=cut
1;
__END__

View File

@@ -0,0 +1,582 @@
package WWW::Google::SiteMap::Robot;
use vars qw($VERSION); $VERSION = '1.10';
=head1 NAME
WWW::Google::SiteMap::Robot - DEPRECATED - See Search::Sitemap
=head1 DEPRECATED
Now that more search engines than just Google are supporting the Sitemap
protocol, the WWW::Google::SiteMap module has been renamed to
L<Search::Sitemap>.
=head1 SYNOPSIS
use WWW::Google::SiteMap::Robot;
my $robot = WWW::Google::SiteMap::Robot->new(
domain => 'www.jasonkohles.com',
restrict => qr{^http://www.jasonkohles.com/},
starting_url => ['/index.html','/google-me.html'],
delay => 1, # delay in minutes
sitemap_file => '/var/www/html/sitemap.gz',
sitemap_url => 'http://www.jasonkohles.com/sitemap.gz',
user_agent => 'MyOwnSpider/1.0',
);
$robot->run();
=head1 DESCRIPTION
This is a simple robot class which subclasses L<LWP::RobotUA> to create a
web-crawling spider. By giving it the URL to your home page, it will crawl
all the pages it can find and create a sitemap for them.
=cut
use strict;
use warnings;
use WWW::Mechanize;
use WWW::RobotRules;
use Carp qw(croak);
use POSIX qw(strftime);
use WWW::Google::SiteMap;
use WWW::Google::SiteMap::Ping;
=head1 METHODS
=over 4
=item new();
Create a new WWW::Google::SiteMap::Robot object.
=cut
sub new {
my $class = shift;
my %args = @_;
my $self = bless({},ref($class)||$class);
croak "No domain specified" unless $args{domain};
# These items have other methods that depend on them, so they need to
# be called in this order:
foreach my $x (qw(domain status_storage)) {
$self->$x(delete($args{$x}));
}
while(my($k,$v) = each %args) { $self->$k($v) }
return $self;
}
=item domain();
Get/Set the domain name of the server you want to spider. This is used both
to create the initial URLs to put in the TO-DO list, as well as to create a
built-in restriction that prevents the robot from leaving your site.
Google doesn't allow a sitemap to refer to URL's that are outside the domain
that the sitemap was retrieved for, so there really isn't any benefit in
allowing the robot to cross multiple domains. If you really think you need
to do this, you probably really just want more than one robot. If you are
absolutely certain you want to cross domain boundaries, then you'll have to
subclass this module, and Google will probably reject your sitemaps.
=cut
sub domain {
my $self = shift;
if(@_) { $self->{domain} = shift }
return $self->{domain};
}
=item restrict();
Get/Set the url restrictions. The restriction list can be any of the
following:
=over 4
=item A list reference (or a list)
A list reference is assumed to contain a list of any of the following types.
When passed as an argument to the constructor it has to be a reference, but
when you are calling restrict() as a method, you can pass it a list, and it
will turn it into a list reference. If you provide more than one restrict
item in a list, the first one to return true will cause the rest of them to
be skipped, so the URL will be restricted (skipped) if any of the items are
true (if you want more complexity than that, then just use a code reference
by itself, which can do whatever it wants.)
=item A code reference
If you give restrict a code reference, it will be passed the URL that is
about to be spidered, if the code returns a true value, the URL will be
skipped. If it returns false, it will not be restricted.
=item A regexp reference
If you give it a regexp reference, then the regexp will be applied to the
URL about to be spidered, if the regexp matches, then the URL will be
skipped.
=back
If called with no arguments, it will return the current list of restrictions.
There are built-in restrictions that are always applied at the end of your
restriction list. One is a url regexp that matches your domain name, to
prevent the robot from leaving your site (it's qr{^\w+://YOUR_DOMAIN/}).
The other is a restriction that excludes any URLs that are not allowed by
your robots.txt. This module doesn't provide any method for ignoring the
robots.txt restriction (because it's dangerous), you should really modify
your robots.txt to allow this robot to bypass any of the restrictions you
don't want it to honor.
For example, if your robot.txt contains:
User-Agent: *
Disallow: /admin
Disallow: /google-stuff
Then those two paths will not be included in your sitemap. If you decided
you actually did want /google-stuff to appear in your sitemap, you could add
this to your robots.txt:
User-Agent: WWWGoogleSiteMapRobot
Disallow: /admin
=cut
sub restrict {
my $self = shift;
if(@_) { $self->{restrict} = \@_ }
unless($self->{restrict}) { $self->{restrict} = [] }
return @{$self->{restrict}};
}
=item starting_url();
If called with one or more arguments, they are assumed to be URLs which will
seed the spider. The spider continues to run as long as there are URLs in
it's "TO-DO" list, this method simply adds items to that list. The arguments
to starting_url are just the filename part of the url, if you don't specify
one, it defaults to '/'.
You can pass it either a list of URLs, or a list reference (so you can use
a list reference in the constructor.)
=cut
sub starting_url {
my $self = shift;
if(@_) {
$self->{starting_url} = \@_;
$self->_populate_starting_urls;
}
unless($self->{starting_url}) { $self->{starting_url} = ['/'] }
return $self->{starting_url};
}
sub _populate_starting_urls {
my $self = shift;
my @populate = @_;
unless(@populate) { @populate = $self->starting_url() }
foreach(@populate) {
next unless $_;
if(ref($_)) { $self->_populate_starting_urls(@{$_}); next; }
$self->{storage}->{"http://".$self->domain.$_} ||= '';
}
}
=item delay();
Get or set the delay (in minutes) to wait between requests. The default is
1 minute, and if you want to hammer on your web server you can set this to
a value less than 1.
=cut
sub delay {
my $self = shift;
if(@_) { $self->{delay} = shift }
return $self->{delay} || 1;
}
=item sitemap_file();
Sets the filename to save the L<WWW::Google::SiteMap> object to. This is
required.
=cut
sub sitemap_file {
my $self = shift;
if(@_) { $self->{sitemap_file} = shift }
return $self->{sitemap_file};
}
=item sitemap_url();
Sets the url for the sitemap. This is optional, but if you specify it, then
the robot will notify Google (using L<WWW::Google::SiteMap::Ping>) after it
writes a new sitemap.
=cut
sub sitemap_url {
my $self = shift;
if(@_) { $self->{sitemap_url} = shift }
return $self->{sitemap_url};
}
=item user_agent();
Set the User Agent that this robot uses to identify itself. The default is
'WWWGoogleSiteMapRobot/version' (unless you have subclassed this module, it's
actually the class name with special characters removed.)
Be careful about changing this while the robot is active (this includes
changing it between runs if you are storing the state) as this affects how
your robot interprets your robots.txt file.
=cut
sub user_agent {
my $self = shift;
if(@_) { $self->{user_agent} = shift }
unless($self->{user_agent}) {
my $pkg = ref($self) || $self;
$pkg =~ s/\W//g;
$self->{user_agent} = join('/',$pkg,$VERSION);
}
return $self->{user_agent};
}
=item robot_rules();
Get or set the L<WWW::RobotRules> object used to handle robots.txt.
=cut
sub robot_rules {
my $self = shift;
if(@_) { $self->{robot_rules} = shift }
unless($self->{robot_rules}) {
$self->{robot_rules} = WWW::RobotRules->new($self->user_agent);
my $url = "http://".$self->domain."/robots.txt";
my $mech = $self->mechanize();
$mech->get($url);
$self->{robot_rules}->parse($url,$mech->content);
}
return $self->{robot_rules};
}
=item mechanize();
Get or set the L<WWW::Mechanize> object used for retrieving web documents.
=cut
sub mechanize {
my $self = shift;
if(@_) { $self->{mech} = shift }
unless($self->{mech}) {
$self->{mech} = WWW::Mechanize->new(
agent => $self->user_agent,
stack_depth => 1,
);
}
return $self->{mech};
}
=item status_storage();
If you provide status_storage with a tied hash, it will be used to store the
state of the TO-DO list which includes the data needed to build the sitemap,
as well as the list of unvisited URLs. This means that the robot can continue
where it left off if it is interrupted for some reason before finishing, then
you don't have to re-spider the entire site. This is strongly recommended.
You can use this with basically anything that can be implemented as a tied
hash, as long as it can handle fully-qualified URLs as keys, the values will
be simple scalars (it won't try to store references or anything like that
in the values.)
Example:
use WWW::Google::SiteMap::Robot;
use GDBM::File;
tie my %storage, 'GDBM_File', '/tmp/my-robot-status', &GDBM_WRCREAT, 0640;
my $robot = WWW::Google::SiteMap::Robot->new(
restrict => qr{^http://www.jasonkohles.com/},
starting_url => 'http://www.jasonkohles.com/index.html',
sitemap_file => '/var/www/html/sitemap.gz',
);
If you don't provide a tied hash to store the status in, it will be stored in
a normal (in-memory) hash.
=cut
sub status_storage {
my $self = shift;
if(@_) {
$self->{storage} = shift;
# If the storage is changed, we might have lost our starting urls
$self->_populate_starting_urls;
}
unless($self->{storage}) {
$self->{storage} = {};
$self->_populate_starting_urls;
}
return $self->{storage};
}
=item pending_urls();
Return a list of all the URLs that have been found, but have not yet been
visited. This may include URLs that will later be restricted, and will not
be visited.
=cut
sub pending_urls {
my $self = shift;
my $todo = $self->status_storage;
return grep { ! $todo->{$_} } keys %{$todo};
}
=item restricted_urls();
Return a list of all the URLs that are in the TO-DO list that have already
been tried, but were skipped because they were restricted.
=cut
sub restricted_urls {
my $self = shift;
$self->_url_data_match(qr/^RESTRICTED /o);
}
=item visited_urls();
Return a list of all the URLs that have already been visited, and will be
included in the sitemap.
=cut
sub visited_urls {
my $self = shift;
$self->_url_data_match(qr/^OK /o);
}
=item run();
Start the robot running. If you are building your robot into a larger
program that has to handle other tasks as well, then you can pass an integer
to run(), which will be the number of URLs to check (of course then you will
have to call it again later, probably in a loop, to make sure you get them
all.) Returns true if something was done, returns false if no pending URLs
were found in the TO-DO list. Calling start() again after it has returned
false is rather pointless. If you call it in a loop as part of a larger
program, you are also responsible for calling write_sitemap() after all the
data is collected.
If called with no arguments (or a false argument) it will run until there are
no more URLs to process.
=cut
sub run {
my $self = shift;
my $count = shift;
my $counter = $count;
my @waiting = $self->pending_urls;
my $mech = $self->mechanize;
while(1) {
sleep($self->delay * 60); # sleep first, because of all the nexts
unless(@waiting) { @waiting = $self->pending_urls }
if(my $url = shift(@waiting)) {
# first make sure we didn't already do it
next if $self->{storage}->{$url};
# Then make sure it isn't restricted
if($self->_check_restrictions($url)) {
$self->{storage}->{$url} = 'RESTRICTED';
next;
}
$mech->get($url);
if($mech->success) {
# extract the last modification time from the page
my $modtime = $mech->response->last_modified()
|| (time - $mech->response->current_age);
$self->{storage}->{$url} = "SUCCESS $modtime";
# add any links in the page to our todo list
foreach($mech->links) {
my $url = $_->url_abs;
$url =~ s/#[^#]+$//;
$self->{storage}->{$url} ||= '';
}
} else {
$self->{storage}->{$url} = 'ERROR '.$mech->status();
}
next;
}
if($count) {
last unless $counter--;
} else {
last unless @waiting;
}
}
unless($count) { # if you are limiting, you have to do this part yourself
$self->write_sitemap() if $self->sitemap_file();
}
}
sub _check_restrictions {
my $self = shift;
my $url = shift;
# some hard-coded restrictions for safety sake
if($url !~ /^(http|https):/) {
return 1;
}
foreach my $r ($self->restrict) {
if(ref($r) eq 'Regexp' && $url =~ /$r/) {
return 1;
}
if(ref($r) eq 'CODE' && $r->($url)) {
return 1
}
}
my $domain = $self->domain;
if($url !~ m{^\w+://$domain}o) {
return 1;
}
unless($self->robot_rules->allowed($url)) {
return 1;
}
return 0;
}
=item write_sitemap();
Write out the sitemap (if a sitemap file was specified), and optionally notify
Google (if a sitemap url was specified).
=cut
sub write_sitemap {
my $self = shift;
my $map = WWW::Google::SiteMap->new(
file => $self->sitemap_file,
pretty => 1,
);
while(my($url,$val) = each(%{$self->{storage}})) {
next unless $val =~ /^SUCCESS /;
my(undef,$lastmod) = split(' ',$val);
$map->add(WWW::Google::SiteMap::URL->new(
loc => $url,
lastmod => $lastmod,
));
}
$map->write;
if($self->sitemap_url) {
my $ping = WWW::Google::SiteMap::Ping->new($self->sitemap_url);
$ping->submit;
}
}
sub _url_data_match {
my $self = shift;
my $regexp = shift;
my $todo = $self->status_storage;
return grep {
$todo->{$_} && $todo->{$_} =~ /^$regexp/o
} keys %{$todo};
}
=back
=head1 EXAMPLE ROBOT
#!/usr/bin/perl -w
##################
use strict;
use warnings;
use lib 'lib';
use WWW::Google::SiteMap::Robot;
use GDBM_File;
foreach my $site (qw(www.example.com www.example2.com www.example3.com)) {
my $status = '/tmp/sitemap-robot-status.$site.db';
tie my %storage, 'GDBM_File', $status, &GDBM_WRCREAT, 0640
my $robot = WWW::Google::SiteMap::Robot->new(
domain => $site,
status_storage => \%storage,
sitemap_file => "/var/www/$site/sitemap.gz",
sitemap_url => "http://$site/sitemap.gz",
);
$robot->run();
}
=head1 MODULE HOME PAGE
The home page of this module is
L<http://www.jasonkohles.com/software/WWW-Google-SiteMap>. This is where you
can always find the latest version, development versions, and bug reports. You
will also find a link there to report bugs.
=head1 SEE ALSO
L<WWW::Google::SiteMap>
L<WWW::Google::SiteMap::Index>
L<WWW::Google::SiteMap::Ping>
L<http://www.jasonkohles.com/software/WWW-Google-SiteMap>
L<WWW::Mechanize>
L<WWW::RobotRules>
=head1 AUTHOR
Jason Kohles, E<lt>email@jasonkohles.comE<gt>
=head1 COPYRIGHT AND LICENSE
Copyright (C) 2005 by Jason Kohles
This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.4 or,
at your option, any later version of Perl 5 you may have available.
=cut
1;
__END__

View File

@@ -0,0 +1,294 @@
package WWW::Google::SiteMap::URL;
use vars qw($VERSION); $VERSION = '1.10';
=head1 NAME
WWW::Google::SiteMap::URL - DEPRECATED - See Search::Sitemap
=head1 DEPRECATED
Now that more search engines than just Google are supporting the Sitemap
protocol, the WWW::Google::SiteMap module has been renamed to
L<Search::Sitemap>.
=head1 SYNOPSIS
use WWW::Google::SiteMap;
=head1 DESCRIPTION
This is a helper class that supports L<WWW::Google::SiteMap> and
L<WWW::Google::SiteMap::Index>.
=cut
=head1 METHODS
=over 4
=cut
use strict;
use warnings;
use Carp qw(carp croak);
use XML::Twig qw();
use POSIX qw(strftime);
use HTML::Entities qw(encode_entities);
=item new()
=cut
sub new {
my $class = shift;
my %opts = ref($_[0]) ? %{$_[0]} : @_;
my $self = bless({}, $class);
while(my($key,$value) = each %opts) { $self->$key($value) }
return $self;
}
=item loc()
Change the URL associated with this object. For a L<WWW::Google::SiteMap>
this specifies the URL to add to the sitemap, for a
L<WWW::Google::SiteMap::Index>, this is the URL to the sitemap.
=cut
sub loc {
shift->_doval('loc', sub {
local $_ = shift;
return unless defined;
return 'must be less than 2048 characters long' unless length($_) < 2048;
return 'must be a fully qualified url' unless m{^https?://};
return;
}, @_);
}
=item changefreq()
Set the change frequency of the object. This field is not used in sitemap
indexes, only in sitemaps.
=cut
sub changefreq {
shift->_doval('changefreq', sub {
local $_ = shift;
my @values = qw(always hourly daily weekly monthly yearly never);
my $re = join('|',@values);
return unless defined;
return 'must be one of '.join(', ',@values) unless /^$re$/;
return;
}, @_);
}
=item lastmod()
Set the last modified time. You have to provide this as one of the following:
=over 4
=item a complete ISO8601 time string
A complete time string will be accepted in exactly this format:
YYYY-MM-DDTHH:MM:SS+TZ:TZ
YYYY - 4-digit year
MM - 2-digit month (zero padded)
DD - 2-digit year (zero padded)
T - literal character 'T'
HH - 2-digit hour (24-hour, zero padded)
SS - 2-digit second (zero padded)
+TZ:TZ - Timezone offset (hours and minutes from GMT, 2-digit, zero padded)
=item epoch time
Seconds since the epoch, such as would be returned from time(). If you provide
an epoch time, then an appropriate ISO8601 time will be constructed with
gmtime() (which means the timezone offset will be +00:00). If anyone knows
of a way to determine the timezone offset of the current host that is
cross-platform and doesn't add dozens of dependencies then I might change this.
=item an ISO8601 date (YYYY-MM-DD)
A simple date in YYYY-MM-DD format. The time will be set to 00:00:00+00:00.
=item a L<DateTime> object.
If a L<DateTime> object is provided, then an appropriate timestamp will be
constructed from it.
=item a L<HTTP::Response> object.
If given an L<HTTP::Response> object, the last modified time will be
calculated from whatever time information is available in the response
headers. Currently this means either the Last-Modified header, or tue
current time - the current_age() calculated by the response object.
This is useful for building web crawlers.
=back
Note that in order to conserve memory, any of these items that you provide
will be converted to a complete ISO8601 time string when they are stored.
This means that if you pass an object to lastmod(), you can't get it back
out. If anyone actually has a need to get the objects back out, then I
might make a configuration option to store the objects internally.
If you have suggestions for other types of date/time objects or formats
that would be usefule, let me know and I'll consider them.
=cut
sub lastmod {
my $self = shift;
return $self->{lastmod} unless @_;
my $value = shift;
if(ref($value)) {
if($value->isa('DateTime')) { # DateTime object
my($date,$tzoff) = $value->strftime("%Y-%m-%dT%T","%z");
if($tzoff =~ /^([+-])?(\d\d):?(\d\d)/) {
$tzoff = ($1 || '+').$2.':'.($3||'00');
} else {
$tzoff = '+00:00';
}
$self->{lastmod} = $date.$tzoff;
} elsif($value->isa('HTTP::Response')) {
my $modtime = $value->last_modified()
|| (time - $value->current_age());
$self->{lastmod} = strftime("%Y-%m-%dT%T+00:00",gmtime($_));
}
} else {
local $_ = $value;
if(/^\d+$/) { # epoch time
$self->{lastmod} = strftime("%Y-%m-%dT%T+00:00",gmtime($_));
} elsif(/^\d\d\d\d-\d\d-\d\d$/) {
$self->{lastmod} = $_.'T00:00:00+00:00';
} elsif(/^\d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\d\+\d\d:\d\d$/) {
$self->{lastmod} = $_;
}
}
return $self->{lastmod} if $self->{lastmod};
$self->_err("'$_' is not a valid value for lastmod");
}
=item priority()
Set the priority. This field is not used in sitemap indexes, only in sitemaps.
=cut
sub priority {
shift->_doval('priority', sub {
local $_ = shift;
return unless defined;
return 'must be a number' unless /^[\d\.]+$/;
return 'must be greater than 0.0' unless $_ >= 0.0;
return 'must be less than 1.0' unless $_ <= 1.0;
return;
}, @_);
}
sub _doval {
my $self = shift;
my $var = shift;
my $valid = shift;
return $self->{$var} unless @_;
my $value = shift;
if(my $res = $valid->($value)) {
my $msg = "'$value' is not a valid value for $var: $res";
if($self->{lenient}) { carp $msg } else { croak $msg }
} else {
$self->{$var} = $value;
}
}
sub _err {
my $self = shift;
if($self->{lenient}) { carp @_ } else { croak @_ }
}
=item delete()
Delete this object from the sitemap or the sitemap index.
=cut
sub delete {
my $self = shift;
for(keys %{$self}) { $self->{$_} = undef }
}
=item lenient()
If lenient contains a true value, then errors will not be fatal.
=cut
sub lenient {
my $self = shift;
$self->{lenient} = shift if @_;
return $self->{lenient};
}
sub as_elt {
my $self = shift;
my $type = shift || 'url';
my @fields = @_;
unless(@fields) { @fields = qw(loc changefreq lastmod priority) }
my @elements = ();
foreach(@fields) {
my $val = $self->$_() || next;
if($_ eq 'loc') {
$val = XML::Twig::Elt->new('#PCDATA' => encode_entities($val));
$val->set_asis(1);
} else {
$val = XML::Twig::Elt->new('#PCDATA' => $val);
}
push(@elements,$val->wrap_in($_));
}
return XML::Twig::Elt->new($type, {}, @elements);
}
=back
=head1 MODULE HOME PAGE
The home page of this module is
L<http://www.jasonkohles.com/software/WWW-Google-SiteMap>. This is where you
can always find the latest version, development versions, and bug reports. You
will also find a link there to report bugs.
=head1 SEE ALSO
L<WWW::Google::SiteMap>
L<WWW::Google::SiteMap::Index>
L<WWW::Google::SiteMap::Ping>
L<http://www.jasonkohles.com/software/WWW-Google-SiteMap/>
L<https://www.google.com/webmasters/sitemaps/docs/en/protocol.html>
=head1 AUTHOR
Jason Kohles, E<lt>email@jasonkohles.comE<gt>
=head1 COPYRIGHT AND LICENSE
Copyright (C) 2005 by Jason Kohles
This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.4 or,
at your option, any later version of Perl 5 you may have available.
=cut
1;
__END__