Imported Upstream version 1.10

2017-09-15 14:26:26 +02:00
commit ee1dc9fb9a
22 changed files with 4358 additions and 0 deletions
--- a/lib/WWW/Google/SiteMap.pm
+++ b/lib/WWW/Google/SiteMap.pm
@@ -0,0 +1,354 @@
+package WWW::Google::SiteMap;
+use vars qw($VERSION); $VERSION = '1.10';
+
+=head1 NAME
+
+WWW::Google::SiteMap - DEPRECATED - See Search::Sitemap
+
+=head1 DEPRECATED
+
+Now that more search engines than just Google are supporting the Sitemap
+protocol, the WWW::Google::SiteMap module has been renamed to
+L<Search::Sitemap>.
+
+=head1 SYNOPSIS
+
+  use WWW::Google::SiteMap;
+
+  my $map = WWW::Google::SiteMap->new(file => 'sitemap.gz');
+
+  # Main page, changes a lot because of the blog
+  $map->add(WWW::Google::SiteMap::URL->new(
+    loc        => 'http://www.jasonkohles.com/',
+    lastmod    => '2005-06-03',
+    changefreq => 'daily',
+    priority   => 1.0,
+  ));
+
+  # Top level directories, don't change as much, and have a lower priority
+  $map->add({
+    loc        => "http://www.jasonkohles.com/$_/",
+    changefreq => 'weekly',
+    priority   => 0.9, # lower priority than the home page
+  }) for qw(
+    software gpg hamradio photos scuba snippets tools
+  );
+
+  $map->write;
+
+=head1 DESCRIPTION
+
+The Sitemap Protocol allows you to inform search engine crawlers about URLs
+on your Web sites that are available for crawling. A Sitemap consists of a
+list of URLs and may also contain additional information about those URLs,
+such as when they were last modified, how frequently they change, etc.
+
+This module allows you to create and modify sitemaps.
+
+=cut
+
+use strict;
+use warnings;
+use WWW::Google::SiteMap::URL qw();
+use XML::Twig qw();
+unless($IO::Zlib::VERSION) { eval "use IO::Zlib ()"; }
+my $ZLIB = $IO::Zlib::VERSION;
+use IO::File qw();
+require UNIVERSAL;
+use Carp qw(carp croak);
+use HTML::Entities qw(decode_entities);
+
+=head1 METHODS
+
+=over 4
+
+=item new()
+
+Creates a new WWW::Google::SiteMap object.
+
+  my $map = WWW::Google::SiteMap->new(
+    file => 'sitemap.gz',
+  );
+
+=cut
+
+sub new {
+	my $class = shift;
+	my %opts = @_;
+	my $self = bless({}, ref($class) || $class);
+	while(my($key,$value) = each %opts) { $self->$key($value) }
+	if($self->file && -e $self->file) { $self->read }
+	return $self;
+}
+
+=item read()
+
+Read a sitemap in to this object.  If a filename is specified, it will be
+read from that file, otherwise it will be read from the file that was
+specified with the file() method.  Reading of compressed files is done
+automatically if the filename ends with .gz.
+
+=cut
+
+sub read {
+	my $self = shift;
+	my $file = shift || $self->file ||
+		croak "No filename specified for ".(ref($self)||$self)."::read";
+
+	# don't try to parse missing or empty files
+	# no errors for this, because we might be creating it
+	return unless -f $file && -s $file;
+	
+	# don't try to parse very small compressed files
+	# (empty .gz files are 20 bytes)
+	return if $file =~ /\.gz/ && -s $file < 50;
+
+	my $fh;
+	if($file =~ /\.gz$/i) {
+		croak "IO::Zlib not available, cannot read compressed sitemaps"
+			unless $ZLIB;
+		$fh = IO::Zlib->new($file,"rb");
+	} else {
+		$fh = IO::File->new($file,"r");
+	}
+	my @urls = ();
+	my $urlparser = sub {
+		my $self = shift;
+		my $elt = shift;
+
+		my $url = WWW::Google::SiteMap::URL->new();
+		foreach my $c ($elt->children) {
+			my $var = $c->gi;
+            if($var eq 'loc') {
+                $url->$var(decode_entities($c->text));
+            } else {
+                $url->$var($c->text);
+            }
+		}
+		$self->purge;
+		push(@urls,$url);
+	};
+	my $twig = XML::Twig->new(
+		twig_roots      => {
+			'urlset/url'				=> $urlparser,
+			'sitemapindex/sitemap'		=> $urlparser,
+		},
+	);
+	$twig->safe_parse(join('',$fh->getlines))
+        or die "Could not parse $file ($@)";
+	$self->urls(@urls);
+}
+
+=item write([$file])
+
+Write the sitemap out to the file.  If a filename is specified, it will be
+written to that file, otherwise it will be written to the file that was
+specified with the file() method.  Writing of compressed files is done
+automatically if the filename ends with .gz.
+
+=cut
+
+sub write {
+	my $self = shift;
+	my $file = shift || $self->file ||
+		croak "No filename specified for ".(ref($self)||$self)."::write";
+
+	my $fh;
+	if($file =~ /\.gz$/i) {
+		croak "IO::Zlib not available, cannot write compressed sitemaps"
+			unless $ZLIB;
+		$fh = IO::Zlib->new($file,"wb9");
+	} else {
+		$fh = IO::File->new($file,"w");
+	}
+	croak "Could not create '$file'" unless $fh;
+	$fh->print($self->xml);
+}
+
+=item urls()
+
+Return the L<WWW::Google::SiteMap::URL> objects that make up the sitemap.
+
+=cut
+
+sub urls {
+	my $self = shift;
+	$self->{urls} = \@_ if @_;
+	my @urls = grep { ref($_) && defined $_->loc } @{$self->{urls}};
+	return wantarray ? @urls : \@urls;
+}
+
+=item add($item,[$item...])
+
+Add the L<WWW::Google::SiteMap::URL> items listed to the sitemap.
+
+If you pass hashrefs instead of L<WWW::Google::SiteMap::URL> objects, it
+will turn them into objects for you.  If the first item you pass is a
+simple scalar that matches \w, it will assume that the values passed are
+a hash for a single object.  If the first item passed matches m{^\w+://}
+(i.e. it looks like a URL) then all the arguments will be treated as URLs,
+and L<WWW::Google::SiteMap::URL> objects will be constructed for them, but only
+the loc field will be populated.
+
+This means you can do any of these:
+
+  # create the WWW::Google::SiteMap::URL object yourself
+  my $url = WWW::Google::SiteMap::URL->new(
+    loc => 'http://www.jasonkohles.com/',
+    priority => 1.0,
+  );
+  $map->add($url);
+
+  # or
+  $map->add(
+    { loc => 'http://www.jasonkohles.com/' },
+    { loc => 'http://www.jasonkohles.com/software/google-sitemap/' },
+    { loc => 'http://www.jasonkohles.com/software/geo-shapefile/' },
+  );
+
+  # or
+  $map->add(
+    loc       => 'http://www.jasonkohles.com/',
+    priority  => 1.0,
+  );
+
+  # or even something funkier
+  $map->add(qw(
+    http://www.jasonkohles.com/
+    http://www.jasonkohles.com/software/www-google-sitemap/
+    http://www.jasonkohles.com/software/geo-shapefile/
+    http://www.jasonkohles.com/software/text-fakedata/
+  ));
+  foreach my $url ($map->urls) { $url->changefreq('daily') }
+    
+=cut
+
+sub add {
+	my $self = shift;
+	if(ref($_[0])) {
+		if(UNIVERSAL::isa($_[0],"WWW::Google::SiteMap::URL")) {
+			push(@{$self->{urls}}, @_);
+		} elsif(ref($_[0]) =~ /HASH/) {
+			push(@{$self->{urls}},map {
+				WWW::Google::SiteMap::URL->new($_)
+			} @_);
+		}
+	} elsif($_[0] =~ /^\w+$/) {
+		push(@{$self->{urls}}, WWW::Google::SiteMap::URL->new(@_));
+	} elsif($_[0] =~ m{^\w+://}) {
+		push(@{$self->{urls}}, map {
+			WWW::Google::SiteMap::URL->new(loc => $_)
+		} @_);
+	} else {
+		croak "Can't turn '".(
+			ref($_[0]) || $_[0]
+		)."' into WWW::Google::SiteMap::URL object";
+	}
+}
+
+=item xml();
+
+Return the xml representation of the sitemap.
+
+=cut
+
+sub xml {
+	my $self = shift;
+
+	my $xml = XML::Twig::Elt->new('urlset', {
+		'xmlns'	=> 'http://www.google.com/schemas/sitemap/0.84',
+		'xmlns:xsi' => 'http://www.w3.org/2001/XMLSchema-instance',
+		'xsi:schemaLocation' => join(' ',
+			'http://www.google.com/schemas/sitemap/0.84',
+			'http://www.google.com/schemas/sitemap/0.84/sitemap.xsd',
+		),
+	});
+	foreach($self->urls) {
+		$_->as_elt->paste(last_child => $xml);
+	}
+	$xml->set_pretty_print($self->pretty);
+	my $header = '<?xml version="1.0" encoding="UTF-8"?>';
+	if($self->pretty) { $header .= "\n" }
+	return $header.$xml->sprint();
+}
+
+=item file()
+
+Get or set the filename associated with this object.  If you call read() or
+write() without a filename, this is the default.
+
+=cut
+
+sub file {
+	my $self = shift;
+	$self->{file} = shift if @_;
+	return $self->{file};
+}
+
+=item pretty()
+
+Set this to a true value to enable 'pretty-printing' on the XML output.  If
+false (the default) the XML will be more compact but not as easily readable
+for humans (Google and other computers won't care what you set this to).
+
+If you set this to a 'word' (something that matches /[a-z]/i), then that
+value will be passed to XML::Twig directly (see the L<XML::Twig> pretty_print
+documentation).  Otherwise if a true value is passed, it means 'nice', and a
+false value means 'none'.
+
+Returns the value it was set to, or the current value if called with no
+arguments.
+
+=cut
+
+sub pretty {
+	my $self = shift;
+	my $val = shift || return $self->{pretty} || 'none';
+
+	if($val =~ /[a-z]/i) {
+		$self->{pretty} = $val;
+	} elsif($val) {
+		$self->{pretty} = 'nice';
+	} else {
+		$self->{pretty} = 'none';
+	}
+	return $self->{pretty};
+}
+
+=back
+
+=head1 MODULE HOME PAGE
+
+The home page of this module is
+L<http://www.jasonkohles.com/software/WWW-Google-SiteMap>.  This is where you
+can always find the latest version, development versions, and bug reports.  You
+will also find a link there to report bugs.
+
+=head1 SEE ALSO
+
+L<WWW::Google::SiteMap::Index>
+
+L<WWW::Google::SiteMap::Ping>
+
+L<WWW::Google::SiteMap::Robot>
+
+L<http://www.jasonkohles.com/software/WWW-Google-SiteMap>
+
+L<https://www.google.com/webmasters/sitemaps/docs/en/protocol.html>
+
+=head1 AUTHOR
+
+Jason Kohles, E<lt>email@jasonkohles.comE<gt>
+
+=head1 COPYRIGHT AND LICENSE
+
+Copyright (C) 2005 by Jason Kohles
+
+This library is free software; you can redistribute it and/or modify
+it under the same terms as Perl itself, either Perl version 5.8.4 or,
+at your option, any later version of Perl 5 you may have available.
+
+=cut
+
+1;
+__END__
--- a/lib/WWW/Google/SiteMap/Index.pm
+++ b/lib/WWW/Google/SiteMap/Index.pm
@@ -0,0 +1,189 @@
+package WWW::Google::SiteMap::Index;
+use vars qw($VERSION); $VERSION = '1.10';
+
+=head1 NAME
+
+WWW::Google::SiteMap::Index - DEPRECATED - See Search::Sitemap
+
+=head1 DEPRECATED
+
+Now that more search engines than just Google are supporting the Sitemap
+protocol, the WWW::Google::SiteMap module has been renamed to
+L<Search::Sitemap>.
+
+=head1 SYNOPSIS
+
+  use WWW::Google::SiteMap::Index;
+  
+  my $index = WWW::Google::SiteMap::Index->new(
+    file => 'sitemap-index.gz',
+  );
+  
+  $index->add(WWW::Google::SiteMap::URL->new(
+    loc     => 'http://www.jasonkohles.com/sitemap1.gz',
+    lastmod => '2005-11-01',
+  ));
+  
+=head1 DESCRIPTION
+
+A sitemap index is used to point Google at your sitemaps if you have more
+than one of them.
+
+=cut
+
+use strict;
+use warnings;
+use base 'WWW::Google::SiteMap';
+
+=head1 METHODS
+
+=over 4
+
+=item new()
+
+Creates a new WWW::Google::SiteMap::Index object.
+
+  my $index = WWW::Google::SiteMap::Index->new(
+    file => 'sitemap-index.gz',
+  );
+
+=item read()
+
+Read a sitemap index in to this object.  If a filename is specified, it will
+be read from that file, otherwise it will be read from the file that was
+specified with the file() method.  Reading of compressed files is done
+automatically if the filename ends with .gz.
+
+=item write([$file]);
+
+Write the sitemap index out to the file.  If a filename is specified, it will
+be written to that file, otherwise it will be written to the file that was
+specified with the file() method.  Writing of compressed files is done
+automatically if the filename ends with .gz
+
+=item urls()
+
+Return the L<WWW::Google::SiteMap::URL> objects that make up the sitemap index.
+
+=item add($item,[$item...]);
+
+Add the L<WWW::Google::SiteMap::URL> items listed to the sitemap index.
+
+If you pass hashrefs instead of L<WWW::Google::SiteMap::URL> objects, it
+will turn them into objects for you.  If the first item you pass is a
+simple scalar that matches \w, it will assume that the values passed are
+a hash for a single object.  If the first item passed matches m{^\w+://}
+(i.e. it looks like a URL) then all the arguments will be treated as URLs,
+and L<WWW::Google::SiteMap::URL> objects will be constructed for them, but only
+the loc field will be populated.
+
+This means you can do any of these:
+
+  # create the WWW::Google::SiteMap::URL object yourself
+  my $url = WWW::Google::SiteMap::URL->new(
+    loc => 'http://www.jasonkohles.com/sitemap1.gz',
+  );
+  $map->add($url);
+  
+  # or
+  $map->add(
+    { loc => 'http://www.jasonkohles.com/sitemap1.gz' },
+    { loc => 'http://www.jasonkohles.com/sitemap2.gz' },
+    { loc => 'http://www.jasonkohles.com/sitemap3.gz' },
+  );
+  
+  # or
+  $map->add(
+    loc       => 'http://www.jasonkohles.com/sitemap1.gz',
+    priority  => 1.0,
+  );
+  
+  # or even something funkier
+  $map->add(qw(
+    http://www.jasonkohles.com/
+    http://www.jasonkohles.com/software/www-google-sitemap/
+    http://www.jasonkohles.com/software/geo-shapefile/
+    http://www.jasonkohles.com/software/text-fakedata/
+  ));
+  foreach my $url ($map->urls) { $url->lastmod('2005-11-01') }
+
+=item xml();
+
+Return the xml representation of the sitemap index.
+
+=cut
+
+sub xml {
+	my $self = shift;
+
+	my $xml = XML::Twig::Elt->new('sitemapindex', {
+		'xmlns' => 'http://www.google.com/schemas/sitemap/0.84',
+		'xmlns:xsi' => 'http://www.w3.org/2001/XMLSchema-instance',
+		'xsi:schemaLocation' => join(' ',
+			'http://www.google.com/schemas/sitemap/0.84',
+			'http://www.google.com/schemas/sitemap/0.84/siteindex.xsd',
+		),
+	});
+	foreach($self->urls) {
+		$_->as_elt('sitemap',qw(loc lastmod))->paste(last_child => $xml);
+	}
+	$xml->set_pretty_print($self->pretty);
+	my $header = '<?xml version="1.0" encoding="UTF-8"?>';
+	if($self->pretty) { $header .= "\n" }
+	return $header.$xml->sprint();
+}
+
+=item file();
+
+Get or set the filename associated with this object.  If you call read() or
+write() without a filename, this is the default.
+
+=item pretty()
+
+Set this to a true value to enable 'pretty-printing' on the XML output.  If
+false (the default) the XML will be more compact but not as easily readable
+for humans (Google and other computers won't care what you set this to).
+
+If you set this to a 'word' (something that matches /[a-z]/i), then that
+value will be passed to XML::Twig directly (see the L<XML::Twig> pretty_print
+documentation).  Otherwise if a true value is passed, it means 'nice', and a
+false value means 'none'.
+
+Returns the value it was set to, or the current value if called with no
+arguments.
+
+=back
+
+=head1 MODULE HOME PAGE
+
+The home page of this module is
+L<http://www.jasonkohles.com/software/WWW-Google-SiteMap>.  This is where you
+can always find the latest version, development versions, and bug reports.  You
+will also find a link there to report bugs.
+
+=head1 SEE ALSO
+
+L<WWW::Google::SiteMap>
+
+L<WWW::Google::SiteMap::Ping>
+
+L<http://www.jasonkohles.com/software/WWW-Google-Sitemap>
+
+L<https://www.google.com/webmasters/sitemaps/docs/en/protocol.html#sitemapFileRequirements>
+
+=head1 AUTHOR
+
+Jason Kohles, E<lt>email@jasonkohles.comE<gt>
+
+=head1 COPYRIGHT AND LICENSE
+
+Copyright (C) 2005 by Jason Kohles
+
+This library is free software; you can redistribute it and/or modify
+it under the same terms as Perl itself, either Perl version 5.8.4 or,
+at your option, any later version of Perl 5 you may have available.
+
+=cut
+
+1;
+__END__
--- a/lib/WWW/Google/SiteMap/Ping.pm
+++ b/lib/WWW/Google/SiteMap/Ping.pm
@@ -0,0 +1,197 @@
+package WWW::Google::SiteMap::Ping;
+use vars qw($VERSION); $VERSION = '1.10';
+
+=head1 NAME
+
+WWW::Google::SiteMap::Ping - DEPRECATED - See Search::Sitemap
+
+=head1 DEPRECATED
+
+Now that more search engines than just Google are supporting the Sitemap
+protocol, the WWW::Google::SiteMap module has been renamed to
+L<Search::Sitemap>.
+
+=head1 SYNOPSIS
+
+  use WWW::Google::SiteMap::Ping;
+  
+  my $ping = WWW::Google::SiteMap::Ping->new(
+    'http://www.jasonkohles.com/sitemap.gz',
+  );
+  
+  $ping->submit;
+  print "These pings succeeded:\n";
+  foreach($ping->success) {
+    print "$_: ".$ping->status($_)."\n";
+  }
+  print "These pings failed:\n";
+  foreach($ping->failure) {
+    print "$_: ".$ping->status($_)."\n";
+  }
+
+=head1 DESCRIPTION
+
+This module makes it easy to notify Google that your sitemaps, or sitemap
+indexes, have been updated.  See L<WWW::Google::SiteMap> and
+L<WWW::Google::SiteMap::Index> for tools to help you create sitemaps and
+indexes.
+
+=cut
+
+use strict;
+use warnings;
+use LWP::UserAgent;
+use URI::Escape qw(uri_escape);
+
+=head1 METHODS
+
+=over 4
+
+=item new();
+
+Create a new WWW::Google::SiteMap::Ping object.  Can be given a list of
+URLs which refer to sitemaps or sitemap indexes, these URLs will simply
+be passed to url().
+
+=cut
+
+sub new {
+	my $class = shift;
+	my $self = bless({}, ref($class) || $class);
+	$self->{urls} = {};
+	$self->add_urls(@_);
+	return $self;
+}
+
+=item add_urls(@urls);
+
+Add one or more urls to the list of URLs to submit to Google.
+
+=cut
+
+sub add_urls {
+	my $self = shift;
+	foreach(@_) {
+		$self->{urls}->{$_} ||= 'PENDING';
+	}
+}
+
+=item urls();
+
+Return the list of urls that will be (or were) submitted to google.
+
+=cut
+
+sub urls { return keys %{shift()->{urls}}; }
+
+=item submit
+
+Submit the urls to Google, returns the number of successful submissions.  This
+module uses L<LWP::UserAgent> for the web-based submissions, and will honor
+proxy settings in the environment.  See L<LWP::UserAgent> for more information.
+
+=cut
+
+sub submit {
+	my $self = shift;
+
+	my $ua = $self->user_agent();
+	my $success = 0;
+	foreach my $url ($self->urls) {
+		my $ping = "http://www.google.com/webmasters/sitemaps/ping?".
+			"sitemap=".uri_escape($url);
+		my $response = $ua->get($ping);
+		if($response->is_success) {
+			$self->{urls}->{$url} = 'SUCCESS';
+			$success++;
+		} else {
+			$self->{urls}->{$url} = $response->status_line;
+		}
+	}
+
+	return $success;
+}
+
+=item success();
+
+Return the URLs that were successfully submitted.  Note that success only
+means that the request was successfully received by Google, it does not
+mean your sitemap was found, loaded or parsed successfully.  If you want
+to know whether your sitemap was loaded or parsed successfully, you have
+to go to L<http://www.google.com/webmasters/sitemaps> and check the status
+there.
+
+=cut
+
+sub success {
+	my $self = shift;
+
+	return grep { $self->{urls}->{$_} eq 'SUCCESS' } keys %{$self->{urls}};
+}
+
+=item failure();
+
+Return the URLs that were not successfully submitted.
+
+=cut
+
+sub failure {
+	my $self = shift;
+
+	return grep { $self->{urls}->{$_} ne 'SUCCESS' } keys %{$self->{urls}};
+}
+
+=item user_agent();
+
+If called with no arguments, will return the current L<LWP::UserAgent> object
+which will be used to access the web-based submission.  If called with an
+arugment, you can set the user agent that will be used in case you need to
+give it special arguments.  It must be a L<LWP::UserAgent> object.
+
+If you call submit without having provided a user agent, one will be created
+for you that is a basic L<LWP::UserAgent> object, which honors proxy settings
+in the environment.
+
+=cut
+
+sub user_agent {
+	my $self = shift;
+
+	if(@_) { $self->{_ua} = shift }
+	unless($self->{_ua}) {
+		$self->{_ua} = LWP::UserAgent->new();
+		$self->{_ua}->env_proxy;
+		$self->{_ua}->timeout(10);
+	}
+	return $self->{_ua};
+}
+
+=back
+
+=head1 MODULE HOME PAGE
+
+The home page of this module is
+L<http://www.jasonkohles.com/software/WWW-Google-SiteMap>.  This is where you
+can always find the latest version, development versions, and bug reports.  You
+will also find a link there to report bugs.
+
+=head1 SEE ALSO
+
+L<http://www.google.com/webmasters/sitemaps/docs/en/submit.html#ping>
+
+=head1 AUTHOR
+
+Jason Kohles, E<lt>email@jasonkohles.comE<gt>
+
+=head1 COPYRIGHT AND LICENSE
+
+Copyright (C) 2005 by Jason Kohles
+
+This library is free software; you can redistribute it and/or modify
+it under the same terms as Perl itself, either Perl version 5.8.4 or,
+at your option, any later version of Perl 5 you may have available.
+
+=cut
+
+1;
+__END__
--- a/lib/WWW/Google/SiteMap/Robot.pm
+++ b/lib/WWW/Google/SiteMap/Robot.pm
@@ -0,0 +1,582 @@
+package WWW::Google::SiteMap::Robot;
+use vars qw($VERSION); $VERSION = '1.10';
+
+=head1 NAME
+
+WWW::Google::SiteMap::Robot - DEPRECATED - See Search::Sitemap
+
+=head1 DEPRECATED
+
+Now that more search engines than just Google are supporting the Sitemap
+protocol, the WWW::Google::SiteMap module has been renamed to
+L<Search::Sitemap>.
+
+=head1 SYNOPSIS
+
+  use WWW::Google::SiteMap::Robot;
+  
+  my $robot = WWW::Google::SiteMap::Robot->new(
+    domain        => 'www.jasonkohles.com',
+    restrict      => qr{^http://www.jasonkohles.com/},
+    starting_url  => ['/index.html','/google-me.html'],
+    delay         => 1, # delay in minutes
+    sitemap_file  => '/var/www/html/sitemap.gz',
+    sitemap_url   => 'http://www.jasonkohles.com/sitemap.gz',
+    user_agent    => 'MyOwnSpider/1.0',
+  );
+  $robot->run();
+
+=head1 DESCRIPTION
+
+This is a simple robot class which subclasses L<LWP::RobotUA> to create a
+web-crawling spider.  By giving it the URL to your home page, it will crawl
+all the pages it can find and create a sitemap for them.
+
+=cut
+
+use strict;
+use warnings;
+use WWW::Mechanize;
+use WWW::RobotRules;
+use Carp qw(croak);
+use POSIX qw(strftime);
+use WWW::Google::SiteMap;
+use WWW::Google::SiteMap::Ping;
+
+=head1 METHODS
+
+=over 4
+
+=item new();
+
+Create a new WWW::Google::SiteMap::Robot object.
+
+=cut
+
+sub new {
+	my $class = shift;
+	my %args = @_;
+
+	my $self = bless({},ref($class)||$class);
+
+	croak "No domain specified" unless $args{domain};
+
+	# These items have other methods that depend on them, so they need to
+	# be called in this order:
+	foreach my $x (qw(domain status_storage)) {
+		$self->$x(delete($args{$x}));
+	}
+
+	while(my($k,$v) = each %args) { $self->$k($v) }
+
+	return $self;
+}
+
+=item domain();
+
+Get/Set the domain name of the server you want to spider.  This is used both
+to create the initial URLs to put in the TO-DO list, as well as to create a
+built-in restriction that prevents the robot from leaving your site.
+
+Google doesn't allow a sitemap to refer to URL's that are outside the domain
+that the sitemap was retrieved for, so there really isn't any benefit in
+allowing the robot to cross multiple domains.  If you really think you need
+to do this, you probably really just want more than one robot.  If you are
+absolutely certain you want to cross domain boundaries, then you'll have to
+subclass this module, and Google will probably reject your sitemaps.
+
+=cut
+
+sub domain {
+	my $self = shift;
+
+	if(@_) { $self->{domain} = shift }
+	return $self->{domain};
+}
+
+=item restrict();
+
+Get/Set the url restrictions.  The restriction list can be any of the
+following:
+
+=over 4
+
+=item A list reference (or a list)
+
+A list reference is assumed to contain a list of any of the following types.
+When passed as an argument to the constructor it has to be a reference, but 
+when you are calling restrict() as a method, you can pass it a list, and it
+will turn it into a list reference.  If you provide more than one restrict
+item in a list, the first one to return true will cause the rest of them to
+be skipped, so the URL will be restricted (skipped) if any of the items are
+true (if you want more complexity than that, then just use a code reference
+by itself, which can do whatever it wants.)
+
+=item A code reference
+
+If you give restrict a code reference, it will be passed the URL that is
+about to be spidered, if the code returns a true value, the URL will be
+skipped.  If it returns false, it will not be restricted.
+
+=item A regexp reference
+
+If you give it a regexp reference, then the regexp will be applied to the
+URL about to be spidered, if the regexp matches, then the URL will be
+skipped.
+
+=back
+
+If called with no arguments, it will return the current list of restrictions.
+
+There are built-in restrictions that are always applied at the end of your
+restriction list.  One is a url regexp that matches your domain name, to
+prevent the robot from leaving your site (it's qr{^\w+://YOUR_DOMAIN/}).
+The other is a restriction that excludes any URLs that are not allowed by
+your robots.txt.  This module doesn't provide any method for ignoring the
+robots.txt restriction (because it's dangerous), you should really modify
+your robots.txt to allow this robot to bypass any of the restrictions you
+don't want it to honor.
+
+For example, if your robot.txt contains:
+
+  User-Agent: *
+  Disallow: /admin
+  Disallow: /google-stuff
+
+Then those two paths will not be included in your sitemap.  If you decided
+you actually did want /google-stuff to appear in your sitemap, you could add
+this to your robots.txt:
+
+  User-Agent: WWWGoogleSiteMapRobot
+  Disallow: /admin
+
+=cut
+
+sub restrict {
+	my $self = shift;
+
+	if(@_) { $self->{restrict} = \@_ }
+	unless($self->{restrict}) { $self->{restrict} = [] }
+
+	return @{$self->{restrict}};
+}
+
+=item starting_url();
+
+If called with one or more arguments, they are assumed to be URLs which will
+seed the spider.  The spider continues to run as long as there are URLs in
+it's "TO-DO" list, this method simply adds items to that list.  The arguments
+to starting_url are just the filename part of the url, if you don't specify
+one, it defaults to '/'.
+
+You can pass it either a list of URLs, or a list reference (so you can use
+a list reference in the constructor.)
+
+=cut
+
+sub starting_url {
+	my $self = shift;
+
+	if(@_) {
+		$self->{starting_url} = \@_;
+		$self->_populate_starting_urls;
+	}
+	unless($self->{starting_url}) { $self->{starting_url} = ['/'] }
+	return $self->{starting_url};
+}
+
+sub _populate_starting_urls {
+	my $self = shift;
+	my @populate = @_;
+
+	unless(@populate) { @populate = $self->starting_url() }
+
+	foreach(@populate) {
+		next unless $_;
+		if(ref($_)) { $self->_populate_starting_urls(@{$_}); next; }
+		$self->{storage}->{"http://".$self->domain.$_} ||= '';
+	}
+}
+
+=item delay();
+
+Get or set the delay (in minutes) to wait between requests.  The default is
+1 minute, and if you want to hammer on your web server you can set this to
+a value less than 1.
+
+=cut
+
+sub delay {
+	my $self = shift;
+
+	if(@_) { $self->{delay} = shift }
+
+	return $self->{delay} || 1;
+}
+
+=item sitemap_file();
+
+Sets the filename to save the L<WWW::Google::SiteMap> object to.  This is
+required.
+
+=cut
+
+sub sitemap_file {
+	my $self = shift;
+
+	if(@_) { $self->{sitemap_file} = shift }
+	return $self->{sitemap_file};
+}
+
+=item sitemap_url();
+
+Sets the url for the sitemap.  This is optional, but if you specify it, then
+the robot will notify Google (using L<WWW::Google::SiteMap::Ping>) after it
+writes a new sitemap.
+
+=cut
+
+sub sitemap_url {
+	my $self = shift;
+
+	if(@_) { $self->{sitemap_url} = shift }
+	return $self->{sitemap_url};
+}
+
+=item user_agent();
+
+Set the User Agent that this robot uses to identify itself.  The default is
+'WWWGoogleSiteMapRobot/version' (unless you have subclassed this module, it's
+actually the class name with special characters removed.)
+Be careful about changing this while the robot is active (this includes
+changing it between runs if you are storing the state) as this affects how
+your robot interprets your robots.txt file.
+
+=cut
+
+sub user_agent {
+	my $self = shift;
+
+	if(@_) { $self->{user_agent} = shift }
+	unless($self->{user_agent}) {
+		my $pkg = ref($self) || $self;
+		$pkg =~ s/\W//g;
+		$self->{user_agent} = join('/',$pkg,$VERSION);
+	}
+	return $self->{user_agent};
+}
+
+=item robot_rules();
+
+Get or set the L<WWW::RobotRules> object used to handle robots.txt.
+
+=cut
+
+sub robot_rules {
+	my $self = shift;
+
+	if(@_) { $self->{robot_rules} = shift }
+	unless($self->{robot_rules}) {
+		$self->{robot_rules} = WWW::RobotRules->new($self->user_agent);
+		my $url = "http://".$self->domain."/robots.txt";
+		my $mech = $self->mechanize();
+		$mech->get($url);
+		$self->{robot_rules}->parse($url,$mech->content);
+	}
+	return $self->{robot_rules};
+}
+
+=item mechanize();
+
+Get or set the L<WWW::Mechanize> object used for retrieving web documents.
+
+=cut
+
+sub mechanize {
+	my $self = shift;
+
+	if(@_) { $self->{mech} = shift }
+	unless($self->{mech}) {
+		$self->{mech} = WWW::Mechanize->new(
+			agent		=> $self->user_agent,
+			stack_depth	=> 1,
+		);
+	}
+	return $self->{mech};
+}
+
+=item status_storage();
+
+If you provide status_storage with a tied hash, it will be used to store the
+state of the TO-DO list which includes the data needed to build the sitemap,
+as well as the list of unvisited URLs.  This means that the robot can continue
+where it left off if it is interrupted for some reason before finishing, then
+you don't have to re-spider the entire site.  This is strongly recommended.
+
+You can use this with basically anything that can be implemented as a tied
+hash, as long as it can handle fully-qualified URLs as keys, the values will
+be simple scalars (it won't try to store references or anything like that
+in the values.)
+
+Example:
+
+  use WWW::Google::SiteMap::Robot;
+  use GDBM::File;
+  
+  tie my %storage, 'GDBM_File', '/tmp/my-robot-status', &GDBM_WRCREAT, 0640;
+  my $robot = WWW::Google::SiteMap::Robot->new(
+    restrict     => qr{^http://www.jasonkohles.com/},
+    starting_url => 'http://www.jasonkohles.com/index.html',
+    sitemap_file => '/var/www/html/sitemap.gz',
+  );
+
+If you don't provide a tied hash to store the status in, it will be stored in
+a normal (in-memory) hash.
+
+=cut
+
+sub status_storage {
+	my $self = shift;
+
+	if(@_) {
+		$self->{storage} = shift;
+		# If the storage is changed, we might have lost our starting urls
+		$self->_populate_starting_urls;
+	}
+	unless($self->{storage}) {
+		$self->{storage} = {};
+		$self->_populate_starting_urls;
+	}
+	return $self->{storage};
+}
+
+=item pending_urls();
+
+Return a list of all the URLs that have been found, but have not yet been
+visited.  This may include URLs that will later be restricted, and will not
+be visited.
+
+=cut
+
+sub pending_urls {
+	my $self = shift;
+	my $todo = $self->status_storage;
+	return grep { ! $todo->{$_} } keys %{$todo};
+}
+
+=item restricted_urls();
+
+Return a list of all the URLs that are in the TO-DO list that have already
+been tried, but were skipped because they were restricted.
+
+=cut
+
+sub restricted_urls {
+	my $self = shift;
+	$self->_url_data_match(qr/^RESTRICTED /o);
+}
+
+=item visited_urls();
+
+Return a list of all the URLs that have already been visited, and will be
+included in the sitemap.
+
+=cut
+
+sub visited_urls {
+	my $self = shift;
+	$self->_url_data_match(qr/^OK /o);
+}
+
+=item run();
+
+Start the robot running.  If you are building your robot into a larger
+program that has to handle other tasks as well, then you can pass an integer
+to run(), which will be the number of URLs to check (of course then you will
+have to call it again later, probably in a loop, to make sure you get them
+all.)  Returns true if something was done, returns false if no pending URLs
+were found in the TO-DO list.  Calling start() again after it has returned
+false is rather pointless.  If you call it in a loop as part of a larger
+program, you are also responsible for calling write_sitemap() after all the
+data is collected.
+
+If called with no arguments (or a false argument) it will run until there are
+no more URLs to process.
+
+=cut
+
+sub run {
+	my $self = shift;
+	my $count = shift;
+
+	my $counter = $count;
+	my @waiting = $self->pending_urls;
+	my $mech = $self->mechanize;
+	while(1) {
+		sleep($self->delay * 60); # sleep first, because of all the nexts
+		unless(@waiting) { @waiting = $self->pending_urls }
+		if(my $url = shift(@waiting)) {
+			# first make sure we didn't already do it
+			next if $self->{storage}->{$url};
+			# Then make sure it isn't restricted
+			if($self->_check_restrictions($url)) {
+				$self->{storage}->{$url} = 'RESTRICTED';
+				next;
+			}
+			$mech->get($url);
+			if($mech->success) {
+				# extract the last modification time from the page
+				my $modtime = $mech->response->last_modified()
+					|| (time - $mech->response->current_age);
+				$self->{storage}->{$url} = "SUCCESS $modtime";
+				# add any links in the page to our todo list
+				foreach($mech->links) {
+                    my $url = $_->url_abs;
+                    $url =~ s/#[^#]+$//;
+					$self->{storage}->{$url} ||= '';
+				}
+			} else {
+				$self->{storage}->{$url} = 'ERROR '.$mech->status();
+			}
+			next;
+		}
+
+		if($count) {
+			last unless $counter--;
+		} else {
+			last unless @waiting;
+		}
+	}
+	unless($count) { # if you are limiting, you have to do this part yourself
+		$self->write_sitemap() if $self->sitemap_file();
+	}
+}
+
+sub _check_restrictions {
+	my $self = shift;
+	my $url = shift;
+
+	# some hard-coded restrictions for safety sake
+	if($url !~ /^(http|https):/) {
+		return 1;
+	}
+
+	foreach my $r ($self->restrict) {
+		if(ref($r) eq 'Regexp' && $url =~ /$r/) {
+			return 1;
+		}
+		if(ref($r) eq 'CODE' && $r->($url)) {
+			return 1
+		}
+	}
+	my $domain = $self->domain;
+	if($url !~ m{^\w+://$domain}o) {
+		return 1;
+	}
+	unless($self->robot_rules->allowed($url)) {
+		return 1;
+	}
+	return 0;
+}
+
+=item write_sitemap();
+
+Write out the sitemap (if a sitemap file was specified), and optionally notify
+Google (if a sitemap url was specified).
+
+=cut
+
+sub write_sitemap {
+	my $self = shift;
+
+	my $map = WWW::Google::SiteMap->new(
+		file	=> $self->sitemap_file,
+		pretty	=> 1,
+	);
+	while(my($url,$val) = each(%{$self->{storage}})) {
+		next unless $val =~ /^SUCCESS /;
+		my(undef,$lastmod) = split(' ',$val);
+		$map->add(WWW::Google::SiteMap::URL->new(
+			loc			=> $url,
+			lastmod		=> $lastmod,
+		));
+	}
+	$map->write;
+
+	if($self->sitemap_url) {
+		my $ping = WWW::Google::SiteMap::Ping->new($self->sitemap_url);
+		$ping->submit;
+	}
+}
+
+sub _url_data_match {
+	my $self = shift;
+	my $regexp = shift;
+
+	my $todo = $self->status_storage;
+	return grep {
+		$todo->{$_} && $todo->{$_} =~ /^$regexp/o
+	} keys %{$todo};
+}
+
+=back
+
+=head1 EXAMPLE ROBOT
+
+  #!/usr/bin/perl -w
+  ##################
+  use strict;
+  use warnings;
+  use lib 'lib';
+  use WWW::Google::SiteMap::Robot;
+  use GDBM_File;
+  
+  foreach my $site (qw(www.example.com www.example2.com www.example3.com)) {
+    my $status = '/tmp/sitemap-robot-status.$site.db';
+    tie my %storage, 'GDBM_File', $status, &GDBM_WRCREAT, 0640
+    my $robot = WWW::Google::SiteMap::Robot->new(
+      domain         => $site,
+      status_storage => \%storage,
+      sitemap_file   => "/var/www/$site/sitemap.gz",
+      sitemap_url    => "http://$site/sitemap.gz",
+    );
+    $robot->run();
+  }
+
+=head1 MODULE HOME PAGE
+
+The home page of this module is
+L<http://www.jasonkohles.com/software/WWW-Google-SiteMap>.  This is where you
+can always find the latest version, development versions, and bug reports.  You
+will also find a link there to report bugs.
+
+=head1 SEE ALSO
+
+L<WWW::Google::SiteMap>
+
+L<WWW::Google::SiteMap::Index>
+
+L<WWW::Google::SiteMap::Ping>
+
+L<http://www.jasonkohles.com/software/WWW-Google-SiteMap>
+
+L<WWW::Mechanize>
+
+L<WWW::RobotRules>
+
+=head1 AUTHOR
+
+Jason Kohles, E<lt>email@jasonkohles.comE<gt>
+
+=head1 COPYRIGHT AND LICENSE
+
+Copyright (C) 2005 by Jason Kohles
+
+This library is free software; you can redistribute it and/or modify
+it under the same terms as Perl itself, either Perl version 5.8.4 or,
+at your option, any later version of Perl 5 you may have available.
+
+=cut
+
+1;
+__END__
--- a/lib/WWW/Google/SiteMap/URL.pm
+++ b/lib/WWW/Google/SiteMap/URL.pm
@@ -0,0 +1,294 @@
+package WWW::Google::SiteMap::URL;
+use vars qw($VERSION); $VERSION = '1.10';
+
+=head1 NAME
+
+WWW::Google::SiteMap::URL - DEPRECATED - See Search::Sitemap
+
+=head1 DEPRECATED
+
+Now that more search engines than just Google are supporting the Sitemap
+protocol, the WWW::Google::SiteMap module has been renamed to
+L<Search::Sitemap>.
+
+=head1 SYNOPSIS
+
+  use WWW::Google::SiteMap;
+
+=head1 DESCRIPTION
+
+This is a helper class that supports L<WWW::Google::SiteMap> and
+L<WWW::Google::SiteMap::Index>.
+
+=cut
+
+=head1 METHODS
+
+=over 4
+
+=cut
+
+use strict;
+use warnings;
+use Carp qw(carp croak);
+use XML::Twig qw();
+use POSIX qw(strftime);
+use HTML::Entities qw(encode_entities);
+
+=item new()
+
+=cut
+
+sub new {
+	my $class = shift;
+	my %opts = ref($_[0]) ? %{$_[0]} : @_;
+
+	my $self = bless({}, $class);
+	while(my($key,$value) = each %opts) { $self->$key($value) }
+	return $self;
+}
+
+=item loc()
+
+Change the URL associated with this object.  For a L<WWW::Google::SiteMap>
+this specifies the URL to add to the sitemap, for a
+L<WWW::Google::SiteMap::Index>, this is the URL to the sitemap.
+
+=cut
+
+sub loc {
+	shift->_doval('loc', sub {
+		local $_ = shift;
+		return unless defined;
+		return 'must be less than 2048 characters long' unless length($_) < 2048;
+		return 'must be a fully qualified url' unless m{^https?://};
+		return;
+	}, @_);
+}
+
+=item changefreq()
+
+Set the change frequency of the object.  This field is not used in sitemap
+indexes, only in sitemaps.
+
+=cut
+
+sub changefreq {
+	shift->_doval('changefreq', sub {
+		local $_ = shift;
+		my @values = qw(always hourly daily weekly monthly yearly never);
+		my $re = join('|',@values);
+		return unless defined;
+		return 'must be one of '.join(', ',@values) unless /^$re$/;
+		return;
+	}, @_);
+}
+
+=item lastmod()
+
+Set the last modified time.  You have to provide this as one of the following:
+
+=over 4
+
+=item a complete ISO8601 time string
+
+A complete time string will be accepted in exactly this format:
+
+  YYYY-MM-DDTHH:MM:SS+TZ:TZ
+  YYYY   - 4-digit year
+  MM     - 2-digit month (zero padded)
+  DD     - 2-digit year (zero padded)
+  T      - literal character 'T'
+  HH     - 2-digit hour (24-hour, zero padded)
+  SS     - 2-digit second (zero padded)
+  +TZ:TZ - Timezone offset (hours and minutes from GMT, 2-digit, zero padded)
+
+=item epoch time
+
+Seconds since the epoch, such as would be returned from time().  If you provide
+an epoch time, then an appropriate ISO8601 time will be constructed with
+gmtime() (which means the timezone offset will be +00:00).  If anyone knows
+of a way to determine the timezone offset of the current host that is
+cross-platform and doesn't add dozens of dependencies then I might change this.
+
+=item an ISO8601 date (YYYY-MM-DD)
+
+A simple date in YYYY-MM-DD format.  The time will be set to 00:00:00+00:00.
+
+=item a L<DateTime> object.
+
+If a L<DateTime> object is provided, then an appropriate timestamp will be
+constructed from it.
+
+=item a L<HTTP::Response> object.
+
+If given an L<HTTP::Response> object, the last modified time will be
+calculated from whatever time information is available in the response
+headers.  Currently this means either the Last-Modified header, or tue
+current time - the current_age() calculated by the response object.
+This is useful for building web crawlers.
+
+=back
+
+Note that in order to conserve memory, any of these items that you provide
+will be converted to a complete ISO8601 time string when they are stored.
+This means that if you pass an object to lastmod(), you can't get it back
+out.  If anyone actually has a need to get the objects back out, then I
+might make a configuration option to store the objects internally.
+
+If you have suggestions for other types of date/time objects or formats
+that would be usefule, let me know and I'll consider them.
+
+=cut
+
+sub lastmod {
+	my $self = shift;
+
+	return $self->{lastmod} unless @_;
+
+	my $value = shift;
+	if(ref($value)) {
+		if($value->isa('DateTime')) { # DateTime object
+			my($date,$tzoff) = $value->strftime("%Y-%m-%dT%T","%z");
+			if($tzoff =~ /^([+-])?(\d\d):?(\d\d)/) {
+				$tzoff = ($1 || '+').$2.':'.($3||'00');
+			} else {
+				$tzoff = '+00:00';
+			}
+			$self->{lastmod} = $date.$tzoff;
+		} elsif($value->isa('HTTP::Response')) {
+			my $modtime = $value->last_modified()
+				|| (time - $value->current_age());
+			$self->{lastmod} = strftime("%Y-%m-%dT%T+00:00",gmtime($_));
+		}
+	} else {
+		local $_ = $value;
+		if(/^\d+$/) { # epoch time
+			$self->{lastmod} = strftime("%Y-%m-%dT%T+00:00",gmtime($_));
+		} elsif(/^\d\d\d\d-\d\d-\d\d$/) {
+			$self->{lastmod} = $_.'T00:00:00+00:00';
+		} elsif(/^\d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\d\+\d\d:\d\d$/) {
+			$self->{lastmod} = $_;
+		}
+	}
+
+	return $self->{lastmod} if $self->{lastmod};
+	$self->_err("'$_' is not a valid value for lastmod");
+}
+
+=item priority()
+
+Set the priority.  This field is not used in sitemap indexes, only in sitemaps.
+
+=cut
+
+sub priority {
+	shift->_doval('priority', sub {
+		local $_ = shift;
+		return unless defined;
+		return 'must be a number' unless /^[\d\.]+$/;
+		return 'must be greater than 0.0' unless $_ >= 0.0;
+		return 'must be less than 1.0' unless $_ <= 1.0;
+		return;
+	}, @_);
+}
+
+sub _doval {
+	my $self = shift;
+	my $var = shift;
+	my $valid = shift;
+	return $self->{$var} unless @_;
+	my $value = shift;
+	if(my $res = $valid->($value)) {
+		my $msg = "'$value' is not a valid value for $var: $res";
+		if($self->{lenient}) { carp $msg } else { croak $msg }
+	} else {
+		$self->{$var} = $value;
+	}
+}
+sub _err {
+	my $self = shift;
+
+	if($self->{lenient}) { carp @_ } else { croak @_ }
+}
+
+
+=item delete()
+
+Delete this object from the sitemap or the sitemap index.
+
+=cut
+
+sub delete {
+	my $self = shift;
+	for(keys %{$self}) { $self->{$_} = undef }
+}
+
+=item lenient()
+
+If lenient contains a true value, then errors will not be fatal.
+
+=cut
+
+sub lenient {
+	my $self = shift;
+	$self->{lenient} = shift if @_;
+	return $self->{lenient};
+}
+
+sub as_elt {
+	my $self = shift;
+	my $type = shift || 'url';
+	my @fields = @_;
+	unless(@fields) { @fields = qw(loc changefreq lastmod priority) }
+	my @elements = ();
+	foreach(@fields) {
+		my $val = $self->$_() || next;
+        if($_ eq 'loc') {
+            $val = XML::Twig::Elt->new('#PCDATA' => encode_entities($val));
+            $val->set_asis(1);
+        } else {
+            $val = XML::Twig::Elt->new('#PCDATA' => $val);
+        }
+        push(@elements,$val->wrap_in($_));
+	}
+	return XML::Twig::Elt->new($type, {}, @elements);
+}
+
+=back
+
+=head1 MODULE HOME PAGE
+
+The home page of this module is
+L<http://www.jasonkohles.com/software/WWW-Google-SiteMap>.  This is where you
+can always find the latest version, development versions, and bug reports.  You
+will also find a link there to report bugs.
+
+=head1 SEE ALSO
+
+L<WWW::Google::SiteMap>
+
+L<WWW::Google::SiteMap::Index>
+
+L<WWW::Google::SiteMap::Ping>
+
+L<http://www.jasonkohles.com/software/WWW-Google-SiteMap/>
+
+L<https://www.google.com/webmasters/sitemaps/docs/en/protocol.html>
+
+=head1 AUTHOR
+
+Jason Kohles, E<lt>email@jasonkohles.comE<gt>
+
+=head1 COPYRIGHT AND LICENSE
+
+Copyright (C) 2005 by Jason Kohles
+
+This library is free software; you can redistribute it and/or modify
+it under the same terms as Perl itself, either Perl version 5.8.4 or,
+at your option, any later version of Perl 5 you may have available.
+
+=cut
+
+1;
+__END__