The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.

# $Id: Wishlist.pm,v 2.6 2009-08-26 22:04:49 Martin Exp $

package WWW::Amazon::Wishlist;

use strict;
use vars qw( @ISA @EXPORT @EXPORT_OK );

use Carp;
use Data::Dumper;
use HTML::TreeBuilder;
use LWP::UserAgent;

use constant COM => 0;
use constant UK  => 1;

use constant DEBUG => 0;
use constant DEBUG_HTML => 0;

require Exporter;

@ISA = qw(Exporter);
# Items to export into callers namespace by default. Note: do not export
# names by default without a very good reason. Use EXPORT_OK instead.
# Do not simply export all your public functions/methods/constants.
@EXPORT = qw(
);

@EXPORT_OK = qw(
        get_list
        UK
        COM
);

our
$VERSION = do { my @r = (q$Revision: 2.6 $ =~ /\d+/g); sprintf "%d."."%03d" x $#r, @r };


=pod

=head1 NAME

WWW::Amazon::Wishlist - grab all the details from your Amazon wishlist

=head1 SYNOPSIS

  use WWW::Amazon::Wishlist qw(get_list COM UK);

  my @wishlist;

  @wishlist = get_list($my_amazon_com_id);       # gets it from amazon.com
  @wishlist = get_list($my_amazon_com_id,  COM); # same, explicitly
  @wishlist = get_list($my_amazon_couk_id, UK);  # gets it from amazon.co.uk

  # Or, if you didn't import the COM and UK constants:
  @wishlist = get_list ($my_amazon_couk_id, WWW::Amazon::Wishlist::UK);

  # The elements of @wishlist are hashrefs that contain the following elements:
  foreach my $book (@wishlist)
    {
    print $book->{title}, # the, err, title
    $book->{author},      # and the author(s) 
    $book->{asin},        # the asin number, its unique id on Amazon
    $book->{price},       # how much it will set you back
    $book->{quantity},    # how many you said you want
    $book->{priority},    # how urgently you said you want it (1-5)
    $book->{type};        # Hardcover/Paperback/CD/DVD etc (not available in the US)
    } # foreach

=head1 DESCRIPTION

Goes to amazon.(com|co.uk), scrapes your wishlist, and returns it
in a array of hashrefs so that you can fiddle with it to your heart's
content.

=head1 GETTING YOUR AMAZON ID

The best way to do this is to search for your own wishlist in the search
tools.

Searching for mine (simon@twoshortplanks.com) on amazon.com takes me to
the URL something like

   http://www.amazon.com/exec/obidos/wishlist/2EAJG83WS7YZM/...

there's some more cruft after that last string of numbers and letters
but it's the

   2EAJG83WS7YZM

bit that's important.

Doing the same for amazon.co.uk is just as easy.

Apparently, some people have had problems getting to their wishlist right
after it gets set up.  You may have to wait a while for it to become
browseable.

=head1 SHOWING YOUR APPRECIATION

There was a thread on london.pm mailing list about working in a vacuum -
that it was a bit depressing to keep writing modules but never get any
feedback.  So, if you use and like this module then please send me an
email and make my day.

All it takes is a few little bytes.


=head1 BUGS

B<IMPORTANT>

C<WWW::Amazon::Wishlist> is a screen scraper and is there for
is vulnerable to any changes that Amazon make to their HTML.

If it starts returning no items then this is very likely the reason
and I will get roudn to fixing it as soon as possible.

You might want to look at the C<Net::Amazon> module instead.

It doesn't cope with anything apart from the UK and USA versions of Amazon.

I don't think it likes unavailable items - trying to work around this
breaks UK compatability.

The code has accumulated lots of cruft.

Lack of testing.  It works for the pages I've tried it for but that's
no guarantee.

=head1 LICENSE

Copyright (c) 2003 Simon Wistow

Distributed under the same terms as Perl itself.

This software is under no warranty and will probably destroy your wish
list, kill your friends, burn your house and bring about the apocalypse

=head1 AUTHOR

Simon Wistow <simon@thegestalt.org>

=head1 SEE ALSO

L<perl>, L<LWP::UserAgent>, L<amazonwish>

=cut

my $USER_AGENT = 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)';

sub get_list
  {
  my ($id, $uk, $test) = @_;
  if (! defined $id)
    {
    croak "No ID given to get_list function\n";
    return undef;
    } # if
  # note to self ... should we UC the id? Nahhhh. Not yet.
  # Default is amazon.com:
  $uk |= 0;
  $test |= 0;
  # fairly self explanatory
  my $domain = ($uk)? "co.uk" : "com";
  # set up some variables
  my $iPage = 1;
  my @items;
  # and awaaaaaaaaaaaaay we go ....
 INFINITE:
  while (1)
    {
    my $url = $uk ? "http://www.amazon.co.uk/gp/registry/wishlist/$id/?page=$iPage" :
    "http://www.amazon.com/gp/registry/wishlist/$id/?page=$iPage";
    # This is a typical complete .com URL as of 2008-12:
    # http://www.amazon.com/gp/registry/wishlist/2O4B95NPM1W3L
    DEBUG_HTML && print STDERR " DDD fetching wishlist for $id, page $iPage...\n";
    # Don't overwhelm the server:
    sleep(3) if (1 < $iPage);
    my $content = _fetch_page($url, $domain);
    # As of 2009-08, Amazon returns HTML with MISSING BRACKETS:
    $content =~ s/(<tbody\s[^>\r\n]+)(\s+<)/$1>\n$2/g;
    # There seems to be a bug in HTML::TreeBuilder that causes
    # abutting tags to be skpped!?!
    $content =~ s!><!> <!g;
    if (0)
      {
      eval "use File::Slurp";
      write_file('PAGES/fetched.html', $content);
      exit 88;
      } # if
    my $iLen = length($content);
    # print STDERR " DDD fetched $iLen bytes.\n";
    return undef unless ($content);
    # print STDERR " DDD call _extract()\n";
    my $result = _extract($uk, $content);
    # print Dumper($result);
    # exit 88;
    last INFINITE if (! defined $result);
    last INFINITE if (! ref $result->{items});
 ITEM:
    foreach my $item (@{$result->{items}})
      {
      $item->{'author'} =~ s!\n!!g;
      $item->{'author'} =~ s!^\s*by\s+!!g;
      $item->{'author'} =~ s!</span></b><br />\n*!!s;
      $item->{'quantity'} = $1 if ($item->{'priority'} =~ m!Desired:\s*</b>\s*(\d+)!i);
      $item->{'priority'} = $1 if ($item->{'priority'} =~ m!Priority:\s*</b>\s*(\d)!i);
      if (
          $uk
          &&
          $item->{image}
          &&
          ($item->{image} !~ m!^http:!)
         )
        {
        $item->{image} = q"http://images-eu.amazon.com/images/P/". $item->{image};
        } # if
      push @items, $item;
      } # foreach ITEM
    # For debug purposes:
    last INFINITE if $test;
    my $sURLNext = $result->{next};
    if (! defined $sURLNext)
      {
      # DEBUG && print STDERR " DDD content===$content===\n";
      # exit 88;
      # Use brute force to find it:
      if ($content =~ m!(&(amp;)?page=\d+)">\s*(<[^>]+>)?Next!)
        {
        DEBUG && print STDERR " DDD found next URL with brute force\n";
        $sURLNext = $1;
        } # if
      } # if
    # Paranoia:
    if (! defined $sURLNext)
      {
      DEBUG && print STDERR " WWW did not find next url\n";
      last INFINITE;
      } # if
    my $iNext = 0;
    if ($sURLNext !~ m/[;&]page=(\d+)/)
      {
      DEBUG && print STDERR " WWW next url =$sURLNext= does not contain page#\n";
      last INFINITE;
      } # if
    $iNext = $1;
    # More paranoia:
    if ($iNext <= $iPage)
      {
      DEBUG && print STDERR " WWW next url page=$iNext is not greater than current page=$iPage\n";
      last INFINITE;
      } # if
    # ...and update:
    $iPage = $iNext;
    } # while
  return @items;
  } # get_list


sub _fetch_page
  {
    my ($url, $domain) = @_;
    if (0)
      {
      eval "use File::Slurp";
      # For debugging UK site:
      return read_file('Pages/uk-2008-12-page1.html');
      # For debugging USA site:
      return read_file('Pages/2008-12.html');
      } # if 0
    # Setting up the UA here is slower but makes the code easier to read
    # really, the slow bit will not be setting up the UA each time

    # set up the UA
    my $ua = new LWP::UserAgent( keep_alive => 1, timeout => 30, agent => $USER_AGENT, );

    # setting it in the 'new' seems not to work sometimes
    $ua->agent($USER_AGENT);
    # for some reason this makes stuff work
    $ua->max_redirect( 0 );


    # make a full set of headers
    my $h = new HTTP::Headers(
                'Host'            => "www.amazon.$domain",
                'Referer'         => $url,
                'User-Agent'      => $USER_AGENT,
                'Accept'          => 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,video/x-mng,image/png,image/jpeg,image/gif;q=0.2,*/*;q=0.1',
                'Accept-Language' => 'en-us,en;q=0.5',
                'Accept-Charset'  => 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
                #'Accept-Encoding' => 'gzip,deflate',
                'Keep-Alive'      =>  '300',
                'Connection'      =>  'keep-alive',
    );
    $h->referer("$url");



    my $request  =  HTTP::Request->new ( 'GET', $url, $h );
    my $response;

    my $times = 0;

    # LWP should be able to do this but seemingly fails sometimes
    while ($times++<3) {
        $response =  $ua->request($request);
        last if $response->is_success;
        if ($response->is_redirect) {
               $url = $response->header("Location");
            #$h->header("Referer", $url); 
            $h->referer("$url");
            $request  =  HTTP::Request->new ( 'GET', $url, $h );
        }
    }

    if (!$response->is_success)     {
        croak "Failed to retrieve $url";
        return undef;
    }

    return $response->content;

} # _fetch_page

# This is the HTML parsing version written by Martin Thurn:

sub _extract
  {
  # Required arg1 = whether we are parsing the UK site or not (Boolean):
  my $iUK = shift || 0;
  # Required arg2 = the HTML contents of the webpage:
  my $s = shift || '';
  DEBUG_HTML && print STDERR " DDD start _extract()\n";
  my $rh;
  my $oTree = new HTML::TreeBuilder;
  $oTree->parse($s);
  $oTree->eof;
  my @aoSPAN = $iUK ? $oTree->look_down(_tag => 'td',
                                        class => 'small',
                                       )
                    : $oTree->look_down(_tag => 'span',
                                        class => 'small productTitle',
                                       );
 SPAN_TAG:
  foreach my $oSPAN (@aoSPAN)
    {
    next SPAN_TAG unless ref $oSPAN;
    DEBUG_HTML && print STDERR " DDD found TD...\n";
    my $oA = $oSPAN->look_down(_tag => 'a');
    next SPAN_TAG unless ref $oA;
    DEBUG_HTML && print STDERR " DDD   found A...\n";
    my $sTitle = $oA->as_text;
    # Strip leading whitespace:
    $sTitle =~ s!\A\s+!!;
    # Strip trailing whitespace:
    $sTitle =~ s!\s+\Z!!;
    next SPAN_TAG unless ($sTitle =~ m!\S!);
    # Strip out zero-width spaces scattered about randomly in item titles
    $sTitle =~ s/\x{200b}//g;
    DEBUG_HTML && print STDERR " DDD found item named '$sTitle'\n";
    my $sURL = $oA->attr('href');
    DEBUG_HTML && print STDERR " DDD   URL ==$sURL==\n";
    my $sASIN = '';
    if (
        ($sURL =~ m!/detail(?:/offer-listing)?/-/(.+?)/ref!)
        ||
        ($sURL =~ m!/gp/product/(.+?)/ref!)
        ||
        ($sURL =~ m!/dp/(.+?)/ref!)
       )
      {
      $sASIN = $1;
      } # if
    else
      {
      DEBUG_HTML && print STDERR " EEE   url does not contain asin\n";
      }
    DEBUG_HTML && print STDERR " DDD   ASIN ==$sASIN==\n";
    # Grab the smallest-containing ancestor of this item:
    my $oParent = $oSPAN->look_up(_tag => 'tbody',
                                 class => 'itemWrapper',
                                );
    if (! ref $oParent)
      {
      DEBUG_HTML && print STDERR " WWW did not find ancestor TBODY\n";
      next SPAN_TAG;
      } # if
    my $sParent = $oParent->as_text;
    my $sParentHTML = $oParent->as_HTML;
    print STDERR " DDD   parent TR HTML ==$sParentHTML==\n" if (4 < DEBUG_HTML);
    print STDERR " DDD   parent TR ==$sParent==\n" if (4 < DEBUG_HTML);
    # Find the quantity desired, and the priority.  These are the defaults:
    my $iDesired = 1;
    my $sPriority = 'medium';
    # my @aoSPAN = $oParent->look_down(_tag => 'span',
    #                                  class => 'wl-iter-heading',
    #                                 );
    my @aoSPAN = $oParent->look_down(_tag => 'td',
                                     class => 'tiny',
                                    );
 SPAN_TAG:
    foreach my $oSPAN (@aoSPAN)
      {
      next SPAN_TAG unless ref $oSPAN;
      my $sSpan = $oSPAN->as_text;
      DEBUG_HTML && print STDERR " DDD   span=$sSpan=\n";
      if ($sSpan =~ m'DESIRED(\d+)')
        {
        $iDesired = $1;
        DEBUG_HTML && print STDERR " DDD     desired=$iDesired=\n";
        } # if
      if ($sSpan =~ m'PRIORITY(.+)\z')
        {
        $sPriority = $1;
        DEBUG_HTML && print STDERR " DDD     priority=$sPriority=\n";
        } # if
      } # foreach SPAN_TAG
    if (! $iDesired || ! $sPriority)
      {
      # See if they are encoded in a FORM:
      # Find the priority:
      if ($sParentHTML =~ m!<option selected="yes" value=([-0-9]+)>!)
        {
        $sPriority = $1;
        DEBUG_HTML && print STDERR " DDD   priority=$sPriority=\n";
        } # if
      else
        {
        DEBUG_HTML && print STDERR " WWW   did not find <option> for priority\n";
        }
      # Find the quantity desired:
      if ($sParentHTML =~ m!<input class="tiny" name="requestedQty.+?" size=\d+ type="text" value=(\d+)>!)
        {
        $iDesired = $1;
        DEBUG_HTML && print STDERR " DDD   desired=$iDesired=\n";
        } # if
      else
        {
        DEBUG_HTML && print STDERR " WWW   did not find <input> for desired-quantity\n";
        }
      } # if
    # Find the date added:
    my $sDate = '';
    if ($sParentHTML =~ m!>added\s+(.+?)<!)
      {
      $sDate = $1;
      DEBUG_HTML && print STDERR " DDD   date=$sDate=\n";
      } # if
    else
      {
      DEBUG_HTML && print STDERR " WWW   did not find text for date-added\n";
      }

    # Find the "author" of this item:
    my @aoTD;
    if ($iUK)
      {
      @aoTD = $oParent->look_down(_tag => 'td',
                                  class => 'small',
                                 );
      }
    else
      {
      @aoTD = $oParent->look_down(_tag => 'span',
                                  sub
                                    {
                                    my $sHtml = $_[0]->as_HTML;
                                    # DEBUG_HTML && print STDERR " DDD   try oTDauthor span==$sHtml==\n";
                                    my $s = $_[0]->attr('class') || q{};
                                    $s =~ m'BYLINE'i;
                                    },
                                 );
      } # else
    my $sAuthor = '';
 TD_TAG:
    foreach my $oTD (@aoTD)
      {
      next TD_TAG unless ref $oTD;
      my $s = $oTD->as_HTML;
      DEBUG_HTML && print STDERR " DDD   try oTDauthor==$s==\n";
      $s = $oTD->as_text;
      if ($s =~ s!\A\s*(by|~)\s+!!)
        {
        $sAuthor = $s;
        last TD_TAG;
        } # if
      } # foreach TD_TAG
    DEBUG_HTML && print STDERR " DDD   author=$sAuthor=\n";
    # Find the price of this item:
    my $sPrice = '';
    my $oTDprice = $oParent->look_down(_tag => 'span',
                                       sub
                                         {
                                         my $s = $_[0]->attr('class') || q{};
                                         $s =~ m'PRICE'i;
                                         },
                                      );
    if (! ref $oTDprice)
      {
      DEBUG_HTML && print STDERR " WWW did not find TD for price\n";
      # print STDERR $oParent->as_HTML;
      # exit 88;
      # next SPAN_TAG;
      } # if
    else
      {
      $sPrice = $oTDprice->as_text;
      if ($sPrice =~ m!Price:\s+(.+)\Z!)
        {
        $sPrice = $1;
        } # if
      DEBUG_HTML && print STDERR " DDD   price=$sPrice=\n";
      } # else
    # Add this item to the result set:
    my %hsItem = (
                  asin => $sASIN,
                  author => $sAuthor,
                  # image => $sImageURL,
                  price => $sPrice,
                  priority => $sPriority,
                  quantity => $iDesired,
                  title => $sTitle,
                  # type => $sType,
                 );
    DEBUG_HTML && print STDERR Dumper(\%hsItem);
    push @{$rh->{items}}, \%hsItem;
    # All done with this item:
    $oParent->detach;
    $oParent->delete;
    } # foreach SPAN_TAG
  # Look for the next-page link:
  my $oA = $oTree->look_down(_tag => 'a',
                             sub
                               {
                               my $s = $_[0]->as_text || q{};
                               $s =~ m/\A\s*Next/;
                               },
                            );
  if (ref $oA)
    {
    $rh->{next} = $oA->attr('href');
    DEBUG_HTML && print STDERR " DDD raw next URL is ==$rh->{next}==\n";
    } # if
  else
    {
    DEBUG_HTML && print STDERR " DDD did not find next URL\n";
    }
  return $rh;
  } # _extract

1;


__END__