The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.

# $Id: Wishlist.pm,v 2.12 2014-11-28 15:26:36 Martin Exp $

package WWW::Amazon::Wishlist;

use strict;
use vars qw( @ISA @EXPORT @EXPORT_OK );

use Carp;
use Data::Dumper;
use HTML::TreeBuilder;
use LWP::UserAgent;

use constant COM => 0;
use constant UK  => 1;

use constant DEBUG => 0;
use constant DEBUG_HTML => 0;

require Exporter;

@ISA = qw(Exporter);
@EXPORT = qw(
);
@EXPORT_OK = qw(
        get_list
        UK
        COM
);

our
$VERSION = do { my @r = (q$Revision: 2.12 $ =~ /\d+/g); sprintf "%d."."%03d" x $#r, @r };

=pod

=head1 NAME

WWW::Amazon::Wishlist - grab all the details from your Amazon wishlist

=head1 SYNOPSIS

  use WWW::Amazon::Wishlist qw(get_list COM UK);

  my @wishlist;

  @wishlist = get_list($my_amazon_com_id);       # gets it from amazon.com
  @wishlist = get_list($my_amazon_com_id,  COM); # same, explicitly
  @wishlist = get_list($my_amazon_couk_id, UK);  # gets it from amazon.co.uk

  # Or, if you didn't import the COM and UK constants:
  @wishlist = get_list ($my_amazon_couk_id, WWW::Amazon::Wishlist::UK);

  # The elements of @wishlist are hashrefs that contain the following elements:
  foreach my $book (@wishlist)
    {
    print $book->{title}, # the, err, title
    $book->{author},      # and the author(s) 
    $book->{asin},        # the asin number, its unique id on Amazon
    $book->{price},       # how much it will set you back
    $book->{quantity},    # how many you said you want
    $book->{priority},    # how urgently you said you want it (1-5)
    $book->{type};        # Hardcover/Paperback/CD/DVD etc (not available in the US)
    } # foreach

=head1 DESCRIPTION

Goes to amazon.(com|co.uk), scrapes your wishlist, and returns it
in a array of hashrefs so that you can fiddle with it to your heart's
content.

=head1 GETTING YOUR AMAZON ID

The best way to do this is to search for your own wishlist in the search
tools.

Searching for mine (simon@twoshortplanks.com) on amazon.com takes me to
the URL something like

   http://www.amazon.com/exec/obidos/wishlist/2EAJG83WS7YZM/...

there's some more cruft after that last string of numbers and letters
but it's the

   2EAJG83WS7YZM

bit that's important.

Doing the same for amazon.co.uk is just as easy.

Apparently, some people have had problems getting to their wishlist right
after it gets set up.  You may have to wait a while for it to become
browseable.

=head1 SHOWING YOUR APPRECIATION

There was a thread on london.pm mailing list about working in a vacuum -
that it was a bit depressing to keep writing modules but never get any
feedback.  So, if you use and like this module then please send me an
email and make my day.

All it takes is a few little bytes.


=head1 BUGS

B<IMPORTANT>

C<WWW::Amazon::Wishlist> is a screen scraper and is there for
is vulnerable to any changes that Amazon make to their HTML.

If it starts returning no items then this is very likely the reason
and I will get around to fixing it as soon as possible.

You might want to look at the C<Net::Amazon> module instead.

It doesn't cope with anything apart from the UK and USA versions of Amazon.

I don't think it likes unavailable items - trying to work around this
breaks UK compatability.

The code has accumulated lots of cruft.

Lack of testing.  It works for the pages I've tried it for but that's
no guarantee.

=head1 LICENSE

Copyright (c) 2003 Simon Wistow

Distributed under the same terms as Perl itself.

This software is under no warranty and will probably destroy your wish
list, kill your friends, burn your house and bring about the apocalypse

=head1 AUTHOR

Simon Wistow <simon@thegestalt.org>
Currently maintained by Martin Thurn <mthurn@cpan.org>

=head1 SEE ALSO

L<perl>, L<LWP::UserAgent>, L<amazonwish>

=cut

my $USER_AGENT = 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)';

sub get_list
  {
  # Required arg = wishlist ID:
  my $id = shift || croak "No ID given to get_list() function\n";
  # Optional arg = whether we're accessing the UK site.  Default is "no":
  my $uk = shift || 0;
  # Optional arg = turn on debugging:
  my $test = shift || DEBUG;
  # Note to self ... should we UC the id? Nahhhh. Not yet.
  # fairly self explanatory
  my $domain = ($uk) ? "co.uk" : "com";
  # set up some variables
  my $iPage = 1;
  my @items;
  # and awaaaaaaaaaaaaay we go ....
 INFINITE:
  while (1)
    {
    my $url = $uk ? "https://www.amazon.co.uk/gp/registry/wishlist/ref=cm_wl_search_1?page=$iPage&cid=$id" :
    "http://www.amazon.com/gp/registry/wishlist/$id/?page=$iPage";
    # This is a typical complete .com URL as of 2008-12:
    # http://www.amazon.com/gp/registry/wishlist/2O4B95NPM1W3L
    DEBUG_HTML && warn " DDD fetching wishlist for $id, page $iPage...\n";
    # Don't overwhelm the server:
    sleep(3) if (1 < $iPage);
    my $content = _fetch_page($url, $domain);
    if (DEBUG_HTML == 88)
      {
      warn $content;
      exit 88;
      } # if
    # As of 2009-08, Amazon returns HTML with MISSING BRACKETS:
    $content =~ s/(<tbody\s[^>\r\n]+)(\s+<)/$1>\n$2/g;
    # There seems to be a bug in HTML::TreeBuilder that causes
    # abutting tags to be skpped!?!
    $content =~ s!><!> <!g;
    if (9 < $test)
      {
      eval "use File::Slurp";
      write_file(qq'PAGES/fetched-$domain.html', $content);
      exit 88;
      } # if
    my $iLen = length($content);
    # warn " DDD fetched $iLen bytes.\n";

    # UPDATED 2014-11.  Both USA and UK sites use the same page
    # format, therefore we always pass COM to the _extract() method:
    my $result = _extract(COM, $content, $test);
    # print Dumper($result);
    # exit 88;
    if (! defined $result)
      {
      DEBUG && warn " WWW _extract() returned nothing\n";
      last INFINITE;
      } # if
    if (! ref $result->{items})
      {
      # Probably an empty wish list
      DEBUG && warn " WWW _extract() returned no items\n";
      last INFINITE;
      } # if
 ITEM:
    foreach my $item (@{$result->{items}})
      {
      $item->{'author'} =~ s!\n!!g;
      $item->{'author'} =~ s!^\s*by\s+!!g;
      $item->{'author'} =~ s!</span></b><br />\n*!!s;
      $item->{'quantity'} = $1 if ($item->{'priority'} =~ m!Desired:\s*</b>\s*(\d+)!i);
      $item->{'priority'} = $1 if ($item->{'priority'} =~ m!Priority:\s*</b>\s*(\d)!i);
      if (
          $uk
          &&
          $item->{image}
          &&
          ($item->{image} !~ m!^http:!)
         )
        {
        $item->{image} = q"http://images-eu.amazon.com/images/P/". $item->{image};
        } # if
      push @items, $item;
      } # foreach ITEM
    my $sURLNext = $result->{next};
    my $iNext = 0;
    if (! defined $sURLNext)
      {
      # DEBUG && warn " DDD content===$content===\n";
      # exit 88;
      # Use brute force to find it:
      if ($content =~ m!([;&]page=\d+)">\s*(<[^>]+>)?Next!)
        {
        DEBUG && warn " DDD found next URL with brute force\n";
        $sURLNext = $1;
        $iNext = $2;
        } # if
      } # if
    # Paranoia:
    if (! defined $sURLNext)
      {
      DEBUG && warn " WWW did not find next url\n";
      last INFINITE;
      } # if
    if ($sURLNext !~ m/[;&]page=(\d+)/)
      {
      DEBUG && warn " WWW next url =$sURLNext= does not contain page#\n";
      last INFINITE;
      } # if
    $iNext = $1;
    # More paranoia:
    if ($iNext <= $iPage)
      {
      DEBUG && warn " WWW next url page=$iNext is not greater than current page=$iPage\n";
      last INFINITE;
      } # if
    # ...and update:
    $iPage = $iNext;
    } # while INFINITE
  return @items;
  } # get_list


sub _fetch_page
  {
    my ($url, $domain) = @_;
    if (0)
      {
      eval "use File::Slurp";
      # For debugging UK site:
      return read_file('Pages/uk-2008-12-page1.html');
      # For debugging USA site:
      return read_file('Pages/2008-12.html');
      } # if 0
    # Setting up the UA here is slower but makes the code easier to read
    # really, the slow bit will not be setting up the UA each time

    # set up the UA
    my $ua = new LWP::UserAgent( keep_alive => 1, timeout => 30, agent => $USER_AGENT, );
    # setting it in the 'new' seems not to work sometimes
    $ua->agent($USER_AGENT);
    # for some reason this makes stuff work
    $ua->max_redirect( 0 );
    # make a full set of headers
    my $h = new HTTP::Headers(
                              'Host'            => "www.amazon.$domain",
                              'Referer'         => $url,
                              'User-Agent'      => $USER_AGENT,
                              'Accept'          => 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,video/x-mng,image/png,image/jpeg,image/gif;q=0.2,*/*;q=0.1',
                              'Accept-Language' => 'en-us,en;q=0.5',
                              'Accept-Charset'  => 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
                              #'Accept-Encoding' => 'gzip,deflate',
                              'Keep-Alive'      =>  '300',
                              'Connection'      =>  'keep-alive',
                             );
    $h->referer("$url");
    my $request  =  HTTP::Request->new ( 'GET', $url, $h );
    my $response;
    my $times = 0;
    # LWP should be able to do this but seemingly fails sometimes
    while ($times++<3)
      {
      $response =  $ua->request($request);
      last if $response->is_success;
      if ($response->is_redirect)
        {
        $url = $response->header("Location");
        #$h->header("Referer", $url); 
        $h->referer("$url");
        $request  =  HTTP::Request->new ( 'GET', $url, $h );
        } # if
      } # while
    if (!$response->is_success)
      {
      croak "Failed to retrieve $url";
      return undef;
      } # if
    my $s = $response->content;
    # Clean the CRAP off the page:
    $s =~ s!<script>.+?</script>!!gs;
    return $s;
    } # _fetch_page

# This is the HTML parsing version written by Martin Thurn:

sub _extract
  {
  # Required arg1 = whether we are parsing the UK site or not (Boolean):
  my $iUK = shift || 0;
  # Required arg2 = the HTML contents of the webpage:
  my $s = shift || '';
  # Optional arg = debugging level:
  my $iDebug = shift || 0;
  DEBUG_HTML && warn " DDD start _extract()\n";
  my $rh = {};
  my $oTree = new HTML::TreeBuilder;
  $oTree->parse($s);
  $oTree->eof;
  my @aoSPAN = $iUK ? $oTree->look_down(_tag => 'div',
                                        class => 'a-text-left a-fixed-left-grid-col a-col-right',
                                        # class => 'lineItemGroup',
                                       )
                    : $oTree->look_down(_tag => 'div',
                                        class => 'a-text-left a-fixed-left-grid-col a-col-right',
                                       );
 SPAN_TAG:
  foreach my $oSPAN (@aoSPAN)
    {
    next SPAN_TAG unless ref $oSPAN;
    DEBUG_HTML && warn " DDD found toplevel item tagset\n";
    if (9 < DEBUG_HTML)
      {
      my $s = $oSPAN->as_HTML;
      warn " DDD ==$s==\n";
      } # if
    my $sASIN = q{};
    my $sName = q{};
    my $sTitle = q{};
    my @aoA = $oSPAN->look_down(_tag => 'a');
    DEBUG_HTML && warn sprintf(" DDD   contains %d <A> tags\n", scalar(@aoA));
 A_TAG:
    foreach my $oA (@aoA)
      {
      next A_TAG if ! ref $oA;
      my $sA = $oA->as_HTML;
      DEBUG_HTML && warn " DDD   try A\n";
      if (9 < DEBUG_HTML)
        {
        warn " DDD ==$sA==\n";
        } # if
      $sTitle = $oA->attr('title') || $oA->as_text;
      # Strip leading whitespace:
      $sTitle =~ s!\A\s+!!;
      # Strip trailing whitespace:
      $sTitle =~ s!\s+\Z!!;
      # Ignore empty (image-only) tags:
      next A_TAG if ($sTitle !~ m/\S/);
      # Strip out zero-width spaces scattered about randomly in item titles
      $sTitle =~ s/\x{200b}//g;
      DEBUG_HTML && warn " DDD found item named '$sTitle'\n";
      next A_TAG if ($sTitle eq 'Universal Wish List Button');
      next A_TAG if ($sTitle eq 'Buying this gift elsewhere?');
      my $sURL = $oA->attr('href');
      DEBUG_HTML && warn " DDD   URL ==$sURL==\n";
      if (
          ($sURL =~ m!/detail(?:/offer-listing)?/-/(.+?)/ref!)
          ||
          ($sURL =~ m!/gp/product/(.+?)/ref!)
          ||
          ($sURL =~ m!/dp/(.+?)/ref!)
         )
        {
        # It's a match!
        $sASIN = $1;
        last A_TAG;
        } # if
      else
        {
        DEBUG_HTML && warn " EEE   url does not contain asin\n";
        }
      } # foreach A_TAG
    DEBUG_HTML && warn " DDD   ASIN ==$sASIN==\n";
    if ($sASIN eq q{})
      {
      next SPAN_TAG;
      } # if
    # Grab the smallest-containing ancestor of this item:
    my $oParent = $iUK
                ? $oSPAN->look_up(_tag => 'tbody',
                                  class => 'itemWrapper',
                                 )
                : $oSPAN;
    if (! ref $oParent)
      {
      DEBUG_HTML && warn " WWW did not find ancestor TBODY\n";
      next SPAN_TAG;
      } # if
    my $sParentHTML = $oParent->as_HTML;
    DEBUG_HTML && warn " DDD   parent HTML ==$sParentHTML==\n";
    my $sParent = $oParent->as_text;
    # Manual text clean-up:
    $sParent =~ s/(DESIRED|RECEIVED|PRIORITY)/;  $1: /g;
    DEBUG_HTML && warn " DDD   parent text ==$sParent==\n";
    my $iDesired = _match_desired($sParent);
    DEBUG_HTML && warn " DDD     desired set to =$iDesired=\n";
    my $sPriority = _match_priority($sParent);
    DEBUG_HTML && warn " DDD     priority set to =$sPriority=\n";
    my @aoTDtiny = $oParent->look_down(_tag => 'td',
                                       class => 'tiny',
                                      );
 QUANT_TAG:
    foreach my $oSPAN (@aoTDtiny)
      {
      next QUANT_TAG unless ref $oSPAN;
      my $sSpan = $oSPAN->as_text;
      DEBUG_HTML && warn " DDD   TDtiny=$sSpan=\n";
      $sPriority ||= _match_priority($sSpan);
      DEBUG_HTML && warn " DDD     priority set to =$sPriority=\n";
      $iDesired ||= _match_desired($sSpan);
      DEBUG_HTML && warn " DDD     desired set to =$iDesired=\n";
      } # foreach QUANT_TAG
    if (! $iDesired || ! $sPriority)
      {
      # See if they are encoded in a FORM:
      # Find the priority:
      if ($sParentHTML =~ m!<option selected="yes" value=([-0-9]+)>!)
        {
        $sPriority = $1;
        DEBUG_HTML && warn " DDD     priority set to =$sPriority=\n";
        } # if
      else
        {
        DEBUG_HTML && warn " WWW   did not find <option> for priority\n";
        }
      # Find the quantity desired:
      if ($sParentHTML =~ m!<input class="tiny" name="requestedQty.+?" size=\d+ type="text" value=(\d+)>!)
        {
        $iDesired = $1;
        DEBUG_HTML && warn " DDD     desired set to =$iDesired=\n";
        } # if
      else
        {
        DEBUG_HTML && warn " WWW   did not find <input> for desired-quantity\n";
        }
      } # if
    # Put in default values if we never found them:
    $sPriority ||= 'medium';
    DEBUG_HTML && warn " DDD     priority set to =$sPriority=\n";
    $iDesired ||= 1;
    # Find the date added:
    my $sDate = '';
    if ($sParentHTML =~ m!>added\s+(.+?)<!)
      {
      $sDate = $1;
      DEBUG_HTML && warn " DDD   date=$sDate=\n";
      } # if
    else
      {
      DEBUG_HTML && warn " WWW   did not find text for date-added\n";
      }

    # Find the "author" of this item:
    my @aoTDauthor;
    if ($iUK)
      {
      @aoTDauthor = $oParent->look_down(_tag => 'td',
                                        class => 'small',
                                       );
      }
    else
      {
      @aoTDauthor = $oParent->look_down(_tag => 'span',
                                        sub
                                          {
                                          my $sHtml = $_[0]->as_HTML;
                                          # DEBUG_HTML && warn " DDD   try oTDauthor span==$sHtml==\n";
                                          my $s = $_[0]->attr('class') || q{};
                                          $s =~ m'BYLINE'i;
                                          },
                                       );
      } # else
    my $sAuthor = '';
 AUTHOR_TAG:
    foreach my $oTD (@aoTDauthor)
      {
      next AUTHOR_TAG unless ref $oTD;
      my $s = $oTD->as_HTML;
      DEBUG_HTML && warn " DDD   try oTDauthor==$s==\n";
      $s = $oTD->as_text;
      if ($s =~ s!\A\s*(by|~)\s+!!)
        {
        $sAuthor = $s;
        last AUTHOR_TAG;
        } # if
      } # foreach AUTHOR_TAG
    DEBUG_HTML && warn " DDD   author=$sAuthor=\n";
    # Find the price of this item:
    my $sPrice = '';
    my $oTDprice = $oParent->look_down(_tag => 'span',
                                       sub
                                         {
                                         my $s = $_[0]->attr('class') || q{};
                                         $s =~ m'PRICE'i;
                                         },
                                      );
    if (! ref $oTDprice)
      {
      DEBUG_HTML && warn " WWW did not find TD for price\n";
      # warn $oParent->as_HTML;
      # exit 88;
      # next SPAN_TAG;
      } # if
    else
      {
      $sPrice = $oTDprice->as_text;
      if ($sPrice =~ m!Price:\s+(.+)\Z!)
        {
        $sPrice = $1;
        } # if
      $sPrice =~ s!\A\s+!!;
      $sPrice =~ s!\s+\Z!!;
      DEBUG_HTML && warn " DDD   price=$sPrice=\n";
      } # else
    # Add this item to the result set:
    my %hsItem = (
                  asin => $sASIN,
                  author => $sAuthor,
                  # image => $sImageURL,
                  price => $sPrice,
                  priority => $sPriority,
                  quantity => $iDesired,
                  title => $sTitle,
                  # type => $sType,
                 );
    DEBUG_HTML && warn Dumper(\%hsItem);
    push @{$rh->{items}}, \%hsItem;
    # All done with this item:
    $oParent->detach;
    $oParent->delete;
    } # foreach SPAN_TAG
  # Look for the next-page link:
  my $oA = $oTree->look_down(_tag => 'a',
                             sub
                               {
                               my $s = $_[0]->as_text || q{};
                               DEBUG_HTML && warn " DDD   try next <A> ==$s==\n";
                               $s =~ m/\A\s*NEXT/i;
                               },
                            );
  if (ref $oA)
    {
    $rh->{next} = $oA->attr('href');
    DEBUG_HTML && warn " DDD raw next URL is ==$rh->{next}==\n";
    } # if
  else
    {
    DEBUG_HTML && warn " DDD did not find next URL\n";
    }
  return $rh;
  } # _extract

sub _match_priority
    {
    my $s = shift || return;
    if ($s =~ m'.+PRIORITY:?\s*(\w+?)(\s|\z)'i)
      {
      return lc $1;
      } # if
    return;
    } # _match_priority

sub _match_desired
      {
      my $s = shift || return;
      if ($s =~ m'(?:DESIRED|WANTS):?\s*(\d+)'i)
        {
        return lc $1;
        } # if
      return;
      } # _match_desired

1;


__END__