The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
# $Id: Images.pm,v 1.19 2005/05/23 16:24:27 rousse Exp $
package WWW::Google::Images;

=head1 NAME

WWW::Google::Images - Google Images Agent

=head1 VERSION

Version 0.6.2

=head1 DESCRIPTION

This module may be used search images on Google. Its interface is
heavily inspired from L<WWW::Google::Groups>.

=head1 SYNOPSIS

    use WWW::Google::Images;

    $agent = WWW::Google::Images->new(
	server => 'images.google.com',
	proxy  => 'my.proxy.server:port',
    );

    $result = $agent->search('flowers', limit => 10);

    while ($image = $result->next()) {
	$count++;
	print $image->content_url();
	print $image->context_url();
	print $image->save_content(base => 'image' . $count);
	print $image->save_context(base => 'page' . $count);
    }

=cut

use WWW::Mechanize;
use WWW::Google::Images::SearchResult;
use HTML::Parser;
use strict;
our $VERSION = '0.6.2';

=head1 Constructor

=head2 new(I<%args>)

Creates and returns a new C<WWW::Google::Images> object.

Optional parameters:

=over

=item server => I<$server>

use I<$server> as server.

=item proxy => I<$proxy>:I<$port>

use I<$proxy> as proxy on port I<$port>.

=back

=cut

sub new {
    my ($class, $query, %arg) = @_;

    foreach my $key (qw(server proxy)){
	next unless $arg{$key};
	$arg{$key} = 'http://'.$arg{$key} if $arg{$key} !~ m,^\w+?://,o;
    }

    my $a = WWW::Mechanize->new(onwarn => undef, onerror => undef);
    $a->proxy(['http'], $arg{proxy}) if $arg{proxy};

    my $self = bless {
	_server => ($arg{server} || 'http://images.google.com/'),
	_proxy  => $arg{proxy},
	_agent  => $a,
    }, $class;

    return $self;
}

=head2 $agent->search(I<$query>, I<%args>);

Perform a search for I<$query>, and return a C<WWW::Google::Images::SearchResult> object.

Optional parameters:

=over

=item limit => I<$limit>

limit the maximum number of result returned to $limit.

=item min_width => I<$width>

limit the minimum width of result returned to $width pixels.

=item min_height => I<$height>

limit the minimum width of result returned to $height pixels.

=item min_size => I<$size>

limit the minimum size of result returned to $size ko.

=item max_width => I<$width>

limit the maximum width of result returned to $width pixels.

=item max_height => I<$height>

limit the maximum width of result returned to $height pixels.

=item max_size => I<$size>

limit the maximum size of result returned to $size ko.

=item regex => I<$regex>

limit the result returned to those whose filename matches case-sensitive
$regex regular expression.

=item iregex => I<$regex>

limit the result returned to those whose filename matches case-insensitive
$regex regular expression.

=back

=cut

sub search {
    my ($self, $query, %arg) = @_;

    warn "No query given, aborting" and return unless $query;

    $arg{limit} = 10 unless defined $arg{limit};

    $self->{_agent}->get($self->{_server});

    $self->{_agent}->submit_form(
	 form_number => 1,
	 fields      => {
	     q => $query
	 }
    );

    my @images;
    my $page = 1;

    LOOP: {
	do {
	    push(@images, $self->_extract_images(($arg{limit} ? $arg{limit} - @images : 0), %arg));
	    last if $arg{limit} && @images == $arg{limit};
	} while ($self->_next_page(++$page));
    }

    return WWW::Google::Images::SearchResult->new($self->{_agent}, @images);
}

sub _next_page {
    my ($self, $page) = @_;

    return $self->{_agent}->follow_link(text => $page)
}

sub _extract_images {
    my ($self, $limit, %arg) = @_;

    my @images;
    my @data;

    my @links = $self->{_agent}->find_all_links( url_regex => qr/imgurl/ );

    if (
	$arg{min_size}   ||
	$arg{max_size}   ||
	$arg{min_width}  || 
	$arg{max_width}  ||
	$arg{min_height} ||
	$arg{max_height}
    ) {
	my $parser = HTML::Parser->new();
	my $callback = sub {
	    my ($text) = @_;
	    if ($text =~ /^(\d+) x (\d+) pixels - (\d+)k$/) {
		push(@data, { width => $1, height => $2, size => $3 });
	    }
	};
	$parser->handler(text => $callback, 'text');
	$parser->parse($self->{_agent}->content());
    }

    for my $i (0 .. $#links) {
	next if $arg{min_size} && $data[$i]->{size} < $arg{min_size};
	next if $arg{max_size} && $data[$i]->{size} > $arg{max_size};
	next if $arg{min_width} && $data[$i]->{width} < $arg{min_width};
	next if $arg{max_width} && $data[$i]->{width} > $arg{max_width};
	next if $arg{min_height} && $data[$i]->{height} < $arg{min_height};
	next if $arg{max_height} && $data[$i]->{height} > $arg{max_height};
	$links[$i]->url() =~ /imgurl=([^&]+)&imgrefurl=([^&]+)/;
	my $content = $1;
	my $context = $2;
	next if $arg{regex} && $content !~ /$arg{regex}/;
	next if $arg{iregex} && $content !~ /$arg{iregex}/i;
	push(@images, { content => $content, context => $context});
	last if $limit && @images == $limit;
    }

    return @images;
}

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2004-2005, INRIA.

This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself.

=head1 AUTHOR

Guillaume Rousse <grousse@cpan.org>

=cut

1;