lib/WWW/Arbeitsagentur/Search.pm

package WWW::Arbeitsagentur::Search;

=head1 NAME

WWW::Arbeitsagentur::Search - Search for Jobs & Applicants via Arbeitsagentur.de

=head1 SYNOPSIS

 package MySearch;
 use WWW::Arbeitsagentur::Search;

 my $search = MySearch->new('job'  => 'Perl Job',
			    'user' => 'username@arbeitsagentur'
			    'pw'   => 'mypassword@arbeitsagentur',
			    'mech' => WWW::Mechanize->new(),
			   );
 $search->search_my_way;

 sub search_my_way{
     my $self = shift;
     $self->connect();
     # do other stuff
     $self->collect_result_pages;
     return $self->count_results;
 }

=head1 DESCRIPTION

This module is the base class for all search classes. It inherits from WWW::Arbeitsagentur and provides methods to collect search results and save them to disk.

=head1 METHODS

=head2 $search->save_results()

Saves the result pages in the directory determined by $search->path.
Returns the number of errors(!).

 Usage:
	# After successful search:
	$search->collect_result_pages();
	$search->path( 'download/' );	# where to save the files.
	$search->save_results();

=head2 $search->save_page( $page_id )

Save a page from the result list to $search->path.

Parameter: index number of the page to be saved from array $self->results.

Return 0 on failure, 1 on success.

 Usage:
	# primitive filtering:
	my $page = $search->result(5);       
	if ($page =~ m/Perl Coder/){
		$search->save_page(5);
	}

=head2 $search->collect_result_pages()

If our search was successful, this method collects all jobs / applicants found on the result pages.
Data is stored in the array $search->results.

Returns the number of pages found.

=head2 $search->select_job()

Selects a job description in a search form on http://www.arbeitsargentur.de.

 Usage:
	# After navigating the mech to the search form:
	$search->beruf('Perl Coder');
	$search->select_job();

Returns 1 if a matching job-description was found.
Returns 0 on failure.

=head1 SEE ALSO

http://arbeitssuche.sourceforge.net

=head1 AUTHOR

Ingo Wiarda dewarim@users.sourceforce.net

=head1 ACKNOWLEDGMENTS

This module is based upon http://arbeitssuche.sourceforge.net,
written by Ingo Wiarda and Stefan Rother.

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2006 by Ingo Wiarda

This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself.

=cut

our $VERSION = "0.0.1";

@ISA = qw/WWW::Arbeitsagentur/;

use Digest::SHA qw(sha256_hex);
use WWW::Arbeitsagentur::Arbeitnehmer;
use WWW::Arbeitsagentur::Applicant;
use WWW::Arbeitsagentur::Search::FastSearchForWork;
use WWW::Arbeitsagentur;

use strict;
use warnings;

sub save_results{
    my ($self) = @_;
    my $errors = 0;
    
    for ( my $x = 0; $x < $self->results_count() ;$x++){
	my $result = $self->save_page( $x );

	if ($result == 0){
	    $errors++;
	    warn("save_page failed. $!\n") unless $result;
	}
    }
    return $errors;
}

sub collect_result_pages{
    my ($self) = @_;
    my $mech = $self->mech();
    $mech->dump_content();

    my @results = $mech->find_all_links('url_regex'
					=> qr!anzeigeDetails\?redirect_id=\d+!);

    my $result_url = 0;
    my $pages_found = 0;

    $result_url = $results[0]->url() if $results[0];
    return 0 unless $result_url;
    
    while($result_url){

	$mech->get($result_url);
	
	$mech->dump_content('job.html');

	if (! $mech->success()){
	    warn( "Could not fetch ". $result_url.".\n" );
	    return $pages_found;
	    # Todo: better error handling - fetch next, retry etc.?
	}

	$self->results_push( $mech->content );
	$pages_found++;

	# find the URL of the next applicant / job:
	@results = $mech->find_all_links("text_regex" => qr/(?:chstes Stellenangebot|chster Bewerber)/);
	$result_url = @results ? $results[0]->url() : 0;
	warn "next URL: ".$result_url."\n";
	$mech->dump_content();
	
	last if $self->_module_test;
    }
    return $pages_found;
}

sub select_job {
    my ($self) = @_;
    my $mech = $self->mech();
    my ($berufsbezeichnung) = $self->beruf();

    # The method should be more talkative why it returns 0...
    return 0 unless $berufsbezeichnung;
    return 0 unless $self->job_typ();

    $mech->dump_content();
    warn "Sende Formular Berufswahl.\n";
    $mech->submit_form('form_number' => 3,
		       'fields' =>{
			   'art' => $self->job_typ(),
			   'berufsbezeichnung' => $berufsbezeichnung,
		       },
		       'button'	=> 'cmd#starteBerufswahl##true'
		       );
    $mech->dump_content('job_type_selected.html');

    if ($mech->content() =~ m/Ergebnis der Suche nach Berufen/){
	# es gibt mehrere Berufe dieses Namens -> nehmen wir den, der am ehesten passt:
	warn "Job-Title $berufsbezeichnung is not correct. Will try anaway.\n";

	# The following should be refactored into a "Test job_list"-Type Funktion.
	#if ( $self->module_test == 0){
	#    open(JOBS, ">>oddjobs.txt");
	#    print JOBS $berufsbezeichnung."\n";
	#    close(JOBS);
	#}

	eval{ $mech->follow_link('text_regex' => qr/$berufsbezeichnung/i)};
	if ( $@ ) {
	    warn "Could find nothing that matches $berufsbezeichnung. $@\n";
	    $mech->dump_content('job_not_found.html');
	    return 0;
	}
	else{
	    $mech->dump_content('job_found.html');
	    return 1;
	}
    }
    
    # correct job was found:
    if ($mech->content() =~ m/name="beruf" value="[.\d]+"/){
	return 1;
    }
    else{
	# we found nothing:
	return 0;
    }
}

sub save_page{

    my ($self, $page_id) = @_;

    my $data = $self->results_index( $page_id );

    # Do not save zero-length files:
    if ( length($data) == 0 ){
      warn("Page is 0 Bytes long - may be a download problem.\n");
      return 0;
    }
    
    my ($ref) = WWW::Arbeitsagentur::extract_refnumber(\$data);
    if ($ref){
	warn "save_page: found reference number: $ref.\n";
    }
    else{
	warn "Could not find a reference number. Will use hash-value as filename.\n";
	$ref = sha256_hex($data)."_sha";
    }

    my $path = $self->path();
    # CSS-Links anpassen:
    $data =~ s!<link type="text/css" href="/vam/css/!<link type="text/css" href="!;

    # Seite anpassen:
    # Damit bei Stellenangeboten die gewünschten Fähigkeiten nicht
    # in einem großen Block stehen, werden Zeilenumbrüche eingefügt.
    $data =~ s/\);/\);<br \/>/g;
    $data =~ s/<body.*?Seiteninhalt<\/h2/<body><h2>Seiteninhalt<\/h2/sm;

    open(HTML, ">$path$ref.html") or die("Could not open file for writing!\n$!\n");
    print HTML $data;
    close(HTML) or die("Could not close file\n$!\n");
    return 1;
}

1;

__END__
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)