The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/perl
use Data::Printer;
use Parallel::Iterator qw( iterate );
use Env qw/PWD/;
use JSON::XS;
use HTML::Strip;
use Digest::MD5 qw(md5_hex);
use AI::MicroStructure::WordBlacklist;
use AI::MicroStructure::Util;
use JSON::XS;
use WWW::Wikipedia;
use Storable::CouchDB;
use LWP::UserAgent;




binmode STDOUT, ':utf8';
binmode STDERR, ':utf8';
die("require a argument") unless($ARGV[0]);
my $state = AI::MicroStructure::Util::load_config(); my @CWD=$state->{cwd}; my $config=$state->{cfg};
$config->{buffer} = "";


#exit(0) unless (!checkIsThere($ARGV[0]));


our $doc ={};
our @links;
our $linkdata = {};
our $result;
my $url = $ARGV[0];
our $done;
our $doneit={};
my @inx;
my @test;
my $search="";
my $TOP="";






   our $x = Storable::CouchDB->new(
            uri =>$config->{couchdb},  #default
            db  =>$config->{db}
           );


    my $carry = {count=>0,max=>0};



  sub list_iter {
               my @ar = @_;
               my $pos = 0;
               return sub {
                   return if $pos >= @ar;
                   my @r = ( $pos, $ar[$pos] );  # Note: returns ( index, value )
                   $pos++;
                   return @r;
               };
           }


  sub checkIsThere {

    my $key =shift;


    require LWP::UserAgent;
    my $ua = LWP::UserAgent->new;
    my ($server,$db) = ($config->{couchdb},$config->{db});

    my $res = $ua->get(sprintf('%s/%s/_all_docs?key="%s"&limit=1',
                                $server,
                                $db,
                                $key));
  my $r =  JSON::XS::decode_json($res->content);
  my @rows =  @{$r->{rows}};
  if(@rows)
  {
    return 1;
  }else{
  return 0;
  }
  return 1;
}

sub URLDecode {
my $theURL = $_[0];
$theURL =~ tr/+/ /;
$theURL =~ s/%([a-fA-F0-9]{2,2})/chr(hex($1))/eg;
$theURL =~ s/<!--(.|\n)*-->//g;
return $theURL;
}

sub URLEncode {
my $theURL = $_[0];
$theURL =~ s/([\W])/"%" . uc(sprintf("%2.2x",ord($1)))/eg;
return $theURL;
}

sub smartdecode {
  use URI::Escape qw( uri_unescape );
  use utf8;
  my $x = my $y = uri_unescape($_[0]);
  return $x if utf8::decode($x);
  return $y;
}

sub imgTranslate {


   my ($url) = @_;

if($url){
    my $request = HTTP::Request::Common::GET($url);

    my  $ua = LWP::UserAgent->new;
    my $response = $ua->request($request);

    if ($response->is_success) {
        print ".";

    my $e = HTML::SimpleLinkExtor->new($response->base);
       $e->parse($response->decoded_content);

        return \@{$e->img};
    }
    else {
        print "E";
        #print STDERR $response->status_line, "\n";
        return ();
    }
  }
}



sub call  {

  my ($idx,$url) = @_;

  return () unless (!checkIsThere($url) || $idx==0 || $config->{out}=~/json/);

  warn "doing $url";
#

#  return unless($url=~/Planet/);

  my $ua = LWP::UserAgent->new;



  my $content ;
  my $response ="";
 # @book = ();

  my @book = ();
    $response  = $ua->get(sprintf("%s%s",$config->{wikipedia}, ucfirst($url)));





#      if ($response->base =~ /m{$TOP}/ ) {
          my $doc={};
          my $linkdata={};

          require HTML::SimpleLinkExtor;
          no warnings 'utf8';
          my $e = HTML::SimpleLinkExtor->new($response->base);
          $e->parse($response->decoded_content);
          @links = grep {/$TOP|.pdf|.ogg|.mp3|.mpg|.avi/}grep{!/Disambig|Help:|Wikipedia:|Special:|:Contents|:Featured_content|Main_Pag|_talk:|Talk:|#|[Aa]rticle[s|_]|All_*_*/}$e->links;

          $doc->{linknr}=$#links;

#          @links = keys %{$linkdata->{base}};
          $linkdata->{pdf}=[grep{/^http.*.[\.](pdf|PDF)$/}@links];

          $linkdata->{audio}=[grep{/^http.*.[\.](mp3|wave|ogg|OGG|WAVE|MP3)$/}@links];

          #@links = grep{/^http:\/\/$config->{wikipedia}\/wiki/}@links;
          $linkdata->{image}=[grep{/^http.*.[\.](JPG|GIF|PNG|svg|jpg|png|gif)$/}@links];


       @{$doc->{audio}} = map{$_=shift @{(imgTranslate($_)  || [])} } @{$doc->{audio}};

        @{$doc->{audio}} = grep{ defined }@{$doc->{audio}};




          #

            warn $#links."\n";



my $wiki = WWW::Wikipedia->new();
my $hs = HTML::Strip->new();

      my $result = $wiki->search(ucfirst $url);
      if (defined($result) && $result->text() ) {

      my $clean_text = $hs->parse($result->text() );
      $hs->eof;

  #            print "\n"x10,$clean_text;


               $doc->{url}=$url;
               $doc->{tags}=[map{$_=lc($_); $_=~s/\)//g; $_=~s/ /_/g; $_=[split("_\\(",$_)] }grep {/[(].+?[)]/} $result->related()];
               $doc->{instances}= {};
               $doc->{members}={};

               my $ltxt = join("\n",@links);
                  $ltxt =~ s/http:\/\/en.wikipedia.org\/wiki\///g; # FIXME replace with $config->{wikipedia}

               push @{$doc->{tags}},map{$_=lc($_); $_=~s/\)//g; $_=~s/ /_/g; $_=[split("_\\(",$_)] }grep {/[(].+?[)]/} split("\n",$ltxt);
               foreach(@{$doc->{tags}}){
                if($_->[0] && $_->[1]){
                $doc->{instances}->{lc $_->[0]}=lc $_->[1];
                push @test,lc $_->[0];
                }
  #              push @test,$_->[1];


               }

               my @instances = [grep{!/\W/}values %{$doc->{instances}}];

                 $doc->{members}=[grep{!/\W/}keys %{$doc->{instances}}];
                 $doc->{instances}=[grep{!/\W/}values %{$doc->{instances}}];
#                 push @{$doc->{instances}},lc($url);

                 $doc->{article}=$clean_text;
                 $doc->{links}=[grep{!/:Wikipedia/}@links];
                 $doc->{audio}=$linkdata->{audio};
                 # FIXME replace with $config->{wikipedia}
                 $doc->{book}=[grep{/^http:\/\/en.wikipedia.org\/wiki\/Book:/}@{$doc->{links}}];
                 $doc->{portal}=[grep{/^http:\/\/en.wikipedia.org\/wiki\/Portal:/}@{$doc->{links}}];
                 $doc->{cat}=[grep{/^http:\/\/en.wikipedia.org\/wiki\/Category:/}@{$doc->{links}}];
                 $doc->{list}=[grep{/^http:\/\/en.wikipedia.org\/wiki\/List_of/}@{$doc->{links}}];
                 $doc->{tmpl}=[grep{/^http:\/\/en.wikipedia.org\/wiki\/Template:/}@{$doc->{links}}];
                 $doc->{pdf}=$linkdata->{pdf};
                 $doc->{recorded}=time;





        @{$doc->{image}} = map{$_=shift @{(imgTranslate($_) || [])} } @{$doc->{image}};
        @{$doc->{image}} = grep{ defined }@{$doc->{image}};


        #


        $doc->{$_} = $_=~/src|full/? smartdecode($hs->parse($result->{$_})) : smartdecode($result->{$_}) for qw(src
                                              fulltext
                                              cursor
                                              related
                                              categories
                                              headings
                                              currentlang);

                $doc->{related} = [grep{!/#/}map{my $a = $_; $a =~ s/ /_/g; $_=ucfirst $a;}@{$doc->{related}}];
                $doc->{size} = length sprintf($doc);


       $x->store("$url" ,$doc) ;

  #      return $url;
        return $doc;
  }


}

foreach my $urlx (@ARGV) {
#if($urlx) {
  $result->{$urlx} = call(0,$urlx);
  #}



if(defined($result->{$urlx}) && @{$result->{$urlx}->{related}}){

my $url_iter = list_iter( @{$result->{$urlx}->{related}});


my $page_iter = iterate( \&call, $url_iter );

  while ( my ( $index, $value ) = $page_iter->() ) {
  printf("\n%s", $index);
  }

}
 # our $out = {abs=>{pdf=>{},related=>{},image=>{},ogg=>{}}};

  #   while ( my ( $index, $value ) = $page_iter->() ) {
   #      $out->{$index} = $value;


    #     foreach(@{$value->{related}}) {

     #    $out->{abs}->{related}->{$_} = defined($out->{abs}->{related}->{$_})? $out->{abs}->{related}->{$_}++ : 1;

      #   }

       #  foreach(@{$value->{pdf}}) {

        # $out->{abs}->{pdf}->{$_} = defined($out->{abs}->{pdf}->{$_})? $out->{abs}->{pdf}->{$_}++ : 1;

       #  }
        # foreach(@{$value->{image}}) {

         #$out->{abs}->{image}->{$_} = defined($out->{abs}->{image}->{$_})? $out->{abs}->{image}->{$_}++ : 1;

    #     }
     #    foreach(@{$value->{audio}}) {

      #   $out->{abs}->{ogg}->{$_} = defined($out->{abs}->{ogg}->{$_})? $out->{abs}->{ogg}->{$_}++ : 1;

       #  }
    # }
#    p $out;
 #   exit(0);



#}


}


BEGIN{
    #`IFS_BAK=\$IFS;
     #  IFS=\$'\n';`;


}
END{
    #`IFS=\$IFS_BAK;`;

     #if($config->{out}=~/json/) {


   # my $json = JSON::XS->new->pretty(1)->encode({ "query" => [@ARGV],
    #                                              "responce" =>[$conf->{abs} ? $out->{abs}:$out]});


#      open (MYFILE, '>>'.$config->{jsonout});
 #       print MYFILE $json;
  #      close (MYFILE);


    #}

     #exit(0);

}



1;