The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/perl -X 
use utf8;
use File::Spec;

binmode STDOUT, ':utf8';
binmode STDERR, ':utf8';

use HTML::Strip;
use Data::Dumper;
use Digest::MD5 qw(md5_hex);
use AI::MicroStructure::WordBlacklist;
use JSON::XS;
use WWW::Wikipedia;
use Storable::CouchDB;
require LWP::UserAgent;

$ARGV[0] = "Space" unless($ARGV[0]);

#exit(0) unless (!checkIsThere($ARGV[0]));


our $doc ={};
our @links;
our $linkdata = {};
our $result;
my $url = $ARGV[0];
our $done;
our $doneit={};
my @inx;
my @test;
my $search="";
my $TOP="";

    my $s=AI::MicroStructure::WordBlacklist::getStopWords('en');
    our @s = keys %$s;
    #print join('|',@s);





   our $x = Storable::CouchDB->new(
                                      uri =>"http://user:password\@localhost:5984/",  #default
                                      db  =>"table"
                                     );


    my $carry = {count=>0,max=>0};





  sub checkIsThere {

    my $key =shift;


    require LWP::UserAgent;
    my $ua = LWP::UserAgent->new;
    my ($server,$db) = (sprintf("http://%s:5984","localhost"),"table");

    my $res = $ua->get(sprintf('%s/%s/_all_docs?key="%s"',
                                $server,
                                $db,
                                $key));
  my $r =  JSON::XS::decode_json($res->content);
  my @rows =  @{$r->{rows}};
  if(@rows)
  {
    return 1;
  }else{
  return 0;
  }
  return 1;
}

sub URLDecode {
my $theURL = $_[0];
$theURL =~ tr/+/ /;
$theURL =~ s/%([a-fA-F0-9]{2,2})/chr(hex($1))/eg;
$theURL =~ s/<!--(.|\n)*-->//g;
return $theURL;
}

sub URLEncode {
my $theURL = $_[0];
$theURL =~ s/([\W])/"%" . uc(sprintf("%2.2x",ord($1)))/eg;
return $theURL;
}

sub smartdecode {
  use URI::Escape qw( uri_unescape );
  use utf8;
  my $x = my $y = uri_unescape($_[0]);
  return $x if utf8::decode($x);
  return $y;
}

sub imgTranslate {

   require HTTP::Request::Common;
   my ($url) = @_;

if($url){
	my $request = HTTP::Request::Common::GET("http://localhost/tiny/concept2.php?img=".$url);
       $request->content_type('application/x-www-form-urlencoded');

	my  $ua = LWP::UserAgent->new;
	my $response = $ua->request($request);

    if ($response->is_success) {
        $response->decoded_content;

	$url = JSON::XS::decode_json($response->decoded_content);

	return \@{$url->{result}};
    }
    else {
        #print STDERR $response->status_line, "\n";
    }
  }
}




sub call  {

  my $url = shift;

  return () unless (!checkIsThere($url));

  warn "doing $url";
#

  my $ua = LWP::UserAgent->new;



  my $content ;
  my $response ="";
 # @book = ();

  my @book = ();
    $response  = $ua->get(sprintf("http://en.wikipedia.org/wiki/%s",ucfirst($url)));





#      if ($response->base =~ /m{$TOP}/ ) {
          my $doc={};
          my $linkdata={};
          use Data::Dumper;
          require HTML::SimpleLinkExtor;
          no warnings 'utf8';
          my $e = HTML::SimpleLinkExtor->new($response->base);
          $e->parse($response->decoded_content);
          @links = grep {/$TOP|.pdf|.ogg|.mp3|.mpg|.avi/}grep{!/Disambig|Help:|Wikipedia:|Special:|:Contents|:Featured_content|Main_Pag|_talk:|Talk:|#|[Aa]rticle[s|_]|All_*_*/}$e->links;

          foreach(@links){

            $linkdata->{base}->{$_} = 1 unless($linkdata->{base}->{$_});

          }
          @links = keys %{$linkdata->{base}};
          $linkdata->{pdf}=[grep{/^http.*.[\.](pdf|PDF)$/}@links];

      	  $linkdata->{audio}=[grep{/^http.*.[\.](mp3|wave|ogg|OGG|WAVE|MP3)$/}@links];

          @links = grep{/^http:\/\/en.wikipedia.org\/wiki/}@links;
          $linkdata->{image}=[grep{/^http.*.[\.](JPG|GIF|PNG|svg|jpg|png|gif)$/}@links];



        @{$doc->{audio}} = map{$_=shift imgTranslate($_) } @{$doc->{audio}};

        @{$doc->{audio}} = grep{ defined }@{$doc->{audio}};




          #
         warn $#links."\n";



my $wiki = WWW::Wikipedia->new();
my $hs = HTML::Strip->new();

      my $result = $wiki->search(ucfirst $url);
      if (defined($result) && $result->text() ) {

      my $clean_text = $hs->parse($result->text() );
      $hs->eof;

  #            print "\n"x10,$clean_text;

               $doc={};
               $doc->{linknr}=$#links;
               $doc->{url}=$url;
               $doc->{tags}=[map{$_=lc($_); $_=~s/\)//g; $_=~s/ /_/g; $_=[split("_\\(",$_)] }grep {/[(].+?[)]/} $result->related()];
               $doc->{instances}= {};
               $doc->{members}={};

               my $ltxt = join("\n",@links);
                  $ltxt =~ s/http:\/\/en.wikipedia.org\/wiki\///g;

               push @{$doc->{tags}},map{$_=lc($_); $_=~s/\)//g; $_=~s/ /_/g; $_=[split("_\\(",$_)] }grep {/[(].+?[)]/} split("\n",$ltxt);
               foreach(@{$doc->{tags}}){
                if($_->[0] && $_->[1]){
                $doc->{instances}->{lc $_->[0]}=lc $_->[1];
                push @test,lc $_->[0];
                }
  #              push @test,$_->[1];


               }

               my @instances = [grep{!/\W/}values %{$doc->{instances}}];

                 $doc->{members}=[grep{!/\W/}keys %{$doc->{instances}}];
                 $doc->{instances}=[grep{!/\W/}values %{$doc->{instances}}];
#                 push @{$doc->{instances}},lc($url);

                 $doc->{article}=$clean_text;
                 $doc->{links}=[grep{!/:Wikipedia/}@links];
                 $doc->{image}=$linkdata->{image};
                 $doc->{audio}=$linkdata->{audio};
                 $doc->{book}=[grep{/^http:\/\/en.wikipedia.org\/wiki\/Book:/}@{$doc->{links}}];
                 $doc->{portal}=[grep{/^http:\/\/en.wikipedia.org\/wiki\/Portal:/}@{$doc->{links}}];
                 $doc->{cat}=[grep{/^http:\/\/en.wikipedia.org\/wiki\/Category:/}@{$doc->{links}}];
                 $doc->{list}=[grep{/^http:\/\/en.wikipedia.org\/wiki\/List_of/}@{$doc->{links}}];
                 $doc->{tmpl}=[grep{/^http:\/\/en.wikipedia.org\/wiki\/Template:/}@{$doc->{links}}];
                 $doc->{pdf}=$linkdata->{pdf};
                 $doc->{recorded}=time;




        @{$doc->{image}} = map{$_=shift imgTranslate($_) } @{$doc->{image}};
        @{$doc->{image}} = grep{ defined }@{$doc->{image}};


        #


        $doc->{$_} = $_=~/src|full/? smartdecode($hs->parse($result->{$_})) : smartdecode($result->{$_}) for qw(src
                                              fulltext
                                              cursor
                                              related
                                              categories
                                              headings
                                              currentlang);

                $doc->{related} = [grep{!/#/}map{my $a = $_; $a =~ s/ /_/g; $_=ucfirst $a;}@{$doc->{related}}];
                $doc->{size} = length sprintf($doc);

                 my $tmp = $doc->{fulltext};
                    $tmp =~ s/( |'|,|")/\n/g;
                my @tmp =  split("\n",$tmp);

                    $tmp = grep{!@s}@tmp;



      $doc->{wn} = `wn $url  -over`;


   #   warn "doing $doc->{wn}";



    #       my @L  = [sort {$a cmp $b}grep{/(wiki|book|Category|List_of_)/i} @links];

          $x->store("$url" ,$doc);

 #       print Dumper $doc;

        return $doc;

  }


}

foreach my $urlx (@ARGV) {
if($urlx) { $result->{$urlx} = call($urlx); }



if(defined($result->{$urlx}) && @{$result->{$urlx}->{related}}){


foreach(@{$result->{$urlx}->{related}}){
 $result->{$_} = call($_);
}


}


}


BEGIN{
    `IFS_BAK=\$IFS;
       IFS=\$'\n';`;


}
END{
    `IFS=\$IFS_BAK;`;

}



1;