#!/usr/bin/perl -X
use utf8;
use File::Spec;
binmode STDOUT, ':utf8';
binmode STDERR, ':utf8';
use HTML::Strip;
use Data::Dumper;
use Digest::MD5 qw(md5_hex);
use AI::MicroStructure::WordBlacklist;
use JSON::XS;
use WWW::Wikipedia;
use Storable::CouchDB;
require LWP::UserAgent;
$ARGV[0] = "Space" unless($ARGV[0]);
#exit(0) unless (!checkIsThere($ARGV[0]));
our $doc ={};
our @links;
our $linkdata = {};
our $result;
my $url = $ARGV[0];
our $done;
our $doneit={};
my @inx;
my @test;
my $search="";
my $TOP="";
my $s=AI::MicroStructure::WordBlacklist::getStopWords('en');
our @s = keys %$s;
#print join('|',@s);
our $x = Storable::CouchDB->new(
uri =>"http://user:password\@localhost:5984/", #default
db =>"table"
);
my $carry = {count=>0,max=>0};
sub checkIsThere {
my $key =shift;
require LWP::UserAgent;
my $ua = LWP::UserAgent->new;
my ($server,$db) = (sprintf("http://%s:5984","localhost"),"table");
my $res = $ua->get(sprintf('%s/%s/_all_docs?key="%s"',
$server,
$db,
$key));
my $r = JSON::XS::decode_json($res->content);
my @rows = @{$r->{rows}};
if(@rows)
{
return 1;
}else{
return 0;
}
return 1;
}
sub URLDecode {
my $theURL = $_[0];
$theURL =~ tr/+/ /;
$theURL =~ s/%([a-fA-F0-9]{2,2})/chr(hex($1))/eg;
$theURL =~ s/<!--(.|\n)*-->//g;
return $theURL;
}
sub URLEncode {
my $theURL = $_[0];
$theURL =~ s/([\W])/"%" . uc(sprintf("%2.2x",ord($1)))/eg;
return $theURL;
}
sub smartdecode {
use URI::Escape qw( uri_unescape );
use utf8;
my $x = my $y = uri_unescape($_[0]);
return $x if utf8::decode($x);
return $y;
}
sub imgTranslate {
require HTTP::Request::Common;
my ($url) = @_;
if($url){
my $request = HTTP::Request::Common::GET("http://localhost/tiny/concept2.php?img=".$url);
$request->content_type('application/x-www-form-urlencoded');
my $ua = LWP::UserAgent->new;
my $response = $ua->request($request);
if ($response->is_success) {
$response->decoded_content;
$url = JSON::XS::decode_json($response->decoded_content);
return \@{$url->{result}};
}
else {
#print STDERR $response->status_line, "\n";
}
}
}
sub call {
my $url = shift;
return () unless (!checkIsThere($url));
warn "doing $url";
#
my $ua = LWP::UserAgent->new;
my $content ;
my $response ="";
# @book = ();
my @book = ();
$response = $ua->get(sprintf("http://en.wikipedia.org/wiki/%s",ucfirst($url)));
# if ($response->base =~ /m{$TOP}/ ) {
my $doc={};
my $linkdata={};
use Data::Dumper;
require HTML::SimpleLinkExtor;
no warnings 'utf8';
my $e = HTML::SimpleLinkExtor->new($response->base);
$e->parse($response->decoded_content);
@links = grep {/$TOP|.pdf|.ogg|.mp3|.mpg|.avi/}grep{!/Disambig|Help:|Wikipedia:|Special:|:Contents|:Featured_content|Main_Pag|_talk:|Talk:|#|[Aa]rticle[s|_]|All_*_*/}$e->links;
foreach(@links){
$linkdata->{base}->{$_} = 1 unless($linkdata->{base}->{$_});
}
@links = keys %{$linkdata->{base}};
$linkdata->{pdf}=[grep{/^http.*.[\.](pdf|PDF)$/}@links];
$linkdata->{audio}=[grep{/^http.*.[\.](mp3|wave|ogg|OGG|WAVE|MP3)$/}@links];
@links = grep{/^http:\/\/en.wikipedia.org\/wiki/}@links;
$linkdata->{image}=[grep{/^http.*.[\.](JPG|GIF|PNG|svg|jpg|png|gif)$/}@links];
@{$doc->{audio}} = map{$_=shift imgTranslate($_) } @{$doc->{audio}};
@{$doc->{audio}} = grep{ defined }@{$doc->{audio}};
#
warn $#links."\n";
my $wiki = WWW::Wikipedia->new();
my $hs = HTML::Strip->new();
my $result = $wiki->search(ucfirst $url);
if (defined($result) && $result->text() ) {
my $clean_text = $hs->parse($result->text() );
$hs->eof;
# print "\n"x10,$clean_text;
$doc={};
$doc->{linknr}=$#links;
$doc->{url}=$url;
$doc->{tags}=[map{$_=lc($_); $_=~s/\)//g; $_=~s/ /_/g; $_=[split("_\\(",$_)] }grep {/[(].+?[)]/} $result->related()];
$doc->{instances}= {};
$doc->{members}={};
my $ltxt = join("\n",@links);
$ltxt =~ s/http:\/\/en.wikipedia.org\/wiki\///g;
push @{$doc->{tags}},map{$_=lc($_); $_=~s/\)//g; $_=~s/ /_/g; $_=[split("_\\(",$_)] }grep {/[(].+?[)]/} split("\n",$ltxt);
foreach(@{$doc->{tags}}){
if($_->[0] && $_->[1]){
$doc->{instances}->{lc $_->[0]}=lc $_->[1];
push @test,lc $_->[0];
}
# push @test,$_->[1];
}
my @instances = [grep{!/\W/}values %{$doc->{instances}}];
$doc->{members}=[grep{!/\W/}keys %{$doc->{instances}}];
$doc->{instances}=[grep{!/\W/}values %{$doc->{instances}}];
# push @{$doc->{instances}},lc($url);
$doc->{article}=$clean_text;
$doc->{links}=[grep{!/:Wikipedia/}@links];
$doc->{image}=$linkdata->{image};
$doc->{audio}=$linkdata->{audio};
$doc->{book}=[grep{/^http:\/\/en.wikipedia.org\/wiki\/Book:/}@{$doc->{links}}];
$doc->{portal}=[grep{/^http:\/\/en.wikipedia.org\/wiki\/Portal:/}@{$doc->{links}}];
$doc->{cat}=[grep{/^http:\/\/en.wikipedia.org\/wiki\/Category:/}@{$doc->{links}}];
$doc->{list}=[grep{/^http:\/\/en.wikipedia.org\/wiki\/List_of/}@{$doc->{links}}];
$doc->{tmpl}=[grep{/^http:\/\/en.wikipedia.org\/wiki\/Template:/}@{$doc->{links}}];
$doc->{pdf}=$linkdata->{pdf};
$doc->{recorded}=time;
@{$doc->{image}} = map{$_=shift imgTranslate($_) } @{$doc->{image}};
@{$doc->{image}} = grep{ defined }@{$doc->{image}};
#
$doc->{$_} = $_=~/src|full/? smartdecode($hs->parse($result->{$_})) : smartdecode($result->{$_}) for qw(src
fulltext
cursor
related
categories
headings
currentlang);
$doc->{related} = [grep{!/#/}map{my $a = $_; $a =~ s/ /_/g; $_=ucfirst $a;}@{$doc->{related}}];
$doc->{size} = length sprintf($doc);
my $tmp = $doc->{fulltext};
$tmp =~ s/( |'|,|")/\n/g;
my @tmp = split("\n",$tmp);
$tmp = grep{!@s}@tmp;
$doc->{wn} = `wn $url -over`;
# warn "doing $doc->{wn}";
# my @L = [sort {$a cmp $b}grep{/(wiki|book|Category|List_of_)/i} @links];
$x->store("$url" ,$doc);
# print Dumper $doc;
return $doc;
}
}
foreach my $urlx (@ARGV) {
if($urlx) { $result->{$urlx} = call($urlx); }
if(defined($result->{$urlx}) && @{$result->{$urlx}->{related}}){
foreach(@{$result->{$urlx}->{related}}){
$result->{$_} = call($_);
}
}
}
BEGIN{
`IFS_BAK=\$IFS;
IFS=\$'\n';`;
}
END{
`IFS=\$IFS_BAK;`;
}
1;