#!/usr/bin/perl
use Data::Printer;
use Parallel::Iterator qw( iterate );
use Env qw/PWD/;
use JSON::XS;
use HTML::Strip;
use Digest::MD5 qw(md5_hex);
use AI::MicroStructure::WordBlacklist;
use AI::MicroStructure::Util;
use JSON::XS;
use WWW::Wikipedia;
use Storable::CouchDB;
use LWP::UserAgent;
binmode STDOUT, ':utf8';
binmode STDERR, ':utf8';
die("require a argument") unless($ARGV[0]);
use constant STRICT => $ARGV[1]?$ARGV[1]:0;
#$config->{buffer} = "";
my $state = AI::MicroStructure::Util::config();
my @CWD = $state->{cwd};
our $config = $state->{cfg};
#exit(0) unless (!checkIsThere($ARGV[0]));
our $n=0;
our $doc ={};
our @links;
our $linkdata = {};
our $result;
my $url = $ARGV[0];
our $done;
our $doneit={};
my @inx;
my @test;
my $search="";
my $TOP="";
our $x = Storable::CouchDB->new(
uri =>$config->{couchdb}, #default
db =>$config->{db}
);
my $carry = {count=>0,max=>0};
sub list_iter {
my @ar = @_;
my $pos = 0;
return sub {
return if $pos >= @ar;
my @r = ( $pos, $ar[$pos] ); # Note: returns ( index, value )
$pos++;
return @r;
};
}
sub checkIsThere {
my $key =shift;
require LWP::UserAgent;
my $ua = LWP::UserAgent->new;
my ($server,$db) = ($config->{couchdb},$config->{db});
my $res = $ua->get(sprintf('%s/%s/_all_docs?key="%s"&limit=1',
$server,
$db,
$key));
my $r = JSON::XS::decode_json($res->content);
my @rows = @{$r->{rows}};
if(@rows)
{
return 1;
}else{
return 0;
}
return 1;
}
sub URLDecode {
my $theURL = $_[0];
$theURL =~ tr/+/ /;
$theURL =~ s/%([a-fA-F0-9]{2,2})/chr(hex($1))/eg;
$theURL =~ s/<!--(.|\n)*-->//g;
return $theURL;
}
sub URLEncode {
my $theURL = $_[0];
$theURL =~ s/([\W])/"%" . uc(sprintf("%2.2x",ord($1)))/eg;
return $theURL;
}
sub smartdecode {
use URI::Escape qw( uri_unescape );
use utf8;
my $x = my $y = uri_unescape($_[0]);
return $x if utf8::decode($x);
return $y->{internals};
}
sub imgTranslate {
my ($url) = @_;
if($url){
my $request = HTTP::Request::Common::GET($url);
my $ua = LWP::UserAgent->new;
my $response = $ua->request($request);
if ($response->is_success) {
print ".";
my $e = HTML::SimpleLinkExtor->new($response->base);
$e->parse($response->decoded_content);
return \@{$e->img};
}
else {
print "E";
#print STDERR $response->status_line, "\n";
return ();
}
}
}
sub call {
my ($idx,$url) = @_;
return () unless (!checkIsThere($url) || $idx==0 || $config->{out}=~/json/);
#
# return unless($url=~/Planet/);
my $ua = LWP::UserAgent->new;
my $content ;
my $response ="";
# @book = ();
my @book = ();
$response = $ua->get(sprintf("%s%s",$config->{wikipedia}, ucfirst($url)));
# if ($response->base =~ /m{$TOP}/ ) {
my $doc={};
my $linkdata={};
require HTML::SimpleLinkExtor;
no warnings 'utf8';
my $e = HTML::SimpleLinkExtor->new($response->base);
$e->parse($response->decoded_content);
@links = $e->links;
$doc->{linknr}=$#links;
# @links = keys %{$linkdata->{base}};
$linkdata->{pdf}=[grep{/^http.*.[\.](pdf|PDF)$/}@links];
$linkdata->{audio}=[grep{/^http.*.[\.](mp3|wave|ogg|OGG|WAVE|MP3)$/}@links];
#@links = grep{/^http:\/\/$config->{wikipedia}\/wiki/}@links;
$linkdata->{image}=[grep{/^http.*.[\.](JPG|GIF|PNG|svg|jpg|png|gif)$/}@links];
@{$doc->{audio}} = map{$_=shift @{(imgTranslate($_) || [])} } @{$doc->{audio}};
@{$doc->{audio}} = grep{ defined }@{$doc->{audio}};
#
$n += $#links;
# warn $n,"\t", $#links."\n";
my $wiki = WWW::Wikipedia->new();
my $hs = HTML::Strip->new();
my $result = $wiki->search(ucfirst $url);
if (defined($result) && $result->text() ) {
my $clean_text = $hs->parse($result->text() );
$hs->eof;
# print "\n"x10,$clean_text;
$doc->{url}=$url;
$doc->{tags}=[map{$_=lc($_); $_=~s/\)//g; $_=~s/ /_/g; $_=[split("_\\(",$_)] }grep {/[(].+?[)]/} $result->related()];
$doc->{instances}= {};
$doc->{members}={};
my $ltxt = join("\n",@links);
$ltxt =~ s/http:\/\/en.wikipedia.org\/wiki\///g; # FIXME replace with $config->{wikipedia}
push @{$doc->{tags}},map{$_=lc($_); $_=~s/\)//g; $_=~s/ /_/g; $_=[split("_\\(",$_)] }grep {/[(].+?[)]/} split("\n",$ltxt);
foreach(@{$doc->{tags}}){
if($_->[0] && $_->[1]){
$doc->{instances}->{lc $_->[0]}=lc $_->[1];
push @test,lc $_->[0];
}
# push @test,$_->[1];
}
my @instances = [grep{!/\W/}values %{$doc->{instances}}];
$doc->{members}=[grep{!/\W/}keys %{$doc->{instances}}];
$doc->{instances}=[grep{!/\W/}values %{$doc->{instances}}];
# push @{$doc->{instances}},lc($url);
$doc->{article}=$clean_text;
$doc->{links}=[grep{!/:Wikipedia/}@links];
$doc->{audio}=$linkdata->{audio};
# FIXME replace with $config->{wikipedia}
$doc->{book}=[grep{/^http:\/\/en.wikipedia.org\/wiki\/Book:/}@{$doc->{links}}];
$doc->{portal}=[grep{/^http:\/\/en.wikipedia.org\/wiki\/Portal:/}@{$doc->{links}}];
$doc->{cat}=[grep{/^http:\/\/en.wikipedia.org\/wiki\/Category:/}@{$doc->{links}}];
$doc->{list}=[grep{/^http:\/\/en.wikipedia.org\/wiki\/List_of/}@{$doc->{links}}];
$doc->{tmpl}=[grep{/^http:\/\/en.wikipedia.org\/wiki\/Template:/}@{$doc->{links}}];
$doc->{pdf}=$linkdata->{pdf};
$doc->{recorded}=time;
@{$doc->{image}} = map{$_=shift @{(imgTranslate($_) || [])} } @{$doc->{image}};
@{$doc->{image}} = grep{ defined }@{$doc->{image}};
#
$doc->{$_} = $_=~/src|full/? smartdecode($hs->parse($result->{$_})) : smartdecode($result->{$_}) for qw(src
fulltext
cursor
related
categories
headings
currentlang);
$doc->{related} = [grep{!/#/}map{my $a = $_; $a =~ s/ /_/g; $_=ucfirst $a;}@{$doc->{related}}];
$doc->{size} = length sprintf($doc);
# $doneit->{$url} = $doc;
# $x->store("$url" ,$doc) ;
# p $json;
# return $url;
return $doc;
}
}
foreach my $urlx (@ARGV) {
#if($urlx) {
$result->{$urlx} = call(0,$urlx);
#}
if(defined($result->{$urlx}) && @{$result->{$urlx}->{related}}){
my $iter={};
$iter->{$_} = 1 for STRICT==1?grep{/$ARGV[0]/i}@{$result->{$urlx}->{related}}:@{$result->{$urlx}->{related}};
my $url_iter = list_iter(sort keys %$iter);
my $page_iter = iterate( \&call, $url_iter );
while ( my ( $index, $value ) = $page_iter->() ) {
$doneit->{$index} = $value unless ($value->{url} !~/$ARGV[0]/);
warn "doing ".$value->{url};
##printf("\n%s", $index);
}
$json = JSON::XS->new->pretty(0)->allow_blessed(1)->convert_blessed(1)->encode({return=>$doneit});
#p $doneit;
}
our $out = {abs=>{pdf=>{},related=>{},image=>{},ogg=>{}}};
while ( my ( $index, $value ) = $page_iter->() ) {
$out->{$index} = $value;
foreach(@{$value->{related}}) {
$out->{abs}->{related}->{$_} = defined($out->{abs}->{related}->{$_})? $out->{abs}->{related}->{$_}++ : 1;
}
foreach(@{$value->{pdf}}) {
$out->{abs}->{pdf}->{$_} = defined($out->{abs}->{pdf}->{$_})? $out->{abs}->{pdf}->{$_}++ : 1;
}
foreach(@{$value->{image}}) {
$out->{abs}->{image}->{$_} = defined($out->{abs}->{image}->{$_})? $out->{abs}->{image}->{$_}++ : 1;
}
foreach(@{$value->{audio}}) {
$out->{abs}->{ogg}->{$_} = defined($out->{abs}->{ogg}->{$_})? $out->{abs}->{ogg}->{$_}++ : 1;
}
}
# p $out;
exit(0);
#}
}
BEGIN{
#`IFS_BAK=\$IFS;
# IFS=\$'\n';`;
}
END{
#`IFS=\$IFS_BAK;`;
#if($config->{out}=~/json/) {
# my $json = JSON::XS->new->pretty(1)->encode({ "query" => [@ARGV],
# "responce" =>[$conf->{abs} ? $out->{abs}:$out]});
# open (MYFILE, '>>'.$config->{jsonout});
# print MYFILE $json;
# close (MYFILE);
#}
#exit(0);
}
1;