# Copyright (C) 2004 Jörg Tiedemann <joerg@stp.ling.uu.se>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package Uplug::Web::Corpus;
use strict;
use IO::File;
use POSIX qw(tmpnam);
use File::Copy;
use ExtUtils::Command;
use File::Basename;
use Uplug::Web;
use Uplug::Web::Config;
use Uplug::Web::Process;
use Uplug::Web::Process::Stack;
use Uplug::Web::User;
use Uplug::Config;
our $INDEXER=$ENV{UPLUGHOME}.'/web/bin/uplug-indexer.pl';
our $RECODE=$ENV{RECODE};
our $CorpusDir=$ENV{UPLUGDATA};
my $MAXFLOCKWAIT=3;
my $CorpusIndexFile=$ENV{UPLUGDATA}.'/.index';
my $CorpusIndex=Uplug::Web::Process::Stack->new($CorpusIndexFile);
sub GetIndexedCorpora{
my $data=shift;
if (ref($data) ne 'HASH'){return $CorpusIndex->read();}
my @corpora=$CorpusIndex->read();
foreach (@corpora){
my ($user,$name,$lang,$alg,$enc)=split(/\:/,$_);
$$data{$user}{$name}{$lang}{encoding}=$enc;
if ($alg){
push (@{$$data{$user}{$name}{$lang}{align}},$alg);
}
}
return keys %{$data};
}
sub IndexCorpus{
my $owner=shift;
my $corpus=shift;
my $CorpusDir=&GetCorpusDir($owner,$corpus);
my $CWBREG="$ENV{UPLUGCWB}/reg/$owner/$corpus";
my $CWBDAT="$ENV{UPLUGCWB}/dat/$owner/$corpus";
if (not -d $ENV{UPLUGCWB}){mkdir $ENV{UPLUGCWB};}
if (not -d "$ENV{UPLUGCWB}/reg"){mkdir "$ENV{UPLUGCWB}/reg";}
if (not -d "$ENV{UPLUGCWB}/dat"){mkdir "$ENV{UPLUGCWB}/dat";}
if (not -d "$ENV{UPLUGCWB}/reg/$owner"){mkdir "$ENV{UPLUGCWB}/reg/$owner";}
if (not -d "$ENV{UPLUGCWB}/dat/$owner"){mkdir "$ENV{UPLUGCWB}/dat/$owner";}
if (not -d "$ENV{UPLUGCWB}/reg/$owner/$corpus"){
mkdir "$ENV{UPLUGCWB}/reg/$owner/$corpus";
system "chmod g+w $ENV{UPLUGCWB}/reg/$owner/$corpus";
}
if (not -d "$ENV{UPLUGCWB}/dat/$owner/$corpus"){
mkdir "$ENV{UPLUGCWB}/dat/$owner/$corpus";
system "chmod g+w $ENV{UPLUGCWB}/dat/$owner/$corpus";
}
my $process=time().'_'.$$;
my $command="$INDEXER $CWBREG $CWBDAT $CorpusDir";
&Uplug::Web::Process::AddProcess('todo',$owner,$process,'$bash',$command);
# print "$command<hr>";
}
sub AddCorpusToIndex{
my $user=shift;
my $corpus=shift;
my $srcenc=shift;
my $trgenc=shift;
my $alg=shift;
my $info=&GetCorpusInfo($user,$corpus);
if ($$info{format}=~/align/){
my ($src,$trg)=split(/\-/,$$info{language});
&AddCorpusToIndex($user,
&GetCorpusName($$info{corpus},$src),
$srcenc,$trgenc,
$trg);
&AddCorpusToIndex($user,
&GetCorpusName($$info{corpus},$trg),
$trgenc,$srcenc,
$src);
}
else{
$CorpusIndex->remove($user,$$info{corpus},$$info{language},$alg);
$CorpusIndex->push($user,$$info{corpus},$$info{language},$alg,$srcenc);
}
}
sub GetCorpusDataFileOld{
my $user=shift;
return "$CorpusDir/$user/ini/uplugUserStreams.ini";
}
sub GetCorpusDataFile{
my $user=shift;
my $corpus=shift;
return "$CorpusDir/$user/$corpus/.documents";
}
sub GetCorpusDir{
my $user=shift;
my $corpus=shift;
my $lang=shift;
if (not defined $user){return $CorpusDir;}
if (not defined $corpus){return "$CorpusDir/$user";}
if (not -d "$CorpusDir/$user/$corpus"){mkdir "$CorpusDir/$user/$corpus";}
if (not defined $lang){return "$CorpusDir/$user/$corpus";}
if (not -d "$CorpusDir/$user/$corpus/$lang"){
mkdir "$CorpusDir/$user/$corpus/$lang";
}
return "$CorpusDir/$user/$corpus/$lang";
}
sub GetRecycleDir{
my $user=shift;
my $corpus=shift;
my $lang=shift;
if (not -d "$CorpusDir/.recycled"){
mkdir "$CorpusDir/.recycled",0755;
}
if (not defined $user){return "$CorpusDir/.recycled";}
if (not -d "$CorpusDir/.recycled/$user"){
mkdir "$CorpusDir/.recycled/$user",0755;
}
if (not defined $corpus){return "$CorpusDir/.recycled/$user";}
if (not -d "$CorpusDir/.recycled/$user/$corpus"){
mkdir "$CorpusDir/.recycled/$user/$corpus",0755;
}
if (defined $lang){
if (not -d "$CorpusDir/.recycled/$user/$corpus/$lang"){
mkdir "$CorpusDir/.recycled/$user/$corpus/$lang",0755;
}
return "$CorpusDir/.recycled/$user/$corpus/$lang";
}
return "$CorpusDir/.recycled/$user/$corpus";
}
sub GetCorpusStreams{
my $user=shift;
my %para=@_;
my %CorpusData=();
&GetCorpusData(\%CorpusData,$user);
my @streams=();
foreach my $c (keys %CorpusData){
my $match=1;
foreach (keys %para){
if ($CorpusData{$c}{$_}!~/$para{$_}/){$match=0;last;}
}
if ($match){push (@streams,$c);}
}
return @streams;
}
#------------------------------------------------------------------
# MatchingDocuments
# find all documents within a corpus with matching attributes
# (%para=attribute-value pairs to be matched)
sub MatchingDocuments{
my $user=shift;
my $corpus=shift;
my %para=@_;
my $docs=&CorpusDocuments($user,$corpus);
my @ok=();
foreach my $c (keys %{$docs}){
my $match=1;
foreach (keys %para){
if ($$docs{$c}{$_}!~/$para{$_}/){$match=0;last;}
}
if ($match){push (@ok,$c);}
}
return @ok;
}
#------------------------------------------------------------------
sub GetCorpusData{
my $CorpusData=shift;
my $user=shift;
my $CorpusInfoFile=&GetCorpusDataFile($user);
if (ref($CorpusData) ne 'HASH'){return 0;}
if (not -e $CorpusInfoFile){return 0;}
&LoadIniData($CorpusData,$CorpusInfoFile);
return keys %{$CorpusData};
}
sub RestoreDocument{
my ($owner,$corpus,$doc)=@_;
$CorpusDir.='/.recycled'; # set recycle-dir
my $ConfigFile=&CorporaConfigFile($owner);
my $corpora=Uplug::Web::Config->new($ConfigFile); # user corpora
my $ConfigFile=&DocumentConfigFile($owner,$corpus); # corpus configfile
my $documents=Uplug::Web::Config->new($ConfigFile); # corpus documents
my $config=$documents->read();
$CorpusDir=$ENV{UPLUGDATA}; # restore data-dir
if (defined $$config{$doc}){
my $lang=$$config{$doc}{language};
my $file=$$config{$doc}{file};
my $RecycleDir=&GetRecycleDir($owner,$corpus,$lang);
my $RemovedFile=$RecycleDir.'/'.&basename($file);
if (-e $RemovedFile){
move ($RemovedFile,$file);
}
my $ConfigFile=&DocumentConfigFile($owner,$corpus); # add the restored
my $ResDoc=Uplug::Web::Config->new($ConfigFile); # document to the
my $ResConfig=$ResDoc->read(); # corpus config
$$ResConfig{$doc}=$$config{$doc}; # file
$ResDoc->write($ResConfig); # write configfile
delete $$config{$doc}; # delete doc-data
$documents->write($config); # write configfile
if (not keys %{$config}){ # if no more removed documents
my $corpconf=$corpora->read(); # in this corpus: read the
delete $$corpconf{$corpus}; # config file and delete the
$corpora->write($corpconf); # corpus and save
}
}
$documents->close();
}
sub RemoveDocument{
my ($owner,$corpus,$doc)=@_;
my $ConfigFile=&DocumentConfigFile($owner,$corpus);
my $documents=Uplug::Web::Config->new($ConfigFile);
my $config=$documents->read();
if (defined $$config{$doc}){
my $lang=$$config{$doc}{language};
my $file=$$config{$doc}{file};
# my $corpus=$$config{$doc}{corpus};
my $RecycleDir=&GetRecycleDir($owner,$corpus,$lang);
if (-e $file){
move ($file,"$RecycleDir/");
}
$CorpusDir.='/.recycled'; # set recycle-dir
my $ConfigFile=&DocumentConfigFile($owner,$corpus); # config-filename
my $RemDoc=Uplug::Web::Config->new($ConfigFile); # open config-file
my $RemConfig=$RemDoc->read(); # read it
$$RemConfig{$doc}=$$config{$doc}; # save doc-data
$RemDoc->write($RemConfig); # write configfile
my $ConfigFile=&CorporaConfigFile($owner); # config-filename
my $RemCorpora=Uplug::Web::Config->new($ConfigFile); # open config-file
my $RemConfig=$RemCorpora->read(); # read it
$$RemConfig{$corpus}=1; # set corpus
$RemCorpora->write($RemConfig); # write configfile
delete $$config{$doc}; # delete doc-data
$documents->write($config); # write configfile
$CorpusDir=$ENV{UPLUGDATA}; # restore data-dir
}
$documents->close();
}
sub RemoveCorpus{
my ($owner,$corpus)=@_;
my $ConfigFile=&CorporaConfigFile($owner);
my $corpora=Uplug::Web::Config->new($ConfigFile);
my $config=$corpora->read();
if (defined $$config{$corpus}){
my $RecycleDir=&GetRecycleDir($owner);
my $DataDir=&GetCorpusDir($owner,$corpus);
if (-d "$RecycleDir/$corpus"){ # quite a hack ...
system "rm -fr $RecycleDir/$corpus"; # and maybe dangerous!!
}
if (-e $DataDir){
system "mv $DataDir $RecycleDir/"; # requires UNIX!!
}
$CorpusDir.='/.recycled'; # set recycle-dir
my $ConfigFile=&CorporaConfigFile($owner); # config-filename
my $RemCorpora=Uplug::Web::Config->new($ConfigFile); # open config-file
my $RemConfig=$RemCorpora->read(); # read it
$$RemConfig{$corpus}=$$config{$corpus}; # set corpus data
$RemCorpora->write($RemConfig); # write configfile
delete $$config{$corpus}; # delete corpus
$corpora->write($config); # from configfile
$CorpusDir=$ENV{UPLUGDATA}; # restore data-dir
}
$corpora->close();
my $ConfigFile=&CorporaConfigFile('pub'); # delete from public
my $corpora=Uplug::Web::Config->new($ConfigFile); # corpora list
my $config=$corpora->read();
if (defined $$config{"../$owner/$corpus"}){
delete $$config{"../$owner/$corpus"};
$corpora->write($config);
}
$corpora->close();
}
sub RemoveCorpusOld{
my ($user,$owner,$name)=@_;
if ($owner ne $user){print "Cannot remove corpus $name!";return 0;}
my $CorpusInfoFile=&GetCorpusDataFile($owner,$name);
my %CorpusData;
&LoadIniData(\%CorpusData,$CorpusInfoFile);
if (defined $CorpusData{$name}){
my $lang=$CorpusData{$name}{language};
my $file=$CorpusData{$name}{file};
my $corpus=$CorpusData{$name}{corpus};
my $RecycleDir=&GetRecycleDir($owner,$corpus);
if (-e $file){
move ($file,"$RecycleDir/");
}
delete $CorpusData{$name};
&WriteIniFile($CorpusInfoFile,\%CorpusData);
}
}
sub GetCorpusName{
my ($name,$lang)=@_;
return "$name ($lang)";
}
sub SplitCorpusName{
my ($name)=@_;
if ($name=~/^(.*)\s\((.*)\)/){
return ($1,$2);
}
return undef;
}
sub GetCorpusInfo{
my $user=shift;
my $corpus=shift;
my $doc=shift;
my $documents=&CorpusDocuments($user,$corpus);
if (ref($$documents{$doc}) eq 'HASH'){return $$documents{$doc};}
return {};
}
sub GetCorpusInfoOld{
my $user=shift;
my $CorpusName=shift;
my $CorpusInfoFile=&GetCorpusDataFile($user,$CorpusName);
my %CorpusData;
&LoadIniData(\%CorpusData,$CorpusInfoFile);
if (ref($CorpusData{$CorpusName}) eq 'HASH'){
return %{$CorpusData{$CorpusName}}
}
return undef;
}
#sub ReadCorpus{
# my $user=shift;
# my $name=shift;
# my $start=shift;
# my $nr=shift;
#
# my %stream=&Uplug::Web::Corpus::GetCorpusInfo($user,$name);
# if (not keys %stream){
# print "Cannot find corpus data for $name\n";
# }
# my $corpus=new Uplug::IO::Any(\%stream);
# if (not $corpus->open('read',\%stream)){
# print "Cannot open $name\n";
# }
# my $html;
# my @rows;
# my $data=Uplug::Data::DOM->new();
# my $count;
# my $skipped;
# while ($corpus->read($data)){
# if ($skipped<$start){$skipped++;next;}
# $count++;
# if ($count>$nr){last;}
# push(@rows,$data->toHtml());
# }
# $corpus->close();
# return @rows;
#}
#
sub SendCorpus{
my $to=shift;
my $owner=shift;
my $corpus=shift;
my $doc=shift;
my $data=&GetCorpusInfo($owner,$corpus,$doc);
if (defined $$data{file}){
&Uplug::Web::User::SendFile($to,'UplugWeb - '.$corpus,$$data{file});
return 1;
}
return 0;
}
sub CorpusIsPrivate{
my $owner=shift;
my $corpus=shift;
my $CorpusConfig=Uplug::Web::Config->new("$CorpusDir/$owner/.corpora");
my $corpora=$CorpusConfig->read();
return $$CorpusConfig{$corpus};
}
sub CorpusIsPublic{
return not &CorpusIsPrivate(@_);
}
sub CorporaConfigFile{
my $owner=shift;
if (not -d "$CorpusDir/$owner"){mkdir "$CorpusDir/$owner";}
return "$CorpusDir/$owner/.corpora";
}
sub DocumentConfigFile{
my $owner=shift;
my $corpus=shift;
if (not -d "$CorpusDir/$owner/$corpus"){mkdir "$CorpusDir/$owner/$corpus";}
return "$CorpusDir/$owner/$corpus/.documents";
}
sub Corpora{
my $owner=shift;
my $ConfigFile=&CorporaConfigFile($owner);
my $CorpusConfig=Uplug::Web::Config->new($ConfigFile);
return $CorpusConfig->read();
}
sub CorpusDocuments{
my $owner=shift;
my $corpus=shift;
my $ConfigFile=&DocumentConfigFile($owner,$corpus);
my $documents=Uplug::Web::Config->new($ConfigFile);
return $documents->read();
}
sub AddCorpus{
my $user=shift;
my $corpus=shift;
my $priv=shift; # =1 --> private corpus (don't store in public)
if ((defined $corpus) and ($corpus!~/^[a-zA-Z\_0-9]{1,10}$/)){
return (0,"Corpus name $corpus is not valid!");
}
my $UserCorpusFile=&CorporaConfigFile($user);
# "$CorpusDir/$user/.corpora";
my $UserCorpora=Uplug::Web::Config->new($UserCorpusFile);
my $corpora=$UserCorpora->read();
if (defined $$corpora{$corpus}){
return (0,"A corpus with the name '$corpus' exists already!");
}
$$corpora{$corpus}=1;
if (not $UserCorpora->write($corpora)){
return (0,"Could not add corpus info to $UserCorpusFile!");
}
$UserCorpora->close();
if (not mkdir "$CorpusDir/$user/$corpus"){
return (0,"Could not create corpus directory for '$corpus'!");
}
if (not $priv){
my $PublicCorpusFile=&CorporaConfigFile('pub');
my $PublicCorpora=Uplug::Web::Config->new($PublicCorpusFile);
my $public=$PublicCorpora->read();
$$public{"../$user/$corpus"}=1;
if (not $PublicCorpora->write($public)){
return (0,"Could not add corpus info to $PublicCorpusFile!");
}
$PublicCorpora->close();
}
return (1,"Corpus '$corpus' sucessfully added!");
}
sub AddDocument{
my ($user,$corpus,$name,$fh,$lang,$enc)=@_;
if ((defined $name) and ($name!~/^[a-zA-Z\_\.0-9]{1,15}$/)){
return (0,"Invalid document name '$name'! (use: [a-zA-Z_.]{1,15})");
}
my $documents=&CorpusDocuments($user,$corpus);
my $CorpusName=&GetCorpusName($corpus,$lang);
if (defined $$documents{$CorpusName}){
return (0,"A document with the name '$CorpusName' exists already!");
}
my $dir="$CorpusDir/$user/$corpus/$lang";
if (not -e $dir){
if (not mkdir $dir){
return (0,"Could not create $lang language directory for '$corpus'!");
}
}
my $file="$dir/$name";
# my $tmpfile=&GetTempFileName;
# open OUT, '>:encoding(utf8)',$tmpfile;
open OUT, '>:encoding(utf8)',$file;
binmode($fh);require Encode;
#----------------------------------
# read data and save them in tempfile
#
while (<$fh>){
eval {$_=&Encode::decode($enc,$_,1); };
if ($@){print $@;return undef;}
print OUT $_;
}
close OUT;
# move($tmpfile,$file); # create the corpus file
# my $lckfile="$file.lock";
# open F,">$lckfile";close F; # create a lock file
chmod 0664,$file;
# chmod 0664,$lckfile;
# unlink $tmpfile;
&AddCorpusInfo($user,$corpus,$name,$lang,'text',
{file => $file,format => 'text'});
return (1,"Document $fh successfully added to corpus $corpus!");
}
sub AddCorpusInfo{
my $owner=shift;
my $corpus=shift;
my $name=shift;
my $lang=shift;
my $status=shift;
my $para=shift;
my $CorpusFile="$CorpusDir/$owner/$corpus/.documents";
my $UserCorpora=Uplug::Web::Config->new($CorpusFile);
my $corpora=$UserCorpora->read();
my $CorpusName=&GetCorpusName($name,$lang);
%{$$corpora{$CorpusName}}=('language' => $lang,
'corpus' => $name,
'status' => $status);
if (ref($para) eq 'HASH'){
foreach (keys %{$para}){
$$corpora{$CorpusName}{$_}=$$para{$_};
}
}
if (not $UserCorpora->write($corpora)){
return (0,"Could not add corpus info to $CorpusFile!");
}
$UserCorpora->close();
}
sub ChangeCorpusInfo{
my $owner=shift;
my $corpus=shift;
my $doc=shift; # either EXISTING doc-name or doc-base-name without lang!
my $para=shift;
my $CorpusFile="$CorpusDir/$owner/$corpus/.documents";
my $UserCorpora=Uplug::Web::Config->new($CorpusFile);
my $corpora=$UserCorpora->read();
if (not defined $$corpora{$doc}){
if ((ref($para) eq 'HASH') and (defined $$para{language})){
$doc=&GetCorpusName($doc,$$para{language});
}
}
if (ref($para) eq 'HASH'){
foreach (keys %{$para}){
$$corpora{$doc}{$_}=$$para{$_};
}
}
if (not $UserCorpora->write($corpora)){
return (0,"Could not add corpus info to $CorpusFile!");
}
$UserCorpora->close();
}
sub ChangeCorpusStatus{
my $owner=shift;
my $corpus=shift;
my $doc=shift;
my $status=shift;
my $CorpusFile="$CorpusDir/$owner/$corpus/.documents";
my $UserCorpora=Uplug::Web::Config->new($CorpusFile);
my $corpora=$UserCorpora->read();
if (not defined $$corpora{$doc}){return undef;}
my $old=$$corpora{$doc}{status};
$$corpora{$doc}{status}=$status;
$UserCorpora->close();
return $old;
}
sub ChangeCorpusInfoOld{
my $user=shift;
my $CorpusName=shift;
my $para=shift;
my $CorpusInfoFile=&GetCorpusDataFile($user,$CorpusName);
my %CorpusData;
&LoadIniData(\%CorpusData,$CorpusInfoFile);
if (not defined $CorpusData{$CorpusName}){
if ((ref($para) eq 'HASH') and (defined $$para{language})){
$CorpusName=&GetCorpusName($CorpusName,$$para{language});
}
}
if (ref($para) eq 'HASH'){
foreach (keys %{$para}){
$CorpusData{$CorpusName}{$_}=$$para{$_};
}
}
&WriteIniFile($CorpusInfoFile,\%CorpusData);
}
sub ChangeCorpusStatusOld{
my $user=shift;
my $CorpusName=shift;
my $status=shift;
my $CorpusInfoFile=&GetCorpusDataFile($user,$CorpusName);
my %CorpusData;
&LoadIniData(\%CorpusData,$CorpusInfoFile);
if (not defined $CorpusData{$CorpusName}){return undef;}
my $old=$CorpusData{$CorpusName}{status};
$CorpusData{$CorpusName}{status}=$status;
&WriteIniFile($CorpusInfoFile,\%CorpusData);
return $old;
}
sub GetTempFileName{
my $fh;
my $file;
do {$file=tmpnam();}
until ($fh=IO::File->new($file,O_RDWR|O_CREAT|O_EXCL));
$fh->close;
return $file;
}
sub ChangeWordLinks{
my $file=shift;
my $links=shift;
my $params=shift;
my $sentLink=$params->{seg};
print "change links is not implemented yet!<br>";
print join '+',@{$links};
print '<hr>';
# if (not -e $file){return 0;}
# if ($file=~/\.gz$/){open F,"$GUNZIP < $file |";}
# else{open F,"< $file";}
# my $sec=0;
# while (not flock(F,2)){
# $sec++;sleep(1);
# if ($sec>$MAXFLOCKWAIT){
# close F;
# return 0;
# }
# }
# local $/='<link ';
# my @align=<F>;
# print join '<hr>',@align;
#
# close F;
}