The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
# UMLS::Interface::ICFinder
# (Last Updated $Id: ICFinder.pm,v 1.37 2014/06/27 13:23:47 btmcinnes Exp $)
#
# Perl module that provides a perl interface to the
# Unified Medical Language System (UMLS)
#
# Copyright (c) 2004-2010,
#
# Bridget T. McInnes, University of Minnesota Twin Cities
# bthomson at cs.umn.edu
#
# Siddharth Patwardhan, University of Utah, Salt Lake City
# sidd at cs.utah.edu
# 
# Serguei Pakhomov, University of Minnesota Twin Cities
# pakh0002 at umn.edu
#
# Ted Pedersen, University of Minnesota, Duluth
# tpederse at d.umn.edu
#
# Ying Liu, University of Minnesota
# liux0935 at umn.edu
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to 
#
# The Free Software Foundation, Inc., 
# 59 Temple Place - Suite 330, 
# Boston, MA  02111-1307, USA.

package UMLS::Interface::ICFinder;

use Fcntl;
use strict;
use warnings;
use DBI;
use bytes;

my $pkg = "UMLS::Interface::ICFinder";

my $root = "";

my $debug = 0;

my %propagationFreq  = ();
my %propagationHash  = ();

my $propagationFile  = "";
my $frequencyFile    = "";

my %frequencyHash = ();

my $option_realtime      = undef;
my $option_icpropagation = undef;
my $option_icfrequency   = undef;
my $option_t             = undef;
my $smooth               = 0;
my $configN              = 0;

my $errorhandler = "";
my $cuifinder    = "";
my $pathfinder   = ""; 

my $max_leaves = 0; 

# UMLS-specific stuff ends ----------

# -------------------- Class methods start here --------------------

#  method to create a new UMLS::Interface::PathFinder object
sub new {
    my $self = {};
    my $className = shift;
    my $params    = shift;
    my $chandler  = shift;
    my $phandler  = shift; 

    # initialize error handler
    $errorhandler = UMLS::Interface::ErrorHandler->new();
    if(! defined $errorhandler) {
	print STDERR "The error handler did not get passed properly.\n";
	exit;
    }
    
    #  initialize the cuifinder
    $cuifinder = $chandler;
    if(! (defined $chandler)) { 
	$errorhandler->_error($pkg, 
			      "new", 
			      "The CuiFinder handler did not get passed properly", 
			      8);
    }

    #  initialize the pathfinder
    $pathfinder = $phandler;
    if(! (defined $phandler)) { 
	$errorhandler->_error($pkg, 
			      "new", 
			      "The PathFinder handler did not get passed properly", 
			      8);
    }

    # bless the object.
    bless($self, $className);

    return $self;
}

# Method to initialize the UMLS::Interface::ICFinder object.
sub _setPropagationParameters
{
    my $self      = shift;
    my $params    = shift;

    #  set function name
    my $function = "_setPropagationParameters";
    &_debug($function);
    
    #  check self
    if(!defined $self || !ref $self) {
	$errorhandler->_error($pkg, $function, "", 2);
    }
        
    #  get the umlsinterfaceindex database from CuiFinder
    my $sdb = $cuifinder->_getIndexDB();
    if(!$sdb) { $errorhandler->_error($pkg, $function, "Error with sdb.", 3); }
    $self->{'sdb'} = $sdb;
    
    #  get the root
    $root = $cuifinder->_root();

    #  set up the options
    $self->_setOptions($params);

    #  load the propagation hash if the option is specified
    if($option_icpropagation) { 
	$self->_loadPropagationHashFromFile();
    }

    #  load the frequency hash if hte option is specified
    if($option_icfrequency) { 
	$self->_loadFrequencyHash();
    }
}


#  print out the function name to standard error
#  input : $function <- string containing function name
#  output: 
sub _debug {
    my $function = shift;
    if($debug) { print STDERR "In UMLS::Interface::ICFinder::$function\n"; }
}

#  method to set the global parameter options
#  input : $params <- reference to a hash
#  output: 
sub _setOptions 
{
    my $self = shift;
    my $params = shift;
 
    my $function = "_setOptions";
    
    #  check self
    if(!defined $self || !ref $self) {
	$errorhandler->_error($pkg, $function, "", 2);
    }
 
    #  get all the parameters
    my $debugoption   = $params->{'debug'};
    my $t             = $params->{'t'};
    my $icpropagation = $params->{'icpropagation'};
    my $icfrequency   = $params->{'icfrequency'};
    my $icsmooth      = $params->{'smooth'};
    my $realtime      = $params->{'realtime'};

    my $output = "";

    #  check if options have been defined
    if(defined $icpropagation || defined $icfrequency || defined $realtime || 
       defined $debugoption   || defined $icsmooth) { 
	$output .= "\nICFinder User Options:\n";
    }

    #  check if the debug option has been been defined
    if(defined $debugoption) {
	$debug = 1; 
	$output .= "   --debug option set\n";
    }
    
    if(defined $icsmooth) {
	$smooth = $icsmooth;
	$output .= "  --smooth\n";
    }

    #  check if the propagation option has been identified
    if(defined $icpropagation) {
	$option_icpropagation = 1;
	$propagationFile    = $icpropagation;
	$output .= "  --icpropagation $icpropagation\n";
    }

    #  check if the frequency option has been identified
    if(defined $icfrequency) { 
	$option_icfrequency = 1;
	$frequencyFile    = $icfrequency;
	$output .= "  --icfrequency $icfrequency\n";
    }

    #  check if the realtime option has been identified
    if(defined $realtime) { 
	$option_realtime = 1;
	$output .= "  --realtime option set\n"; 
    }

    &_debug($function);
      
    if(defined $t) {
	$option_t = 1;
    }
    else {
	print STDERR "$output\n";
    }    
}


#  method to set the realtime global parameter options               
#  input : bool <- 1 (turn on) 0 (turn off)                     
#  output:
sub _setRealtimeOption {

    my $self = shift;
    my $option = shift;

    my $function = "_setRealtimeOption";
    &_debug($function);

    #  check self                                                       
    if(!defined $self || !ref $self) {
        $errorhandler->_error($pkg, $function, "", 2);
    }

    if($option == 1) {
        $option_realtime = 1;
    }
    else {
        $option_realtime = 0;
    }
}

#  method returns the configN - the total number of CUIs
#  input: 
#  output: int 
sub _getN
{
    my $self = shift;
    
    my $function = "_getN";
    &_debug($function);

    if($configN == 0) { 
	my $hash = $cuifinder->_getCuiList(); 
	$configN = keys %{$hash}; 
    }
    return $configN;
}

 
#  returns the intrinsic information content (IC) of a cui
#  input : $concept <- string containing a cui
#  output: $double  <- double containing its IC
sub _getSecoIntrinsicIC
{
    my $self     = shift;
    my $concept  = shift;

    my $function = "_getSecoInrinsicIC";
    &_debug($function);

     #  check self
    if(!defined $self || !ref $self) {
	$errorhandler->_error($pkg, $function, "", 2);
    }
     
    #  check concept was obtained
    if(!$concept) { 
	$errorhandler->_error($pkg, $function, "Error with input variable \$concept.", 4);
    }
    
    #  check if valid concept
    if(! ($errorhandler->_validCui($concept)) ) {
	$errorhandler->_error($pkg, $function, "Concept ($concept) in not valid.", 6);
    }    
       
    my $children = $cuifinder->_getChildren($concept);

    my $n = _getN(); 

    my $children_num = ($#{$children}) + 2; 
    my $ic = 1 - ( (log($children_num)/log(10)) / (log($n)/log(10)) );
		   
    return $ic;
}


sub _getDecendents
{
    my $concept   = shift;
    my $array     = shift;
    
    if($concept=~/^\s*$/) { return; }

    #  if concept is one of the following just return
    #C1274012|Ambiguous concept (inactive concept)
    if($concept=~/C1274012/) { return; }
    #C1274013|Duplicate concept (inactive concept)
    if($concept=~/C1274013/) { return; }
    #C1276325|Reason not stated concept (inactive concept)
    if($concept=~/C1276325/) { return; }
    #C1274014|Outdated concept (inactive concept)
    if($concept=~/C1274014/) { return; }
    #C1274015|Erroneous concept (inactive concept)
    if($concept=~/C1274015/) { return; }
    #C1274021|Moved elsewhere (inactive concept)
    if($concept=~/C1274021/) { return; }
    #C2733115|limited status concept
    if($concept=~/C2733115/) { return; }
    #C1443286|
    if($concept=~/C1443286/) { return; }
    
    #  set the new path
    my @path = @{$array};
    push @path, $concept;
    
    my $series = join " ", @path;

    #  get all the children
    my $children = $cuifinder->_getChildren($concept);
    
    my %subsumers = (); my %leaves = ();

    #  search through the children
    foreach my $child (@{$children}) {	
	
	#  check if child cui has already in the path
	my $flag = 0;
	foreach my $cui (@path) {
	    if($cui eq $child) { 
		$flag = 1; 
	    }
	}
	
	#  if it isn't continue on with the depth first search
	if($flag == 0) {
	    my ($s, $l) = &_getDecendents($child, \@path); 
	    %subsumers = (%subsumers, %{$s}); %leaves = (%leaves, %{$l});  
	}
    }
    
    if($#{$children} < 0) { $leaves{$concept}++; } $subsumers{$concept}++;
    
    return (\%subsumers, \%leaves); 
}

#  returns the intrinsic information content (IC) of a cui
#  input : $concept <- string containing a cui
#  output: $double  <- double containing its IC
sub _getSanchezIntrinsicIC
{
    my $self     = shift;
    my $concept  = shift;

    my $function = "_getSanchezIntrinsicIC";
    &_debug($function);

     #  check self
    if(!defined $self || !ref $self) {
	$errorhandler->_error($pkg, $function, "", 2);
    }
     
    #  check concept was obtained
    if(!$concept) { 
	$errorhandler->_error($pkg, $function, "Error with input variable \$concept.", 4);
    }
    
    #  check if valid concept
    if(! ($errorhandler->_validCui($concept)) ) {
	$errorhandler->_error($pkg, $function, "Concept ($concept) in not valid.", 6);
    }   
    
    #  get the leaves
    my $leaves = 0; my $maxleaves = 0; 
    if($option_realtime) { 
	my @path = (); 
	my ($d, $l) = _getDecendents($concept, \@path);
	$leaves = keys %{$l}; 
	$maxleaves = _getMaxLeaves(); 
    }
    else {
	#  get the umlsinterfaceindex database from CuiFinder
	my $sdb = $cuifinder->_getIndexDB();
	if(!$sdb) { $errorhandler->_error($pkg, $function, "Error with sdb.", 3); }
	$self->{'sdb'} = $sdb;
	
	#  get the intrinsic table name
	my $intrinsicTableName = $cuifinder->_getIntrinsicTableName();

	#  check that it exists in the index
	my $arrRefCheck = $sdb->selectcol_arrayref("select count(*) from tableindex where HEX=\'$intrinsicTableName\'"); 
	$errorhandler->_checkDbError($pkg, $function, $sdb);
	my $check = shift @{$arrRefCheck};
	
	if($check != 1) { 
	    print STDERR "The index does not contain the intrinsic table for the\n"; 
	    print STDERR "sources/relations in the configration file. It must have\n";
	    print STDERR "been created with an earlier version of UMLS-Interface.\n";
	    print STDERR "Please either recreate the index by removing it using the\n";
	    print STDERR "removeConfigData.pl or run with the --realtime option.\n\n";
	    exit; 
	}

	#  get subsumers and leaves from the intrinsic table
	my $arrRefLeaves = $sdb->selectcol_arrayref("select LEAVES from $intrinsicTableName where CUI=\'$concept\'");
	$errorhandler->_checkDbError($pkg, $function, $sdb);

	my $arrRefMaxLeaves = $sdb->selectcol_arrayref("select LEAVES from $intrinsicTableName where CUI=\'$root\'");
	$errorhandler->_checkDbError($pkg, $function, $sdb);
	
	$maxleaves = shift @{$arrRefMaxLeaves}; 
	$leaves    = shift @{$arrRefLeaves};
    }
    
    #  get the subsumers
    my $paths = $pathfinder->_pathsToRoot($concept);
    my %subhash = (); 
    foreach my $path (@{$paths}) {
	my @array = split/\s+/, $path;
	foreach my $element (@array) { $subhash{$element}++; }
    }
    my $subsumers = keys %subhash; 

    my $a = 0; 
    if(defined $leaves) { 
	if($leaves != 0 && $subsumers != 0) { 
	    $a = $leaves/$subsumers; 
	}
    }$a++; 

    my $b = $maxleaves; $b++; 
        
    my $ic = -1 * ( (log( $a/$b )/log(2)) ); 
    
    return $ic;
}

sub _getMaxLeaves { 
    
    if($max_leaves == 0) {

	my @path = (); 
	my ($s, $l) = _getDecendents($cuifinder->_root(), \@path); 
	
	$max_leaves = keys %{$l}; 
    }
    
    return $max_leaves;
}
    
#  returns the information content (IC) of a cui
#  input : $concept <- string containing a cui
#  output: $double  <- double containing its IC
sub _getIC
{
    my $self     = shift;
    my $concept  = shift;

    my $function = "_getIC";
    &_debug($function);

     #  check self
    if(!defined $self || !ref $self) {
	$errorhandler->_error($pkg, $function, "", 2);
    }
     
    #  check concept was obtained
    if(!$concept) { 
	$errorhandler->_error($pkg, $function, "Error with input variable \$concept.", 4);
    }
    
    #  check if valid concept
    if(! ($errorhandler->_validCui($concept)) ) {
	$errorhandler->_error($pkg, $function, "Concept ($concept) in not valid.", 6);
    }    
    
    #  if option frequency then the propagation hash 
    #  hash has not been loaded and we should determine
    #  the information content of the concept using the
    #  frequency information in the file in realtime
    if($option_icfrequency) { 
	
  	#  initialize the propagation hash
	$self->_initializePropagationHash();
	
	#  load the propagation frequency hash
	$self->_loadPropagationFreq(\%frequencyHash);
	
	#  propogate the counts
	&_debug("_propagation");
	my @array = ();
	$self->_propagation($concept, \@array);
	
	#  tally up the propagation counts
	$self->_tallyCounts();
    }
    
    my $prob = $propagationHash{$concept};

    if(!defined $prob) { return 0; }

    my $ic = 0;
    if($prob > 0 and $prob < 1) { $ic = -1 * (log($prob) / log(10)); }    

    return $ic;
}

 
#  returns the probability
#  input : $concept <- string containing a cui
#  output: $double  <- double containing its probability
sub _getProbability
{
    my $self     = shift;
    my $concept  = shift;

    my $function = "_getProbability";
    &_debug($function);

     #  check self
    if(!defined $self || !ref $self) {
	$errorhandler->_error($pkg, $function, "", 2);
    }
     
    #  check concept was obtained
    if(!$concept) { 
	$errorhandler->_error($pkg, $function, "Error with input variable \$concept.", 4);
    }
    
    #  check if valid concept
    if(! ($errorhandler->_validCui($concept)) ) {
	$errorhandler->_error($pkg, $function, "Concept ($concept) in not valid.", 6);
    }    
       
    #  if option frequency then the propagation hash 
    #  hash has not been loaded and we should determine
    #  the information content of the concept using the
    #  frequency information in the file in realtime
    if($option_icfrequency) { 
	
  	#  initialize the propagation hash
	$self->_initializePropagationHash();
	
	#  load the propagation frequency hash
	$self->_loadPropagationFreq(\%frequencyHash);
	
	#  propogate the counts
	&_debug("_propagation");
	my @array = ();
	$self->_propagation($concept, \@array);
	
	#  tally up the propagation counts
	$self->_tallyCounts();
    }
    
    my $prob = $propagationHash{$concept};

	
    if(!defined $prob) { return 0; }

    return $prob;
}

#  returns the propagation count (frequency)  of a cui
#  input : $concept <- string containing a cui
#  output: $double  <- frequency
sub _getFrequency
{
    my $self     = shift;
    my $concept  = shift;

    my $function = "_getFrequency";
    &_debug($function);

     #  check self
    if(!defined $self || !ref $self) {
	$errorhandler->_error($pkg, $function, "", 2);
    }
     
    #  check concept was obtained
    if(!$concept) { 
	$errorhandler->_error($pkg, $function, "Error with input variable \$concept.", 4);
    }
    
    #  check if valid concept
    if(! ($errorhandler->_validCui($concept)) ) {
	$errorhandler->_error($pkg, $function, "Concept ($concept) in not valid.", 6);
    }    
       
    #  if option frequency then the propagation hash 
    #  hash has not been loaded and we should determine
    #  the information content of the concept using the
    #  frequency information in the file in realtime
    if($option_icfrequency) { 
	
  	#  initialize the propagation hash
	$self->_initializePropagationHash();
	
	#  load the propagation frequency hash
	$self->_loadPropagationFreq(\%frequencyHash);
	
	#  propogate the counts
	&_debug("_propagation");
	my @array = ();
	$self->_propagation($concept, \@array);
	
	#  tally up the propagation counts
	$self->_tallyCounts();
    }

    my $freq = int($propagationHash{$concept} * $configN);

    return $freq;
}

#  this method obtains the CUIs in the sources which 
#  are going to be propagated
#  input :
#  output: $hash <- reference to hash containing the cuis
sub _loadFrequencyHash {

    my $self = shift;
    
    my $function = "_loadFrequencyHash";
    &_debug($function);

    
    #  check self
    if(!defined $self || !ref $self) {
	$errorhandler->_error($pkg, $function, "", 2);
    }

    #  open the frequency file
    open(FILE, $frequencyFile) || die "Could not open file $frequencyFile\n";

    #  get the source and relations associated with the propagation file
    my $sab  = <FILE>; chomp $sab;
    my $rel  = <FILE>; chomp $rel;

    #  get the rela realtions associated with the propagation file if one exists
    my $rela = <FILE>; chomp $rela;

    #  if it does exist in then get N otherwise we got N already.
    my $ninfo = $rela;
    if($rela=~/RELA/) { 
	$ninfo = <FILE>; chomp $ninfo; 
    }
    else { 
	$rela = "";
    }

    $ninfo=~/N\s*\:\:\s*([0-9]+)/;
    $configN = $1;

    #  get the source and relations from config file or the defaults
    my $configsab = $cuifinder->_getSabString();
    my $configrel = $cuifinder->_getRelString();
    
    #  check the source information is correct
    if(! ($self->_checkParameters($configsab, $sab)) ) {
	my $str = "SAB information ($sab) does not match the config file ($configsab).";
	$errorhandler->_error($pkg, $function, $str, 5);	
       }
    
    #  check that that the relation information is correct
    if(! ($self->_checkParameters($configrel, $rel)) ) { 
	my $str = "REL information ($rel) does not match the config file ($configrel).";
	$errorhandler->_error($pkg, $function, $str, 5);
    }
    
    #  check if rela information was used
    if($rela ne "") { 
	if(!($self->_checkParameters($_, $cuifinder->_getRelaString()))) {
	    my $str = "RELA information does not match the config file ($_).";
	    $errorhandler->_error($pkg, $function, $str, 5);
	}
    }
    #  check that the relations used are acceptable for propagation
    #  the only acceptable relations are RB/RN and PAR/CHD
    if(! ($self->_checkHierarchicalRelations ($configrel)) ) { 
	my $str = "REL information ($rel) contains relations other than RB/RN and PAR/CHD.";
	$errorhandler->_error($pkg, $function, $str, 11);
    }

    #  obtain the frequency counts storing them in the frequency hash table
    while(<FILE>) { 
	chomp;
	
	#  if blank line more on
	if($_=~/^\s*$/) { next; }

	#  get the cui and its frequency count
	my($cui, $freq) = split/<>/;
	
	#  make certain that it is a cui and a frequency
	#  and if it is load it into the frequency hash
	if( ($cui=~/C[0-9]/) && ($freq=~/[0-9]+/) ) { 
	    if(exists $frequencyHash{$cui}) { 
		$frequencyHash{$cui} += $freq;
	    }
	    else {
		$frequencyHash{$cui} = $freq;
	    }
	}
    }
    
    close(FILE);
}

#  this method obtains the CUIs in the sources which 
#  are going to be propagated
#  input :
#  output: $hash <- reference to hash containing the cuis
sub _getPropagationCuis {

    my $self = shift;
    
    my $function = "_getPropagationCuis";
    &_debug($function);

    #  check self
    if(!defined $self || !ref $self) {
	$errorhandler->_error($pkg, $function, "", 2);
    }

    #  return the reference to a hash
    return $cuifinder->_getCuiList();
}

#  initialize the propgation hash
#  input :
#  output:
sub _initializePropagationHash {

    my $self = shift;

    my $function = "_initializePropagationHash";
    &_debug($function);

    #  check self
    if(!defined $self || !ref $self) {
	$errorhandler->_error($pkg, $function, "", 2);
    }
    
    #  clear out the hash just in case
    my $hash = $self->_getPropagationCuis();
       
    #  add the cuis to the propagation hash
    foreach my $cui (sort keys %{$hash}) { 
	$propagationHash{$cui} = "";
	$propagationFreq{$cui} = $smooth;
    }
}

#  load the propagation frequency has with the frequency counts
#  input : $hash <- reference to hash containing frequency counts
#  output:
sub _loadPropagationFreq {

    my $self = shift;
    my $fhash = shift;
    
    my $function = "_loadPropagationFreq";
    &_debug($function);

    #  check self
    if(!defined $self || !ref $self) {
	$errorhandler->_error($pkg, $function, "", 2);
    }

    #  loop through and set the frequency count
    my $N = 0;    
    foreach my $cui (sort keys %{$fhash}) {
	if($cui=~/^\s*$/) { next; }
	
	my $freq = ${$fhash}{$cui};
	if(exists $propagationFreq{$cui}) {
	    $propagationFreq{$cui} += $freq;
	}
	$N = $N + $freq;
    }
    
    #  check if something has been set
    if($smooth == 1) { 
	my $pkeys = keys %propagationFreq;
	$N += $pkeys;
    }
    
    #  set N for the config file
    $configN = $N;

    #  loop through again and set the probability
    foreach my $cui (sort keys %propagationFreq) { 
	$propagationFreq{$cui} = ($propagationFreq{$cui}) / $N;
	
    }
}

#  check that the parameters in config file match
#  input : $string1 <- string containing parameter
#          $string2 <- string containing configuratation parameter
#  output: 0|1      <- true or false
sub _checkParameters {
    my $self = shift;
    my $string1 = shift;
    my $string2 = shift;

    my $function = "_checkParameters";
    &_debug($function);

    if( !(defined $string1) && !(defined $string2) ) { return 1; }
    if( ($string1=~/^\s*$/) && ($string2=~/^\s*$/) ) { return 1; }
    if( ($string1=~/^\s*$/) && !($string2=~/^\s*$/) ) { return 0; }
    if( !($string1=~/^\s*$/) && ($string2=~/^\s*$/) ) { return 0; }

    if(!($string1=~/([A-Z]+) :: (include|exclude) (.*?)$/)) { return 0; }
    if(!($string2=~/([A-Z]+) :: (include|exclude) (.*?)$/)) { return 0; }

    $string1=~/([A-Z]+) :: (include|exclude) (.*?)$/;
    my $option1 = $1;
    my $type1   = $2;
    my $param1  = $3;
    
    $string2=~/([A-Z]+) :: (include|exclude) (.*?)$/;
    my $option2 = $1;
    my $type2   = $2;
    my $param2  = $3;

    if($option1 ne $option2) { return 0; }
    if($type1 ne $type2)     { return 0; }

    my @array1 = split/\,/, $param1;
    my @array2 = split/\,/, $param2;
    

    my %hash = ();

    foreach my $element (@array1) { $element=~s/\s+//g; $hash{$element}++; }
    foreach my $element (@array2) { $element=~s/\s+//g; $hash{$element}++; }
    
    foreach my $element (sort keys %hash) { 
	if($hash{$element} != 2) { return 0; }
    }    

    return 1;
}

#  check that the relations used are only RB/RN and/or PAR/CHD relations
#  input : string  <- contain the relation line from config file
#  output: 1|0     <- indicating if the string contains relations other 
#                     than RB/RN or PAR/CHD relations
sub _checkHierarchicalRelations {
    
    my $self = shift;
    my $string  = shift;

    my $function = "_checkHierarchicalRelations";
    &_debug($function);

    $string=~/([A-Z]+) :: (include|exclude) (.*?)$/;
    my $option = $1;
    my $type   = $2;
    my $param  = $3;
    
    $param=~s/\s+//g;

    my @rels = split/\s*\,\s*/, $param; 
    
    foreach my $rel (@rels) { 
	if( !($rel=~/(PAR|CHD|RB|RN)/) ) { 
	    return 0;
	}
    }
    
    return 1;
}

#  load the propagation hash
#  input :
#  output: 
sub _loadPropagationHashFromFile {

    my $self = shift;
        
    my $function = "_loadPropagationHashFromFile";
    &_debug($function);

    #  check self
    if(!defined $self || !ref $self) {
	$errorhandler->_error($pkg, $function, "", 2);
    }

    #  open the propagation file
    open(FILE, $propagationFile) || die "Could not open file $propagationFile\n";
    #  check if smoothing was set
    my $psmooth = <FILE>;
    
    #  get the source and relations associated with the propagation file
    my $sab  = <FILE>; chomp $sab;
    my $rel  = <FILE>; chomp $rel;

    #  get the rela realtions associated with the propagation file if one exists
    my $rela = <FILE>; chomp $rela;

    #  if it does exist in then get N otherwise we got N already.
    my $ninfo = $rela;
    if($rela=~/RELA/) { 
	$ninfo = <FILE>; chomp $ninfo; 
    }
    else { 
	$rela = "";
    }

    $ninfo=~/N\s*\:\:\s*([0-9]+)/;
    $configN = $1;

    #  get the source and relations from config file or the defaults
    my $configsab = $cuifinder->_getSabString();
    my $configrel = $cuifinder->_getRelString();
    
    #  check the source information is correct
    if(! ($self->_checkParameters($configsab, $sab)) ) {
	my $str = "SAB information ($sab) does not match the config file ($configsab).";
	$errorhandler->_error($pkg, $function, $str, 5);	
       }
    
    #  check that that the relation information is correct
    if(! ($self->_checkParameters($configrel, $rel)) ) { 
	my $str = "REL information ($rel) does not match the config file ($configrel).";
	$errorhandler->_error($pkg, $function, $str, 5);
    }
    
    #  check if rela information was used
    if($rela ne "") { 
	if(!($self->_checkParameters($_, $cuifinder->_getRelaString()))) {
	    my $str = "RELA information does not match the config file ($_).";
	    $errorhandler->_error($pkg, $function, $str, 5);
	}
    }
    #  check that the relations used are acceptable for propagation
    #  the only acceptable relations are RB/RN and PAR/CHD
    if(! ($self->_checkHierarchicalRelations ($configrel)) ) { 
	my $str = "REL information ($rel) contains relations other than RB/RN and PAR/CHD.";
	$errorhandler->_error($pkg, $function, $str, 11);
    }
    
    while(<FILE>) {
	chomp;
	
	#  if blank line move on
	if($_=~/^\s*$/) { next; }
	
	#  get the cui and its frequency count
	my ($cui, $freq) = split/<>/;

	#  load it into the propagation hash
	$propagationHash{$cui} = $freq;
    }
}

#  DEBUNKED FUNCTION? CHECK
#  get the propagation count for a given cui
#  input : $concept   <- string containing the cui
#  output: $double|-1 <- the propagation count otherwise
#                        a -1 if none existed for that cui
sub _getPropagationCount {

    my $self = shift;
    my $concept = shift;

    my $function = "_getPropagationCount";
    &_debug($function);

    #  check self
    if(!defined $self || !ref $self) {
	$errorhandler->_error($pkg, $function, "", 2);
    }

    #  check concept was obtained
    if(!$concept) { 
	$errorhandler->_error($pkg, $function, "Error with input variable \$concept.", 4);
    }
    
    #  check if valid concept
    if(! ($errorhandler->_validCui($concept)) ) {
	$errorhandler->_error($pkg, $function, "Concept ($concept) in not valid.", 6);
    }

    #  propagate the counts
    $self->_propagateCounts();

    #  if the concept exists in the propagation hash 
    #  return the probability otherwise return a -1
    if(exists $propagationHash{$concept}) {
	return $propagationHash{$concept};
    }
    else {
	return -1;
    }

}

#  method which actually propagates the counts
#  input : $hash <- reference to the hash containing 
#                   the frequency counts
#  output: 
sub _propagateCounts {

    my $self = shift;    
    my $fhash = shift;
    
    my $function = "_propagateCounts";
    &_debug($function);

    #  check self
    if(!defined $self || !ref $self) {
	$errorhandler->_error($pkg, $function, "", 2);
    }

    #  check the parameters
    if(!defined $fhash) { 
	$errorhandler->_error($pkg, $function, "Input variable \%fhash  not defined.", 4);
    }

    #  initialize the propagation hash
    $self->_initializePropagationHash();
    
    #  load the propagation frequency hash
    $self->_loadPropagationFreq($fhash);
    
    #  propagate the counts
    my @array = ();
    $self->_propagation($root, \@array);
    
    #  tally up the propagation counts
    $self->_tallyCounts();

    my $k = keys %propagationHash;
    
    #  return the propagation counts
    return \%propagationHash;
}

#  method that tallys up the probability counts of the
#  cui and its decendants and then calculates the ic
#  input :
#  output: 
sub _tallyCounts {

    my $self = shift;
    
    my $function = "_tallyCounts";
    &_debug($function);

    #  check self
    if(!defined $self || !ref $self) {
	$errorhandler->_error($pkg, $function, "", 2);
    }
    
    foreach my $cui (sort keys %propagationHash) {
	my $set    = $propagationHash{$cui};
	my $pcount = $propagationFreq{$cui};
	
	if(defined $set) { 
	    my %hash = ();
	    while($set=~/(C[0-9][0-9][0-9][0-9][0-9][0-9][0-9])/g) {
		my $c = $1;
		if(! (exists $hash{$c}) ) {
		    $pcount += $propagationFreq{$c};
		    $hash{$c}++;
		}
	    }	
	}
	$propagationHash{$cui} = $pcount;
    }
}

#  recursive method that acuatlly performs the propagation
#  input : $concept <- string containing the cui
#          $array   <- reference to the array containing
#                      the cui's decendants
#  output: $concept <- string containing the cui
#          $array   <- reference to the array containing
#                      the cui's decendants
sub _propagation {

    my $self    = shift;
    my $concept = shift;
    my $array   = shift;

    my $function = "_propagation";

    #  check self
    if(!defined $self || !ref $self) {
	$errorhandler->_error($pkg, $function, "", 2);
    }
    
    #  check concept was obtained
    if(!$concept) { 
	$errorhandler->_error($pkg, $function, "Error with input variable \$concept.", 4);
    }
    
    #  check if valid concept
    if(! ($errorhandler->_validCui($concept)) ) {
	$errorhandler->_error($pkg, $function, "Concept ($concept) in not valid.", 6);
    }
    
    #  if the concept is inactive
    if($cuifinder->_forbiddenConcept($concept)) { return; }

    #  set up the new path
    my @intermediate = @{$array};
    push @intermediate, $concept;
    my $series = join " ", @intermediate;

    #  initialize the set
    my $set = $propagationHash{$concept};

    #  if the propagation hash already contains a list of CUIs it
    #  is from its decendants so it has been here before so all we 
    #  have to do is return the list of ancestors with it added
    if(defined $set) { 
	if(! ($set=~/^\s*$/)) { 
	    $set .= " $concept";
	    return $set; 
	}
    }

    #  get all the children
    my $children = $cuifinder->_getChildren($concept);

    #  search through the children   
    foreach my $child (@{$children}) {

	my $flag = 0;
	
	#  check that the concept is not one of the forbidden concepts
	if($cuifinder->_forbiddenConcept($child)) { $flag = 1; }
	
	#  check if child cui has already in the path
	foreach my $cui (@intermediate) {
	    if($cui eq $child) { $flag = 1; }
	}
	
	#  if it isn't continue on with the depth first search
	if($flag == 0) {  
	    $set .= " ";
	    $set .= $self->_propagation($child, \@intermediate);    
	}
    }
    
    #  remove duplicates from the set
    my $rset;
    if(defined $set) { 
	$rset = _breduce($set); 
    }
    #  store the set in the propagation hash
    $propagationHash{$concept} = $rset;
    
    #  add the concept to the set
    $rset .= " $concept";
    
    #  return the set
    return $rset;
}

#  removes duplicates in an array
#  input : $array <- reference to an array
#  output: 
sub _breduce {
    
    local($_)= @_;
    my (@words)= split;
    my (%newwords);
    for (@words) { $newwords{$_}=1 }
    join ' ', keys(%newwords);
}


1;

__END__

=head1 NAME

UMLS::Interface::ICFinder  - provides the information content 
of the CUIs in the UMLS for the modules in the 
UMLS::Interface package.

=head1 DESCRIPTION

This package provides the information content of the CUIs in the 
UMLS for the modules in the UMLS::Interface package.

For more information please see the UMLS::Interface.pm 
documentation. 

=head1 SYNOPSIS

 use UMLS::Interface::CuiFinder;
 use UMLS::Interface::ICFinder;
 use UMLS::Interface::ErrorHandler;

 %params = ();

 $params{"realtime"} = 1;

 $cuifinder = UMLS::Interface::CuiFinder->new(\%params); 
 die "Unable to create UMLS::Interface::CuiFinder object.\n" if(!$cuifinder);

 $icfinder = UMLS::Interface::ICFinder->new(\%params, $cuifinder); 
 die "Unable to create UMLS::Interface::ICFinder object.\n" if(!$icfinder);

 $concept = "C0037303";

 $ic = $icfinder->_getIC($concept);
 print "The IC of $concept is $ic\n\n";

 print "Note: This probably returned zero because the information\n";
 print "content file is not specified - this is difficult to do in\n"; 
 print "the synopsis. You need to create an icpropagation file and\n";
 print "then pass it as one of the parameters. See:\n\n";
 print "            create-icpropagation.pl\n\n";
 print "to create it and then add the following line above:\n\n";
 print "           \$params{\"icpropgation\"} = <icpropagation file>;\n\n";
 print "\n";

=head1 INSTALL

To install the module, run the following magic commands:

  perl Makefile.PL
  make
  make test
  make install

This will install the module in the standard location. You will, most
probably, require root privileges to install in standard system
directories. To install in a non-standard directory, specify a prefix
during the 'perl Makefile.PL' stage as:

  perl Makefile.PL PREFIX=/home/sid

It is possible to modify other parameters during installation. The
details of these can be found in the ExtUtils::MakeMaker
documentation. However, it is highly recommended not messing around
with other parameters, unless you know what you're doing.

=head2 PROPAGATION

The Information Content (IC) is  defined as the negative log 
of the probability of a concept. The probability of a concept, 
c, is determine by summing the probability of the concept 
(P(c)) ocurring in some text plus the probability its decendants 
(P(d)) occuring in some text:

P(c*) = P(c) + \sum_{d\exists decendant(c)} P(d)

The initial probability of a concept (P(c)) and its decendants 
(P(d)) is obtained by dividing the number of times a concept is 
seen in the corpus (freq(d)) by the total number of concepts (N):

P(d) = freq(d) / N

Not all of the concepts in the taxonomy will be seen in the corpus. 
We have the option to use Laplace smoothing, where the frequency 
count of each of the concepts in the taxonomy is incremented by one. 
The advantage of doing this is that it avoides having a concept that 
has a probability of zero. The disadvantage is that it can shift the 
overall probability mass of the concepts from what is actually seen 
in the corpus. 

=head2 PROPAGATION IN REALTIME

The algorithm to determine the information content of a CUI in real 
time is as follows:

        1. get all of its decendants using the DFS

        2. looping through the frequency file computing the 
           probability of the decendants and storing this 
           information

        3. sum the probability of the decedants and the CUI 

        4. calculate the IC which is defined as the negative log of 
           the probabilty of the concept (which is the sum from
           step 3)


In order to run the propagation in real time, we require the frequency 
counts of a list of CUIs to be given to us. 

=head1 SEE ALSO

http://tech.groups.yahoo.com/group/umls-similarity/

http://search.cpan.org/dist/UMLS-Similarity/

=head1 AUTHOR

Bridget T McInnes <bthomson@cs.umn.edu>
Ted Pedersen <tpederse@d.umn.edu>

=head1 COPYRIGHT

 Copyright (c) 2007-2009
 Bridget T. McInnes, University of Minnesota
 bthomson at cs.umn.edu

 Ted Pedersen, University of Minnesota Duluth
 tpederse at d.umn.edu

 Siddharth Patwardhan, University of Utah, Salt Lake City
 sidd at cs.utah.edu

 Serguei Pakhomov, University of Minnesota Twin Cities
 pakh0002 at umn.edu

 Ying Liu, University of Minnesota Twin Cities
 liux0395 at umn.edu

This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation; either version 2 of the License, or (at your option) any later
version.

This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with
this program; if not, write to 

 The Free Software Foundation, Inc.,
 59 Temple Place - Suite 330,
 Boston, MA  02111-1307, USA.