lib/GnaData/Topics.pm - metacpan.org

#!/var/local/gna/bin/perl5


=pod
These are various procedures to guess topics

Use refine_topics1
=cut

my(@exclude_words) = 
    ("the", "and", "to", "of", "a", "an","for", "what", "are", "in", "our");

my($remap_table) = {
    "masters"=>"master",
    "bachelors"=>"bachelor"
    };

my($exclude_table) = {
    "master"=>["science", "arts"],
    "bachelor"=>["science", "arts"],
    "associate"=>["science", "arts"]
};

require 'edit-catalog.pl';
use strict;

my($table) = $::catalog_file;
my($table1) = $::tabledir . "/wordtable1.rdb";
my($topic_list) = $::tabledir . "/topics.rdb";

sub use_bayes {
    my($title, $restrict) = @_;
    my(%prob) = ();
    my($word, $exclude_word, $topic, $score);
    
  topicloop:
    foreach $word (split(/\s+/, $title)) {
	$word =~ tr/A-Z/a-z/;
#print "Looking for $word\n";
	foreach $exclude_word (@exclude_words) {
	    if ($word eq $exclude_word) {
#		print "  Ignoring\n";
		next topicloop;
	    }
	}
	my(%transfer_prob) = ();
	open (DATA, "echo '$word' | search -mi conting.rdb word |");
	$_ = <DATA>;
	$_ = <DATA>;
	while (<DATA>) {
	    ($topic, $word, $score) = split(/\t/, $_);
	    $transfer_prob{$topic} = $score;
	}
	close(DATA);
	
	my(%new_prob) = ();
	foreach $word (keys %transfer_prob) {
	    if ($prob{$word} ne "") {
		$new_prob{$word} = $prob{$word} * $transfer_prob{$word};
	    } else {
		$new_prob{$word} = 0.01 * $transfer_prob{$word};
	    }
	}

	foreach $word (keys %prob) {
	    if ($new_prob{$word} eq "") {
		$new_prob{$word} = 0.01 * $prob{$word};
	    }
	}

	if ($restrict ne "") {
	    foreach $word (keys %prob) {
		if ($word !~ /^$restrict/) {
		    delete $new_prob{$word};
		}
	    }
	}

# Normalize
	my($norm) =0.0;
	foreach $word (keys %new_prob) {
	    $norm += $new_prob{$word};
	}
	
	foreach $word (keys %new_prob) {
	    $new_prob{$word} /= $norm;
	}
	
	%prob = %new_prob;

#print "Probablity vector table\n---------------------\n";
#foreach $word (keys %new_prob) {
#print "$word\t$prob{$word}\n";
#}
}
    return %prob;
}

sub find_top {
    my ($probref, $number) = @_;

    my(@topic_list);
    my($I, $J);
    if ($number eq "") {
	$number = 1;
    }

    my($word);
  wordloop:
    foreach $word (keys (%{$probref})) {
        my(@list) = split(/;/, $word);
        my($words) = $#list;
	my($i);
	for ($i=0; $i < $number; $i++) {
	    my(@list_count) = split(/;/, $topic_list[$i]);
	    my($topic_count) = $#list_count;
	    my($j);

	    if ($topic_list[$i] eq "" ||
		$probref->{$word} > $probref->{$topic_list[$i]} ||
($probref->{$word} == $probref->{$topic_list[$i]}
&&
$words > $topic_count)) {
		for ($j=$number-1; $j > $i; $j--) {
		    $topic_list[$j] = $topic_list[$j-1];
		}
		$topic_list[$i] = $word;
		next wordloop;
	    }
	}
    }
    return @topic_list;
}

sub refine_topic {
    my($title, $listref) = @_;
    local($_, *FILE);
    my(%array) = ();
    my(@title_words) = split(/\s+/, $title);
    my($check_topic);

    foreach $check_topic (@{$listref}) {
        open(FILE, "echo '$check_topic' | search -mi -x $table topic | column title topic | headoff |");
        while (<FILE>) {
	    my($test, $topic) = split(/[\t\n]/);
	    $test =~ y/A-Z/a-z/;
	    my(@test_words) = split(/\s+/, $test);

            my(%bucket) = ();
            my(%title_words) = ();
	    my(%test_words) = ();
            my($match_count) = 0;
            my($total_word_count) = 0;
	    my($word);
            foreach $word (@title_words) {
                $bucket{$word} = 1;
                $title_words{$word} = 1;
            }
            foreach $word (@test_words) {
		$bucket{$word} = 1;
		$test_words{$word} = 1;
	    }
	    my(%exclude_words);
            foreach $word (keys %bucket) {
                if (! $exclude_words{$word}) {
                    if ($title_words{$word} == 1
			&& $test_words{$word} == 1) {
                        $match_count ++;
                    }
                    $total_word_count++;
                }
	    }
            my($scale_factor) = $match_count / $total_word_count;
            if ($scale_factor > $array{$topic}) {
                $array{$topic} = $scale_factor;
            }
        }
        close(FILE);
    }

    return (%array);
}

sub refine_topic1 {
    my($title, $listref) = @_;
    local($_, *FILE);
    my(%array) = ();
    my($cache_file) = "/tmp/topic.cache.$$";

    my($title_words) = &tokenize_title($title, $remap_table);
    my($exclude_words) = 
	&get_exclude_list($title_words,
			  \@exclude_words,
			  $exclude_table);

     my($check_topic, $word_score);

    foreach $check_topic (@{$listref}) {
	if ($check_topic eq "") {
	    if (!-e $cache_file) {
		`column title topic < $table > $cache_file`;
	    }
	    open(FILE, $cache_file);
	} else {
	    open(FILE, "echo '$check_topic' | search -mi -x $table topic | column title topic | headoff |");
	}
        while (<FILE>) {
	    my($test, $topic) = split(/[\t\n]/);
	    $test =~ y/A-Z/a-z/;
	    my(@test_words) = split(/\s+/, $test);
	    my($has_word) = 0;
            my(%bucket) = ();
            my(%title_words) = ();
	    my(%test_words) = ();
            my($match_count) = 0;
            my($total_word_count) = 0;
	    my($word);
            foreach $word (@test_words) {
                if (! $exclude_words->{$word}) {
		    $bucket{$word} = 1;
		    $test_words{$word} = 1;
		}
	    }

            foreach $word (@$title_words) {
                if (! $exclude_words->{$word}) {
		    if ($bucket{$word} == 1) {
			$has_word = 1;
		    }
		    $bucket{$word} = 1;
		    $title_words{$word} = 1;
		}
            }
	    if ($has_word) {
		foreach $word (keys %bucket) {
		    if (! $exclude_words->{$word}) {
			if ($title_words{$word} > 0 &&
			    $test_words{$word} > 0) {
			    $word_score = &get_word_score($topic,$word);
			    $match_count += $word_score;
#			    print "$word -> $word_score\n";
			    if ($word_score > 0) {
				$total_word_count += 
				    &get_word_score($topic,$word);
			    } else {
				$total_word_count += 1;
			    }
			} else {
			    $total_word_count += 1;
			}
		    }
		}
		my($scale_factor) = $match_count / $total_word_count;
		
		if ($scale_factor > $array{$topic}) {
#		    print "-->$topic $test $scale_factor\n";
		    $array{$topic} = $scale_factor;
		}
	    }
        }			# 
        close(FILE);
    }

    return (%array);
}

my(%cache) = ();
my(%loaded) = ();
sub get_word_score {
    my($topic, $word) = @_;
    my($foo, $foo1, $foo2, $score);
    if (!defined($loaded{$word})) {
#	print "Loading $word\n";
	open(FILE1, "echo '$word' | search -mb $table1 word | ");
	$foo = <FILE1>;
	$foo = <FILE1>;
	while ($foo = <FILE1>) {
	    ($foo1, $foo2, $score) = split(/[\t\n]+/, $foo);
	    $cache{$foo2 . "//" . $foo1} = $score;
	}
	close(FILE1);
	$loaded{$word}  = 1;
	if ($score eq "") {$score = 0;}
#	print "$topic - $word - $score\n";
	return  $cache{$word . "//" . $topic};
    } else {
	return $cache{$word . "//" . $topic};
    }
}

use strict;

sub tokenize_title {
    my ($title, $remap) = @_;
    my (@word_list) = ();
    my ($word);
    foreach $word (split(/[\s\-\:]+/, $title)) {
	$word =~ tr/A-Z/a-z/;
	$word =~ s/[:\(\)\"\'\,\?\-\;\*\&\#\.\/]//g;
	$word =~ s/<.*?>//g;
	if ($remap->{$word} ne "") {
	    $word = $remap->{$word};
#	    print "Mapping $word to $remap->{$word}\n";
	}
	push(@word_list, $word);
    }
    return \@word_list;
}

sub get_exclude_list {
    my ($title_word_list, $global_excludes, $local_exclude) = @_;
    my ($exclude_list) = {};
    my ($word, $word1);
    foreach $word (@$global_excludes) {
	$exclude_list->{$word} = 1;
    }
    foreach $word (@$title_word_list) {
	if (defined($local_exclude->{$word})) {
	    foreach $word1 (@{$local_exclude->{$word}}) {
		$exclude_list->{$word1} = 1;
	    print "Excluding $word -> $word1\n";
	    }
	}
    }
    return $exclude_list;
}

sub guess_topic {
    my($fref, $assignh) = @_;

    my($title) = $fref->{'title'} ne "" ? 
	$fref->{'title'} : $fref->{'name'};
    my($id) = $fref->{'classid'} ne "" ? 
	$fref->{'classid'} : $fref->{'progid'};
    my($topic) = $fref->{'topic'};

    my($topich) = IO::File->new($topic_list);
    my($logstring) = "";
    my(%in, %guess );
    @in{"topic", "title", "classid"} = ($topic, $title, $id);

    $in{"title"} =~ y/A-Z/a-z/;
    $in{"classid"} =~ y/A-Z/a-z/;

# Try to see if topics is already assigned
    if ($in{"classid"} ne "") {
      $topic = `echo "$in{'classid'}" | search -mi $::tabledir/catalog.rdb-c classid | column topic | headoff | head -1`;
	chop $topic;
     if ($topic ne "") {
        $in{"topic"} = $topic;
     }
}

# Ignore headers
    $topich->getline();
    $topich->getline();

    while ($_ = $topich->getline()) {
	/^\s*$/ && next;
	/^#/ && next;
	chop;
	@guess{"oldtopic", "title", "classid", "newtopic"} = 
	    split(/\|/, $_);
	my($match) = 1;
	foreach (keys %guess) {
	    if ($guess{$_} eq "" || $_ eq "newtopic") {
		next;
	    }
	    if ($_ eq "oldtopic") {
		if ($guess{"oldtopic"} eq "*") {
		    next;
		} elsif ($guess{"oldtopic"} eq "" && $in{"topic"} ne "") {
		    $match = 0; last;
		} elsif ($in{"topic"} !~ m!^$guess{"oldtopic"}!) {
		$match = 0; last;
	    }
	    } elsif ($in{$_} !~ /$guess{$_}/) {
		$match = 0; last;
	    }
	}
	
	if ($match) {

	    if ($guess{"newtopic"} =~ /^\+/) {
	        $guess{"newtopic"} =~ s/\+//;
		if ($in{"topic"} ne "" && $in{"topic"} !~ /$guess{"newtopic"}/
&& $in{'topic'} !~ /Countries/ && $in{'topic'} !~ /Regions/) {
		    $in{"topic"} .= ";" . $guess{"newtopic"};
		}
	    } else {
		($in{"topic"} = $guess{"newtopic"});
	    
	    }
	}
    }
    $topich->close();
    if ($in{"topic"} eq "") {
	$logstring .= 
	    join(":", "NOTOPIC" ,@in{"classid", "title"}) . "\n";
    } else {
	$logstring .= 
	    join(":", @in{"topic", "classid", "title"}) . "\n";
    }
    my($check_topic) = 
	&gna_catalog_edit_check_topic($::catalog_file, $in{'topic'}); 

	
    if ($check_topic ne "") {
	$logstring .= "   ****$check_topic \n";	
    }
    if ($check_topic =~ /too many/i || $in{'topic'} eq "") {
	$logstring .= "   *** Refining ";
	my(%prob_vec) = &refine_topic1($in{'title'}, [$in{'topic'}]);
	my($new_topic) = &find_top(\%prob_vec, 1);
	if ($new_topic ne "") {
	    $logstring .= "to $new_topic";
	    $in{'topic'} = $new_topic;
	}
	$logstring .= "\n";
    }
    if (defined($assignh)) {
	$assignh->print($logstring);
	$assignh->flush();
    }
    return $in{"topic"};
}



1;
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)