lib/Alvis/TermTagger.pm

package Alvis::TermTagger;

our $VERSION = '0.7';

#######################################################################
#
# Last Update: 21/12/2010 (mm/dd/yyyy date format)
# 
# Copyright (C) 2006 Thierry Hamon
#
# Written by thierry.hamon@univ-paris13.fr
#
# Author : Thierry Hamon
# Email : thierry.hamon@univ-paris13.fr
# URL : http://www-limbio.smbh.univ-paris13.fr/membres/hamon/
#
########################################################################


use strict;
use warnings;

# TODO : write functions for term tagginga, term selection with and
# without offset in the corpus

sub termtagging {

    my ($corpus_filename, $term_list_filename, $output_filename, $lemmatised_corpus_filename) = @_;

    my @term_list;
    my @regex_term_list;
    my @regex_lemmawordterm_list;
    my %corpus;
    my %lc_corpus;
    my %lemmatised_corpus;
    my %lc_lemmatised_corpus;
    my %corpus_index;
    my %lemmatised_corpus_index;
    my %idtrm_select;
    my %idlemtrm_select;

    &load_TermList($term_list_filename,\@term_list);
    &get_Regex_TermList(\@term_list, \@regex_term_list, \@regex_lemmawordterm_list);

    &load_Corpus($corpus_filename,\%corpus, \%lc_corpus);
    if (defined $lemmatised_corpus_filename) {
	&load_Corpus($lemmatised_corpus_filename,\%lemmatised_corpus, \%lc_lemmatised_corpus);
    }
    &corpus_Indexing(\%lc_corpus, \%corpus_index);
    if (defined $lemmatised_corpus_filename) {
	&corpus_Indexing(\%lc_lemmatised_corpus, \%lemmatised_corpus_index);
    }
    &term_Selection(\%corpus_index, \@term_list, \%idtrm_select);
    if (defined $lemmatised_corpus_filename) {
	&term_Selection(\%lemmatised_corpus_index, \@term_list, \%idlemtrm_select);
    }
    &term_tagging_offset(\@term_list, \@regex_term_list, \%idtrm_select, \%corpus, $output_filename);
    if (defined $lemmatised_corpus_filename) {
	&term_tagging_offset(\@term_list, \@regex_lemmawordterm_list, \%idlemtrm_select, \%lemmatised_corpus, $output_filename);
    }
    return(0);
}


sub load_TermList {
    my ($termlist_name, $ref_termlist) = @_;

    my $line;
    my $line1;
    my $term;        # not use yet 
    my $suppl_info;  # not use yet 
    my @tab;

    warn "Loading the terminological resource\n";

    open DESC_TERMLIST, $termlist_name or die "$0: $termlist_name: No such file\n";

    binmode(DESC_TERMLIST, ":utf8");

    while($line1 = <DESC_TERMLIST>) {
	chomp $line1;
	utf8::decode($line1);
	$line=$line1;

	# Blank and comment lines are throw away
	if (($line !~ /^\s*\#/o)&&($line !~ /^\s*\/\//o)&&($line !~ /^\s*$/o)) {
	    # Term is split from the other information
	    my @tab = split / ?[\|:] ?/, $line;
	     if ($tab[0] !~ /^\s*$/) {
		 # TODO better
		 $tab[0] =~ s/ +/ /go;
		 $tab[0] =~ s/ $//go;
		 $tab[0] =~ s/^ //go;
		 push @$ref_termlist, \@tab;
	     }
 	 }
    }
    close DESC_TERMLIST;
    print STDERR "\n\tTerm list size : " . scalar(@$ref_termlist) . "\n\n";
}

sub get_Regex_TermList {

    my ($ref_termlist, $ref_regex_termlist, $ref_regex_lemmaWordtermlist) = @_;
    my $term_counter;

    warn "Generating the regular expression from the terms\n";

    for($term_counter  = 0;$term_counter < scalar @$ref_termlist;$term_counter++) {
	$ref_regex_termlist->[$term_counter] = $ref_termlist->[$term_counter]->[0];
	if (defined $ref_regex_lemmaWordtermlist) {
	    $ref_regex_lemmaWordtermlist->[$term_counter] = $ref_termlist->[$term_counter]->[3];
	}
 	$ref_regex_termlist->[$term_counter] =~ s/([()\',\[\]\?\!:;\/.\+\-\*\#\{\}])/\\$1/og;
	$ref_regex_termlist->[$term_counter] =~ s/ /[\- \n]/og;
	if (defined $ref_regex_lemmaWordtermlist) {
	    $ref_regex_lemmaWordtermlist->[$term_counter] =~ s/([()\',\[\]\?\!:;\/.\+\-\*\#\{\}])/\\$1/og;
	    $ref_regex_lemmaWordtermlist->[$term_counter] =~ s/ /[\- \n]/og;
	}
    }
    print STDERR "\n\tTerm/regex list size : " . scalar(@$ref_regex_termlist);
    if (defined $ref_regex_lemmaWordtermlist) {
	print STDERR" / " . scalar(@$ref_regex_lemmaWordtermlist);
    }
    print STDERR "\n\n";
}

sub load_Corpus {

    my ($corpus_filename,$ref_tabh_Corpus, $ref_tabh_Corpus_lc) = @_;
    my $line;
    my $sent_id = 1;

    warn "Loading the corpus\n";

    open CORPUS, $corpus_filename or die "File $corpus_filename not found\n";
 
    binmode(CORPUS, ":utf8");
    
    while($line=<CORPUS>){
	chomp $line;
	$ref_tabh_Corpus->{$sent_id} = $line;
	$ref_tabh_Corpus_lc->{$sent_id} = lc $line;	
	$sent_id++;
    }
    close CORPUS;
    print STDERR "\n\tCorpus size : " . scalar(keys %$ref_tabh_Corpus) . "\n\n";
}


sub corpus_Indexing {
    my ($ref_corpus_lc, $ref_corpus_index) = @_;

    my $word;
    my @tab_words;
    my $sent_id;

    warn "Indexing the corpus\n";

    foreach $sent_id (keys %$ref_corpus_lc) { # \-\.,\n;\/
	@tab_words = split /[ ()\',\[\]\?\!:;\/\.\+\-\*\#\{\}\n]/, $ref_corpus_lc->{$sent_id};
	foreach $word (@tab_words) {
	    if ($word ne "") {
		if (!exists $ref_corpus_index->{$word}) {
		    my @tabtmp;
		    $ref_corpus_index->{$word} = \@tabtmp;
		}
		push @{$ref_corpus_index->{$word}}, $sent_id;
	    }
	}
    }
    print STDERR "\n\tSize of the first selected term list: " . scalar(keys %$ref_corpus_index) . "\n\n";
}

sub print_corpus_index {
    my ($ref_corpus_index) = @_;

    my $word;

    foreach $word (sort keys %$ref_corpus_index) {
	print STDERR "$word\t";
	print STDERR join(", ", @{$ref_corpus_index->{$word}});
	print STDERR "\n";
    }
}

sub _term_Selection2 {
    my ($ref_corpus_index, $ref_termlist, $ref_tabh_idtrm_select) = @_;
    my $counter;
    my $term;
    my @tab_termlex;
    my $i;
    my $word;
    my $sent_id;
    my $word_found = 0;

    warn "Selecting the terms potentialy appearing in the corpus\n";

    my %tabh_numtrm_select;
  
    for($counter  = 0;$counter < scalar @$ref_termlist;$counter++) {
	$term = lc $ref_termlist->[$counter]->[0];
	@tab_termlex = split /[ \-]+/, $term;
	$word_found = 0;
	$i=0; 
	do {
	    $word = $tab_termlex[$i];
	    if (($word ne "") && ((length($word) > 2) || (scalar(@tab_termlex)==1)) && 
		((exists $ref_corpus_index->{$word}) || (exists $ref_corpus_index->{$word . "s"}))) {
		$word_found = 1;
		if (!exists $ref_tabh_idtrm_select->{$counter}) {
		    my %tabhtmp2;
		    $ref_tabh_idtrm_select->{$counter} = \%tabhtmp2;
		}
		foreach $sent_id (@{$ref_corpus_index->{$word}}) {
		    ${$ref_tabh_idtrm_select->{$counter}}{$sent_id} = 1;
		}
	    }
	    $i++;
	} while((!$word_found) && ($i < scalar @tab_termlex));
    }

    warn "\nEnd of selecting the terms potentialy appearing in the corpus\n";
}

sub term_Selection {
    my ($ref_corpus_index, $ref_termlist, $ref_tabh_idtrm_select) = @_;
    my $counter;
    my $term;
    my @tab_termlex;
    my $i;
    my $word;
    my $sent_id;
    my $word_found = 0;

    my @recordedWords;

    warn "Selecting the terms potentialy appearing in the corpus\n";

    my %tabh_numtrm_select;
    
    for($counter  = 0;$counter < scalar @$ref_termlist;$counter++) {
	$term = lc $ref_termlist->[$counter]->[0];
	@tab_termlex = split /[ \-:]+/, $term;
	$word_found = 0;
	$i=0; 
	@recordedWords = ();
	$word = $tab_termlex[$i];
	while(($i < scalar(@tab_termlex)) && (($word eq "") || 
					      ((exists $ref_corpus_index->{$word}) # || 
					      ))) {
	    if ($word ne "") {
		push @recordedWords, $word;
	    }
	    $i++;
	}
	if ($i == scalar(@tab_termlex)) {
	    foreach $word (@recordedWords) {
		if (!exists $ref_tabh_idtrm_select->{$counter}) {
		    my %tabhtmp2;
		    $ref_tabh_idtrm_select->{$counter} = \%tabhtmp2;
		}
		foreach $sent_id (@{$ref_corpus_index->{$word}}) {
		    ${$ref_tabh_idtrm_select->{$counter}}{$sent_id} = 1;
		}
	    }
	}

    }

    warn "\nEnd of selecting the terms potentialy appearing in the corpus\n";
}

sub term_tagging_offset {
    my ($ref_termlist, $ref_regex_termlist, $ref_tabh_idtrm_select, $ref_tabh_corpus, $offset_tagged_corpus_name) = @_;
    my $counter;
    my $term_regex;
    my $sent_id;
    my $line;

    warn "Term tagging\n";

    open TAGGEDCORPUS, ">>$offset_tagged_corpus_name" or die "$0: $offset_tagged_corpus_name: No such file\n";

    binmode(TAGGEDCORPUS, ":utf8");

    foreach $counter (keys %$ref_tabh_idtrm_select) {
	$term_regex = $ref_regex_termlist->[$counter];
	foreach $sent_id (keys %{$ref_tabh_idtrm_select->{$counter}}){
	    $line = $ref_tabh_corpus->{$sent_id};
	    print STDERR ".";
	    
	    if ($line =~ /[,.?!:;\/ \n\-\/\*'\#\{\}\(\)\[\]\+]($term_regex)[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+]/i) {
		printMatchingTerm(\*TAGGEDCORPUS, $ref_termlist->[$counter], $sent_id);
	    }
	    if ($line =~ /^($term_regex)[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+]/i) {
		printMatchingTerm(\*TAGGEDCORPUS, $ref_termlist->[$counter], $sent_id);
	    }
	    if ($line =~ /[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+]($term_regex)$/i) {
		printMatchingTerm(\*TAGGEDCORPUS, $ref_termlist->[$counter], $sent_id);
	    }
	}
	print STDERR "\n";
    }

close TAGGEDCORPUS;

#########################################################################################################
    warn "\nEnd of term tagging\n";
}

sub printMatchingTerm() {
    my ($descriptor, $ref_matching_term, $sent_id) = @_;

    print $descriptor "$sent_id\t";
    print $descriptor join("\t", @$ref_matching_term);
    print $descriptor "\n";

}


sub term_tagging_offset_tab {
    my ($ref_termlist, $ref_regex_termlist, $ref_tabh_idtrm_select, $ref_tabh_corpus, $ref_tab_results) = @_;
    my $counter;
    my $term_regex;
    my $sent_id;
    my $line;
    my $i;
    my $size_termselect = scalar(keys %$ref_tabh_idtrm_select);

    $i = 0;
    
    foreach $counter (keys %$ref_tabh_idtrm_select) {
  	printf STDERR "Term tagging... %0.1f%%\r", ($i/$size_termselect)*100 ;
	$term_regex = $ref_regex_termlist->[$counter];

	foreach $sent_id (keys %{$ref_tabh_idtrm_select->{$counter}}){
	    $line = $ref_tabh_corpus->{$sent_id};
	    if ($line =~ /[,.?!:;\/ \n\-\/\*'\#\{\}\(\)\[\]\+](?<term>$term_regex)[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+]/is) {
 		printMatchingTerm_tab($ref_termlist->[$counter], $+{term},  $sent_id, $ref_tab_results);
	    }
 	    if ($line =~ /^(?<term>$term_regex)[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+]/is) {
		printMatchingTerm_tab($ref_termlist->[$counter], $+{term}, $sent_id, $ref_tab_results);
	    }
	    if ($line =~ /[,.?!:;\/ \n\-\/\*'\#\(\)\[\]\{\}\+](?<term>$term_regex)$/is) {
		printMatchingTerm_tab($ref_termlist->[$counter], $+{term}, $sent_id, $ref_tab_results);
	    }
	}
	$i++;
    }
    print STDERR "\n";

#########################################################################################################
    warn "\nEnd of term tagging\n";
}

sub printMatchingTerm_tab() {
    my ($ref_matching_term, $term, $sent_id, $ref_tab_results) = @_;

    my $tmp_line = "";
    my $tmp_key;

    if (ref($ref_tab_results) eq "ARRAY") {
	$tmp_line .= "$sent_id\t";
 	$tmp_line .= join ("\t", @$ref_matching_term);
	push @$ref_tab_results, $tmp_line;
    } else {
	if (ref($ref_tab_results) eq "HASH") {
	    my @tab_tmp;
 	    $term =~ s/\\([\-\+\(\)\{\}])/$1/og;
	    $tmp_key .= $sent_id . "_";
	    $tmp_key .= $term;

	    push @tab_tmp, $sent_id;
	    push @tab_tmp, @$ref_matching_term;

	    $ref_tab_results->{$tmp_key} = \@tab_tmp;
	}
    }
}

1;

__END__

=head1 NAME

Alvis::TermTagger - Perl extension for tagging terms in a text

=head1 SYNOPSIS

use Alvis::TermTagger;

Alvis::TermTagger::termtagging($text, $termlist, $outputfile);

or 

Alvis::TermTagger::termtagging($text, $termlist, $outputfile, $lemmatised_text);



=head1 DESCRIPTION

This module is used to tag a text with terms (either with inflected or
lemmatised form of their words). The text or the text corpus
(C<$text>) is a file with one sentence per line. Term list
(C<$termlist>) is a file containing one term per line. For each term,
additionnal information (as canonical form, a semantic tag and the
lemmatised word of the term) can be given after the first column. This
information can be separated by either a column, either by a vertical
bar. Each line of the output file (C<$outputfile>) contains the
sentence number, the term, additional information, all separated by a
tabulation character. The lemmatised text (C<$lemmatised_text>) is
build as the concatenation of the lemma of the word of the corpus;

This module is mainly used in the Alvis NLP Platform.

=head1 METHODS



=head2 termtagging()

    termtagging($corpus_filename, $term_list_filename, $output_filename, $lemmatised_corpus_filename);

This is the main method of module. It loads the term list
(C<$term_list_filename>) and tags the text corpus
(C<$corpus_filename>). It produces the list of matching terms and the
sentence offset (and additional information given in the input file)
where the terms can be found. The file C<$output_filename> contains
this output.  To look up the lemmatised term (as a concatenation of
lemmatised word), the lemmatised corpus C<$lemmatised_corpus_filename>
has to be specified as fourth argument of the method.

=head2 load_TermList()

    load_TermList($term_list_filename,\@term_list);

This method loads the term list (C<$term_list_filename> is the file
name) in the array given by reference (C<\@term_list>). Each element
of term list contains a reference to a two element array (the term and
its canonical form).


=head2 get_Regex_TermList()

    get_Regex_TermList(\@term_list, \@regex_term_list, \@ref_regex_lemmaWordtermlist);

This method generates the regular expression from the term list
(C<\@term_list>). stored in the specific array
(C<\@regex_term_list>). C<\@ref_regex_lemmaWordtermlist> records the
regular expression for the term lemma.


=head2 load_Corpus()

    load_Corpus($corpus_filename\%corpus, \%lc_corpus);

This method loads the corpus (C<$corpus_filename>) in hashtable
(C<\%corpus>) and prepares the corpus in lower case (recorded in a
specific hashtable, C<\%lc_corpus>)



=head2 corpus_Indexing()

    corpus_Indexing(\%lc_corpus, \%corpus_index);

This method indexes the lower case version of the corpus
(C<\%lc_corpus>) according the words C<\%corpus_index> (the index is a
hashtable given by reference).

=head2 print_corpus_index()

    print_corpus_index(\%corpus_index);

This method prints on STDERR the corpus index C<\%corpus_index>.

=head2 term_Selection()

    term_Selection(\%corpus_index, \@term_list, \%idtrm_select);

This method selects the terms from the term list (C<\@term_list>)
potentially appearing in the corpus (that is the indexed corpus,
C<\%corpus_index>). Results are recorded in the hash table
C<\%idtrm_select>.


=head2 term_tagging_offset()

    term_tagging_offset(\@term_list, \@regex_term_list, \%idtrm_select, \%corpus, $output_filename);

This method tags the corpus C<\%corpus> with the terms (issued from
the term list C<\@term_list>, C<\@regex_term_list> is the term list
with regular expression), and selected in a previous step
(C<\%idtrm_select>). Resulting selected terms are recorded with their
offset, and additional information in the file C<$output_filename>.


=head2 term_tagging_offset_tab()

    term_tagging_offset_tab(\@term_list, \@regex_term_list, \%idtrm_select, \%corpus, \@tab_results);

or 

    term_tagging_offset_tab(\@term_list, \@regex_term_list, \%idtrm_select, \%corpus, \%tabh_results);

This method tags the corpus C<\%corpus> with the terms (issued from
the term list C<\@term_list>, C<\@regex_term_list> is the term list
with regular expression), and selected in a previous step
(C<\%idtrm_select>). Resulting selected terms are recorded with their
offset, and additional information in the array C<@tab_results>
(values are sentence id, selected terms and additional information
separated by tabulation) or in the hashtable C<%tabh_results> (keys
form is "sentenceid_selectedterm", values are an array reference
containing sentence id, selected terms and additional ifnormation).

=head2 printMatchingTerm

    printMatchingTerm($descriptor, $ref_matching_term, $sentence_id);

This method prints into the file descriptor C<$descriptor>, the
sentence id (C<$sentence_id>) and the matching term (named by its
reference C<$ref_matching_term>). Both data are on a line and are
separated by a tabulation character.

=head2 printMatchingTerm_tab

    printMatchingTerm_tab($ref_matching_term, $sentence_id, $ref_tab_results);

This method stores into C<$ref_tab_results>, the sentence id
(C<$sentence_id>) and the matching term (named by its reference
C<$ref_matching_term>). C<$ref_tab_results> can be a array or a hash
table. In case of an array, both data are concatanated in a line and
are separated by a tabulation character. In case of a hash table, both
data are stored in an array, hash key is the concatenation of the
sentence id and the matching term.


=head1 SEE ALSO

Alvis web site: http://www.alvis.info

=head1 AUTHORS

Thierry Hamon <thierry.hamon@univ-paris13.fr>

=head1 LICENSE

Copyright (C) 2006 by Thierry Hamon

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.6 or,
at your option, any later version of Perl 5 you may have available.
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)