{
'module' => {
'program' => 'uplug-coocfreq-slow',
'location' => '$UplugBin',
'name' => 'co-occurrence frequency counter',
# 'stdin' => 'bitext',
},
'description' => 'This modules counts co-occurrence frequencies of
words and phrases.',
'input' => {
'bitext' => {
'stream name' => 'runtime xml',
},
},
'output' => {
'cooc freq' => {
'stream name' => 'cooc freq',
},
'source freq' => {
'stream name' => 'source freq',
},
'target freq' => {
'stream name' => 'target freq',
}
},
'parameter' => {
'token' => {
#------------------------------------------------------------------
# token pair features
# define contextual features for counting
# for example:
#
# 'features (source)' => { # source language features:
# 'left:pos' => '^(..).*$$1', # 1st 2 char's of POS of the left neighbor
# '#text' => '(.{4})$$1', # last 4 char's of the word itself
# 'c.*:right:type' => undef, # type attribute of a right neighbor
# }, # of a parent-tag that starts with 'c'
# 'features (target)' => { # target language features:
# 'pos' => undef, # POS-attribute of the current token
# },
#
#------------------------------------------------------------------
#------------------------------------------------------------------
# other token parameters:
# chunks: use marked chunks, argument: xml-tag-pattern
# minimal frequency: threshold for token pair frequencies
#
# 'minimal length diff' => 0.1, # string length difference ratio
# 'matching word class' => 'same', # don't mix content and stop words
# 'minimal length (source)' => 2,
# 'minimal length (target)' => 2,
# 'use attribute (source)' => 'stem', # use the 'stem'-attribute
# 'use attribute (target)' => 'stem', # for all tokens
# 'grep token (source)' => 'alphabetic', # restrict tokens to
# 'grep token (target)' => 'alphabetic', # alphabetic only
# 'exclude stop words (source)' => 0, # don't count stop words
# 'exclude stop words (target)' => 0,
# 'language (source)' => 'english', # use language-specific
# 'language (target)' => 'swedish', # information (inilang.ini)
#------------------------------------------------------------------
'chunks (source)' => 'c.*', # use marked chunks
'chunks (target)' => 'c.*', # use marked chunks
'minimal frequency' => 2,
'minimal frequency (source)' => 2,
'minimal frequency (target)' => 2,
'maximal ngram length (source)' => 1, # >1 --> use N-grams
'maximal ngram length (target)' => 1, # >1 --> use N-grams
'lower case (source)' => 0, # =1 --> lower case
'lower case (target)' => 0, # =1 --> lower case
'token label' => 'w', # xml-tag for (single) tokens
'remove linked' => 1, # =1 --> don't count aligned data!
},
'runtime' => {
#------------------------------------------------------------------
# runtime parameters
#
'print progress' => 1, # verbose output
'buffer' => 2000000, # number of token pairs buffered in a hash
'source buffer' => 2000000, # source token buffer
'target buffer' => 2000000, # target token buffer
#------------------------------------------------------------
# clean buffer:
# if set to 1: remove low-frequency-pairs from the buffer in
# cases of buffer overflows
'clean buffer' => 1,
#------------------------------------------------------------
},
},
#------------------------------------------------------------------
'arguments' => {
'shortcuts' => {
'src' => 'output:source freq:file',
'trg' => 'output:target freq:file',
'cooc' => 'output:cooc freq:file',
'freq' => 'parameter:token:minimal frequency',
'srclang' => 'parameter:token:language (source)',
'trglang' => 'parameter:token:language (target)',
'max' => 'parameter:runtime:max nr of segments',
'buf' => 'parameter:runtime:buffer',
'clean' => 'parameter:runtime:clean buffer',
'sa' => 'parameter:token:use attribute (source)',
'ta' => 'parameter:token:use attribute (target)',
'w' => 'parameter:token:token label',
}
},
'widgets' => {
}
}