<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
"http://www.w3.org/TR/html4/loose.dtd">
<html >
<head><title>APPENDIX</title>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
<meta name="generator" content="TeX4ht (http://www.cse.ohio-state.edu/~gurari/TeX4ht/)">
<meta name="originator" content="TeX4ht (http://www.cse.ohio-state.edu/~gurari/TeX4ht/)">
<!-- html,2 -->
<meta name="src" content="DocMain.tex">
<meta name="date" content="2009-06-16 09:20:00">
<link rel="stylesheet" type="text/css" href="DocMain.css">
</head><body
>
<!--l. 40--><div class="crosslinks"><p class="noindent">[<a
href="#tailDocMainse11.html">tail</a>] [<a
href="# " >up</a>] </p></div>
<h3 class="sectionHead"><span class="titlemark">A </span> <a
id="x45-193000A"></a>APPENDIX</h3>
<!--l. 1--><p class="noindent" >
<h4 class="subsectionHead"><span class="titlemark">A.1 </span> <a
id="x45-194000A.1"></a>Simple installation test</h4>
<!--l. 4--><p class="noindent" >The following simple script is available in the <span
class="ectt-1095">doc/InstallationTest.pl </span>file. It must be run as
’root’ and tests that basic functions of the Combine installation works.
<!--l. 8--><p class="indent" > Basicly it creates and initializes a new jobname, crawls one specific test page and
exports it as XML. This XML is then compared to a correct XML-record for that
page.
<!--l. 12--><p class="noindent" >
<h5 class="subsubsectionHead"><span class="titlemark">A.1.1 </span> <a
id="x45-195000A.1.1"></a>InstallationTest.pl</h5>
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
use strict;
 <br />if ( $> != 0 ) {
 <br />    die("You have to run this test as root");
 <br />}
 <br />
 <br />my $orec=’’;
 <br />while (<DATA>) { chop; $orec .= $_; }
 <br />
 <br />$orec =~ s|<checkedDate>.*</checkedDate>||;
 <br />$orec =~ tr/\n\t //d;
 <br />
 <br />my $olen=length($orec);
 <br />my $onodes=0;
 <br />while ( $orec =~ m/</g ) { $onodes++; }
 <br />print "ORIG Nodes=$onodes; Len=$olen\n";
 <br />
 <br />our $jobname;
 <br />require ’./t/defs.pm’;
 <br />
 <br />system("combineINIT --jobname $jobname --topic /etc/combine/Topic_carnivor.txt > /dev/null");
 <br />
 <br />system("combine --jobname $jobname --harvest http://combine.it.lth.se/CombineTests/InstallationTest.html");
 <br />open(REC,"combineExport --jobname $jobname |");
 <br />my $rec=’’;
 <br />while (<REC>) { chop; $rec .= $_; }
 <br />close(REC);
 <br />$rec =~ s|<checkedDate>.*</checkedDate>||;
 <br />$rec =~ tr/\n\t //d;
 <br />
 <br />my $len=length($rec);
 <br />my $nodes=0;
 <br />while ( $rec =~ m/</g ) { $nodes++; }
 <br />print "NEW Nodes=$nodes; Len=$len\n";
 <br />
 <br />my $OK=0;
 <br />
 <br />if ($onodes == $nodes) { print "Number of XML nodes match\n"; }
 <br />else { print "Number of XML nodes does NOT match\n"; $OK=1; }
 <br />if ($olen == $len) {
 <br />  print "Size of XML match\n";
 <br />} else {
 <br />  $orec =~  s|<originalDocument.*</originalDocument>||s;
 <br />  $rec =~  s|<originalDocument.*</originalDocument>||s;
 <br />  if (length($orec) == length($rec)) { print "Size of XML match (after removal of ’originalDocument’)\n";}
 <br />  else { print "Size of XML does NOT match\n"; $OK=1; }
 <br />}
 <br />
 <br />if (($OK == 0) && ($orec eq $rec)) { print "All tests OK\n"; }
 <br />else { print "There might be some problem with your Combine Installation\n"; }
 <br />
 <br />__END__
 <br /><?xml version="1.0" encoding="UTF-8"?>
 <br /><documentCollection version="1.1" xmlns="http://alvis.info/enriched/">
 <br /><documentRecord id="80AC707F96BC57DFEF78C815F6FABD57">
 <br /><acquisition>
 <br /><acquisitionData>
 <br /><modifiedDate>2006-12-05 13:20:25</modifiedDate>
 <br /><checkedDate>2006-10-03 9:06:42</checkedDate>
 <br /><httpServer>Apache/1.3.29 (Debian GNU/Linux) PHP/4.3.3</httpServer>
 <br /><urls>
 <br />    <url>http://combine.it.lth.se/CombineTests/InstallationTest.html</url>
 <br />  </urls>
 <br /></acquisitionData>
 <br /><originalDocument mimeType="text/html" compression="gzip" encoding="base64" charSet="UTF-8">
 <br />H4sIAAAAAAAAA4WQsU7DMBCG9zzF4bmpBV2QcDKQVKJSKR2CEKObXBSrjm3sSyFvT0yCQGJgusG/
 <br />//u+E1flU1G9HrfwUD3u4fh8v98VwFLOXzYF52VVzg+b9Q3n2wPLE9FRr+NA2UyDFGnMdyaQ1FqS
 <br />sgYIA0FrPRS2PymDgs+hRPRIEozsMWNnHN+tbwKD2hpCQxkrpDfqYr0dAjgtDYUVlN4G9HIFB3RT
 <br />qMPAvns6Ipfi26Au09e5I61Gh78aCT+IR947qDvpA1I2UJvexg6+CJxsM0ad6/8kpkQiXB5XSWUC
 <br />BNsj/GGG4LBWrarhSw+0OiOIidZjmzGPeh15WL6ICS7zFUjT/AiuBXeRbwHj870/AeRYaTupAQAA
 <br /></originalDocument>
 <br /><canonicalDocument>
 <br />  <section>
 <br />    <section title="Installation test for Combine">
 <br />      <section>Installation test for Combine</section>
 <br />      <section>Contains some Carnivorous plant specific words like <ulink url="rel.html">Drosera </ulink>, and Nepenthes.</section></section></section></canonicalDocument>
 <br /><metaData>
 <br />    <meta name="title">Installation test for Combine</meta>
 <br />    <meta name="dc:format">text/html</meta>
 <br />    <meta name="dc:format">text/html; charset=iso-8859-1</meta>
 <br />    <meta name="dc:subject">Carnivorous plants</meta>
 <br />    <meta name="dc:subject">Drosera</meta>
 <br />    <meta name="dc:subject">Nepenthes</meta>
 <br />  </metaData>
 <br /><links>
 <br />    <outlinks>
 <br />      <link type="a">
 <br />        <anchorText>Drosera</anchorText>
 <br />        <location>http://combine.it.lth.se/CombineTests/rel.html</location>
 <br />      </link>
 <br />    </outlinks>
 <br />  </links>
 <br /><analysis>
 <br /><property name="topLevelDomain">se</property>
 <br /><property name="univ">1</property>
 <br /><property name="language">en</property>
 <br /><topic absoluteScore="1000" relativeScore="110526">
 <br />    <class>ALL</class>
 <br />  </topic>
 <br /><topic absoluteScore="375" relativeScore="41447">
 <br />    <class>CP.Drosera</class>
 <br />    <terms>drosera</terms>
 <br />  </topic>
 <br /><topic absoluteScore="375" relativeScore="41447">
 <br />    <class>CP.Nepenthes</class>
 <br />    <terms>nepenthe</terms>
 <br />  </topic>
 <br /><topic absoluteScore="250" relativeScore="27632">
 <br />    <class>CP</class>
 <br />    <terms>carnivorous plant</terms>
 <br />    <terms>carnivor</terms>
 <br />  </topic>
 <br /></analysis>
 <br /></acquisition>
 <br /></documentRecord>
 <br />
 <br /></documentCollection>
</div>
</td></tr></table>
<!--l. 130--><p class="nopar" >
<h4 class="subsectionHead"><span class="titlemark">A.2 </span> <a
id="x45-196000A.2"></a>Example topic filter plug in</h4>
<!--l. 4--><p class="noindent" >This example gives more details on how to write a topic filter Plug-In.
<!--l. 7--><p class="noindent" >
<h5 class="subsubsectionHead"><span class="titlemark">A.2.1 </span> <a
id="x45-197000A.2.1"></a>classifyPlugInTemplate.pm</h5>
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
#Template for writing a classify PlugIn for Combine
 <br />#See documentation at http://combine.it.lth.se/documentation/
 <br />
 <br />package classifyPlugInTemplate; #Change to your own module name
 <br />
 <br />use Combine::XWI; #Mandatory
 <br />use Combine::Config; #Optional if you want to use the Combine configuration system
 <br />
 <br />#API:
 <br />#  a subroutine named ’classify’ taking a XWI-object as in parameter
 <br />#    return values: 0/1
 <br />#        0: record fails to meet the classification criteria, ie ignore this record
 <br />#        1: record is OK and should be stored in the database, and links followed by the crawler
 <br />sub classify {
 <br />  my ($self,$xwi) = @_;
 <br />
 <br />  #utility routines to extract information from the XWI-object
 <br />  #URL (can be several):
 <br />   # $xwi->url_rewind;
 <br />   # my $url_str="";
 <br />   # my $t;
 <br />   # while ($t = $xwi->url_get) { $url_str .= $t . ", "; }
 <br />
 <br />  #Metadata:
 <br />   #  $xwi->meta_rewind;
 <br />   #  my ($name,$content);
 <br />   #  while (1) {
 <br />   #    ($name,$content) = $xwi->meta_get;
 <br />   #    last unless $name;
 <br />   #    next if ($name eq ’Rsummary’);
 <br />   #    next if ($name =~ /^autoclass/);
 <br />   #    $meta .= $content . " ";
 <br />   #  }
 <br />
 <br />  #Title:
 <br />   #  $title = $xwi->title;
 <br />
 <br />  #Headings:
 <br />   #  $xwi->heading_rewind;
 <br />   #  my $this;
 <br />   #  while (1) {
 <br />   #    $this = $xwi->heading_get or last;
 <br />   #    $head .= $this . " ";
 <br />   #  }
 <br />
 <br />  #Text:
 <br />   #  $this = $xwi->text;
 <br />   #  if ($this) {
 <br />   #    $text = $$this;
 <br />   #  }
 <br />
 <br />###############################
 <br />#Apply your classification algorithm here
 <br />#  assign $result a value (0/1)
 <br />###############################
 <br />
 <br />  #utility routines for saving detailed results (optional) in the database. These data may appear
 <br />  # in exported XML-records
 <br />
 <br />  #Topic takes 5 parameters
 <br />  # $xwi->topic_add(topic_class_notation, topic_absolute_score, topic_normalized_score, topic_terms, algorithm_id);
 <br />  #  topic_class_notation, topic_terms, and algorithm_id are strings
 <br />  #    max length topic_class_notation: 50, algorithm_id: 25
 <br />  #  topic_absolute_score, and topic_normalized_score are integers
 <br />  #  topic_normalized_score and topic_terms are optional and may be replaced with 0, ’’ respectively
 <br />
 <br />  #Analysis takes 2 parameters
 <br />  # $xwi->robot_add(name,value);
 <br />  # both are strings with max length name: 15, value: 20
 <br />
 <br />    # return true (1) if you want to keep the record
 <br />    # otherwise return false (0)
 <br />
 <br />  return $result;
 <br />}
 <br />
 <br />1;
</div>
</td></tr></table>
<!--l. 86--><p class="nopar" >
<h4 class="subsectionHead"><span class="titlemark">A.3 </span> <a
id="x45-198000A.3"></a>Default configuration files</h4>
<!--l. 4--><p class="noindent" >
<h5 class="subsubsectionHead"><span class="titlemark">A.3.1 </span> <a
id="x45-199000A.3.1"></a>Global</h5>
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
#@#Default configuration values Combine system
 <br />
 <br />#Direct connection to Zebra indexing - for SearchEngine-in-a-box (default no connection)
 <br />#@#ZebraHost = NoDefaultValue
 <br />ZebraHost =
 <br />
 <br />#Direct connection to Solr indexing
 <br />#@#SolrHost = NoDefaultValue
 <br />SolrHost =
 <br />
 <br />#Enable(1)/disable(0) fulltext-index in MySQL table search
 <br />MySQLfulltext = 0
 <br />
 <br />#Use a proxy server if this is defined (default no proxy)
 <br />#@#httpProxy = NoDefaultValue
 <br />httpProxy =
 <br />
 <br />#Enable(1)/disable(0) automatic recycling of new links
 <br />AutoRecycleLinks = 1
 <br />
 <br />#User agent handles redirects (1) or treat redirects as new links (0)
 <br />UserAgentFollowRedirects = 0
 <br />
 <br />#Number of pages to process before restarting the harvester
 <br />HarvesterMaxMissions = 500
 <br />
 <br />#Logging level (0 (least) - 10 (most))
 <br />Loglev = 0
 <br />
 <br />#Enable(1)/disable(0) analysis of genre, language
 <br />doAnalyse = 1
 <br />analysePlugin =
 <br />relTextPlugin =
 <br />
 <br />#How long the summary should be. Use 0 to disable the summarization code
 <br />SummaryLength   = 0
 <br />
 <br />#Store(1)/do not store(0) the raw HTML in the database
 <br />saveHTML = 1
 <br />
 <br />#Use(1)/do not use(0) Tidy to clean the HTML before parsing it
 <br />useTidy = 0
 <br />
 <br />#Use(1)/do not use(0) OAI record status keeping in SQL database
 <br />doOAI = 1
 <br />
 <br />#Extract(1)/do not extract(0) links from plain text
 <br />extractLinksFromText = 1
 <br />
 <br />#Enable(1)/disable(0) topic classification (focused crawling)
 <br />#Generated by combineINIT based on --topic parameter
 <br />doCheckRecord = 0
 <br />
 <br />#Which topic classification PlugIn module algorithm to use
 <br />#Combine::Check_record and Combine::PosCheck_record included by default
 <br />#NEW SVM classifier: Combine::classifySVM
 <br />#see classifyPlugInTemplate.pm and documentation to write your own
 <br />classifyPlugIn = Combine::Check_record
 <br />
 <br />#Filename for the SVM model
 <br />#@#SVMmodel = NoDefaultValue
 <br />SVMmodel =
 <br />
 <br />###Parameters for Std topic classification algorithm
 <br />###StdTitleWeight = 10 #
 <br />###StdMetaWeight = 4 #
 <br />###StdHeadingsWeight = 2 #
 <br />###StdCutoffRel = 10 #Class score must be above this % to be counted
 <br />###StdCutoffNorm = 0.2 #normalised cutoff for summed normalised score
 <br />###StdCutoffTot = 90 #non normalised cutoff for summed total score
 <br />
 <br />###Parameters for Pos topic classification algorithm
 <br />###PosCutoffRel = 1 #Class score must be above this % to be counted
 <br />###PosCutoffNorm = 0.002 #normalised cutoff for summed normalised score
 <br />###PosCutoffTot = 1 #non normalised cutoff for summed total score
 <br />
 <br />HarvestRetries                  = 5
 <br />SdqRetries                      = 5
 <br />
 <br />#Maximum length of a URL; longer will be silently discarded
 <br />maxUrlLength = 250
 <br />
 <br />#Time in seconds to wait for a server to respond
 <br />UAtimeout = 30
 <br />
 <br />#If we have seen this page before use Get-If-Modified (1) or not (0)
 <br />UserAgentGetIfModifiedSince = 1
 <br />
 <br />WaitIntervalExpirationGuaranteed = 315360000
 <br />WaitIntervalHarvesterLockNotFound = 2592000
 <br />WaitIntervalHarvesterLockNotModified = 2592000
 <br />WaitIntervalHarvesterLockRobotRules = 2592000
 <br />WaitIntervalHarvesterLockUnavailable = 86400
 <br />WaitIntervalRrdLockDefault = 86400
 <br />WaitIntervalRrdLockNotFound = 345600
 <br />WaitIntervalRrdLockSuccess = 345600
 <br />
 <br />#Time in seconds after succesfull download before allowing a page to be downloaded again (around 11 days)
 <br />WaitIntervalHarvesterLockSuccess = 1000000
 <br />
 <br />#Time in seconds to wait before making a new reschedule if a reschedule results in an empty ready que
 <br />WaitIntervalSchedulerGetJcf = 20
 <br />
 <br />#Minimum time between accesses to the same host. Must be positive
 <br />WaitIntervalHost = 60
 <br />
 <br />#URL scheduling algorithm
 <br />SchedulingAlgorithm = default
 <br />
 <br />#Identifies MySQL database name, user and host
 <br />MySQLdatabase   = NoDefaultValue
 <br />
 <br />#Base directory for configuration files; initialized by Config.pm
 <br />#@#baseConfigDir = /etc/combine
 <br />
 <br />#Directory for job specific configuration files; taken from ’jobname’
 <br />#@#configDir = NoDefaultValue
 <br />
 <br /><binext>
 <br />#Extensions of binary files
 <br />arff
 <br />au
 <br />avi
 <br />class
 <br />exe
 <br />fig
 <br />gif
 <br />gz
 <br />hqx
 <br />ica
 <br />jpeg
 <br />jpg
 <br />mat
 <br />mdb
 <br />mov
 <br />mp3
 <br />mpeg
 <br />mpg
 <br />msi
 <br />pcx
 <br />pdb
 <br />psd
 <br />ram
 <br />rar
 <br />raw
 <br />rmd
 <br />rmx
 <br />sav
 <br />sdd
 <br />shar
 <br />tar
 <br />tga
 <br />tgz
 <br />tif
 <br />tiff
 <br />vo
 <br />wav
 <br />wmv
 <br />wmz
 <br />xbm
 <br />xpm
 <br />z
 <br />zip
 <br /></binext>
 <br />
 <br /><converters>
 <br />#Configure which converters can be used to produce a XWI object
 <br />#Format:
 <br />#  1 line per entry
 <br />#  each entry consists of 3 ’;’ separated fields
 <br />#
 <br />#Entries are processed in order and the first match is executed
 <br />#  external converters have to be found via PATH and executable to be considered a match
 <br />#  the external converter command should take a filename as parameter and convert that file
 <br />#   the result should be comming on STDOUT
 <br />#
 <br /># mime-type   ;   External converter command ; Internal converter
 <br />
 <br />text/html ; ; GuessHTML
 <br />#Check this
 <br />www/unknown ; ; GuessHTML
 <br />text/plain ; ; GuessText
 <br />text/x-tex ;  tth -g -w1 -r <  ; TeXHTML
 <br />application/x-tex ;  tth -g -w1 -r < ; TeXHTML
 <br />text/x-tex ; untex -a -e -giso ; TeXText
 <br />application/x-tex ; untex -a -e -giso ; TeXText
 <br />text/x-tex ;  ; TeX
 <br />application/x-tex ; ; TeX
 <br />application/pdf ; pdftohtml -i -noframes -nomerge -nodrm -stdout ; HTML
 <br />application/pdf ; pstotext ; Text
 <br />application/postscript ; pstotext ; Text
 <br />application/msword ; antiword -t ; Text
 <br />application/vnd.ms-excel ; xlhtml -fw ; HTML
 <br />application/vnd.ms-powerpoint ; ppthtml ; HTML
 <br />application/rtf ; unrtf --nopict --html ; HTML
 <br />image/gif ; ; Image
 <br />image/jpeg ; ; Image
 <br />image/tiff ; ; Image
 <br /></converters>
 <br />
 <br /><url>
 <br />  <exclude>
 <br />    #Exclude URLs or hostnames that matches these regular expressions
 <br />    #Malformed hostnames
 <br />    HOST: http:\/\/\.
 <br />    HOST: \@
 <br />  </exclude>
 <br /></url>
</div>
</td></tr></table>
<!--l. 215--><p class="nopar" >
<h5 class="subsubsectionHead"><span class="titlemark">A.3.2 </span> <a
id="x45-200000A.3.2"></a>Job specific</h5>
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
#Please change
 <br />Operator-Email      = "YourEmailAdress@YourDomain"
 <br />
 <br />#Password not used yet. (Please change)
 <br />Password    = "XxXxyYzZ"
 <br />
 <br /><converters>
 <br />#Configure which converters can be used to produce a XWI object
 <br />#Format:
 <br />#  1 line per entry
 <br />#  each entry consists of 3 ’;’ separated fields
 <br />#
 <br />#Entries are processed in order and the first match is executed
 <br />#  external converters have to be found via PATH and executable to be considered a match
 <br />#  the external converter command should take a filename as parameter and convert that file
 <br />#   the result should be comming on STDOUT
 <br />#
 <br /># mime-type   ;   External converter command ; Internal converter
 <br />
 <br />application/pdf ; MYpdftohtml -i -noframes -nomerge -stdout ; HTML
 <br /></converters>
 <br />
 <br /><url>
 <br />#List of servernames that are aliases are in the file ./config_serveralias
 <br />#    (automatically updated by other programs)
 <br />#use one server per line
 <br />#example
 <br />#www.100topwetland.com  www.100wetland.com
 <br />#  means that www.100wetland.com is replaced by www.100topwetland.com during URL normalization
 <br /><serveralias>
 <br /><<include config_serveralias>>
 <br /></serveralias>
 <br />
 <br />#use either URL or HOST: (obs ’:’) to match regular expressions to
 <br /># either the full URL or the HOST part of a URL.
 <br /><allow>
 <br />#Allow crawl of URLs or hostnames that matches these regular expressions
 <br />HOST: .*$
 <br /></allow>
 <br />
 <br /><exclude>
 <br />#Exclude URLs or hostnames that matches these regular expressions
 <br /># default: CGI and maps
 <br />URL cgi-bin|htbin|cgi|\?|\.map$|_vti_
 <br />
 <br /># default: binary files
 <br />URL \.exe$|\.zip$|\.tar$|\.tgz$|\.gz$|\.hqx$|\.sdd$|\.mat$|\.raw$
 <br />URL \.EXE$|\.ZIP$|\.TAR$|\.TGZ$|\.GZ$|\.HQX$|\.SDD$|\.MAT$|\.RAW$
 <br />
 <br /># default: Unparsable documents
 <br />URL \.shar$|\.rmx$|\.rmd$|\.mdb$|\.sav$
 <br />URL \.SHAR$|\.RMX$|\.RMD$|\.MDB$|\.SAV$
 <br />
 <br /># default: images
 <br />URL \.gif$|\.jpg$|\.jpeg$|\.xpm$|\.tif$|\.tiff$|\.mpg$|\.mpeg$|\.mov$|\.wav$|\.au$|\.pcx$|\.xbm$|\.tga$|\.psd$
 <br />URL \.GIF$|\.JPG$|\.JPEG$|\.XPM$|\.TIF$|\.TIFF$|\.MPG$|\.MPEG$|\.MOV$|\.WAV$|\.AU$|\.PCX$|\.XBM$|\.TGA$|\.PSD$
 <br />
 <br /># default: other binary formats
 <br />URL \.pdb$|\.class$|\.ica$|\.ram$|\.wmz$|\.arff$|\.rar$|\.vo$|\.fig$|\.mp3$|\.wmv$|\.avi$|\.msi$
 <br />URL \.PDB$|\.CLASS$|\.ICA$|\.RAM$|\.WMZ$|\.ARFF$|\.RAR$|\.VO$|\.FIG$|\.MP3$|\.WMV$|\.AVI$|\.MSI$
 <br />
 <br />#more excludes in the file config_exclude (automatically updated by other programs)
 <br /><<include config_exclude>>
 <br /></exclude>
 <br /><sessionids>
 <br />#patterns to recognize and remove sessionids in URLs
 <br />sessionid
 <br />lsessionid
 <br />jsessionid
 <br />SID
 <br />PHPSESSID
 <br />SessionID
 <br />BV_SessionID
 <br /></sessionids>
 <br />#url is just a conatiner for all URL related configuration patterns
 <br /></url>
</div>
</td></tr></table>
<!--l. 295--><p class="nopar" >
<h4 class="subsectionHead"><span class="titlemark">A.4 </span> <a
id="x45-201000A.4"></a>SQL database</h4>
<!--l. 3--><p class="noindent" >
<h5 class="subsubsectionHead"><span class="titlemark">A.4.1 </span> <a
id="x45-202000A.4.1"></a>Create database</h5>
<!--l. 4--><p class="noindent" ><span class="obeylines-h"><span class="verb"><span
class="ectt-1095">DROP</span><span
class="ectt-1095"> DATABASE</span><span
class="ectt-1095"> IF</span><span
class="ectt-1095"> EXISTS</span><span
class="ectt-1095"> $database;</span></span></span><br
class="newline" /><span class="obeylines-h"><span class="verb"><span
class="ectt-1095">CREATE</span><span
class="ectt-1095"> DATABASE</span><span
class="ectt-1095"> $database</span><span
class="ectt-1095"> DEFAULT</span><span
class="ectt-1095"> CHARACTER</span><span
class="ectt-1095"> SET</span><span
class="ectt-1095"> utf8;</span></span></span><br
class="newline" /><span class="obeylines-h"><span class="verb"><span
class="ectt-1095">USE</span><span
class="ectt-1095"> $database;</span></span></span><br
class="newline" />
<h5 class="subsubsectionHead"><span class="titlemark">A.4.2 </span> <a
id="x45-203000A.4.2"></a>Creating MySQL tables</h5>
<!--l. 8--><p class="noindent" ><span class="obeylines-h"><span class="verb"><span
class="ectt-1095">All</span><span
class="ectt-1095"> tables</span><span
class="ectt-1095"> use</span><span
class="ectt-1095"> UTF-8</span></span></span><br
class="newline" /><span class="obeylines-h"><span class="verb"></span></span><br
class="newline" /><span class="obeylines-h"><span class="verb"><span
class="ectt-1095">Summary</span><span
class="ectt-1095"> tables</span><span
class="ectt-1095"> ’^’=primary</span><span
class="ectt-1095"> key,</span><span
class="ectt-1095"> ’*’=key:</span></span></span><br
class="newline" /><span class="obeylines-h"><span class="verb"><span
class="ectt-1095">TABLE</span><span
class="ectt-1095"> hdb:</span><span
class="ectt-1095"> recordid^,</span><span
class="ectt-1095"> type,</span><span
class="ectt-1095"> dates,</span><span
class="ectt-1095"> server,</span><span
class="ectt-1095"> title,</span><span
class="ectt-1095"> ip,</span><span
class="ectt-1095"> ...</span></span></span><br
class="newline" /><span class="obeylines-h"><span class="verb"><span
class="ectt-1095">TABLE</span><span
class="ectt-1095"> links:</span><span
class="ectt-1095"> recordid*,</span><span
class="ectt-1095"> mynetlocid*,</span><span
class="ectt-1095"> urlid*,</span><span
class="ectt-1095"> netlocid*,</span><span
class="ectt-1095"> linktype,</span><span
class="ectt-1095"> anchor</span><span
class="ectt-1095"> </span><span
class="ectt-1095"> (netlocid</span><span
class="ectt-1095"> for</span><span
class="ectt-1095"> urlid!!)</span></span></span><br
class="newline" /><span class="obeylines-h"><span class="verb"><span
class="ectt-1095">TABLE</span><span
class="ectt-1095"> meta:</span><span
class="ectt-1095"> recordid*,</span><span
class="ectt-1095"> </span><span
class="ectt-1095"> name,</span><span
class="ectt-1095"> value</span></span></span><br
class="newline" /><span class="obeylines-h"><span class="verb"><span
class="ectt-1095">TABLE</span><span
class="ectt-1095"> html:</span><span
class="ectt-1095"> recordid^,</span><span
class="ectt-1095"> html</span></span></span><br
class="newline" /><span class="obeylines-h"><span class="verb"><span
class="ectt-1095">TABLE</span><span
class="ectt-1095"> analys:</span><span
class="ectt-1095"> recordid*,</span><span
class="ectt-1095"> name,</span><span
class="ectt-1095"> value</span></span></span><br
class="newline" /><span class="obeylines-h"><span class="verb"><span
class="ectt-1095">TABLE</span><span
class="ectt-1095"> topic:</span><span
class="ectt-1095"> recordid*,</span><span
class="ectt-1095"> notation*,</span><span
class="ectt-1095"> absscore,</span><span
class="ectt-1095"> relscore,</span><span
class="ectt-1095"> terms,</span><span
class="ectt-1095"> algorithm</span></span></span><br
class="newline" /><span class="obeylines-h"><span class="verb"><span
class="ectt-1095">TABLE</span><span
class="ectt-1095"> localtags:</span><span
class="ectt-1095"> netlocid,</span><span
class="ectt-1095"> urlid,</span><span
class="ectt-1095"> name,</span><span
class="ectt-1095"> value</span></span></span><br
class="newline" /><span class="obeylines-h"><span class="verb"><span
class="ectt-1095">TABLE</span><span
class="ectt-1095"> search:</span><span
class="ectt-1095"> recordid^,</span><span
class="ectt-1095"> stext*</span></span></span><br
class="newline" /><span class="obeylines-h"><span class="verb"></span></span><br
class="newline" /><span class="obeylines-h"><span class="verb"><span
class="ectt-1095">(TABLE</span><span
class="ectt-1095"> netlocalias:</span><span
class="ectt-1095"> netlocid*,</span><span
class="ectt-1095"> netlocstr^)</span></span></span><br
class="newline" /><span class="obeylines-h"><span class="verb"><span
class="ectt-1095">(TABLE</span><span
class="ectt-1095"> urlalias:</span><span
class="ectt-1095"> urlid*,</span><span
class="ectt-1095"> urlstr^)</span></span></span><br
class="newline" /><span class="obeylines-h"><span class="verb"><span
class="ectt-1095">TABLE</span><span
class="ectt-1095"> topichierarchy:</span><span
class="ectt-1095"> node^,</span><span
class="ectt-1095"> father*,</span><span
class="ectt-1095"> notation*,</span><span
class="ectt-1095"> caption,</span><span
class="ectt-1095"> level</span></span></span><br
class="newline" /><span class="obeylines-h"><span class="verb"><span
class="ectt-1095">TABLE</span><span
class="ectt-1095"> netlocs:</span><span
class="ectt-1095"> netlocid^,</span><span
class="ectt-1095"> netlocstr^,</span><span
class="ectt-1095"> retries</span></span></span><br
class="newline" /><span class="obeylines-h"><span class="verb"><span
class="ectt-1095">TABLE</span><span
class="ectt-1095"> urls:</span><span
class="ectt-1095"> netlocid*,</span><span
class="ectt-1095"> urlid^,</span><span
class="ectt-1095"> urlstr^,</span><span
class="ectt-1095"> path</span></span></span><br
class="newline" /><span class="obeylines-h"><span class="verb"><span
class="ectt-1095">TABLE</span><span
class="ectt-1095"> urldb:</span><span
class="ectt-1095"> netlocid*,</span><span
class="ectt-1095"> urlid^,</span><span
class="ectt-1095"> urllock,</span><span
class="ectt-1095"> harvest*,</span><span
class="ectt-1095"> retries,</span><span
class="ectt-1095"> netloclock</span></span></span><br
class="newline" /><span class="obeylines-h"><span class="verb"><span
class="ectt-1095">TABLE</span><span
class="ectt-1095"> newlinks</span><span
class="ectt-1095"> urlid^,</span><span
class="ectt-1095"> netlocid</span></span></span><br
class="newline" /><span class="obeylines-h"><span class="verb"><span
class="ectt-1095">TABLE</span><span
class="ectt-1095"> recordurl:</span><span
class="ectt-1095"> recordid*,</span><span
class="ectt-1095"> urlid^,</span><span
class="ectt-1095"> lastchecked,</span><span
class="ectt-1095"> md5*,</span><span
class="ectt-1095"> fingerprint*^</span></span></span><br
class="newline" /><span class="obeylines-h"><span class="verb"><span
class="ectt-1095">TABLE</span><span
class="ectt-1095"> admin:</span><span
class="ectt-1095"> status,</span><span
class="ectt-1095"> queid,</span><span
class="ectt-1095"> schedulealgorithm</span></span></span><br
class="newline" /><span class="obeylines-h"><span class="verb"><span
class="ectt-1095">TABLE</span><span
class="ectt-1095"> log:</span><span
class="ectt-1095"> pid,</span><span
class="ectt-1095"> id,</span><span
class="ectt-1095"> date,</span><span
class="ectt-1095"> message</span></span></span><br
class="newline" /><span class="obeylines-h"><span class="verb"><span
class="ectt-1095">TABLE</span><span
class="ectt-1095"> que:</span><span
class="ectt-1095"> queid^,</span><span
class="ectt-1095"> urlid,</span><span
class="ectt-1095"> netlocid</span></span></span><br
class="newline" /><span class="obeylines-h"><span class="verb"><span
class="ectt-1095">TABLE</span><span
class="ectt-1095"> robotrules:</span><span
class="ectt-1095"> netlocid*,</span><span
class="ectt-1095"> rule,</span><span
class="ectt-1095"> expire</span></span></span><br
class="newline" /><span class="obeylines-h"><span class="verb"><span
class="ectt-1095">TABLE</span><span
class="ectt-1095"> oai:</span><span
class="ectt-1095"> recordid,</span><span
class="ectt-1095"> md5^,</span><span
class="ectt-1095"> date*,</span><span
class="ectt-1095"> status</span></span></span><br
class="newline" /><span class="obeylines-h"><span class="verb"><span
class="ectt-1095">TABLE</span><span
class="ectt-1095"> exports:</span><span
class="ectt-1095"> host,</span><span
class="ectt-1095"> port,</span><span
class="ectt-1095"> last</span></span></span><br
class="newline" />
<h5 class="subsubsectionHead"><span class="titlemark">A.4.3 </span> <a
id="x45-204000A.4.3"></a>Data tables</h5>
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
CREATE TABLE hdb (
 <br />  recordid int(11) NOT NULL default ’0’,
 <br />  type varchar(50) default NULL,
 <br />  title text,
 <br />  mdate timestamp NOT NULL,
 <br />  expiredate datetime default NULL,
 <br />  length int(11) default NULL,
 <br />  server varchar(50) default NULL,
 <br />  etag varchar(25) default NULL,
 <br />  nheadings int(11) default NULL,
 <br />  nlinks int(11) default NULL,
 <br />  headings mediumtext,
 <br />  ip mediumblob,
 <br />  PRIMARY KEY  (recordid)
 <br />) ENGINE=MyISAM AVG_ROW_LENGTH = 20000 MAX_ROWS = 10000000 DEFAULT CHARACTER SET=utf8;
</div>
</td></tr></table>
<!--l. 51--><p class="nopar" >
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
CREATE TABLE html (
 <br />  recordid int(11) NOT NULL default ’0’,
 <br />  html mediumblob,
 <br />  PRIMARY KEY  (recordid)
 <br />) ENGINE=MyISAM AVG_ROW_LENGTH = 20000 MAX_ROWS = 10000000 DEFAULT CHARACTER SET=utf8;
</div>
</td></tr></table>
<!--l. 59--><p class="nopar" >
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
CREATE TABLE links (
 <br />  recordid int(11) NOT NULL default ’0’,
 <br />  mynetlocid int(11) default NULL,
 <br />  urlid int(11) default NULL,
 <br />  netlocid int(11) default NULL,
 <br />  anchor text,
 <br />  linktype varchar(50) default NULL,
 <br />  KEY recordid (recordid),
 <br />  KEY urlid (urlid),
 <br />  KEY mynetlocid (mynetlocid),
 <br />  KEY netlocid (netlocid)
 <br />) ENGINE=MyISAM MAX_ROWS = 1000000000 DEFAULT CHARACTER SET=utf8;
</div>
</td></tr></table>
<!--l. 74--><p class="nopar" >
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
CREATE TABLE meta (
 <br />  recordid int(11) NOT NULL default ’0’,
 <br />  name varchar(50) default NULL,
 <br />  value text,
 <br />  KEY recordid (recordid)
 <br />) ENGINE=MyISAM MAX_ROWS = 1000000000 DEFAULT CHARACTER SET=utf8;
</div>
</td></tr></table>
<!--l. 83--><p class="nopar" >
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
CREATE TABLE analys (
 <br />  recordid int(11) NOT NULL default ’0’,
 <br />  name varchar(100) NOT NULL,
 <br />  value varchar(100),
 <br />  KEY recordid (recordid)
 <br />) ENGINE=MyISAM DEFAULT CHARACTER SET=utf8;
</div>
</td></tr></table>
<!--l. 92--><p class="nopar" >
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
CREATE TABLE topic (
 <br />  recordid int(11) NOT NULL default ’0’,
 <br />  notation varchar(50) default NULL,
 <br />  abscore int(11) default NULL,
 <br />  relscore int(11) default NULL,
 <br />  terms text default NULL,
 <br />  algorithm varchar(25),
 <br />  KEY notation (notation),
 <br />  KEY recordid (recordid)
 <br />) ENGINE=MyISAM DEFAULT CHARACTER SET=utf8;
</div>
</td></tr></table>
<!--l. 105--><p class="nopar" >
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
CREATE TABLE localtags (
 <br />  netlocid int(11) NOT NULL DEFAULT ’0’,
 <br />  urlid int(11) NOT NULL DEFAULT ’0’,
 <br />  name varchar(100) NOT NULL,
 <br />  value varchar(100) NOT NULL,
 <br />  PRIMARY KEY tag (netlocid,urlid,name(100),value(100))
 <br />) ENGINE=MyISAM DEFAULT CHARACTER SET=utf8;
</div>
</td></tr></table>
<!--l. 115--><p class="nopar" >
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
CREATE TABLE search (
 <br />  recordid int(11) NOT NULL default ’0’,
 <br />  stext mediumtext,
 <br />  PRIMARY KEY (recordid),
 <br />  FULLTEXT (stext)
 <br />) ENGINE=MyISAM AVG_ROW_LENGTH = 20000 MAX_ROWS = 10000000 DEFAULT CHARACTER SET=utf8;
</div>
</td></tr></table>
<!--l. 124--><p class="nopar" >
<!--l. 126--><p class="noindent" >
<h5 class="subsubsectionHead"><span class="titlemark">A.4.4 </span> <a
id="x45-205000A.4.4"></a>Administrative tables</h5>
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
CREATE TABLE netlocalias (
 <br />  netlocid int(11),
 <br />  netlocstr varchar(150) NOT NULL,
 <br />  KEY netlocid (netlocid),
 <br />  PRIMARY KEY netlocstr (netlocstr)
 <br />) ENGINE=MyISAM DEFAULT CHARACTER SET=utf8;
</div>
</td></tr></table>
<!--l. 134--><p class="nopar" >
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
CREATE TABLE urlalias (
 <br />  urlid int(11),
 <br />  urlstr tinytext,
 <br />  KEY urlid (urlid),
 <br />  PRIMARY KEY urlstr (urlstr(255))
 <br />) ENGINE=MyISAM DEFAULT CHARACTER SET=utf8;
</div>
</td></tr></table>
<!--l. 143--><p class="nopar" >
<!--l. 145--><p class="indent" > <span class="obeylines-h"><span class="verb"><span
class="ectt-1095">topichierarchy</span><span
class="ectt-1095"> have</span><span
class="ectt-1095"> to</span><span
class="ectt-1095"> initialized</span><span
class="ectt-1095"> manually</span></span></span><br
class="newline" />
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
CREATE TABLE topichierarchy (
 <br />  node int(11) NOT NULL DEFAULT ’0’,
 <br />  father int(11) DEFAULT NULL,
 <br />  notation varchar(50) NOT NULL DEFAULT ’’,
 <br />  caption varchar(255) DEFAULT NULL,
 <br />  level int(11) DEFAULT NULL,
 <br />  PRIMARY KEY node (node),
 <br />  KEY father (father),
 <br />  KEY notation (notation)
 <br />) ENGINE=MyISAM DEFAULT CHARACTER SET=utf8;
</div>
</td></tr></table>
<!--l. 157--><p class="nopar" >
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
CREATE TABLE netlocs (
 <br />  netlocid int(11) NOT NULL auto_increment,
 <br />  netlocstr varchar(150) NOT NULL,
 <br />  retries int(11) NOT NULL DEFAULT 0,
 <br />  PRIMARY KEY (netlocstr),
 <br />  UNIQUE INDEX netlockid (netlocid)
 <br />) ENGINE=MyISAM DEFAULT CHARACTER SET=utf8;
</div>
</td></tr></table>
<!--l. 167--><p class="nopar" >
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
CREATE TABLE urls (
 <br />  netlocid int(11) NOT NULL DEFAULT ’0’,
 <br />  urlid int(11) NOT NULL auto_increment,
 <br />  urlstr tinytext,
 <br />  path tinytext,
 <br />  PRIMARY KEY urlstr (urlstr(255)),
 <br />  INDEX netlocid (netlocid),
 <br />  UNIQUE INDEX urlid (urlid)
 <br />) ENGINE=MyISAM MAX_ROWS = 1000000000 DEFAULT CHARACTER SET=utf8;
</div>
</td></tr></table>
<!--l. 179--><p class="nopar" >
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
CREATE TABLE urldb (
 <br />  netlocid int(11) NOT NULL default ’0’,
 <br />  netloclock int(11) NOT NULL default ’0’,
 <br />  urlid int(11) NOT NULL default ’0’,
 <br />  urllock int(11) NOT NULL default ’0’,
 <br />  harvest tinyint(1) NOT NULL default ’0’,
 <br />  retries int(11) NOT NULL default ’0’,
 <br />  score int(11) NOT NULL default ’0’,
 <br />  PRIMARY KEY  (urlid),
 <br />  KEY netlocid (netlocid),
 <br />  KEY harvest (harvest)
 <br />) ENGINE=MyISAM DEFAULT CHARACTER SET=utf8;
</div>
</td></tr></table>
<!--l. 194--><p class="nopar" >
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
CREATE TABLE newlinks (
 <br />  urlid int(11) NOT NULL,
 <br />  netlocid int(11) NOT NULL,
 <br />  PRIMARY KEY  (urlid)
 <br />) ENGINE=MyISAM DEFAULT CHARACTER SET=utf8;
</div>
</td></tr></table>
<!--l. 202--><p class="nopar" >
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
CREATE TABLE recordurl (
 <br />  recordid int(11) NOT NULL auto_increment,
 <br />  urlid int(11) NOT NULL default ’0’,
 <br />  lastchecked timestamp NOT NULL,
 <br />  md5 char(32),
 <br />  fingerprint char(50),
 <br />  KEY md5 (md5),
 <br />  KEY fingerprint (fingerprint),
 <br />  PRIMARY KEY (urlid),
 <br />  KEY recordid (recordid)
 <br />) ENGINE=MyISAM DEFAULT CHARACTER SET=utf8;
</div>
</td></tr></table>
<!--l. 216--><p class="nopar" >
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
CREATE TABLE admin (
 <br />  status enum(’closed’,’open’,’paused’,’stopped’) default NULL,
 <br />  schedulealgorithm enum(’default’,’bigdefault’,’advanced’) default ’default’,
 <br />  queid int(11) NOT NULL default ’0’
 <br />) ENGINE=MEMORY DEFAULT CHARACTER SET=utf8;
</div>
</td></tr></table>
<!--l. 224--><p class="nopar" >
<!--l. 226--><p class="indent" > <span class="obeylines-h"><span class="verb"><span
class="ectt-1095">advanced</span><span
class="ectt-1095"> means</span><span
class="ectt-1095"> use</span><span
class="ectt-1095"> config</span><span
class="ectt-1095"> variable</span><span
class="ectt-1095"> SchedulingAlgorithm</span></span></span><br
class="newline" /><span class="obeylines-h"><span class="verb"><span
class="ectt-1095">Initialise</span><span
class="ectt-1095"> admin</span><span
class="ectt-1095"> to</span><span
class="ectt-1095"> ’open’</span><span
class="ectt-1095"> status</span></span></span><br
class="newline" /><span class="obeylines-h"><span class="verb"><span
class="ectt-1095">INSERT</span><span
class="ectt-1095"> INTO</span><span
class="ectt-1095"> admin</span><span
class="ectt-1095"> VALUES</span><span
class="ectt-1095"> (’open’,’default’,0)</span></span></span><br
class="newline" />
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
CREATE TABLE log (
 <br />  pid int(11) NOT NULL default ’0’,
 <br />  id varchar(50) default NULL,
 <br />  date timestamp NOT NULL,
 <br />  message varchar(255) default NULL
 <br />) ENGINE=MyISAM DEFAULT CHARACTER SET=utf8;
</div>
</td></tr></table>
<!--l. 236--><p class="nopar" >
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
CREATE TABLE que (
 <br />  netlocid int(11) NOT NULL default ’0’,
 <br />  urlid int(11) NOT NULL default ’0’,
 <br />  queid int(11) NOT NULL auto_increment,
 <br />  PRIMARY KEY  (queid)
 <br />) ENGINE=MEMORY DEFAULT CHARACTER SET=utf8;
</div>
</td></tr></table>
<!--l. 245--><p class="nopar" >
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
CREATE TABLE robotrules (
 <br />  netlocid int(11) NOT NULL default ’0’,
 <br />  expire int(11) NOT NULL default ’0’,
 <br />  rule varchar(255) default ’’,
 <br />  KEY netlocid (netlocid)
 <br />) ENGINE=MyISAM DEFAULT CHARACTER SET=utf8;
</div>
</td></tr></table>
<!--l. 254--><p class="nopar" >
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
CREATE TABLE oai (
 <br />  recordid int(11) NOT NULL default ’0’,
 <br />  md5 char(32),
 <br />  date timestamp,
 <br />  status enum(’created’, ’updated’, ’deleted’),
 <br />  PRIMARY KEY (md5),
 <br />  KEY date (date)
 <br />) ENGINE=MyISAM DEFAULT CHARACTER SET=utf8;
</div>
</td></tr></table>
<!--l. 265--><p class="nopar" >
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
CREATE TABLE exports (
 <br />  host varchar(30),
 <br />  port int,
 <br />  last timestamp DEFAULT ’1999-12-31’
 <br />) ENGINE=MyISAM DEFAULT CHARACTER SET=utf8;
</div>
</td></tr></table>
<!--l. 273--><p class="nopar" >
<!--l. 275--><p class="noindent" >
<h5 class="subsubsectionHead"><span class="titlemark">A.4.5 </span> <a
id="x45-206000A.4.5"></a>Create user dbuser with required priviligies</h5>
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
GRANT SELECT,INSERT,UPDATE,DELETE,CREATE,CREATE TEMPORARY TABLES,
 <br />   ALTER,LOCK TABLES ON $database.* TO $dbuser;
</div>
</td></tr></table>
<!--l. 279--><p class="nopar" >
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
GRANT SELECT,INSERT,UPDATE,DELETE,CREATE,CREATE TEMPORARY TABLES,
 <br />   ALTER,LOCK TABLES ON $database.* TO $dbuser\@localhost;
</div>
</td></tr></table>
<!--l. 284--><p class="nopar" >
<!--l. 47--><p class="noindent" >
<h4 class="subsectionHead"><span class="titlemark">A.5 </span> <a
id="x45-207000A.5"></a>Manual pages</h4>
<!--l. 1--><p class="noindent" >
<h5 class="subsubsectionHead"><span class="titlemark">A.5.1 </span> <a
id="x45-208000A.5.1"></a>combineExport</h5>
<!--l. 1--><p class="noindent" ><span class="paragraphHead"><a
id="x45-209000A.5.1"></a><span
class="ecbx-1095">NAME</span></span>
combineExport - export records in XML from Combine database
<!--l. 6--><p class="noindent" ><span class="paragraphHead"><a
id="x45-210000A.5.1"></a><span
class="ecbx-1095">SYNOPSIS</span></span>
combineExport –jobname <span
class="cmmi-10x-x-109"><</span>name<span
class="cmmi-10x-x-109">> </span>[–profile alvis<span
class="cmsy-10x-x-109">|</span>dc<span
class="cmsy-10x-x-109">|</span>combine –charset utf8<span
class="cmsy-10x-x-109">|</span>isolatin –number
<span
class="cmmi-10x-x-109"><</span>n<span
class="cmmi-10x-x-109">> </span>–recordid <span
class="cmmi-10x-x-109"><</span>n<span
class="cmmi-10x-x-109">> </span>–md5 <span
class="cmmi-10x-x-109"><</span>MD5<span
class="cmmi-10x-x-109">> </span>–incremental –xsltscript ...]
<!--l. 11--><p class="noindent" ><span class="paragraphHead"><a
id="x45-211000A.5.1"></a><span
class="ecbx-1095">OPTIONS AND ARGUMENTS</span></span>
jobname is used to find the appropriate configuration (mandatory)
<!--l. 16--><p class="indent" >
<dl class="description"><dt class="description">
<span
class="ecbx-1095">–profile</span> </dt><dd
class="description">
<!--l. 20--><p class="noindent" >Three profiles: alvis, dc, and combine . alvis and combine are similar XML formats.
<!--l. 24--><p class="noindent" >’alvis’ profile format is defined by the Alvis enriched document format DTD. It uses
charset UTF-8 per default.
<!--l. 29--><p class="noindent" >’combine’ is more compact with less redundancy.
<!--l. 33--><p class="noindent" >’dc’ is XML encoded Dublin Core data.
</dd><dt class="description">
<span
class="ecbx-1095">–charset</span> </dt><dd
class="description">
<!--l. 38--><p class="noindent" >Selects a specific characterset from UTF-8, iso-latin-1 Overrides –profile settings.
</dd><dt class="description">
<span
class="ecbx-1095">–collapseinlinks</span> </dt><dd
class="description">
<!--l. 44--><p class="noindent" >Skip inlinks with duplicate anchor-texts (ie just one inlink per unique anchor-text).
</dd><dt class="description">
<span
class="ecbx-1095">–nooutlinks</span> </dt><dd
class="description">
<!--l. 49--><p class="noindent" >Do not include any outlinks in the exported records.
</dd><dt class="description">
<span
class="ecbx-1095">–ZebraIndex</span> </dt><dd
class="description">
<!--l. 54--><p class="noindent" >ZebraIndex sends XML records directly to the Zebra server defined in Combine
configuration variable ’ZebraHost’. It uses the default Zebra configuration:
profile=combine, nooutlinks, collapseinlinks and is compatible with the direct Zebra
indexing done during harvesting when ’ZebraHost’ is defined in the Combine
configuration. Requires that the Zebra server is running.
</dd><dt class="description">
<span
class="ecbx-1095">–SolrIndex</span> </dt><dd
class="description">
<!--l. 64--><p class="noindent" >SolrIndex sends XML records directly to the Solr server defined in Combine
configuration variable ’SolrHost’. It uses the default Solr configuration:
profile=combine, nooutlinks, collapseinlinks and is compatible with the direct
Solr indexing done during harvesting when ’SolrHost’ is defined in the Combine
configuration. Requires that the Solr server is running.
</dd><dt class="description">
<span
class="ecbx-1095">–xsltscript</span> </dt><dd
class="description">
<!--l. 74--><p class="noindent" >Generates records in Combine native format and converts them using this XSLT
script before output. See example scripts in /etc/combine/*.xsl
</dd><dt class="description">
<span
class="ecbx-1095">–number</span> </dt><dd
class="description">
<!--l. 80--><p class="noindent" >the max number of records to be exported
</dd><dt class="description">
<span
class="ecbx-1095">–recordid</span> </dt><dd
class="description">
<!--l. 85--><p class="noindent" >Export just the one record with this recordid
</dd><dt class="description">
<span
class="ecbx-1095">–md5</span> </dt><dd
class="description">
<!--l. 90--><p class="noindent" >Export just the one record with this MD5 checksum
</dd><dt class="description">
<span
class="ecbx-1095">–pipehost, –pipeport</span> </dt><dd
class="description">
<!--l. 95--><p class="noindent" >Specifies the server-name and port to connect to and export data using the Alvis
Pipeline. Exports incrementally, ie all changes since last call to combineExport with
the same pipehost and pipeport.
</dd><dt class="description">
<span
class="ecbx-1095">–incremental</span> </dt><dd
class="description">
<!--l. 102--><p class="noindent" >Exports incrementally, ie all changes since last call to combineExport using
–incremental
</dd></dl>
<!--l. 105--><p class="noindent" ><span class="paragraphHead"><a
id="x45-212000A.5.1"></a><span
class="ecbx-1095">DESCRIPTION</span></span>
<!--l. 106--><p class="noindent" ><span class="paragraphHead"><a
id="x45-213000A.5.1"></a><span
class="ecbx-1095">EXAMPLES</span></span>
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
 Export all records in Alvis XML-format to the file recs.xml
 <br />   combineExport --jobname atest > recs.xml
</div>
</td></tr></table>
<!--l. 110--><p class="nopar" >
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
 Export 10 records to STDOUT
 <br />   combineExport --jobname atest --number 10
</div>
</td></tr></table>
<!--l. 114--><p class="nopar" >
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
 Export all records in UTF-8 using Combine native format
 <br />   combineExport --jobname atest --profile combine --charset utf8 > Zebrarecs.xml
</div>
</td></tr></table>
<!--l. 118--><p class="nopar" >
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
 Incremental export of all changes from last call using localhost at port 6234 using the
 <br /> default profile (Alvis)
 <br />   combineExport --jobname atest --pipehost localhost --pipeport 6234
</div>
</td></tr></table>
<!--l. 123--><p class="nopar" >
<!--l. 124--><p class="noindent" ><span class="paragraphHead"><a
id="x45-214000A.5.1"></a><span
class="ecbx-1095">SEE ALSO</span></span>
Combine configuration documentation in <span
class="ecti-1095">/usr/share/doc/combine/</span>.
<!--l. 131--><p class="indent" > Alvis XML schema (–profile alvis) at <span
class="ecss-1095">http://project.alvis.info/alvis_docs/enriched-document.xsd</span>
<!--l. 134--><p class="noindent" ><span class="paragraphHead"><a
id="x45-215000A.5.1"></a><span
class="ecbx-1095">AUTHOR</span></span>
Anders Ardö, <span
class="cmmi-10x-x-109"><</span>anders.ardo@it.lth.se<span
class="cmmi-10x-x-109">></span>
<!--l. 139--><p class="noindent" ><span class="paragraphHead"><a
id="x45-216000A.5.1"></a><span
class="ecbx-1095">COPYRIGHT AND LICENSE</span></span>
Copyright (C) 2005 - 2006 Anders Ardö
<!--l. 146--><p class="indent" > This library is free software; you can redistribute it and/or modify it under the same terms as
Perl itself, either Perl version 5.8.4 or, at your option, any later version of Perl 5 you may have
available.
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
 See the file LICENCE included in the distribution at
 <br /> L<http://combine.it.lth.se/>
</div>
</td></tr></table>
<!--l. 153--><p class="nopar" > __________________________________________________________________________
<h5 class="subsubsectionHead"><span class="titlemark">A.5.2 </span> <a
id="x45-217000A.5.2"></a>combineCtrl</h5>
<!--l. 1--><p class="noindent" ><span class="paragraphHead"><a
id="x45-218000A.5.2"></a><span
class="ecbx-1095">NAME</span></span>
combineCtrl - controls a Combine crawling job
<!--l. 6--><p class="noindent" ><span class="paragraphHead"><a
id="x45-219000A.5.2"></a><span
class="ecbx-1095">SYNOPSIS</span></span>
combineCtrl <span
class="cmmi-10x-x-109"><</span>action<span
class="cmmi-10x-x-109">> </span>–jobname <span
class="cmmi-10x-x-109"><</span>name<span
class="cmmi-10x-x-109">></span>
<!--l. 13--><p class="indent" > where action can be one of start, kill, load, recyclelinks, reharvest, stat, howmany, records,
hosts, initMemoryTables, open, stop, pause, continue
<!--l. 16--><p class="noindent" ><span class="paragraphHead"><a
id="x45-220000A.5.2"></a><span
class="ecbx-1095">OPTIONS AND ARGUMENTS</span></span>
jobname is used to find the appropriate configuration (mandatory)
<!--l. 21--><p class="indent" > <span class="likesubparagraphHead"><a
id="x45-221000A.5.2"></a><span
class="ecbx-1095">Actions starting/killing crawlers</span></span>
<dl class="description"><dt class="description">
<span
class="ecbx-1095">start</span> </dt><dd
class="description">
<!--l. 26--><p class="noindent" >takes an optional switch <span
class="ectt-1095">–harvesters n </span>where <span
class="ectt-1095">n </span>is the number of crawler processes
to start
</dd><dt class="description">
<span
class="ecbx-1095">kill</span> </dt><dd
class="description">
<!--l. 32--><p class="noindent" >kills all active crawlers (and their associated combineRun monitors) for jobname
</dd></dl>
<!--l. 35--><p class="indent" > <span class="likesubparagraphHead"><a
id="x45-222000A.5.2"></a><span
class="ecbx-1095">Actions loading or recycling URLs for crawling</span></span>
<dl class="description"><dt class="description">
<span
class="ecbx-1095">load</span> </dt><dd
class="description">
<!--l. 40--><p class="noindent" >Read a list of URLs from STDIN (one per line) and schedules them for crawling
</dd><dt class="description">
<span
class="ecbx-1095">recyclelinks</span> </dt><dd
class="description">
<!--l. 45--><p class="noindent" >Schedule all newly found (since last invocation of recyclelinks) links in crawled pages
for crawling
</dd><dt class="description">
<span
class="ecbx-1095">reharvest</span> </dt><dd
class="description">
<!--l. 51--><p class="noindent" >Schedules all pages in the database for crawling again (in order to check if they have
changed)
</dd></dl>
<!--l. 55--><p class="indent" > <span class="likesubparagraphHead"><a
id="x45-223000A.5.2"></a><span
class="ecbx-1095">Actions for controlling scheduling of URLs</span></span>
<dl class="description"><dt class="description">
<span
class="ecbx-1095">open</span> </dt><dd
class="description">
<!--l. 60--><p class="noindent" >opens database for URL scheduling (maybe after a stop)
</dd><dt class="description">
<span
class="ecbx-1095">stop</span> </dt><dd
class="description">
<!--l. 65--><p class="noindent" >stops URL scheduling
</dd><dt class="description">
<span
class="ecbx-1095">pause</span> </dt><dd
class="description">
<!--l. 70--><p class="noindent" >pauses URL scheduling
</dd><dt class="description">
<span
class="ecbx-1095">continue</span> </dt><dd
class="description">
<!--l. 75--><p class="noindent" >continues URL scheduling after a pause
</dd></dl>
<!--l. 78--><p class="indent" > <span class="likesubparagraphHead"><a
id="x45-224000A.5.2"></a><span
class="ecbx-1095">Misc actions</span></span>
<dl class="description"><dt class="description">
<span
class="ecbx-1095">stat</span> </dt><dd
class="description">
<!--l. 83--><p class="noindent" >prints out rudimentary status of the ready queue (ie eligible now) of URLs to be
crawled
</dd><dt class="description">
<span
class="ecbx-1095">howmany</span> </dt><dd
class="description">
<!--l. 88--><p class="noindent" >prints out rudimentary status of all URLs to be crawled
</dd><dt class="description">
<span
class="ecbx-1095">records</span> </dt><dd
class="description">
<!--l. 93--><p class="noindent" >prints out the number of ercords in the SQL database
</dd><dt class="description">
<span
class="ecbx-1095">hosts</span> </dt><dd
class="description">
<!--l. 98--><p class="noindent" >prints out rudimentary status of all hosts that have URLs to be crawled
</dd><dt class="description">
<span
class="ecbx-1095">initMemoryTables</span> </dt><dd
class="description">
<!--l. 103--><p class="noindent" >initializes the administrative MySQL tables that are kept in memory
</dd></dl>
<!--l. 106--><p class="noindent" ><span class="paragraphHead"><a
id="x45-225000A.5.2"></a><span
class="ecbx-1095">DESCRIPTION</span></span>
Implements various control functionality to administer a crawling job, like starting and
stoping crawlers, injecting URLs into the crawl queue, scheduling newly found links for crawling,
controlling scheduling, etc.
<!--l. 115--><p class="indent" > This is the preferred way of controling a crawl job.
<!--l. 117--><p class="noindent" ><span class="paragraphHead"><a
id="x45-226000A.5.2"></a><span
class="ecbx-1095">EXAMPLES</span></span>
<dl class="description"><dt class="description">
<span
class="ectt-1095">echo ’http://www.yourdomain.com/’ </span><span
class="cmsy-10x-x-109">| </span><span
class="ectt-1095">combineCtrl load –jobname aatest</span> </dt><dd
class="description">
<!--l. 122--><p class="noindent" >Seed the crawling job <span
class="ectt-1095">aatest </span>with a URL
</dd><dt class="description">
<span
class="ectt-1095">combineCtrl start –jobname aatest –harvesters 3</span> </dt><dd
class="description">
<!--l. 127--><p class="noindent" >Start 3 crawling processes for job <span
class="ectt-1095">aatest</span>
</dd><dt class="description">
<span
class="ectt-1095">combineCtrl recyclelinks –jobname aatest</span> </dt><dd
class="description">
<!--l. 132--><p class="noindent" >Schedule all new links crawling
</dd><dt class="description">
<span
class="ectt-1095">combineCtrl stat –jobname aatest</span> </dt><dd
class="description">
<!--l. 137--><p class="noindent" >See how many URLs that are eligible for crawling right now.
</dd></dl>
<!--l. 140--><p class="noindent" ><span class="paragraphHead"><a
id="x45-227000A.5.2"></a><span
class="ecbx-1095">SEE ALSO</span></span>
combine
<!--l. 147--><p class="indent" > Combine configuration documentation in <span
class="ecti-1095">/usr/share/doc/combine/</span>.
<!--l. 149--><p class="noindent" ><span class="paragraphHead"><a
id="x45-228000A.5.2"></a><span
class="ecbx-1095">AUTHOR</span></span>
Anders Ardö, <span
class="cmmi-10x-x-109"><</span>anders.ardo@it.lth.se<span
class="cmmi-10x-x-109">></span>
<!--l. 154--><p class="noindent" ><span class="paragraphHead"><a
id="x45-229000A.5.2"></a><span
class="ecbx-1095">COPYRIGHT AND LICENSE</span></span>
Copyright (C) 2005 Anders Ardö
<!--l. 161--><p class="indent" > This library is free software; you can redistribute it and/or modify it under the same terms as
Perl itself, either Perl version 5.8.4 or, at your option, any later version of Perl 5 you may have
available.
<!--l. 167--><p class="indent" > See the file LICENCE included in the distribution at <span
class="ecss-1095">http://combine.it.lth.se/</span>
<!--l. 6--><p class="indent" > _________________________________________________________________________________________________________________
<h5 class="subsubsectionHead"><span class="titlemark">A.5.3 </span> <a
id="x45-230000A.5.3"></a>combineRun</h5>
<!--l. 1--><p class="noindent" ><span class="paragraphHead"><a
id="x45-231000A.5.3"></a><span
class="ecbx-1095">NAME</span></span>
combineRun - starts, monitors and restarts a combine harvesting process
<!--l. 6--><p class="noindent" ><span class="paragraphHead"><a
id="x45-232000A.5.3"></a><span
class="ecbx-1095">SYNOPSIS</span></span>
combineRun <span
class="cmmi-10x-x-109"><</span>pidfile<span
class="cmmi-10x-x-109">> <</span>combine command to run<span
class="cmmi-10x-x-109">></span>
<!--l. 11--><p class="noindent" ><span class="paragraphHead"><a
id="x45-233000A.5.3"></a><span
class="ecbx-1095">DESCRIPTION</span></span>
Starts a program and monitors it in order to make sure there is alsways a copy running. If the
program dies it will be restarted with the same parameters. Used by <span
class="ectt-1095">combineCtrl </span>when starting
combine crawling.
<!--l. 18--><p class="noindent" ><span class="paragraphHead"><a
id="x45-234000A.5.3"></a><span
class="ecbx-1095">SEE ALSO</span></span>
combineCtrl
<!--l. 23--><p class="noindent" ><span class="paragraphHead"><a
id="x45-235000A.5.3"></a><span
class="ecbx-1095">AUTHOR</span></span>
Anders Ardö, <span
class="cmmi-10x-x-109"><</span>anders.ardo@it.lth.se<span
class="cmmi-10x-x-109">></span>
<!--l. 28--><p class="noindent" ><span class="paragraphHead"><a
id="x45-236000A.5.3"></a><span
class="ecbx-1095">COPYRIGHT AND LICENSE</span></span>
Copyright (C) 2005 Anders Ardö
<!--l. 35--><p class="indent" > This library is free software; you can redistribute it and/or modify it under the same terms as
Perl itself, either Perl version 5.8.4 or, at your option, any later version of Perl 5 you may have
available.
<!--l. 41--><p class="indent" > See the file LICENCE included in the distribution at <span
class="ecss-1095">http://combine.it.lth.se/</span>
<!--l. 9--><p class="indent" > _________________________________________________________________________________________________________________
<h5 class="subsubsectionHead"><span class="titlemark">A.5.4 </span> <a
id="x45-237000A.5.4"></a>combineReClassify</h5>
<!--l. 1--><p class="noindent" ><span class="paragraphHead"><a
id="x45-238000A.5.4"></a><span
class="ecbx-1095">NAME</span></span>
combineReClassify - main program that reanalyse records in a combine database
<!--l. 8--><p class="indent" > Algorithm: select relevant records based on cls parameter for each record get record from
database delete analyse infor from the record analyse the record if still_relevant save in
database
<!--l. 12--><p class="indent" > _________________________________________________________________________________________________________________
<h5 class="subsubsectionHead"><span class="titlemark">A.5.5 </span> <a
id="x45-239000A.5.5"></a>combineSVM</h5>
<!--l. 1--><p class="noindent" ><span class="paragraphHead"><a
id="x45-240000A.5.5"></a><span
class="ecbx-1095">NAME</span></span>
combineSVM - generate a SVM model from good and bad examples
<!--l. 6--><p class="noindent" ><span class="paragraphHead"><a
id="x45-241000A.5.5"></a><span
class="ecbx-1095">SYNOPSIS</span></span>
combineSVM –jobname <span
class="cmmi-10x-x-109"><</span>name<span
class="cmmi-10x-x-109">> </span>[–good <span
class="cmmi-10x-x-109"><</span>good-file<span
class="cmmi-10x-x-109">></span>] [–bad <span
class="cmmi-10x-x-109"><</span>bad-file<span
class="cmmi-10x-x-109">></span>] [–train
<span
class="cmmi-10x-x-109"><</span>model-file<span
class="cmmi-10x-x-109">></span>] [–help]
<!--l. 11--><p class="noindent" ><span class="paragraphHead"><a
id="x45-242000A.5.5"></a><span
class="ecbx-1095">OPTIONS AND ARGUMENTS</span></span>
jobname is used to find the appropriate configuration (mandatory)
<!--l. 18--><p class="indent" > good is the name of a file with good URLs, one per line. Default ’goodURL.txt’
<!--l. 22--><p class="indent" > bad is the name of a file with bad URLs, one per line. Default ’badURL.txt’
<!--l. 26--><p class="indent" > train is the name of the file where the trained SVM model will be stored. Default
’SVMmodel.txt’
<!--l. 28--><p class="noindent" ><span class="paragraphHead"><a
id="x45-243000A.5.5"></a><span
class="ecbx-1095">DESCRIPTION</span></span>
Takes two files, one with positive examples (good) and one with negative examples
(bad) and trains a SVM classifier using these. The resulting model is stored in the file
<span
class="cmmi-10x-x-109"><</span>train<span
class="cmmi-10x-x-109">></span>.
<!--l. 36--><p class="indent" > The example files should contain one URL per line and nothing else.
<!--l. 38--><p class="noindent" ><span class="paragraphHead"><a
id="x45-244000A.5.5"></a><span
class="ecbx-1095">SEE ALSO</span></span>
combine
<!--l. 45--><p class="indent" > Combine configuration documentation in <span
class="ecti-1095">/usr/share/doc/combine/</span>.
<!--l. 47--><p class="noindent" ><span class="paragraphHead"><a
id="x45-245000A.5.5"></a><span
class="ecbx-1095">AUTHOR</span></span>
Ignacio Garcia Dorado Anders Ardö, <span
class="cmmi-10x-x-109"><</span>anders.ardo@it.lth.se<span
class="cmmi-10x-x-109">></span>
<!--l. 53--><p class="noindent" ><span class="paragraphHead"><a
id="x45-246000A.5.5"></a><span
class="ecbx-1095">COPYRIGHT AND LICENSE</span></span>
Copyright (C) 2008 Ignacio Garcia Dorado, Anders Ardö
<!--l. 60--><p class="indent" > This library is free software; you can redistribute it and/or modify it under the same terms as
Perl itself, either Perl version 5.8.4 or, at your option, any later version of Perl 5 you may have
available.
<!--l. 66--><p class="indent" > See the file LICENCE included in the distribution at <span
class="ecss-1095">http://combine.it.lth.se/</span>
<!--l. 15--><p class="indent" > _________________________________________________________________________________________________________________
<h5 class="subsubsectionHead"><span class="titlemark">A.5.6 </span> <a
id="x45-247000A.5.6"></a>combineRank</h5>
<!--l. 1--><p class="noindent" ><span class="paragraphHead"><a
id="x45-248000A.5.6"></a><span
class="ecbx-1095">NAME</span></span>
combineRank - calculates various Ranks for a Combine crawled database
<!--l. 6--><p class="noindent" ><span class="paragraphHead"><a
id="x45-249000A.5.6"></a><span
class="ecbx-1095">SYNOPSIS</span></span>
combineRank <span
class="cmmi-10x-x-109"><</span>action<span
class="cmmi-10x-x-109">> </span>–jobname <span
class="cmmi-10x-x-109"><</span>name<span
class="cmmi-10x-x-109">> </span>–verbose
<!--l. 13--><p class="indent" > where action can be one of PageRank, PageRankBL, NetLocRank, and exportLinkGraph.
Results on STDOUT.
<!--l. 16--><p class="noindent" ><span class="paragraphHead"><a
id="x45-250000A.5.6"></a><span
class="ecbx-1095">OPTIONS AND ARGUMENTS</span></span>
jobname is used to find the appropriate configuration (mandatory)
<!--l. 23--><p class="indent" > verbose enables printing of ranks to STDOUT as SQL INSERT statements
<!--l. 25--><p class="indent" > <span class="likesubparagraphHead"><a
id="x45-251000A.5.6"></a><span
class="ecbx-1095">Actions calculating variants of PageRank</span></span>
<dl class="description"><dt class="description">
<span
class="ecbx-1095">PageRank</span> </dt><dd
class="description">
<!--l. 30--><p class="noindent" >calculate standard PageRank
</dd><dt class="description">
<span
class="ecbx-1095">PageRankBL</span> </dt><dd
class="description">
<!--l. 35--><p class="noindent" >calculate PageRanks with backlinks added for each link
</dd><dt class="description">
<span
class="ecbx-1095">NetLocRank</span> </dt><dd
class="description">
<!--l. 40--><p class="noindent" >calculate SiteRank for each site and a local DocRank for documents within each site.
Global ranks are then calulated as SiteRank * DocRank
</dd></dl>
<!--l. 44--><p class="indent" > <span class="likesubparagraphHead"><a
id="x45-252000A.5.6"></a><span
class="ecbx-1095">Actions exporting link data</span></span>
<dl class="description"><dt class="description">
<span
class="ecbx-1095">exportLinkGraph</span> </dt><dd
class="description">
<!--l. 49--><p class="noindent" >export linkgraph from Combine database
</dd></dl>
<!--l. 52--><p class="noindent" ><span class="paragraphHead"><a
id="x45-253000A.5.6"></a><span
class="ecbx-1095">DESCRIPTION</span></span>
Implements calculation of different variants of PageRank.
<!--l. 59--><p class="indent" > Results are written to STDOUT and can be huge for large databases.
<!--l. 63--><p class="indent" > Linkgraph is exported in ASCII as a sparse matrix, one row per line. First integer is the ID
(urlid) of a page with links. The rest of integers on the line are IDs for pages linked to. Ie
121 5624 23416 51423 267178 means that page 121 links to pages 5624 23416 51423
267178
<!--l. 69--><p class="noindent" ><span class="paragraphHead"><a
id="x45-254000A.5.6"></a><span
class="ecbx-1095">EXAMPLES</span></span>
<dl class="description"><dt class="description">
<span
class="ectt-1095">combineRank –jobname aatest –verbose PageRankBL</span> </dt><dd
class="description">
<!--l. 74--><p class="noindent" >calculate PageRank with backlinks, result on STDOUT
</dd><dt class="description">
<span
class="ectt-1095">combineRank –jobname aatest –verbose exportLinkGraph</span> </dt><dd
class="description">
<!--l. 79--><p class="noindent" >export the linkgraph to STDOUT
</dd></dl>
<!--l. 82--><p class="noindent" ><span class="paragraphHead"><a
id="x45-255000A.5.6"></a><span
class="ecbx-1095">SEE ALSO</span></span>
combine
<!--l. 89--><p class="indent" > Combine configuration documentation in <span
class="ecti-1095">/usr/share/doc/combine/</span>.
<!--l. 91--><p class="noindent" ><span class="paragraphHead"><a
id="x45-256000A.5.6"></a><span
class="ecbx-1095">AUTHOR</span></span>
Anders Ardö, <span
class="cmmi-10x-x-109"><</span>anders.ardo@it.lth.se<span
class="cmmi-10x-x-109">></span>
<!--l. 96--><p class="noindent" ><span class="paragraphHead"><a
id="x45-257000A.5.6"></a><span
class="ecbx-1095">COPYRIGHT AND LICENSE</span></span>
Copyright (C) 2006 Anders Ardö
<!--l. 103--><p class="indent" > This library is free software; you can redistribute it and/or modify it under the same terms as
Perl itself, either Perl version 5.8.4 or, at your option, any later version of Perl 5 you may have
available.
<!--l. 109--><p class="indent" > See the file LICENCE included in the distribution at <span
class="ecss-1095">http://combine.it.lth.se/</span>
<!--l. 18--><p class="indent" > _________________________________________________________________________________________________________________
<h5 class="subsubsectionHead"><span class="titlemark">A.5.7 </span> <a
id="x45-258000A.5.7"></a>combineUtil</h5>
<!--l. 1--><p class="noindent" ><span class="paragraphHead"><a
id="x45-259000A.5.7"></a><span
class="ecbx-1095">NAME</span></span>
combineUtil - various operations on the Combine database
<!--l. 6--><p class="noindent" ><span class="paragraphHead"><a
id="x45-260000A.5.7"></a><span
class="ecbx-1095">SYNOPSIS</span></span>
combineUtil <span
class="cmmi-10x-x-109"><</span>action<span
class="cmmi-10x-x-109">> </span>–jobname <span
class="cmmi-10x-x-109"><</span>name<span
class="cmmi-10x-x-109">></span>
<!--l. 13--><p class="indent" > where action can be one of stats, termstat, classtat, sanity, all, serveralias, resetOAI,
restoreSanity, deleteNetLoc, deletePath, deleteMD5, deleteRecordid, addAlias
<!--l. 18--><p class="noindent" ><span class="paragraphHead"><a
id="x45-261000A.5.7"></a><span
class="ecbx-1095">OPTIONS AND ARGUMENTS</span></span>
jobname is used to find the appropriate configuration (mandatory)
<!--l. 23--><p class="indent" > <span class="likesubparagraphHead"><a
id="x45-262000A.5.7"></a><span
class="ecbx-1095">Actions listing statistics</span></span>
<dl class="description"><dt class="description">
<span
class="ecbx-1095">stats</span> </dt><dd
class="description">
<!--l. 28--><p class="noindent" >Global statistics about the database
</dd><dt class="description">
<span
class="ecbx-1095">termstat</span> </dt><dd
class="description">
<!--l. 33--><p class="noindent" >generates statistics about the terms from topic ontology matched in documents (can
be long output)
</dd><dt class="description">
<span
class="ecbx-1095">classtat</span> </dt><dd
class="description">
<!--l. 39--><p class="noindent" >generates statistics about the topic classes assigned to documents
</dd></dl>
<!--l. 42--><p class="indent" > <span class="likesubparagraphHead"><a
id="x45-263000A.5.7"></a><span
class="ecbx-1095">Actions for sanity controlls</span></span>
<dl class="description"><dt class="description">
<span
class="ecbx-1095">sanity</span> </dt><dd
class="description">
<!--l. 47--><p class="noindent" >Performs various sanity checks on the database
</dd><dt class="description">
<span
class="ecbx-1095">restoreSanity</span> </dt><dd
class="description">
<!--l. 52--><p class="noindent" >Deletes records which sanity checks finds insane
</dd><dt class="description">
<span
class="ecbx-1095">resetOAI</span> </dt><dd
class="description">
<!--l. 57--><p class="noindent" >Removes all history (ie ’deleted’ records) from the OAI table. This is done by
removing the OAI table and recreating it from the existing database.
</dd></dl>
<!--l. 62--><p class="indent" > <span class="likesubparagraphHead"><a
id="x45-264000A.5.7"></a><span
class="ecbx-1095">Action all</span></span>
Does the actions: stats, sanity, classtat, termstat
<!--l. 67--><p class="indent" > <span class="likesubparagraphHead"><a
id="x45-265000A.5.7"></a><span
class="ecbx-1095">Actions for deleting records</span></span>
<dl class="description"><dt class="description">
<span
class="ecbx-1095">deleteNetLoc</span> </dt><dd
class="description">
<!--l. 72--><p class="noindent" >Deletes all records matching the ’,’-separated list of server net-locations (server-names
optionally with port) in the switch –netlocstr. Net-locations can include SQL wild
cards (’%’).
</dd><dt class="description">
<span
class="ecbx-1095">deletePath</span> </dt><dd
class="description">
<!--l. 79--><p class="noindent" >Deletes all records matching the ’,’-separated list of URl paths (excluding
net-locations) in the switch –pathsubstr. Paths can include SQL wild cards (’%’).
</dd><dt class="description">
<span
class="ecbx-1095">deleteMD5</span> </dt><dd
class="description">
<!--l. 85--><p class="noindent" >Delete the record which has the MD5 in switch –md5
</dd><dt class="description">
<span
class="ecbx-1095">deleteRecordid</span> </dt><dd
class="description">
<!--l. 90--><p class="noindent" >Delete the record which has the recordid in switch –recordid
</dd></dl>
<!--l. 93--><p class="indent" > <span class="likesubparagraphHead"><a
id="x45-266000A.5.7"></a><span
class="ecbx-1095">Actions for handling server aliases</span></span>
<dl class="description"><dt class="description">
<span
class="ecbx-1095">serverAlias</span> </dt><dd
class="description">
<!--l. 98--><p class="noindent" >Detect server aliases in the current database and do a ’addAlias’ on each detected
alias.
</dd><dt class="description">
<span
class="ecbx-1095">addAlias</span> </dt><dd
class="description">
<!--l. 104--><p class="noindent" >Manually add a serveralias to the system. Requires switches –aliases and –preferred
</dd></dl>
<!--l. 108--><p class="noindent" ><span class="paragraphHead"><a
id="x45-267000A.5.7"></a><span
class="ecbx-1095">DESCRIPTION</span></span>
Does various statistics generation as well as performing sanity checks on the database
<!--l. 113--><p class="noindent" ><span class="paragraphHead"><a
id="x45-268000A.5.7"></a><span
class="ecbx-1095">EXAMPLES</span></span>
<dl class="description"><dt class="description">
<span
class="ectt-1095">combineUtil termstat –jobname aatest</span> </dt><dd
class="description">
<!--l. 118--><p class="noindent" >Generate matched term statistics
</dd></dl>
<!--l. 121--><p class="noindent" ><span class="paragraphHead"><a
id="x45-269000A.5.7"></a><span
class="ecbx-1095">SEE ALSO</span></span>
combine
<!--l. 128--><p class="indent" > Combine configuration documentation in <span
class="ecti-1095">/usr/share/doc/combine/</span>.
<!--l. 130--><p class="noindent" ><span class="paragraphHead"><a
id="x45-270000A.5.7"></a><span
class="ecbx-1095">AUTHOR</span></span>
Anders Ardö, <span
class="cmmi-10x-x-109"><</span>anders.ardo@it.lth.se<span
class="cmmi-10x-x-109">></span>
<!--l. 135--><p class="noindent" ><span class="paragraphHead"><a
id="x45-271000A.5.7"></a><span
class="ecbx-1095">COPYRIGHT AND LICENSE</span></span>
Copyright (C) 2005 Anders Ardö
<!--l. 142--><p class="indent" > This library is free software; you can redistribute it and/or modify it under the same terms as
Perl itself, either Perl version 5.8.4 or, at your option, any later version of Perl 5 you may have
available.
<!--l. 148--><p class="indent" > See the file LICENCE included in the distribution at <span
class="ecss-1095">http://combine.it.lth.se/</span>
<!--l. 21--><p class="indent" > _________________________________________________________________________________________________________________
<h5 class="subsubsectionHead"><span class="titlemark">A.5.8 </span> <a
id="x45-272000A.5.8"></a>combine</h5>
<!--l. 1--><p class="noindent" ><span class="paragraphHead"><a
id="x45-273000A.5.8"></a><span
class="ecbx-1095">NAME</span></span>
Combine - Focused Web crawler framework
<!--l. 6--><p class="noindent" ><span class="paragraphHead"><a
id="x45-274000A.5.8"></a><span
class="ecbx-1095">SYNOPSIS</span></span>
combine –jobname <span
class="cmmi-10x-x-109"><</span>name<span
class="cmmi-10x-x-109">> </span>–logname <span
class="cmmi-10x-x-109"><</span>id<span
class="cmmi-10x-x-109">></span>
<!--l. 11--><p class="noindent" ><span class="paragraphHead"><a
id="x45-275000A.5.8"></a><span
class="ecbx-1095">OPTIONS AND ARGUMENTS</span></span>
jobname is used to find the appropriate configuration (mandatory)
<!--l. 18--><p class="indent" > logname is used as identifier in the log (in MySQL table log)
<!--l. 20--><p class="noindent" ><span class="paragraphHead"><a
id="x45-276000A.5.8"></a><span
class="ecbx-1095">DESCRIPTION</span></span>
Does crawling, parsing, optional topic-check and stores in MySQL database Normally started
with the <span
class="ectt-1095">combineCtrl </span>command. Briefly it get’s an URL from the MySQL database, which acts
as a common coordinator for a Combine job. The Web-page is fetched, provided it
passes the robot exclusion protocoll. The HTML ic cleaned using <span
class="ectt-1095">Tidy </span>and parsed into
metadata, headings, text, links and link achors. Then it is stored (optionaly provided a
topic-check is passed to keep the crawler focused) in the MySQL database in a structured
form.
<!--l. 35--><p class="indent" > A simple workflow for a trivial crawl job might look like:
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
    Initialize database and configuration
 <br />  combineINIT --jobname aatest
 <br />    Enter some seed URLs from a file with a list of URLs
 <br />  combineCtrl  load --jobname aatest < seedURLs.txt
 <br />    Start 2 crawl processes
 <br />  combineCtrl  start --jobname aatest --harvesters 2
</div>
</td></tr></table>
<!--l. 44--><p class="nopar" >
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
    For some time occasionally schedule new links for crawling
 <br />  combineCtrl recyclelinks --jobname aatest
 <br />    or look at the size of the ready queue
 <br />  combineCtrl stat --jobname aatest
</div>
</td></tr></table>
<!--l. 50--><p class="nopar" >
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
    When satisfied kill the crawlers
 <br />  combineCtrl kill --jobname aatest
 <br />    Export data records in a highly structured XML format
 <br />  combineExport --jobname aatest
</div>
</td></tr></table>
<!--l. 56--><p class="nopar" >
<!--l. 59--><p class="indent" > For more complex jobs you have to edit the job configuration file.
<!--l. 61--><p class="noindent" ><span class="paragraphHead"><a
id="x45-277000A.5.8"></a><span
class="ecbx-1095">SEE ALSO</span></span>
combineINIT, combineCtrl
<!--l. 68--><p class="indent" > Combine configuration documentation in <span
class="ecti-1095">/usr/share/doc/combine/</span>.
<!--l. 70--><p class="noindent" ><span class="paragraphHead"><a
id="x45-278000A.5.8"></a><span
class="ecbx-1095">AUTHOR</span></span>
Anders Ardö, <span
class="cmmi-10x-x-109"><</span>anders.ardo@it.lth.se<span
class="cmmi-10x-x-109">></span>
<!--l. 75--><p class="noindent" ><span class="paragraphHead"><a
id="x45-279000A.5.8"></a><span
class="ecbx-1095">COPYRIGHT AND LICENSE</span></span>
Copyright (C) 2005 Anders Ardö
<!--l. 82--><p class="indent" > This library is free software; you can redistribute it and/or modify it under the same terms as
Perl itself, either Perl version 5.8.4 or, at your option, any later version of Perl 5 you may have
available.
<!--l. 88--><p class="indent" > See the file LICENCE included in the distribution at <span
class="ecss-1095">http://combine.it.lth.se/</span>
<!--l. 24--><p class="indent" > _________________________________________________________________________________________________________________
<h5 class="subsubsectionHead"><span class="titlemark">A.5.9 </span> <a
id="x45-280000A.5.9"></a>Combine::PosMatcher</h5>
<!--l. 1--><p class="noindent" ><span class="paragraphHead"><a
id="x45-281000A.5.9"></a><span
class="ecbx-1095">NAME</span></span>
PosMatcher
<!--l. 6--><p class="noindent" ><span class="paragraphHead"><a
id="x45-282000A.5.9"></a><span
class="ecbx-1095">DESCRIPTION</span></span>
This a module in the DESIRE automatic classification system. Copyright 1999.
<!--l. 13--><p class="indent" > Exported routines: 1. Fetching text: These routines all extract texts from a document (either
a Combine record, a Combine XWI datastructure or a WWW-page identified by a
URL. They all return: $meta, $head, $text, $url, $title, $size $meta: Metadata from
document $head: Important text from document $text: Plain text from document
$url: URL of the document $title: HTML title of the document $size: The size of the
document
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
   Common input parameters:
 <br />        $DoStem: 1=do stemming; 0=no stemming
 <br />        $stoplist: object pointer to a LoadTermList object with a stoplist loaded
 <br />        $simple: 1=do simple loading; 0=advanced loading (might induce errors)
</div>
</td></tr></table>
<!--l. 30--><p class="nopar" >
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
 getTextXWI
 <br />     parameters: $xwi, $DoStem, $stoplist, $simple
 <br />       $xwi is a Combine XWI datastructure
</div>
</td></tr></table>
<!--l. 35--><p class="nopar" >
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
 getTextURL
 <br />    parameters: $url, $DoStem, $stoplist, $simple
 <br />       $url is the URL for the page to extract text from
</div>
</td></tr></table>
<!--l. 40--><p class="nopar" >
<!--l. 43--><p class="indent" > 2. Term matcher accepts a text as a (reference) parameter, matches each term in
Term against text Matches are recorded in an associative array with class as key and
summed weight as value. Match parameters: $text, $termlist $text: text to match
against the termlist $termlist: object pointer to a LoadTermList object with a termlist
loaded output: %score: an associative array with classifications as keys and scores as
values
<!--l. 55--><p class="indent" > 3. Heuristics: sum scores down the classification tree to the leafs cleanEiTree parameters:
%res - an associative array from Match output: %res - same array
<!--l. 60--><p class="noindent" ><span class="paragraphHead"><a
id="x45-283000A.5.9"></a><span
class="ecbx-1095">AUTHOR</span></span>
Anders Ardö, <span
class="cmmi-10x-x-109"><</span>anders.ardo@it.lth.se<span
class="cmmi-10x-x-109">></span>
<!--l. 65--><p class="noindent" ><span class="paragraphHead"><a
id="x45-284000A.5.9"></a><span
class="ecbx-1095">COPYRIGHT AND LICENSE</span></span>
Copyright (C) 2005,2006 Anders Ardö
<!--l. 72--><p class="indent" > This library is free software; you can redistribute it and/or modify it under the same terms as
Perl itself, either Perl version 5.8.4 or, at your option, any later version of Perl 5 you may have
available.
<!--l. 78--><p class="indent" > See the file LICENCE included in the distribution at <span
class="ecss-1095">http://combine.it.lth.se/</span>
<!--l. 27--><p class="indent" > _________________________________________________________________________________________________________________
<h5 class="subsubsectionHead"><span class="titlemark">A.5.10 </span> <a
id="x45-285000A.5.10"></a>Combine::selurl</h5>
<!--l. 1--><p class="noindent" ><span class="paragraphHead"><a
id="x45-286000A.5.10"></a><span
class="ecbx-1095">NAME</span></span>
selurl - Normalise and validate URIs for harvesting
<!--l. 6--><p class="noindent" ><span class="paragraphHead"><a
id="x45-287000A.5.10"></a><span
class="ecbx-1095">INTRODUCTION</span></span>
Selurl selects and normalises URIs on basis of both general practice (hostname lowercasing,
portnumber substsitution etc.) and Combine-specific handling (aplpying config_allow,
config_exclude, config_serveralias and other relevant config settings).
<!--l. 16--><p class="indent" > The Config settings catered for currently are:
<!--l. 20--><p class="indent" > maxUrlLength - the maximum length of an unnormalised URL allow - Perl regular
to identify allowed URLs exclude - Perl regular expressions to exclude URLs from
harvesting serveralias - Aliases of server names sessionids - List sessionid markers to be
removed
<!--l. 28--><p class="indent" > A selurl object can hold a single URL and has methods to obtain its subparts
as defined in URI.pm, plus some methods to normalise and validate it in Combine
context.
<!--l. 32--><p class="noindent" ><span class="paragraphHead"><a
id="x45-288000A.5.10"></a><span
class="ecbx-1095">BUGS</span></span>
Currently, the only schemes supported are http, https and ftp. Others may or may not work
correctly. For one thing, we assume the scheme has an internet hostname/port.
<!--l. 41--><p class="indent" > clone() will only return a copy of the real URI object, not a new selurl.
<!--l. 46--><p class="indent" > URI URI-escapes the strings fed into it by new() once. Existing percent signs in the input are
left untouched, which implicates that:
<!--l. 51--><p class="indent" > (a) there is no risk of double-encoding; and
<!--l. 55--><p class="indent" > (b) if the original contained an inadvertent sequence that could be interpreted as an escape
sequence, uri_unescape will not render the original input (e.g. url_with_%66_in_it goes
whoop) If you know that the original has not yet been escaped and wish to safeguard potential
percent signs, you’ll have to escape them (and only them) once before you offer it to
new().
<!--l. 64--><p class="indent" > A problem with URI is, that its object is not a hash we can piggyback our data on, so I had
to resort to AUTOLOAD to emulate inheritance. I find this ugly, but well, this *is* Perl, so
what’d you expect?
<!--l. 30--><p class="indent" > _________________________________________________________________________________________________________________
<h5 class="subsubsectionHead"><span class="titlemark">A.5.11 </span> <a
id="x45-289000A.5.11"></a>Combine::XWI</h5>
<!--l. 1--><p class="noindent" ><span class="paragraphHead"><a
id="x45-290000A.5.11"></a><span
class="ecbx-1095">NAME</span></span>
XWI.pm - class for internal representation of a document record
<!--l. 6--><p class="noindent" ><span class="paragraphHead"><a
id="x45-291000A.5.11"></a><span
class="ecbx-1095">SYNOPSIS</span></span>
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
 use Combine::XWI;
 <br /> $xwi = new Combine::XWI;
</div>
</td></tr></table>
<!--l. 10--><p class="nopar" >
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
 #single value record variables
 <br /> $xwi->server($server);
</div>
</td></tr></table>
<!--l. 14--><p class="nopar" >
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
 my $server = $xwi->server();
</div>
</td></tr></table>
<!--l. 17--><p class="nopar" >
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
 #original content
 <br /> $xwi->content(\$html);
</div>
</td></tr></table>
<!--l. 21--><p class="nopar" >
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
 my $text = ${$xwi->content()};
</div>
</td></tr></table>
<!--l. 24--><p class="nopar" >
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
 #multiple value record variables
 <br /> $xwi->meta_add($name1,$value1);
 <br /> $xwi->meta_add($name2,$value2);
</div>
</td></tr></table>
<!--l. 29--><p class="nopar" >
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
 $xwi->meta_rewind;
 <br /> my ($name,$content);
 <br /> while (1) {
 <br />  ($name,$content) = $xwi->meta_get;
 <br />  last unless $name;
 <br /> }
</div>
</td></tr></table>
<!--l. 37--><p class="nopar" >
<!--l. 38--><p class="noindent" ><span class="paragraphHead"><a
id="x45-292000A.5.11"></a><span
class="ecbx-1095">DESCRIPTION</span></span>
Provides methods for storing and retrieving structured records representing crawled
documents.
<!--l. 44--><p class="noindent" ><span class="paragraphHead"><a
id="x45-293000A.5.11"></a><span
class="ecbx-1095">METHODS</span></span>
<!--l. 45--><p class="indent" > <span class="likesubparagraphHead"><a
id="x45-294000A.5.11"></a><span
class="ecbx-1095">new()</span></span>
<!--l. 46--><p class="indent" > <span class="likesubparagraphHead"><a
id="x45-295000A.5.11"></a><span
class="ecbx-1095">XXX($val)</span></span>
Saves $val using AUTOLOAD. Can later be retrieved, eg
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
    $xwi->MyVar(’My value’);
 <br />    $t = $xwi->MyVar;
</div>
</td></tr></table>
<!--l. 54--><p class="nopar" >
<!--l. 57--><p class="indent" > will set $t to ’My value’
<!--l. 59--><p class="indent" > <span class="likesubparagraphHead"><a
id="x45-296000A.5.11"></a><span
class="ecbx-1095">*_reset()</span></span>
Forget all values.
<!--l. 64--><p class="indent" > <span class="likesubparagraphHead"><a
id="x45-297000A.5.11"></a><span
class="ecbx-1095">*_rewind()</span></span>
*_get will start with the first value.
<!--l. 69--><p class="indent" > <span class="likesubparagraphHead"><a
id="x45-298000A.5.11"></a><span
class="ecbx-1095">*_add</span></span>
stores values into the datastructure
<!--l. 74--><p class="indent" > <span class="likesubparagraphHead"><a
id="x45-299000A.5.11"></a><span
class="ecbx-1095">*_get</span></span>
retrieves values from the datastructure
<!--l. 79--><p class="indent" > <span class="likesubparagraphHead"><a
id="x45-300000A.5.11"></a><span
class="ecbx-1095">meta_reset() / meta_rewind() / meta_add() / meta_get()</span></span>
Stores the content of Meta-tags
<!--l. 86--><p class="indent" > Takes/Returns 2 parameters: Name, Content
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
 $xwi->meta_add($name1,$value1);
 <br /> $xwi->meta_add($name2,$value2);
</div>
</td></tr></table>
<!--l. 91--><p class="nopar" >
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
 $xwi->meta_rewind;
 <br /> my ($name,$content);
 <br /> while (1) {
 <br />  ($name,$content) = $xwi->meta_get;
 <br />  last unless $name;
 <br /> }
</div>
</td></tr></table>
<!--l. 99--><p class="nopar" >
<!--l. 100--><p class="indent" > <span class="likesubparagraphHead"><a
id="x45-301000A.5.11"></a><span
class="ecbx-1095">xmeta_reset() / xmeta_rewind() / xmeta_add() / xmeta_get()</span></span>
Extended information from Meta-tags. Not used.
<!--l. 105--><p class="indent" > <span class="likesubparagraphHead"><a
id="x45-302000A.5.11"></a><span
class="ecbx-1095">url_remove() / url_reset() / url_rewind() / url_add() / url_get()</span></span>
Stores all URLs (ie if multiple URLs for the same page) for this record
<!--l. 112--><p class="indent" > Takes/Returns 1 parameter: URL
<!--l. 114--><p class="indent" > <span class="likesubparagraphHead"><a
id="x45-303000A.5.11"></a><span
class="ecbx-1095">heading_reset() / heading_rewind() / heading_add() / heading_get()</span></span>
Stores headings from HTML documents
<!--l. 121--><p class="indent" > Takes/Returns 1 parameter: Heading text
<!--l. 123--><p class="indent" > <span class="likesubparagraphHead"><a
id="x45-304000A.5.11"></a><span
class="ecbx-1095">link_reset() / link_rewind() / link_add() / link_get()</span></span>
Stores links from documents
<!--l. 130--><p class="indent" > Takes/Returns 5 parameters: URL, netlocid, urlid, Anchor text, Link type
<!--l. 132--><p class="indent" > <span class="likesubparagraphHead"><a
id="x45-305000A.5.11"></a><span
class="ecbx-1095">robot_reset() / robot_rewind() / robot_add() / robot_get()</span></span>
Stores calculated information, like genre, language, etc
<!--l. 139--><p class="indent" > Takes/Returns 2 parameters Name, Value. Both are strings with max length Name: 15,
Value: 20
<!--l. 141--><p class="indent" > <span class="likesubparagraphHead"><a
id="x45-306000A.5.11"></a><span
class="ecbx-1095">topic_reset() / topic_rewind() / topic_add() / topic_get()</span></span>
Stores result of topic classification.
<!--l. 148--><p class="indent" > Takes/Returns 5 parameters: Class, Absolute score, Normalized score, Terms, Algorithm
id
<!--l. 152--><p class="indent" > Class, Terms, and Algorithm id are strings with max lengths Class: 50, and Algorithm id:
25
<!--l. 157--><p class="indent" > Absolute score, and Normalized score are integers
<!--l. 161--><p class="indent" > Normalized score and Terms are optional and may be replaced with 0, and ” respectively
<!--l. 163--><p class="noindent" ><span class="paragraphHead"><a
id="x45-307000A.5.11"></a><span
class="ecbx-1095">SEE ALSO</span></span>
Combine focused crawler main site <span
class="ecss-1095">http://combine.it.lth.se/</span>
<!--l. 168--><p class="noindent" ><span class="paragraphHead"><a
id="x45-308000A.5.11"></a><span
class="ecbx-1095">AUTHOR</span></span>
Yong Cao <span
class="cmmi-10x-x-109"><</span>tsao@munin.ub2.lu.se<span
class="cmmi-10x-x-109">> </span>v0.05 1997-03-13
<!--l. 175--><p class="indent" > Anders Ardö, <span
class="cmmi-10x-x-109"><</span>anders.ardo@it.lth.se<span
class="cmmi-10x-x-109">></span>
<!--l. 177--><p class="noindent" ><span class="paragraphHead"><a
id="x45-309000A.5.11"></a><span
class="ecbx-1095">COPYRIGHT AND LICENSE</span></span>
Copyright (C) 2005,2006 Anders Ardö
<!--l. 184--><p class="indent" > This library is free software; you can redistribute it and/or modify it under the same terms as
Perl itself, either Perl version 5.8.4 or, at your option, any later version of Perl 5 you may have
available.
<!--l. 190--><p class="indent" > See the file LICENCE included in the distribution at <span
class="ecss-1095">http://combine.it.lth.se/</span>
<!--l. 33--><p class="indent" > _________________________________________________________________________________________________________________
<h5 class="subsubsectionHead"><span class="titlemark">A.5.12 </span> <a
id="x45-310000A.5.12"></a>Combine::Matcher</h5>
<!--l. 1--><p class="noindent" ><span class="paragraphHead"><a
id="x45-311000A.5.12"></a><span
class="ecbx-1095">NAME</span></span>
Matcher
<!--l. 6--><p class="noindent" ><span class="paragraphHead"><a
id="x45-312000A.5.12"></a><span
class="ecbx-1095">DESCRIPTION</span></span>
This a module in the DESIRE automatic classification system. Copyright 1999. Modified in
the ALVIS project. Copyright 2004
<!--l. 14--><p class="indent" > Exported routines: 1. Fetching text: These routines all extract texts from a document (either
a Combine XWI datastructure or a WWW-page identified by a URL. They all return: $meta,
$head, $text, $url, $title, $size $meta: Metadata from document $head: Important text from
document $text: Plain text from document $url: URL of the document $title: HTML title of the
document $size: The size of the document
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
   Common input parameters:
 <br />        $DoStem: 1=do stemming; 0=no stemming
 <br />        $stoplist: object pointer to a LoadTermList object with a stoplist loaded
 <br />        $simple: 1=do simple loading; 0=advanced loading (might induce errors)
</div>
</td></tr></table>
<!--l. 31--><p class="nopar" >
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
 getTextXWI
 <br />     parameters: $xwi, $DoStem, $stoplist, $simple
 <br />       $xwi is a Combine XWI datastructure
</div>
</td></tr></table>
<!--l. 36--><p class="nopar" >
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
 getTextURL
 <br />    parameters: $url, $DoStem, $stoplist, $simple
 <br />       $url is the URL for the page to extract text from
</div>
</td></tr></table>
<!--l. 41--><p class="nopar" >
<!--l. 44--><p class="indent" > 2. Term matcher accepts a text as a (reference) parameter, matches each term in
Term against text Matches are recorded in an associative array with class as key and
summed weight as value. Match parameters: $text, $termlist $text: text to match
against the termlist $termlist: object pointer to a LoadTermList object with a termlist
loaded output: %score: an associative array with classifications as keys and scores as
values
<!--l. 54--><p class="noindent" ><span class="paragraphHead"><a
id="x45-313000A.5.12"></a><span
class="ecbx-1095">AUTHOR</span></span>
Anders Ardö <span
class="cmmi-10x-x-109"><</span>anders.ardo@it.lth.se<span
class="cmmi-10x-x-109">></span>
<!--l. 59--><p class="noindent" ><span class="paragraphHead"><a
id="x45-314000A.5.12"></a><span
class="ecbx-1095">COPYRIGHT AND LICENSE</span></span>
Copyright (C) 2005,2006 Anders Ardö
<!--l. 66--><p class="indent" > This library is free software; you can redistribute it and/or modify it under the same terms as
Perl itself, either Perl version 5.8.4 or, at your option, any later version of Perl 5 you may have
available.
<!--l. 72--><p class="indent" > See the file LICENCE included in the distribution at <span
class="ecss-1095">http://combine.it.lth.se/</span>
<!--l. 36--><p class="indent" > _________________________________________________________________________________________________________________
<h5 class="subsubsectionHead"><span class="titlemark">A.5.13 </span> <a
id="x45-315000A.5.13"></a>Combine::FromTeX</h5>
<!--l. 1--><p class="noindent" ><span class="paragraphHead"><a
id="x45-316000A.5.13"></a><span
class="ecbx-1095">NAME</span></span>
Combine::FromTeX.pm - TeX parser in combine package
<!--l. 6--><p class="noindent" ><span class="paragraphHead"><a
id="x45-317000A.5.13"></a><span
class="ecbx-1095">AUTHOR</span></span>
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
 Anders Ardø 2000-06-11
</div>
</td></tr></table>
<!--l. 9--><p class="nopar" > __________________________________________________________________________
<h5 class="subsubsectionHead"><span class="titlemark">A.5.14 </span> <a
id="x45-318000A.5.14"></a>Combine::utilPlugIn</h5>
<!--l. 1--><p class="noindent" ><span class="paragraphHead"><a
id="x45-319000A.5.14"></a><span
class="ecbx-1095">NAME</span></span>
utilPlugIn
<!--l. 6--><p class="noindent" ><span class="paragraphHead"><a
id="x45-320000A.5.14"></a><span
class="ecbx-1095">DESCRIPTION</span></span>
Utilities for: * extracting text from XWI’s * SVM classification * language and country
identification
<!--l. 14--><p class="noindent" ><span class="paragraphHead"><a
id="x45-321000A.5.14"></a><span
class="ecbx-1095">AUTHOR</span></span>
Ignacio Garcia Dorado Anders Ardö <span
class="cmmi-10x-x-109"><</span>anders.ardo@eit.lth.se<span
class="cmmi-10x-x-109">></span>
<!--l. 20--><p class="noindent" ><span class="paragraphHead"><a
id="x45-322000A.5.14"></a><span
class="ecbx-1095">COPYRIGHT AND LICENSE</span></span>
Copyright (C) 2008 Ignacio Garcia Dorado, Anders Ardö
<!--l. 27--><p class="indent" > This library is free software; you can redistribute it and/or modify it under the same terms as
Perl itself, either Perl version 5.8.4 or, at your option, any later version of Perl 5 you may have
available.
<!--l. 33--><p class="indent" > See the file LICENCE included in the distribution at <span
class="ecss-1095">http://combine.it.lth.se/</span>
<!--l. 42--><p class="indent" > _________________________________________________________________________________________________________________
<h5 class="subsubsectionHead"><span class="titlemark">A.5.15 </span> <a
id="x45-323000A.5.15"></a>Combine::SD_SQL</h5>
<!--l. 1--><p class="noindent" ><span class="paragraphHead"><a
id="x45-324000A.5.15"></a><span
class="ecbx-1095">NAME</span></span>
SD_SQL
<!--l. 6--><p class="noindent" ><span class="paragraphHead"><a
id="x45-325000A.5.15"></a><span
class="ecbx-1095">DESCRIPTION</span></span>
Reimplementation of sd.pl SD.pm and SDQ.pm using MySQL contains both recyc and
guard
<!--l. 14--><p class="indent" > Basic idea is to have a table (urldb) that contains most URLs ever inserted into the system
together with a lock (the guard function) and a boolean harvest-flag. Also in this table
is the host part together with its lock. URLs are selected from this table based on
urllock, netloclock and harvest and inserted into a queue (table que). URLs from this
queue are then given out to harvesters. The queue is implemented as: # The admin
table can be used to generate sequence numbers like this: #mysql<span
class="cmmi-10x-x-109">> </span>update admin set
queid=LAST_INSERT_ID(queid+1); # and used to extract the next URL from the queue
#mysql<span
class="cmmi-10x-x-109">> </span>select host,url from que where queid=LAST_INSERT_ID(); # When the queue is
empty it is filled from table urldb. Several different algorithms can be used to fill it (round-robin,
most urls, longest time since harvest, ...). Since the harvest-flag and guard-lock are not
updated until the actual harvest is done it is OK to delete the queue and regenerate it
anytime.
<!--l. 33--><p class="indent" > ########################## #Questions, ideas, TODOs, etc #Split
table urldb into 2 tables - one for urls and one for hosts??? #Less efficient when filling que;
more efficient when updating netloclock #Datastruktur TABLE hosts: create table
hosts( host varchar(50) not null default ”, netloclock int not null, retries int not null
default 0, ant int not null default 0, primary key (host), key (ant), key (netloclock)
);
<!--l. 50--><p class="indent" > ############# Handle to many retries?
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
    algorithm takes an url from the host that was accessed longest ago
 <br />    ($hostid,$url)=SELECT host,url,id FROM hosts,urls WHERE
 <br />         hosts.hostlock < UNIX_TIMESTAMP()
 <br />         hosts.host=urls.host AND
 <br />         urls.urllock < UNIX_TIMESTAMP() AND
 <br />         urls.harvest=1 ORDER BY hostlock LIMIT 1;
</div>
</td></tr></table>
<!--l. 60--><p class="nopar" >
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
    algorithm takes an url from the host with most URLs
 <br />    ($hostid,$url)=SELECT host,url,id FROM hosts,urls WHERE
 <br />         hosts.hostlock < UNIX_TIMESTAMP()
 <br />         hosts.host=urls.host AND
 <br />         urls.urllock < UNIX_TIMESTAMP() AND
 <br />         urls.harvest=1 ORDER BY host.ant DESC LIMIT 1;
</div>
</td></tr></table>
<!--l. 68--><p class="nopar" >
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
    algorithm takes an url from any available host
 <br />    ($hostid,$url)=SELECT host,url,id FROM hosts,urls WHERE
 <br />         hosts.hostlock < UNIX_TIMESTAMP()
 <br />         hosts.host=urls.host AND
 <br />         urls.urllock < UNIX_TIMESTAMP() AND
 <br />         urls.harvest=1 LIMIT 1;
</div>
</td></tr></table>
<!--l. 76--><p class="nopar" >
<!--l. 77--><p class="noindent" ><span class="paragraphHead"><a
id="x45-326000A.5.15"></a><span
class="ecbx-1095">AUTHOR</span></span>
Anders Ardö <span
class="cmmi-10x-x-109"><</span>anders.ardo@it.lth.se<span
class="cmmi-10x-x-109">></span>
<!--l. 82--><p class="noindent" ><span class="paragraphHead"><a
id="x45-327000A.5.15"></a><span
class="ecbx-1095">COPYRIGHT AND LICENSE</span></span>
Copyright (C) 2005,2006 Anders Ardö
<!--l. 89--><p class="indent" > This library is free software; you can redistribute it and/or modify it under the same terms as
Perl itself, either Perl version 5.8.4 or, at your option, any later version of Perl 5 you may have
available.
<!--l. 95--><p class="indent" > See the file LICENCE included in the distribution at <span
class="ecss-1095">http://combine.it.lth.se/</span>
<!--l. 45--><p class="indent" > _________________________________________________________________________________________________________________
<h5 class="subsubsectionHead"><span class="titlemark">A.5.16 </span> <a
id="x45-328000A.5.16"></a>Combine::FromHTML</h5>
<!--l. 1--><p class="noindent" ><span class="paragraphHead"><a
id="x45-329000A.5.16"></a><span
class="ecbx-1095">NAME</span></span>
Combine::FromHTML.pm - HTML parser in combine package
<!--l. 6--><p class="noindent" ><span class="paragraphHead"><a
id="x45-330000A.5.16"></a><span
class="ecbx-1095">AUTHOR</span></span>
Yong Cao <span
class="cmmi-10x-x-109"><</span>tsao@munin.ub2.lu.se<span
class="cmmi-10x-x-109">> </span>v0.06 1997-03-19 Anders Ardø 1998-07-18 added <span
class="cmmi-10x-x-109"><</span>AREA
... HREF=link ...<span
class="cmmi-10x-x-109">> </span>fixed <span
class="cmmi-10x-x-109"><</span>A ... HREF=link ...<span
class="cmmi-10x-x-109">> </span>regexp to be more general Anders Ardö
2002-09-20 added ’a’ as a tag not to be replaced with space added removal of Cntrl-chars
and some punctuation marks from IP added <span
class="cmmi-10x-x-109"><</span>style<span
class="cmmi-10x-x-109">></span>...<span
class="cmmi-10x-x-109"><</span>/style<span
class="cmmi-10x-x-109">> </span>as something to be
removed before processing beefed up compression of sequences of blanks to include <span
class="cmsy-10x-x-109">\</span>240
(non-breakable space) changed ’remove head’ before text extraction to handle multiline
matching (which can be introduced by decoding html entities) added compress blanks and
remove CRs to metadata-content Anders Ardö 2004-04 Changed extraction process
dramatically
<!--l. 48--><p class="indent" > _________________________________________________________________________________________________________________
<h5 class="subsubsectionHead"><span class="titlemark">A.5.17 </span> <a
id="x45-331000A.5.17"></a>Combine::RobotRules</h5>
<!--l. 1--><p class="noindent" ><span class="paragraphHead"><a
id="x45-332000A.5.17"></a><span
class="ecbx-1095">NAME</span></span>
RobotRules.pm
<!--l. 6--><p class="noindent" ><span class="paragraphHead"><a
id="x45-333000A.5.17"></a><span
class="ecbx-1095">AUTHOR</span></span>
Anders Ardo version 1.0 2004-02-19
<!--l. 51--><p class="indent" > _________________________________________________________________________________________________________________
<h5 class="subsubsectionHead"><span class="titlemark">A.5.18 </span> <a
id="x45-334000A.5.18"></a>Combine::HTMLExtractor</h5>
<!--l. 1--><p class="noindent" ><span class="paragraphHead"><a
id="x45-335000A.5.18"></a><span
class="ecbx-1095">NAME</span></span>
HTMLExtractor
<!--l. 6--><p class="noindent" ><span class="paragraphHead"><a
id="x45-336000A.5.18"></a><span
class="ecbx-1095">DESCRIPTION</span></span>
Adopted from HTML::LinkExtractor - Extract links from an HTML document by D.H
(PodMaster)
<!--l. 14--><p class="noindent" ><span class="paragraphHead"><a
id="x45-337000A.5.18"></a><span
class="ecbx-1095">AUTHOR Anders Ardo</span></span>
D.H (PodMaster)
<!--l. 19--><p class="noindent" ><span class="paragraphHead"><a
id="x45-338000A.5.18"></a><span
class="ecbx-1095">LICENSE</span></span>
Copyright (c) 2003 by D.H. (PodMaster). All rights reserved.
<!--l. 27--><p class="indent" > This module is free software; you can redistribute it and/or modify it under the same terms
as Perl itself. The LICENSE file contains the full text of the license.
<!--l. 54--><p class="indent" > _________________________________________________________________________________________________________________
<h5 class="subsubsectionHead"><span class="titlemark">A.5.19 </span> <a
id="x45-339000A.5.19"></a>Combine::LoadTermList</h5>
<!--l. 1--><p class="noindent" ><span class="paragraphHead"><a
id="x45-340000A.5.19"></a><span
class="ecbx-1095">NAME</span></span>
LoadTermList
<!--l. 6--><p class="noindent" ><span class="paragraphHead"><a
id="x45-341000A.5.19"></a><span
class="ecbx-1095">DESCRIPTION</span></span>
This a module in the DESIRE automatic classification system. Copyright 1999.
<!--l. 13--><p class="indent" > LoadTermList - A class for loading and storing a stoplist with single words a termlist with
classifications and weights
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
 Subroutines:
 <br />   LoadStopWordList(StopWordListFileName)
 <br />      loads a list of stopwords, one per line, from
 <br />      the file StopWordListFileName.
</div>
</td></tr></table>
<!--l. 22--><p class="nopar" >
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
   EraseStopWordList
 <br />      clears the stopword list
</div>
</td></tr></table>
<!--l. 26--><p class="nopar" >
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
 Subroutines:
 <br />  LoadTermList(TermListFileName) - loads TermClass from file
 <br />  LoadTermListStemmed(TermListFileName) - same plus stems terms
</div>
</td></tr></table>
<!--l. 31--><p class="nopar" >
<table
class="verbatim"><tr class="verbatim"><td
class="verbatim"><div class="verbatim">
 Input: A formatted term-list including weights and classifications
 <br />  Format:  <weight>: <term_reg_exp>=[<classification>, ]+
 <br />  weight can be a positive or negative number
 <br />  term_reg_exp can be words, phrases, boolean expressions (with @and
 <br />     as operator) on term_reg_exp or Perl regular expressions
</div>
</td></tr></table>
<!--l. 38--><p class="nopar" >
<!--l. 39--><p class="noindent" ><span class="paragraphHead"><a
id="x45-342000A.5.19"></a><span
class="ecbx-1095">AUTHOR</span></span>
Anders Ardö <span
class="cmmi-10x-x-109"><</span>Anders.Ardo@it.lth.se<span
class="cmmi-10x-x-109">></span>
<!--l. 44--><p class="noindent" ><span class="paragraphHead"><a
id="x45-343000A.5.19"></a><span
class="ecbx-1095">COPYRIGHT AND LICENSE</span></span>
Copyright (C) 2005,2006 Anders Ardö
<!--l. 51--><p class="indent" > This library is free software; you can redistribute it and/or modify it under the same terms as
Perl itself, either Perl version 5.8.4 or, at your option, any later version of Perl 5 you may have
available.
<!--l. 57--><p class="indent" > See the file LICENCE included in the distribution at <span
class="ecss-1095">http://combine.it.lth.se/</span>
<!--l. 57--><p class="indent" > _________________________________________________________________________________________________________________
<h5 class="subsubsectionHead"><span class="titlemark">A.5.20 </span> <a
id="x45-344000A.5.20"></a>Combine::classifySVM</h5>
<!--l. 1--><p class="noindent" ><span class="paragraphHead"><a
id="x45-345000A.5.20"></a><span
class="ecbx-1095">NAME</span></span>
classifySVM
<!--l. 6--><p class="noindent" ><span class="paragraphHead"><a
id="x45-346000A.5.20"></a><span
class="ecbx-1095">DESCRIPTION</span></span>
Classification plugin module using SVM (implementation SVMLight)
<!--l. 13--><p class="indent" > Uses SVM model loaded from file pointed to by configuration variable ’SVMmodel’
<!--l. 15--><p class="noindent" ><span class="paragraphHead"><a
id="x45-347000A.5.20"></a><span
class="ecbx-1095">AUTHOR</span></span>
Ignacio Garcia Dorado Anders Ardö <span
class="cmmi-10x-x-109"><</span>anders.ardo@eit.lth.se<span
class="cmmi-10x-x-109">></span>
<!--l. 21--><p class="noindent" ><span class="paragraphHead"><a
id="x45-348000A.5.20"></a><span
class="ecbx-1095">COPYRIGHT AND LICENSE</span></span>
Copyright (C) 2008 Ignacio Garcia Dorado, Anders Ardö
<!--l. 28--><p class="indent" > This library is free software; you can redistribute it and/or modify it under the same terms as
Perl itself, either Perl version 5.8.4 or, at your option, any later version of Perl 5 you may have
available.
<!--l. 34--><p class="indent" > See the file LICENCE included in the distribution at <span
class="ecss-1095">http://combine.it.lth.se/</span>
<!--l. 60--><p class="indent" > _________________________________________________________________________________________________________________
<!--l. 51--><div class="crosslinks"><p class="noindent">[<a
href="DocMainse11.html" >front</a>] [<a
href="# " >up</a>] </p></div>
<!--l. 51--><p class="indent" > <a
id="tailDocMainse11.html"></a>
</body></html>