The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
<?xml version="1.0" encoding="UTF-8"?>
<!-- $Id: alvis.xml,v 1.1 2005/06/17 15:45:56 mike Exp $ -->
<anytype:container xmlns:anytype="http://alvis.info/schema/anytype/1.0/">
<documentCollection>
  <documentRecord id="12345678">

    <!-- Information generated during the initial acquision of the
    document, whether by a web crawler, MS-Word converter, etc. -->
    <acquisition>

      <!-- Information, in the current WP7 format, to do with the
      acquisions process: acquision date, URLs where the document was
      found, expiry date, size, etc. -->
      <acquisitionData>
        <modifiedDate>2001-04-19</modifiedDate>
        <expiryDate>2004-11-06</expiryDate>
        <checkedDate>2004-10-06</checkedDate>
        <httpServer>WebSTAR/4.4(SSL) ID/72915</httpServer>
        <urls>
          <url>http://www.snomnh.ou.edu/pdf/2000/00-27.pdf</url>
        </urls>
      </acquisitionData>

      <!-- Original document represented as cleaned HTML, or text
      extracted from MSWord, PS, PDF, etc.  May be a binary format, with
      an attribute specifying base64 or quoted-printable encoding -->
      <originalDocument mimeType="text/plain" charSet="us-ascii">
	...
      </originalDocument>

      <!-- Visible text from document, together with what internal
      structure we can express through canonical markup -->
      <canonicalDocument>
        <!-- As in previous example -->
      </canonicalDocument>

      <!-- Information, in the current WP7 format, that is _about_ the
      document rather than part of it: e.g., author, title, subject, DOI -->
      <metaData>
        <meta name="dc.author">Wedel, Mathew J.</meta>
        <meta name="dc.date">2000</meta>
        <meta name="dc.title">Sauroposeidon proteles, a new sauropod from
                the Early Cretaceous of Oklahoma</meta>
      </metaData>

      <!-- Link information from WP7 format. All URLs will contain an
      internal ID (not guaranteed to be unique across multiple crawler
      instances) -->
      <links>
        <outlinks> <!-- links to external pages -->
          <link type="a"> <!-- repeatable -->
            <anchorText>Text from this document</anchorText>
            <location documentId="...">URL</location>
          </link>
        </outlinks>
        <inlinks> <!-- links from external pages -->
          <link type="a"> <!-- repeatable -->
            <anchorText>PDF ( 1 MB)</anchorText>
            <location documentId="...">http://www.snomnh.ou.edu/publications/Articles/index.shtml</location>
          </link>
        </inlinks>
        <!-- Number of unique other hosts with links pointing to this page -->
        <inlinkHosts> ... </inlinkHosts>
      </links>

      <!-- Results of analysis done as part of the acquisition process,
      e.g. genre intuited from top-level domain name of the site from
      which a Web document was crawled -->
      <analysis>
        <!-- analysis also containes other analysed properties (mainly from
             the URL) with property name as tag and content as value -->
        <property name="topLevelDomain">edu</property>
        <property name="language">en</property>
        <property name="genre">article</property>
        <ranking scheme="..."> ... </ranking> <!-- repeatable -->
        <topic absoluteScore="150" relativeScore="570">
          <class>ALL</class>
        </topic>
        <topic absoluteScore="100" relativeScore="380">
          <class>CP</class>
          <terms>carnivorous plant[^\s]*, carnivor[^\s]*, </terms>
        </topic>
        <topic absoluteScore="50" relativeScore="190">
          <class>CP.Dionaea</class>
          <terms>flytrap[^\s]*, venus flytrap[^\s]*, </terms>
        </topic>
      </analysis>
    </acquisition>

    <!-- Annotations from WP5 -->
    <linguisticAnalysis>
      <!-- Details omitted: see Deliverable D5.1 -->
    </linguisticAnalysis >

    <!-- Relevance information added from WP2 -->
    <relevance>
      <scoreset type="ranking">
	<score topicId="1">8.36536</score>
	<score topicId="4">4.25395</score>
	<score topicId="19">0.44538</score>
	<score topicId="36">2.35349</score>
      </scoreset>
      <scoreset type="content">
	<score topicId="1">40.25395</score>
	<score topicId="4">2.947</score>
	<score topicId="17">0.44538</score>
	<score topicId="23">1.4629</score>
	<score topicId="36">2.35349</score>
      </scoreset>
    </relevance>
  </documentRecord>
</documentCollection>
</anytype:container>