Wray Buntine > Alvis-Convert-0.4 > Alvis::Convert

Download:
Alvis-Convert-0.4.tar.gz

Dependencies

Annotate this POD

View/Report Bugs
Module Version: 0.4   Source  

NAME ^

Alvis::Convert - Perl extension for converting documents from a number of different source formats to Alvis XML format.

SYNOPSIS ^

 use Alvis::Convert;

 # Create a new instance, outputting under 'out'. Get the detected
 # encoding from sourceEncodingFromMeta.
 #
 my $C=Alvis::Convert->new(outputRootDir=>'out',
                           outputNPerSubdir=>1000,
                           outputAtSameLocation=>0,
                           includeOriginalDocument=>0,
                           sourceEncodingFromMeta=>1);
 # Restart output counters
 $C->init_output();

 # Convert e.g. HTML
 for my $html_text (@html)
 {
     my $alvisXML=$C->HTML($html_txt,$meta_txt);
     if (!defined($alvisXML))
     {
        warn $C->errmsg();
        $C->clearerr();
        next;
     }
 
     if (!$C->output_Alvis([$alvisXML]))
     {
         warn $C->errmsg();
         $C->clearerr();
         next;
     }
 }

DESCRIPTION ^

Converts document collections of different formats to Alvis XML format.

METHODS ^

new()

Options:

    fileType                 the MIME type of the source file to convert. 
                             Default: guess.
    sourceEncoding           encoding of the source document. Default: guess.  
    urlFromBasename          extract URL from basename. Default: no.
    outputAtSameLocation     output Alvis XML to the same directories as the
                             source documents. Default: no.
    alvisSuffix              suffix of the output Alvis XML records. Default:
                             'alvis'.
    outputRootDir            root directory for output files. Default: '.'
    outputNPerSubdir         number of records output per subdirectory.
                             Default: 1000
    defaultDocType           first guess document (MIME) type. Default: 'text'.
    defaultDocSubType        first guess document subtype. Default: 'html'.
    defaultEncoding          first guess encoding. Default: 'iso-8859-1'.
    includeOriginalDocument  include original document in the output?
                             Default: yes.
    ainodumpWarnings         issue warnings concerning ainodump conversion?
                             Default: yes.
    sourceEncodingFromMeta   read source encoding from Meta information?
                             Default: no.

HTML()

     my $alvisXML=$C->HTML($html_txt,$meta_txt,
                           {sourceEncoding=>'utf8',
                            sourceEncodingFromMeta=>0
                            });
     if (!defined($alvisXML))
     {
        warn $C->errmsg();
        $C->clearerr();
        next;
     }

newsXML()

     $meta_txt=$C->read_meta($news_xml_entries{$base_name}{metaF});
     if (!defined($meta_txt))
     {
         warn "Reading meta file " .
              "\"$news_xml_entries{$base_name}{metaF}\" failed. " .
              $C->errmsg();
         $C->clearerr();
         next;
     }
     my $alvisXMLs;
     $xml_txt=$C->read_news_XML($news_xml_entries{$base_name}{xmlF});
     if (!defined($xml_txt))
     {
         warn "Reading the news XML for basename \"$base_name\" failed. " .
               $C->errmsg();
         $C->clearerr();
         next;
     }
     $alvisXMLs=$C->newsXML($xml_txt,$meta_txt,$original_document_text);
     if (!defined($alvisXMLs))
     {
         warn "Obtaining the Alvis versions of the documents inside " .
              "\"$base_name\"'s XML file failed. " . $C->errmsg();
         $C->clearerr();
         next;
     }

ainodump()

    if (!$C->ainodump($ainodump_file))
    {
       warn "Obtaining the Alvis version of the " .
            "ainodump file \"$dump_entries{$base_name}{ainoF}\" " .
            "failed. " . $C->errmsg() if
              $Warnings;
       $C->clearerr();
    }

set()

    $C->set('alvisSuffix','foo');

read_HTML()

    $html_txt=$C->read_HTML($html_file,$meta_txt);
     if (!defined($html_txt))
     {
         warn "Reading the HTML failed. " .
               $C->errmsg();
         $C->clearerr();
         next;
     }

read_meta()

read_news_XML()

init_output()

    Initializes output counters.

output_alvis()

    $alvisXML=$C->HTML($html_txt,$meta_txt);
    if (!$C->output_Alvis([$alvisXML],$base_name))
    {
        warn "Outputting the Alvis records failed. " . $C->errmsg() if
                $Warnings;
        $C->clearerr();
        next;
    }

errmsg()

Returns a stack of error messages, if any. Empty string otherwise.

SEE ALSO ^

Alvis::Document

AUTHOR ^

Kimmo Valtonen, <kimmo.valtonen@hiit.fi>

COPYRIGHT AND LICENSE ^

Copyright (C) 2006 by Kimmo Valtonen

This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself, either Perl version 5.8.4 or, at your option, any later version of Perl 5 you may have available.

syntax highlighting: