Anders Ardö > Combine-3.8 > Combine::FromHTML

Download:
Combine-3.8.tar.gz

Dependencies

Annotate this POD

CPAN RT

Open  0
Report a bug
Source   Latest Release: Combine-4.003
    if ($opt =~ /M/) { # extract metadata

        my $summary = "";
        $xwi->meta_rewind;
        my ($name,$content);
        while(1) {
            ($name,$content) = $xwi->meta_get;
            if (!defined($name)) { last; }

          #If abstract, description or DC.Description is not a list of keywords: add it to summary
          if ( $name eq 'description' || $name eq 'dc.description' || $name eq 'abstract' ) {
              my @kom = split(', ',$content);
              my @dot = split(' ',$content);
              if ( $#kom < $#dot ) { #If several meta-fields check if they overlap or are the same##
                  $summary .= $content . ' ';
              }
          }
        }

    #Generate Summary
        my $sumlength = Combine::Config::Get('SummaryLength');
#       print "SUM1: $summary\nHTML: $html\n";
        if ( $sumlength > 0 ) {
            if ( ($sumlength - length($summary)) > 0 ) {
                require HTML::Summary;
                require HTML::TreeBuilder;
                my $html_summarizer = new HTML::Summary( LENGTH => $sumlength - length($summary), USE_META => 0 );
                my $tree = new HTML::TreeBuilder;
                $tree->parse( Encode::encode('latin1',$html) );
#               $tree->parse( $html );
                $tree->eof();
##              $summary .= $html_summarizer->generate ( $tree );
                my $t .= Encode::decode('latin1',$html_summarizer->generate ( $tree ));
                $tree = $tree->delete;
                $summary .= $t;
            }
            if (length($summary)>2) {
#               $summary =~ s/[^\w\s,\.\!\?:;\'\"]//gs;
                $summary =~ s/[^\p{IsAlnum}\s,\.\!\?:;\'\"]//gs;
                $summary =~ s/[\s\240]+/ /g;
                $xwi->meta_add("Rsummary",$summary);
            }
        }
    }#End if M

NAME ^

Combine::FromHTML.pm - HTML parser in combine package

AUTHOR ^

Yong Cao <tsao@munin.ub2.lu.se> v0.06 1997-03-19 Anders Ardø 1998-07-18 added <AREA ... HREF=link ...> fixed <A ... HREF=link ...> regexp to be more general Anders Ardö 2002-09-20 added 'a' as a tag not to be replaced with space added removal of Cntrl-chars and some punctuation marks from IP added <style>...</style> as something to be removed before processing beefed up compression of sequences of blanks to include \240 (non-breakable space) changed 'remove head' before text extraction to handle multiline matching (which can be introduced by decoding html entities) added compress blanks and remove CRs to metadata-content Anders Ardö 2004-04 Changed extraction process dramatically