The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
# Copyright 2012 Kevin Ryde
#
# This file is part of RSS2Leafnode.
#
# RSS2Leafnode is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 3, or (at your option) any later
# version.
#
# RSS2Leafnode is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
# for more details.
#
# You should have received a copy of the GNU General Public License along
# with RSS2Leafnode.  If not, see <http://www.gnu.org/licenses/>.


# re-parse on error
# rss_newest_only must look at all dates




sub rss_process_str {
  my ($self, $xml) = @_;
  ### rss_process_str() ...

  # default "discard_spaces" chucks leading and trailing space on content,
  # which is usually a good thing
  #
  require XML::Twig;
  XML::Twig->VERSION('3.34'); # for att_exists()
  my $twig = XML::Twig->new (map_xmlns => $map_xmlns,
                             pretty_print => 'wrapped',
                             twig_handlers => { item => sub { $self->process_item(\&_process_item,
);
  $twig->safe_parse ($xml);
  my $err = $@;
  ### $err

  # Try to fix bad non-ascii chars by putting it through Encode::from_to().
  # Encode::FB_DEFAULT substitutes U+FFFD when going to unicode, or question
  # mark "?" going to non-unicode.  Mozilla does some sort of similar
  # liberal byte interpretation so as to at least display something from a
  # dodgy feed.
  #
  if ($err && $err =~ /not well-formed \(invalid token\) at (line \d+, column \d+, byte (\d+))/) {
    my $where = $1;
    my $byte = ord(substr($xml,$2,1));
    if ($byte >= 128) {
      my $charset = $twig->encoding // 'utf-8';
      $self->verbose (1, sprintf ("parse error, attempt re-code $charset for byte 0x%02X\n", $byte));
      require Encode;
      my $recoded_xml = $xml;
      Encode::from_to($recoded_xml, $charset, $charset, Encode::FB_DEFAULT());

      $twig = XML::Twig->new (map_xmlns => $map_xmlns);
      if ($twig->safe_parse ($recoded_xml)) {
        $twig->root->set_att('rss2leafnode:fixup',
                             "Recoded bad bytes to charset $charset");
        print __x("Feed {url}\n  recoded {charset} to parse, expect substitutions for bad non-ascii\n  ({where})\n",
                  url     => $self->{'uri'},
                  charset => $charset,
                  where   => $where);
        undef $err;
      }
    }
  }

  # Or attempt to put it through XML::Liberal, if available.
  #
  if ($err) {
    my $liberal_xml = $self->xml_liberal_correction($xml);
    if (defined $liberal_xml) {
      ### reparse xml liberal fixup with twig ...
      $twig = XML::Twig->new (map_xmlns => $map_xmlns);
      if ($twig->safe_parse ($liberal_xml)) {
        ### now ok ...
        $err = Text::Trim::trim($err);
        $twig->root->set_att('rss2leafnode:fixup',
                             "XML::Liberal fixed: {error}",
                             error => $err);
        print __x("Feed {url}\n  parse error: {error}\n  continuing with repairs by XML::Liberal\n",
                  url   => $self->{'uri'},
                  error => $err);
        undef $err;
      }
    }
    ### now err: $err
  }

  if ($err) {
    # XML::Parser seems to stick some spurious leading whitespace on the error
    $err = Text::Trim::trim($err);

    $self->verbose (1, __x("Parse error on URL {url}\n{error}",
                           url   => $self->{'uri'},
                           error => $err));
    return (undef, $err);
  }

  # Strip any explicit "rss:" or "atom:" namespace down to bare part.
  # Should be unambiguous and is easier than giving tag names both with and
  # without the namespace.  Undocumented set_ns_as_default() might do this
  # ... or might not.
  #
  my $root = $twig->root;
  App::RSS2Leafnode::XML::Twig::Other::elt_tree_strip_prefix ($root, 'atom');
  App::RSS2Leafnode::XML::Twig::Other::elt_tree_strip_prefix ($root, 'rss');

  # somehow map_xmlns mangles default attributes like "decimals=...", prefer
  # to see them without rss: or atom: -- maybe
  #   foreach my $child ($root->descendants_or_self) {
  #     foreach my $attname ($child->att_names) {
  #       if ($attname =~ /^(atom|rss):(.*)/) {
  #         $child->change_att_name($attname, $2);
  #       }
  #     }
  #   }

  ### add xml base
  if (defined $self->{'uri'} && ! $root->att_exists('xml:base')) {
    $root->set_att ('xml:base', $self->{'uri'});
  }

  ### success
  return ($twig, undef);
}