lib/Parse/MediaWikiDump/Pages.pm

package Parse::MediaWikiDump::Pages;

our $VERSION = '1.0.4';

use base qw(Parse::MediaWikiDump::Revisions);

use strict;
use warnings;
use Scalar::Util qw(weaken);

#the only difference between this class and ::Revisions
#is that this class enforces a single revision per each
#page node
sub new_accumulator_engine {
	my ($self) = @_;
	
	weaken($self);
	
	my $f = Parse::MediaWikiDump::XML::Accumulator->new;
	my $store_siteinfo = $self->{SITEINFO};
	my $store_page = $self->{PAGE_LIST};
	
	my $root = $f->root;
	my $mediawiki = $f->node('mediawiki', Start => \&handle_mediawiki_node);
	
	#stuff for siteinfo
	my $siteinfo = $f->node('siteinfo', End => sub { %$store_siteinfo = %{ $_[1] } } );
	my $sitename = $f->textcapture('sitename');
	my $base = $f->textcapture('base');
	my $generator = $f->textcapture('generator');
	my $case = $f->textcapture('case');
	my $namespaces = $f->node('namespaces', Start => sub { $_[1]->{namespaces} = []; } );
	my $namespace = $f->node('namespace', Character => \&save_namespace_node);
	
	#stuff for page entries
	my $page = $f->node('page', Start => sub { $_[0]->accumulator( {} ) } );
	my $title = $f->textcapture('title');
	my $id = $f->textcapture('id');
	my $revision = $f->node('revision', 
		Start => sub { $_[1]->{minor} = 0 }, 
		End => sub { 
			if (defined($_[1]->{seen_revision})) {
				$self->{DIE_REQUESTED} = "only one revision per page is allowed";
			}
			
			$_[1]->{seen_revision} = 1;
			
			push(@$store_page, { %{ $_[1] } } );
		} );
	my $rev_id = $f->textcapture('id', 'revision_id');
	my $minor = $f->node('minor', Start => sub { $_[1]->{minor} = 1 } );
	my $time = $f->textcapture('timestamp');
	my $contributor = $f->node('contributor');
	my $username = $f->textcapture('username');
	my $ip = $f->textcapture('ip', 'userip');
	my $contrib_id = $f->textcapture('id', 'userid');
	my $comment = $f->textcapture('comment');
	my $text = $f->textcapture('text');
	my $restr = $f->textcapture('restrictions');
	
	#put together the tree
	$siteinfo->add_child($sitename, $base, $generator, $case, $namespaces);
	  $namespaces->add_child($namespace);
	
	$page->add_child($title, $id, $revision, $restr);
	  $revision->add_child($rev_id, $time, $contributor, $minor, $comment, $text);
	    $contributor->add_child($username, $ip, $contrib_id);
	
	$mediawiki->add_child($siteinfo, $page);
	$root->add_child($mediawiki);
	
	my $engine = $f->engine($root, {});

	return $engine;	
}

sub handle_mediawiki_node {
	return Parse::MediaWikiDump::Revisions::handle_mediawiki_node(@_);
}

sub save_namespace_node {
	return Parse::MediaWikiDump::Revisions::save_namespace_node(@_);
}

1;

__END__
=head1 NAME

Parse::MediaWikiDump::Pages - Object capable of processing dump files with a single revision per article

=head1 ABOUT

This object is used to access the metadata associated with a MediaWiki instance and provide an iterative interface
for extracting the individual articles out of the same. This module does not allow more than one revision
for each specific article; to parse a comprehensive dump file use the Parse::MediaWikiDump::Revisions object. 

=head1 SYNOPSIS
  
  $pmwd = Parse::MediaWikiDump->new;
  $pages = $pmwd->pages('pages-articles.xml');
  $pages = $pmwd->pages(\*FILEHANDLE);
  
  #print the title and id of each article inside the dump file
  while(defined($page = $pages->next)) {
    print "title '", $page->title, "' id ", $page->id, "\n";
  }

=head1 STATUS

=head1 STATUS

This software is being RETIRED - MediaWiki::DumpFile is the official successor to
Parse::MediaWikiDump and includes a compatibility library called MediaWiki::DumpFile::Compat
that is 100% API compatible and is a near perfect standin for this module. It is faster
in all instances where it counts and is actively maintained. Any undocumented deviation
of MediaWiki::DumpFile::Compat from Parse::MediaWikiDump is considered a bug and will
be fixed. 

=head1 METHODS

=over 4

=item $pages->new

Open the specified MediaWiki dump file. If the single argument to this method
is a string it will be used as the path to the file to open. If the argument
is a reference to a filehandle the contents will be read from the filehandle as
specified. 

=item $pages->next

Returns an instance of the next available Parse::MediaWikiDump::page object or returns undef
if there are no more articles left.

=item $pages->version

Returns a plain text string of the dump file format revision number

=item $pages->sitename

Returns a plain text string that is the name of the MediaWiki instance.

=item $pages->base

Returns the URL to the instances main article in the form of a string.

=item $pages->generator

Returns a string containing 'MediaWiki' and a version number of the instance that dumped this file.
Example: 'MediaWiki 1.14alpha'

=item $pages->case

Returns a string describing the case sensitivity configured in the instance.

=item $pages->namespaces

Returns a reference to an array of references. Each reference is to another array with the first
item being the unique identifier of the namespace and the second element containing a string
that is the name of the namespace.

=item $pages->namespaces_names

Returns an array reference the array contains strings of all the namespaces each as an element. 

=item $pages->current_byte

Returns the number of bytes that has been processed so far

=item $pages->size

Returns the total size of the dump file in bytes. 

=back

=head2 Scan an article dump file for double redirects that exist in the most recent article revision

  #!/usr/bin/perl
  
  #progress information goes to STDERR, a list of double redirects found
  #goes to STDOUT
  
  binmode(STDOUT, ":utf8");
  binmode(STDERR, ":utf8");
  
  use strict;
  use warnings;
  use Parse::MediaWikiDump;
  
  my $file = shift(@ARGV);
  my $pmwd = Parse::MediaWikiDump->new;
  my $pages;
  my $page;
  my %redirs;
  my $artcount = 0;
  my $file_size;
  my $start = time;
  
  if (defined($file)) {
  	$file_size = (stat($file))[7];
  	$pages = $pmwd->pages($file);
  } else {
  	print STDERR "No file specified, using standard input\n";
  	$pages = $pmwd->pages(\*STDIN);
  }
  
  #the case of the first letter of titles is ignored - force this option
  #because the other values of the case setting are unknown
  die 'this program only supports the first-letter case setting' unless
  	$pages->case eq 'first-letter';
  
  print STDERR "Analyzing articles:\n";
  
  while(defined($page = $pages->next)) {
    update_ui() if ++$artcount % 500 == 0;
  
    #main namespace only
    next unless $page->namespace eq '';
    next unless defined($page->redirect);
  
    my $title = case_fixer($page->title);
    #create a list of redirects indexed by their original name
    $redirs{$title} = case_fixer($page->redirect);
  }
  
  my $redir_count = scalar(keys(%redirs));
  print STDERR "done; searching $redir_count redirects:\n";
  
  my $count = 0;
  
  #if a redirect location is also a key to the index we have a double redirect
  foreach my $key (keys(%redirs)) {
    my $redirect = $redirs{$key};
  
    if (defined($redirs{$redirect})) {
      print "$key\n";
      $count++;
    }
  }
  
  print STDERR "discovered $count double redirects\n";
  
  #removes any case sensativity from the very first letter of the title
  #but not from the optional namespace name
  sub case_fixer {
    my $title = shift;
  
    #check for namespace
    if ($title =~ /^(.+?):(.+)/) {
      $title = $1 . ':' . ucfirst($2);
    } else {
      $title = ucfirst($title);
    }
  
    return $title;
  }
  
  sub pretty_bytes {
    my $bytes = shift;
    my $pretty = int($bytes) . ' bytes';
  
    if (($bytes = $bytes / 1024) > 1) {
      $pretty = int($bytes) . ' kilobytes';
    }
  
    if (($bytes = $bytes / 1024) > 1) {
      $pretty = sprintf("%0.2f", $bytes) . ' megabytes';
    }
  
    if (($bytes = $bytes / 1024) > 1) {
      $pretty = sprintf("%0.4f", $bytes) . ' gigabytes';
    }
  
    return $pretty;
  }
  
  sub pretty_number {
    my $number = reverse(shift);
    $number =~ s/(...)/$1,/g;
    $number = reverse($number);
    $number =~ s/^,//;
  
    return $number;
  }
  
  sub update_ui {
    my $seconds = time - $start;
    my $bytes = $pages->current_byte;
  
    print STDERR "  ", pretty_number($artcount),  " articles; "; 
    print STDERR pretty_bytes($bytes), " processed; ";
  
    if (defined($file_size)) {
      my $percent = int($bytes / $file_size * 100);
  
      print STDERR "$percent% completed\n"; 
    } else {
      my $bytes_per_second = int($bytes / $seconds);
      print STDERR pretty_bytes($bytes_per_second), " per second\n";
    }
  }

=head1 LIMITATIONS

=head2 Version 0.4

This class was updated to support version 0.4 dump files from
a MediaWiki instance but it does not currently support any of
the new information available in those files.
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)