The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
1. more document is needed.
2. some interface may be changed in future.

----------------------- cut of sources
	$self->{Url_Last} = $url;
	$self->{Url_Base} = $res->base->as_string;
	$self->{Url_Code} = $res->code;
	$self->{Url_Title} = $res->headers->title;
	$self->{Url_Content_Type} = $res->headers->content_type;
	$self->{Url_Last_Modified} = $res->headers->last_modified;
	$self->{Url_Last_Modified_Iso} = HTTP::Date::time2iso($res->headers->last_modified);
$res->content
$res->is_success

$str="<Test>\nThis is a Test\r\n;<!-- Abc\r \n-->";
is($bot->get_url_textok($str), "<ln> <Test><ln>This is a Test<ln>;<!-- Abc--><ln>", "function get_url_text_ok");

$bot=&recreate($testdir);
$str=<<STR;
Before <!---
Test of mark <font>FONTS</font>
---> Inner
<script src="Test">display();</script>
Another   Space
STR
$str=~s/(\r\n|\n|\r)/<ln>/g;
is($bot->parse_empty($str), "Before Inner<ln><ln>Another Space<ln>", "function parse_empty");
$bot->{REMOVE_LEADING_SPACES}=1;
$str="<ln>    a good<ln>  dog<ln>  must<ln>  be<ln>  good.<ln>";
is($bot->parse_leadingspace($str), "<ln>  a good<ln>dog<ln>must<ln>be<ln>good.<ln>", "function parse_leadingspace");
use WWW::BookBot::Chinese;
$bot=new WWW::BookBot::Chinese;
$bot->initialize;
$bot->{TEXTREMOVEINNERSPACE}=1;
$str="A b o u t  a  ÖÐgreat ¹ú ÊÇ Ò» ¸ö and  w o n d e r f u l  i d e a.<ln>";
is($bot->parse_innerspace($str), "About a great and wonderful idea.<ln>", "function parse_innerspace");


#print Dumper($bot);



#-----------------------------------------------
# get_url_ok: prepare received contents
#-----------------------------------------------
# $str			contents
#---RETURN contents
#-----------------------------------------------
sub get_url_ok {
	my $self = shift;
	return ($self->{Url_Content_Type} eq 'text/html'
			or $self->{Url_Content_Type} eq 'text/plain') ?
		$self->get_url_textok(@_) : $self->get_url_binok(@_);
}

#-----------------------------------------------
# get_url_textok: prepare received text contents by replace \r\n as <ln>
#-----------------------------------------------
# $str			contents
#---RETURN contents
#-----------------------------------------------
sub get_url_textok {
	my $self = shift;
	my $str=$self->de_code($_[0]);
	$str=~s/(\r\n|\r|\n)/<ln>/g;
	$str=~s/(<[^<>]*)(?: *<ln>)+([^<>]*>)/$1$2/g;	#prepare <> in different lines
	return "<ln>$str<ln>";
}

#-----------------------------------------------
# parse_img: parser images
#-----------------------------------------------
# $base			base url
# $contents		html contents
#-----------------------------------------------
sub parse_img {
	my $self = shift;
	my $base = shift;
	my ($html, $src);
	while($_[0]=~/<img([^<>]*>)/ig) {
		$html=$1;
		$src='';
		if( $html=~/src *= *\"([^\">]*)\"/ ) {
			$src=$1;
		} elsif( $html=~/src *= *\'([^\'>]*)\'/ ) {
			$src=$1;
		} elsif( $html=~/src *= *([^ >]*)( |>)/ ) {
			$src=$1;
		}
		$self->parse_img_ok($self->url_rel2abs($src, $base));
	}
}

#-----------------------------------------------
# parse_img_ok: found good image url
#-----------------------------------------------
# $url			image url
#-----------------------------------------------
sub parse_img_ok {
	my $self = shift;
	my ($url) = @_;
	# not finished
}


application/octet-stream
text/plain


$p1="\$str=~\/^(?:$p1\)\$/x";

$str=$found if eval($p1);

	$str=$self->parse_removespace($str);
	if( $str=~/\n +[^ \n][^\n]*\n[^ \n][^\n]*\n[^ \n][^\n]*\n/sg ) {
		# "\n " -> <br>, "\n" -> no use
		$str=~s/\n +(?=[^ \n])/<br> /sg;
		$str=~s/\n//sg;
		$str=~s/<br>/\n/g;
	}


#-----------------------------------------------
# parse_innerspace: remove inner space
#-----------------------------------------------
# $content		contents
#---RETURN contents
#-----------------------------------------------
sub parse_innerspace {
	my $self = shift;
	return $_[0] if not $self->{TEXTREMOVEINNERSPACE};

	my $pattern_sub="[^ <>]($self->{Pattern_Mark})* ";
	my $pattern="";
	for(my $i=0; $i<7; $i++) {
		$pattern.=$pattern_sub;
	}
	return $_[0] if not $_[0]=~/$pattern/;

	my $str1=$_[0];
	$str1=~s/([0-9a-zA-Z]($self->{Pattern_Mark})) [0-9a-zA-Z][0-9a-zA-Z]//g;
	my $str="";
	foreach (split /(<ln>| {2,})/, $_[0]) {
		print "($_) ";
		if(/^  /) {
			$str.=" ";
			next;
		}
		
		$_=~s/ //g if $_=~/^[^ ]($self->{Pattern_Mark})*( [^ ]($self->{Pattern_Mark})*)+$/;
		$str.=$_;
	}

	return $str;
}

WWW::BookBot is a bot to fetch web e-texts with catalog, books and chapters. It can fetch and reformat multiple html pages into a single text file, which can be readed in .