lib/URI2/URL.pm - metacpan.org

#
# Incompatibilities (+ after imcompatibilities that should probably be
# fixed) :
#
# . \0 is not permitted in strings, %00 is permitted.              +
# . in netloc the following is not possible because not
#   distinction is done between null string and undefined string.
# 
#   :0@h:0      missing user, auth part omitted (h:0)
#   :@h:0       missing user, auth part omitted (h:0)
#   foo:@h:0    missing passwd, passwd omitted (foo@h:0)           +
#
# . crack not implemented                                          +
# . derivation not implemented                                     +
# . leading . and .. are reduced
#      /./g       = <URL:http://a/./g>                             +
#      /../g      = <URL:http://a/../g>                            +
#
# . a/b/c//../d -> a/b/d instead of a/b/c/d. This is how
#   filesystems behave.
# . gopher: is an error. In the old implementation it was valid. 
# . file:a is the same as file:/a. When using Netscape, file:a has
#   no defined behaviour anyway.
# . In file://localhost/a/b/c localhost is implicit and resulting
#   string (as_string) will return file:/a/b/c
# . rel function will return "" instead of "./" for 
#   rel(http://a/b/c, http://a/b/c)
# 
# Escaping
#
# The escape char specification for uri_escape() may not contain
# intervals like \x00-\x20 but should contain all the characters
# explicitly.
#
# . $reserved 
# . $reserved_no_slash
# . $reserved_no_form
# . $unsafe
# . implicit
#   ($unsafe + some $reserved chars, not clear why not all reserved chars
#    were set)
#
# Per component escaping policy
#
# . path($value), params($value), query($value) escape $value before set
#   + return old value unescaped and check that no information is lost
#   when unescaping. No check is don't on $value, if it contains % the
#   % are escaped. $value must be unescaped. Specific chars are
#   escaped, not all : reserved_no_slash for path, reserved_no_form for query
#   and params.
#   Drawback: control chars are not escaped, no control on % loss when setting,
#   deep understanding necessary to get the expected result : must explicitly
#   call uri_escpape for accented letters and control chars. The intuitive
#   $object->path("a/b/c^ad/"); $object->epath() will give the original
#   string instead of the expected escaped "a/b/c%01d/".
#
# . epath($value), eparams($value), equery($value) store + retrieve verbatim
#   the name suggest that the values stored and retrieved must be escaped
#   and unescaped by the caller.
#   Drawback:
#
# . full_path
#   return path + param + query with 'unsafe' chars escaped
#   with aditional argument return path + param + query as is
#   Drawback: inconsistent with path(), params() and query() which return
#   the unescaped value. Escaping when setting path() params() and query() 
#   use 'reserved' chars instead of 'unsafe'. One would expect full_path
#   to be strictly equivalent to path() . ";" . params() . "?" . query()
#   which is not the case. Escaping is not done on a per component basis
#   and reserved chars are not escaped.
#
# . as_string is similar to full_path
#
# . path_component(@list) store in 'path' the path built from the 
#   @list, reserved chars quoted + '.'. Unescape each path component
#   of the existing path a list of them. This is approximately consistent 
#   with path().
#   Drawback: inconsistent with path() regarding the handling of . as
#   a reserved char. See also full path.
# 
# Absolute and relative path
#   
#   The situation is quite confuse regarding the leading / of the path.
#   Many functions contained special cases to restore the leading / when
#   'necessary'. This has been changed by introducing two functions that
#   handle path names given by the caller (path_in) and path names 
#   returned to the caller. A flag in the object specifies if the path
#   is absolute or relative. This is necessary because a path not beginning
#   with a / is absolute in an absolute URL and relative in a relative URL.
#   Therefore the path only does not allow to find out if it's absolute or
#   relative. 
#
# . Nulls in strings are not allowed, C library do not like them
#
# . If user is set to '' or passwd set to '' they do not show
#   See
#    $url->user('');
#    $url->_expect('netloc' => ':0@h:0');
#    $url->password('');
#    $url->_expect('netloc' => ':@h:0');
#    $url->user('foo');
#    $url->_expect('netloc' => 'foo:@h:0');
#   in t/base.t
#  
# . news : path is not overriden (no object orientation)
#
package URI2::URL;

$VERSION = "5.00";   

sub Version { $VERSION; }

require 5.004;

require Exporter;
@ISA = qw(Exporter);
@EXPORT = qw(url);

use Carp ();
use URIC;

#use strict 'vars';
#use strict 'subs';

use vars qw(
	    $Debug $Strict_URL $reserved $reserved_no_slash $reserved_no_form
	   );

$reserved          = ";\\/?:\\@&=+#%"; # fielding-03 reserved plus '#' and '%'
$reserved_no_slash = ";?:\\@&=+#%";    # used when escaping path
$reserved_no_form  = ";\\/?:\\@#%";    # used when escaping params and query

$Debug         = 0;     # set to 1 to print URLs on creation
$Strict_URL    = 0;     # see new()

use overload ( '""' => 'as_string', 'fallback' => 1 );

URICc::uri_mode_set($URICc::URI_MODE_LOWER_SCHEME | $URICc::URI_MODE_FIELD_MALLOC | $URICc::URI_MODE_URI_STRICT_SCHEME);

# Easy to use constructor
sub url ($;$)
{
    URI2::URL->new(@_);
}

sub new
{
    my($class, $init, $base) = @_;

    my $self;
    if (ref $init) {
	$self = $init->clone;
    } else {
	$init = "" unless defined $init;
	# RFC 1738 appendix suggest that we just ignore extra whitespace
	$init =~ s/\s+//g;
	# Also get rid of any <URL: > wrapper
	$init =~ s/^<(?:URL:)?(.*)>$/$1/;

	# hand-off to scheme specific implementation sub-class
	$self->{'_orig_url'} = $init if $Debug;
	$self->{'uri'} = URICc::uri_alloc($init, length($init));
	if(!defined($self->{'uri'})) {
	    Carp::croak("URI2::URL $URICc::uri_errstr")
	}
    }

    bless($self, $class);
    if($base) {
	if(!ref($base)) {
	    $base = URI2::URL->new($base);
	}
	#
	# Convert base to absolute form.
	#
	$base = URI2::URL->new($base->abs());
	$self->base($base);
	my($scheme) = URICc::uri_scheme($base->{'uri'});
	if($scheme && !URICc::uri_scheme($self->{'uri'})) {
	    $init = "$scheme:$init";
	    if(URICc::uri_realloc($self->{'uri'}, $init, length($init)) != $URI_CANNONICAL) {
		Carp::croak("URI2::URL $URICc::uri_errstr");
	    }
	    URICc::uri_scheme_set($self->{'uri'}, '');
	}
    }
    $self->print_on('STDERR') if $Debug;
    return $self;
}

sub clone
{
    my $self = shift;
    # this work as long as none of the components are references themselves
    return URI2::URL->new($self->as_string());
}

# This private method help us implement access to the elements in the
# URI2::URL object hash (%$self).  You can set up access to an element
# with a routine similar to this one:
#
#  sub component { shift->_elem('component', @_); }

%URL::elemc = (
	       'scheme' => 1,
	       'host' => 1,
	       'port' => 1,
	       'path' => 1,
	       'params' => 1,
	       'query' => 1,
	       'frag' => 1,
	       'user' => 1,
	       'passwd' => 1,
	       );
sub _elem
{
    my $self = shift;
    my $elem = shift;
    my($old);
    if($URL::elemc{$elem}) {
	my($access) = "URICc::uri_${elem}";
	$old = &$access($self->{'uri'});
	if (@_) {
	    my $set = "URICc::uri_${elem}_set";
	    my($elem) = shift;
	    $elem = '' if(!defined($elem));
	    &$set($self->{'uri'}, $elem);
	}
	undef($old) if(!length($old));
    } else {
	$old = $self->{$elem};
	if (@_) {
	    $self->{$elem} = shift;
	}
    }
    $old;
}

sub DESTROY {
    my($self) = @_;

    if(defined($self->{'uri'}))  {
      URICc::uri_free($self->{'uri'});
	undef($self->{'uri'});
    }
}

sub newlocal
{
    my($class, $file) = shift;
    return URI2::URL->new("file:$file");
}

sub strict
{
    return $Strict_URL unless @_;
    my $old = $Strict_URL;
    $Strict_URL = $_[0];
    $old;
}

sub base { shift->_elem('base', @_); }
sub scheme { shift->_elem('scheme', @_); }

sub crack
{
    my $self = shift;
    my($o) = $self->{'uri'};
    (URICc::uri_scheme($o),  	  # 0: scheme
     URICc::uri_user($o),          # 1: user
     URICc::uri_passwd($o),        # 2: passwd
     URICc::uri_host($o),          # 3: host
     URICc::uri_port($o),          # 4: port
     URICc::uri_path($o),          # 5: path
     URICc::uri_params($o),        # 6: params
     URICc::uri_query($o),         # 7: query
     URICc::uri_frag($o)           # 8: fragment
    )
}

sub abs
{
    my($self, $base, $allow_scheme_in_relative_urls) = @_;

    if($base) {
	$base = new URI2::URL($base);
    } else {
	$base = $self->base();
    }

    return $self->clone unless $base;            # we have no base (step1)

    URICc::uri_mode_set(URICc::uri_mode() & ~$URI_MODE_URI_STRICT_SCHEME) if($allow_scheme_in_relative_urls);

    my($absolute) = URICc::uri_uri(URICc::uri_abs_1($base->{'uri'}, $self->{'uri'}));
    my($ret) = URI2::URL->new($absolute);

    URICc::uri_mode_set(URICc::uri_mode() | $URI_MODE_URI_STRICT_SCHEME) if($allow_scheme_in_relative_urls);

    return $ret;
}

sub as_string { return URICc::uri_escape(URICc::uri_uri(shift->{'uri'}), $uri_escape_unsafe); }

# This is set up as an alias for various methods
sub bad_method {
    my $self = shift;
    my $scheme = $self->scheme;
    Carp::croak("Illegal method called for $scheme: URL")
	if $Strict_URL;
    # Carp::carp("Illegal method called for $scheme: URL")
    #     if $^W;
    undef;
}

sub print_on
{
    my($self) = @_;

    URICc::uri_dump($self->{'uri'});
}

sub epath {
    my($self) = shift;
    my($path) = @_;
    my($pathp);
    $pathp = 1 if(@_);

    #
    # Get the old path before changing to new because we may change the
    # relative_path flag.
    #
    my($path_out) = $self->path_out();
    if($pathp) {
	my($uri) = $self->{'uri'};
	#
	# Compute the path string and find out if it's a relative
	# or absolute path
	#
	my($path_in, $relative) = $self->path_in($path);
	#
	# Set the new path and relative_path flag if appropriate;
	#
	$self->_elem('path', $path_in);
        URICc::uri_info_set($uri, URICc::uri_info($uri) | $URI_INFO_RELATIVE_PATH) if($relative);
    }
    return $path_out;
}

sub path {
    my($self) = shift;

    my($path_out);
    if(@_) {
	my($path) = URICc::uri_escape($_[0], $reserved_no_slash);
	$path_out = $self->epath($path);
    } else {
	$path_out = $self->epath();
    }
    #
    # Sanity checks
    #
    Carp::croak("Path components contain '/' (you must call epath)")
	if $path_out =~ /%2[fF]/ and !@_;

    return URICc::uri_unescape($path_out);
}

sub params {
    my $self = shift;
    my $old = $self->_elem('params', map {URICc::uri_escape($_,$reserved_no_form)} @_);
    return URICc::uri_unescape($old) if defined $old;
    undef;
}

sub query {
    my $self = shift;
    my $old = $self->_elem('query', map { URICc::uri_escape($_, $reserved_no_form) } @_);
    if (defined(wantarray) && defined($old)) {
	if ($old =~ /%(?:26|2[bB]|3[dD])/) {  # contains escaped '=' '&' or '+'
	    my $mess;
	    for ($old) {
		$mess = "Query contains both '+' and '%2B'"
		  if /\+/ && /%2[bB]/;
		$mess = "Form query contains escaped '=' or '&'"
		  if /=/  && /%(?:3[dD]|26)/;
	    }
	    if ($mess) {
		Carp::croak("$mess (you must call equery)");
	    }
	}
	# Now it should be safe to unescape the string without loosing
	# information
	return URICc::uri_unescape($old);
    }
    undef;
}

sub host {
    my($self) = shift;
    my($tmp) = $self->_elem('host', map { URICc::uri_escape($_, $reserved) } @_);
    return defined($tmp) ? URICc::uri_unescape($tmp) : undef;
}

sub port {
    my($self) = shift;
    my($tmp) = $self->_elem('port', map { URICc::uri_escape($_, $reserved) } @_);
    return defined($tmp) ? URICc::uri_unescape($tmp) : undef;
}

sub eparams { shift->_elem('params', @_); }
sub equery { shift->_elem('query', @_); }
sub frag { shift->_elem('frag', @_); }

sub user {
    my($self) = shift;
    my($tmp) = $self->_elem('user', map { URICc::uri_escape($_, $reserved) } @_);
    return defined($tmp) ? URICc::uri_unescape($tmp) : undef;
}

sub password {
    my($self) = shift;
    my($tmp) = $self->_elem('passwd', map { URICc::uri_escape($_, $reserved) } @_);
    return defined($tmp) ? URICc::uri_unescape($tmp) : undef;
}

sub netloc {
    return URICc::uri_auth_netloc(shift->{'uri'});
}

sub auth {
    my($auth) = URICc::uri_auth(shift->{'uri'});
    return length($auth) > 0 ? $auth : undef;
}

sub full_path {
  URICc::uri_escape(URICc::uri_all_path(shift->{'uri'}), $uri_escape_unsafe);
}

sub default_port {
  URICc::uri_default_port(shift->{'uri'});
}

sub path_components {
    my $self = shift;

    my($path_out);
    if(@_) {
	my($path) = join("/", map { URICc::uri_escape($_, $reserved.".") } @_);
	$path_out = $self->epath($path);
    } else {
	$path_out = $self->epath();
    }

    map { URICc::uri_unescape($_) } split("/", $path_out, -1);
}

#
# The path given may be
# . absolute path if url is absolute. If the path begins with a / 
#   or not, it is absolute anyway since a relative path means nothing
#   in an absolute url
# . relative path if url is relative AND path does not start with a / 
# . absolute path if url is relative AND path starts with a / 
#
# Returns ($path, $relative) where $path is the string that may
# be inserted in the path component and $relative is set if it's a
# relative path, unset if not.
#
sub path_in {
    my($self, $path) = @_;
    my($uri) = $self->{'uri'};
    my($relative);

    return if(!defined($path));

    if(URICc::uri_info($uri) & $URI_INFO_RELATIVE) {
	$relative = 1 if($path !~ m|^/|);
    }
    $path =~ s|^/||;

    return ($path, $relative);
}

#
# Returns the path component with
# . Leading / added if the path is absolute, that is if the url is
#   absolute or if the url is relative and the path is absolute.
# . No leading / added if the url is relative and the path is relative.
#
sub path_out {
    my($self) = @_;
    my($path) = $self->_elem('path');
    my($uri) = $self->{'uri'};

    if((URICc::uri_info($uri) & $URI_INFO_RELATIVE) == 0 ||
       ((URICc::uri_info($uri) & $URI_INFO_RELATIVE) &&
	(URICc::uri_info($uri) & $URI_INFO_RELATIVE_PATH) == 0)) {
	if(!defined($path)) {
	    $path = "/";
	} else {
	    $path = "/$path";
	}
    }
    return $path;
}

sub eq {
    my($self, $other) = @_;

    $other = URI2::URL->new($other) if(!ref($other));

    my($a) = URICc::uri_cannonical($self->{'uri'});
    if(!defined($a)) {
      Carp::croak($self->as_string() . " cannot be cannonicalized");
    }
    my($b) = URICc::uri_cannonical($other->{'uri'});
    if(!defined($b)) {
      Carp::croak($other->as_string() . " cannot be cannonicalized");
    }

    return URICc::uri_escape(URICc::uri_uri($a), $uri_escape_unsafe) eq URICc::uri_escape(URICc::uri_uri($b), $uri_escape_unsafe);
}

sub rel {
    my($self, $base) = @_;
    my($rel) = $self->clone;
    $base = $self->base unless $base;
    return $rel unless $base;
    $base = new URI2::URL $base unless ref $base;

    $rel->base($base);
    my($scheme) = $rel->scheme();
    my($netloc) = $rel->netloc();
    if (!defined($scheme) && !defined($netloc)) {
	# it is already relative
	return $rel;
    }
    my($bscheme) = $base->scheme();
    my($bnetloc) = $base->netloc();

    for ($bscheme, $bnetloc, $netloc) { $_ = '' unless defined }

    unless ($scheme eq $bscheme && $netloc eq $bnetloc) {
	# different location, can't make it relative
	return $rel;
    }

    my($path) = $rel->path();
    my($bpath) = $base->path();

    for ($path, $bpath) {  $_ = "/$_" unless m,^/,; }

    # Make it relative by eliminating scheme and netloc
    $rel->scheme(undef);
    $rel->host(undef);
    $rel->port(undef);
    $rel->user(undef);
    $rel->password(undef);

    # This loop is based on code from Nicolai Langfeldt <janl@ifi.uio.no>.
    # First we calculate common initial path components length ($li).
    my $li = 1;
    while (1) {
	my $i = index($path, '/', $li);
	last if $i < 0 ||
                $i != index($bpath, '/', $li) ||
	        substr($path,$li,$i-$li) ne substr($bpath,$li,$i-$li);
	$li=$i+1;
    }
    # then we nuke it from both paths
    substr($path, 0,$li) = '';
    substr($bpath,0,$li) = '';

    if ($path eq $bpath && defined($rel->frag) && !defined($rel->equery)) {
        $rel->epath('');
    } else {
        # Add one "../" for each path component left in the base path
        $path = ('../' x $bpath =~ tr|/|/|) . $path;
#	$path = "./" if $path eq "";
        $rel->epath($path);
    }

    $rel->print_on('STDERR');
    $rel;
}

#
# Http specific
#
sub keywords {
    my($self) = shift;
    my($old) = $self->equery();
    if (@_) {
	# Try to set query string
	$self->equery(join('+', map { URICc::uri_escape($_, $reserved) } @_));
    }
    return if !defined($old) || !defined(wantarray);

    Carp::croak("Query is not keywords") if $old =~ /=/;
    map { URICc::uri_unescape($_) } split(/\+/, $old);
}

# Handle ...?foo=bar&bar=foo type of query
sub query_form {
    my $self = shift;
    my($old) = $self->equery();
    if (@_) {
	# Try to set query string
	my @query;
	my($key,$vals);
	while (($key,$vals) = splice(@_, 0, 2)) {
	    $key = '' unless defined $key;
	    $key =  URICc::uri_escape($key, $reserved);
	    $key =  URICc::uri_escape($key, $uri_escape_unsafe);
	    $vals = [$vals] unless ref($vals) eq 'ARRAY';
	    my $val;
	    for $val (@$vals) {
		$val = '' unless defined $val;
		$val = URICc::uri_escape($val, $reserved);
		$val = URICc::uri_escape($val, $uri_escape_unsafe);
		push(@query, "$key=$val");
	    }
	}
	$self->equery(join('&', @query));
    }
    return if !defined($old) || length($old) == 0 || !defined(wantarray);
    Carp::croak("Query is not a form") unless $old =~ /=/;
    map { s/\+/ /g; URICc::uri_unescape($_) }
	 map { /=/ ? split(/=/, $_, 2) : ($_ => '')} split(/&/, $old);
}

#
# Gopher specific
#
sub gtype { shift->_elem('user', @_); }
sub selector { shift->_elem('params', @_); }
sub search { shift->_elem('query', @_); }
sub string { shift->_elem('frag', @_); }

#
# Mailto specific
# 
sub address {
    my($self, $value) = @_;

    my($old) = URICc::uri_user($self->{'uri'}) . "@" . URICc::uri_host($self->{'uri'});
    if($value) {
	my($uri) = "mailto:$value";
        URICc::uri_realloc($self->{'uri'}, $uri, length($uri));
    }
    return $old;
}

*encoded822addr = \&address;

#
# News and nntp specific
#
sub article {
    my($self) = shift;
    my($func) = $self->scheme() . "_article";
    return &$func($self, @_);
}

sub group {
    my($self) = shift;
    my($func) = $self->scheme() . "_group";
    return &$func($self, @_);
}

#
# News specific
#
sub groupart {
    my $self = shift;
    my $old = $self->_elem('path');
    if (@_) {
	my $p = shift;
	if (defined $p && $p =~ /\@/) {
	    # it is an message id
	    $p =~ s/^<(.*)>$/$1/;  # "<" and ">" should not be part of it
	}
	$self->_elem('path', $p);
    }
    return $old;
}

sub news_article   {
    my $self = shift;
    Carp::croak("Illegal article id name (does not contain '\@')")
      if @_ && $_[0] !~ /\@/;
    my $old = $self->groupart(@_);
    return undef if $old !~ /\@/;
    $old;
}

sub news_group {
    my $self = shift;
    Carp::croak("Illegal group name (contains '\@')")
      if @_ && $_[0] =~ /\@/;
    my $old = $self->groupart(@_);
    return undef if $old =~ /\@/;
    $old;
}

#
# Nntp specific
#
sub nntp_group { shift->_elem('path', @_); }
sub nntp_article { shift->_elem('params', @_); }
sub digits { shift->_elem('params', @_); }

#
# Wais specific
#
sub database { shift->_elem('path', @_); }
sub wtype { shift->_elem('params', @_); }
sub wpath { shift->_elem('frag', @_); }
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)