The London Perl and Raku Workshop takes place on 26th Oct 2024. If your company depends on Perl, please consider sponsoring and/or attending.
package App::Wax;

use 5.008008;

use Digest::SHA qw(sha1_hex);
use File::Slurper qw(read_text write_text);
use File::Spec;
use File::Temp;
use Getopt::Long qw(GetOptionsFromArray :config posix_default require_order bundling no_auto_abbrev no_ignore_case);
use IPC::System::Simple qw(EXIT_ANY $EXITVAL systemx);
use LWP::UserAgent;
use Method::Signatures::Simple;
use MIME::Types;
use Mouse;
use Parallel::parallel_map qw(parallel_map);
use Pod::Usage qw(pod2usage);
use Try::Tiny qw(try catch);
use URI::Split qw(uri_split);

# NOTE this is the version of the *command* rather than the *module*, i.e.
# breaking API changes may occur here which aren't reflected in the SemVer since
# they don't break the behavior of the command
#
# XXX this declaration must be on a single line
# https://metacpan.org/pod/version#How-to-declare()-a-dotted-decimal-version
use version; our $VERSION = version->declare('v2.4.1');

# defaults
use constant {
    CACHE              => 0,
    DEFAULT_USER_AGENT => 'Mozilla/5.0 (Windows NT 10.0; rv:78.0) Gecko/20100101 Firefox/78.0',
    ENV_PROXY          => 1,
    ENV_USER_AGENT     => $ENV{WAX_USER_AGENT},
    EXTENSION          => qr/.(\.(?:(tar\.(?:bz|bz2|gz|lzo|Z))|(?:[ch]\+\+)|(?:\w+)))$/i,
    INDEX              => '%s.index.txt',
    MIRROR             => 0,
    NAME               => 'wax',
    SEPARATOR          => '--',
    TEMPLATE           => 'XXXXXXXX',
    TIMEOUT            => 60,
    VERBOSE            => 0,
};

use constant USER_AGENT => ENV_USER_AGENT || DEFAULT_USER_AGENT;

# RFC 2616: "If the media type remains unknown, the recipient SHOULD treat
# it as type 'application/octet-stream'."
use constant DEFAULT_CONTENT_TYPE => 'application/octet-stream';

# resources with these mime-types may have their extension inferred from the
# path part of their URI
use constant INFER_EXTENSION => {
    'text/plain'               => 1,
    'application/octet-stream' => 1,
    'binary/octet-stream'      => 1,
};

# errors
use constant {
    OK                  =>  0,
    E_DOWNLOAD          => -1,
    E_INVALID_DIRECTORY => -2,
    E_INVALID_OPTIONS   => -3,
    E_NO_COMMAND        => -4,
};

has app_name => (
    is      => 'rw',
    isa     => 'Str',
    default => NAME,
);

has cache => (
    is      => 'rw',
    isa     => 'Bool',
    default => CACHE,
    trigger => \&_check_keep,
);

has directory => (
    is        => 'rw',
    isa       => 'Str',
    predicate => 'has_directory',
    required  => 0,
    trigger   => \&_check_directory,
);

has keep => (
    is       => 'ro',
    isa      => 'Bool',
    default  => 0,
    writer   => '_set_keep',
);

has _lwp_user_agent => (
    is      => 'rw',
    isa     => 'LWP::UserAgent',
    lazy    => 1,
    builder => '_build_lwp_user_agent',
);

# this should really be a class attribute, but there's no MouseX::ClassAttribute
# (on CPAN)
has mime_types => (
    is      => 'ro',
    isa     => 'MIME::Types',
    lazy    => 1,
    default => sub { MIME::Types->new() },
);

has mirror => (
    is      => 'rw',
    isa     => 'Bool',
    default => MIRROR,
    trigger => \&_check_keep,
);

has separator => (
    is        => 'rw',
    isa       => 'Str',
    default   => SEPARATOR,
    clearer   => 'clear_separator',
    predicate => 'has_separator',
);

# TODO make this private and read only, and rename it to something more
# descriptive, e.g. tempfile_template
has template => (
    is      => 'rw',
    isa     => 'Str',
    default => method () { sprintf('%s_%s', $self->app_name, TEMPLATE) },
    lazy    => 1,
);

has timeout => (
    is      => 'rw',
    isa     => 'Int',
    default => TIMEOUT,
    trigger => method ($timeout) { $self->_lwp_user_agent->timeout($timeout) },
);

has user_agent => (
    is      => 'rw',
    isa     => 'Str',
    default => USER_AGENT,
    trigger => method ($user_agent) { $self->_lwp_user_agent->agent($user_agent) },
);

has verbose => (
    is      => 'rw',
    isa     => 'Bool',
    default => VERBOSE,
    trigger => method ($verbose) { $| = 1 }, # unbuffer output
);

# log the path. if the directory doesn't exist, create it if its parent directory
# exists; otherwise, raise an error
method _check_directory ($dir) {
    $self->debug("directory: $dir");

    unless (-d $dir) {
        unless (mkdir $dir) {
            $self->log(ERROR => "Can't create directory (%s): %s", $dir, $!);
            exit E_INVALID_DIRECTORY;
        }
    }
}

# lazy constructor for the default LWP::UserAgent instance
method _build_lwp_user_agent {
    LWP::UserAgent->new(
        env_proxy => ENV_PROXY,
        timeout   => $self->timeout,
        agent     => $self->user_agent
    )
}

# set `keep` to true if --cache or --mirror are set,
# but raise an error if both are set
method _check_keep {
    if ($self->cache && $self->mirror) {
        $self->log(ERROR => "--cache and --mirror can't be used together");
        exit E_INVALID_OPTIONS;
    } else {
        $self->_set_keep(1);
    }
}

# remove temporary files
method _unlink ($unlink) {
    for my $filename (@$unlink) {
        chmod 0600, $filename; # borrowed from File::Temp (may be needed on Windows)
        $self->debug('removing: %s', $filename);
        unlink($filename) || $self->log(WARN => "Can't unlink %s: %s", $filename, $!);
    }
}

# return the URL's content-type or an empty string if the request fails
method content_type ($_url) {
    my ($url, $url_index) = @$_url;
    my $response = $self->_lwp_user_agent->head($url);
    my $content_type = '';

    if ($response->is_success) {
        # the initial (pre-semicolon) part of the mime-type, trimmed and lowercased.
        $content_type = $response->headers->content_type;

        if ($content_type) {
            $self->debug('content-type (%d): %s', $url_index, $content_type);
        } else {
            $content_type = DEFAULT_CONTENT_TYPE;
            $self->debug('content-type (%d): %s (default)', $url_index, $content_type);
        }
    }

    return $content_type;
}

# save the URL to a local filename; returns an error message if an error occurred,
# or a falsey value otherwise
method download ($_url, $filename) {
    my ($url, $url_index) = @$_url;
    my $ua = $self->_lwp_user_agent;
    my ($downloaded, $error, $response);

    if ($self->cache && (-e $filename)) {
        $downloaded = 0;
    } elsif ($self->mirror) {
        $response = $ua->mirror($url, $filename);

        if ($response->is_success) {
            $downloaded = 1;
        } elsif ($response->code == 304) {
            $downloaded = 0;
        }
    } else {
        $response = $ua->get($url, ':content_file' => $filename);

        if ($response->is_success) {
            $downloaded = 1;
        }
    }

    if (defined $downloaded) {
        $self->debug('download (%d): %s', $url_index,  ($downloaded ? 'yes' : 'no'));
    } else {
        my $status = $response->status_line;
        $error = "can't download URL #$url_index ($url) to filename ($filename): $status";
    }

    return $error;
}

# helper for `dump_command`: escape/quote a shell argument on POSIX shells
func _escape ($arg) {
    # https://stackoverflow.com/a/1250279
    # https://github.com/boazy/any-shell-escape/issues/1#issuecomment-36226734
    $arg =~ s!('{1,})!'"$1"'!g;
    $arg = "'$arg'";
    $arg =~ s{^''|''$}{}g;

    return $arg;
}

method _use_default_directory () {
    # "${XDG_CACHE_HOME:-$HOME/.cache}/wax"
    require File::BaseDir;
    $self->directory(File::BaseDir::cache_home($self->app_name));
}

# print the version and exit
method _dump_version () {
    print $VERSION, $/;
    exit 0;
}

# log a message to stderr with the app's name and message's log level
method log ($level, $template, @args) {
    my $name = $self->app_name;
    my $message = @args ? sprintf($template, @args) : $template;
    warn "$name: $level: $message", $/;
}

# return a best-effort guess at the URL's file extension based on its content
# type, e.g. ".md" or ".tar.gz", or an empty string if one can't be determined.
# XXX note: makes a network request to determine the content type
method extension ($_url) {
    my ($url, $url_index) = @$_url;
    my $extension = '';
    my $split = $self->is_url($url);

    return $extension unless ($split);

    my ($scheme, $domain, $path, $query, $fragment) = @$split;
    my $content_type = $self->content_type($_url);

    return $extension unless ($content_type); # won't be defined if the URL is invalid

    if (INFER_EXTENSION->{$content_type}) {
        # try to get a more specific extension from the path
        if (not(defined $query) && $path && ($path =~ EXTENSION)) {
            $extension = $+;
        }
    }

    unless ($extension) {
        my $mime_type = $self->mime_types->type($content_type);
        my @extensions = $mime_type->extensions;

        if (@extensions) {
            $extension = '.' . $extensions[0];
        }
    }

    $self->debug('extension (%d): %s', $url_index, $extension);

    return $extension;
}

# return a truthy value (an arrayref containing the URL's components)
# if the supplied value can be parsed as a URL, or a falsey value otherwise
method is_url ($url) {
    if ($url =~ m{^[a-zA-Z][\w+]*://}) { # basic sanity check
        my ($scheme, $domain, $path, $query, $fragment) = uri_split($url);

        if ($scheme && ($domain || $path)) { # no domain for file:// URLs
            return [$scheme, $domain, $path, $query, $fragment];
        }
    }
}

# log a message to stderr if logging is enabled
method debug ($template, @args) {
    if ($self->verbose) {
        my $name = $self->app_name;
        my $message = @args ? sprintf($template, @args) : $template;
        warn "$name: $message", $/;
    }
}

# perform housekeeping after a download: replace the placeholder with the file
# path; push the path onto the delete list if it's a temporary file; and log any
# errors
#
# XXX give this a more descriptive name, e.g. _handle_download or _after_download
method _handle ($resolved, $command, $unlink) {
    my ($command_index, $filename, $error) = @$resolved;

    $command->[$command_index] = $filename;

    unless ($self->keep) {
        push @$unlink, $filename;
    }

    if ($error) {
        $self->log(ERROR => $error);
        return E_DOWNLOAD;
    } else {
        return OK;
    }
}

# this is purely for diagnostic purposes, i.e. there's no guarantee
# that the dumped command can be used as a command line. a better
# (but still imperfect/incomplete) implementation would require at
# least two extra modules: Win32::ShellQuote and String::ShellQuote:
# https://rt.cpan.org/Public/Bug/Display.html?id=37348
method dump_command ($args) {
    return join(' ', map { /[^0-9A-Za-z+,.\/:=\@_-]/ ? _escape($_) : $_ } @$args);
}

# takes a URL and returns a $filename => $error pair where
# the filename is the path to the saved file and the error
# is the first error message encountered while trying to download
# and save it
method resolve ($_url) {
    my ($error, $filename, @resolved);

    if ($self->keep) {
        ($filename, $error) = $self->resolve_keep($_url);
    } else {
        ($filename, $error) = $self->resolve_temp($_url);
    }

    $error ||= $self->download($_url, $filename);
    @resolved = ($filename, $error);

    return wantarray ? @resolved : \@resolved;
}

# takes a URL and returns a $filename => $error pair for cacheable files.
# in order to calculate the filename, we need to determine the URL's extension,
# which requires a network request for the content type. to avoid hitting the
# network for subsequent requests, we cache the extension in an index file.
method resolve_keep ($_url) {
    my ($url, $url_index) = @$_url;
    my $directory = $self->has_directory ? $self->directory : File::Spec->tmpdir;
    my $id = sprintf('%s_%s', $self->app_name, sha1_hex($url));
    my $index_file = File::Spec->catfile($directory, sprintf(INDEX, $id));
    my ($error, $extension);

    # -s: if /tmp is full, the index file may get written as an empty file, so
    # make sure it's non-empty
    if (-s $index_file) {
        $self->debug('index (%d): %s (exists)', $url_index, $index_file);

        try {
            $extension = read_text($index_file);
        } catch {
            $error = "unable to load index #$url_index ($index_file): $_";
        };
    } else {
        $self->debug('index (%d): %s (create)', $url_index, $index_file);
        $extension = $self->extension($_url);

        try {
            write_text($index_file, $extension);
        } catch {
            $error = "unable to save index #$url_index ($index_file): $_";
        };
    }

    my $filename = File::Spec->catfile($directory, "$id$extension");

    return ($filename, $error);
}

# takes a URL and returns a $filename => $error pair for
# temporary files (i.e. files which will be automatically unlinked)
method resolve_temp ($_url) {
    my $extension = $self->extension($_url);
    my %options   = (TEMPLATE => $self->template, UNLINK => 0);

    if ($self->has_directory) {
        $options{DIR} = $self->directory;
    } else {
        $options{TMPDIR} = 1;
    }

    if ($extension) {
        $options{SUFFIX} = $extension;
    }

    my ($filename, $error);

    try {
        srand($$); # see the File::Temp docs
        $filename = File::Temp->new(%options)->filename;
    } catch {
        $error = $_;
    };

    return ($filename, $error);
}

# parse the supplied arrayref of options and return a pair of:
#
#   command: an arrayref containing the command to execute and its arguments
#   resolve: an arrayref of [index, URL] pairs, where index refers to the URL's
#            (0-based) index in the commmand array
method _parse ($argv) {
    my @argv = @$argv; # don't mutate the original

    my $parsed = GetOptionsFromArray(\@argv,
        'c|cache'             => sub { $self->cache(1) },
        'd|dir|directory=s'   => sub { $self->directory($_[1]) },
        'D|default-directory' => sub { $self->_use_default_directory },
        'h|?|help'            => sub { pod2usage(-input => $0, -verbose => 2, -exitval => 0) },
        'm|mirror'            => sub { $self->mirror(1) },
        's|separator=s'       => sub { $self->separator($_[1]) },
        'S|no-separator'      => sub { $self->clear_separator() },
        't|timeout=i'         => sub { $self->timeout($_[1]) },
        'u|user-agent=s'      => sub { $self->user_agent($_[1]) },
        'v|verbose'           => sub { $self->verbose(1) },
        'V|version'           => sub { $self->_dump_version },
    );

    unless ($parsed) {
        pod2usage(
            -exitval => E_INVALID_OPTIONS,
            -input   => $0,
            -verbose => 0,
        );
    }

    my (@command, @resolve);
    my $seen_url = 0;

    while (@argv) {
        my $arg = shift(@argv);

        if ($self->has_separator && ($arg eq $self->separator)) {
            push @command, @argv;
            last;
        } elsif ($self->is_url($arg)) {
            unless ($seen_url) {
                my $source = ENV_USER_AGENT ? ' (env)'  : '';
                $self->debug('user-agent%s: %s', $source, $self->user_agent);
                $self->debug('timeout: %d', $self->timeout);
                $seen_url = 1;
            }

            my $url_index = @resolve + 1; # 1-based
            my $_url = [$arg, $url_index];

            $self->debug('url (%d): %s', $url_index, $arg);

            push @command, $arg;
            push @resolve, [$#command, $_url];
        } else {
            push @command, $arg;
        }
    }

    unless (@command) {
        pod2usage(
            -exitval => E_NO_COMMAND,
            -input   => $0,
            -msg     => 'no command supplied',
            -verbose => 0,
        )
    }

    return \@command, \@resolve;
}

# process the options and execute the command with substituted filenames
method run ($argv, %options) {
    my $test = $options{test};
    my $error = 0;
    my $unlink = [];
    my ($command, $resolve) = $self->_parse($argv);

    if (@$resolve == 1) {
        my ($command_index, $_url) = @{ $resolve->[0] };
        my @resolved = $self->resolve($_url);

        $error = $self->_handle([$command_index, @resolved], $command, $unlink);
    } elsif (@$resolve) {
        $self->debug('jobs: %d', scalar(@$resolve));

        my @resolved = parallel_map {
            my ($command_index, $_url) = @$_;
            [$command_index, $self->resolve($_url)]
        } @$resolve;

        for my $resolved (@resolved) {
            $error ||= $self->_handle($resolved, $command, $unlink);
        }
    }

    if ($error) {
        $self->debug('exit code: %d', $error);
        $self->_unlink($unlink);
        return $error;
    } elsif ($test) {
        return $command;
    } else {
        $self->debug('command: %s', $self->dump_command($command));

        try {
            # XXX hack to remove the "<error> in /path/to/App/Wax.pm line <line>"
            # noise. we just want the error message
            no warnings qw(redefine);
            local *IPC::System::Simple::croak = sub { die @_, $/ };
            systemx(EXIT_ANY, @$command);
        } catch {
            chomp;
            $self->log(ERROR => $_);
        };

        $self->debug('exit code: %d', $EXITVAL);
        $self->_unlink($unlink);

        return $EXITVAL;
    }
}

__PACKAGE__->meta->make_immutable();

1;