The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/env perl
use warnings;
use strict;

use HTML::Parser ();
use File::Glob   qw/bsd_glob/;
use XML::LibXML::Simple;

use Data::Dumper qw/Dumper/;
$Data::Dumper::Indent = 1;

my $iana_root   = 'sources/iana-media-types';
my $my_own      = 'sources/my-own';
my $apache      = 'sources/from-apache';
my $sitepoint   = 'sources/from-sitepoint';
my $stdicon     = 'sources/from-stdicon';
my $broofa      = 'sources/from-broofa';
my $freedesktop = 'sources/from-freedesktop';

my $history     = 'history';
my $distributed = 'lib/MIME/types.db';

my @iana_basetypes = qw{
  text/plain
  application/octet-stream
};
my %iana_basetypes = map +($_ => 1), @iana_basetypes;

sub iana();
sub my_own();
sub apache();
sub sitepoint();
sub stdicon();
sub broofa();
sub freedesktop();

sub simplify();
sub write_tables();
sub write_distributed();
sub keep_sources($);
sub scan_dist_version();
sub add_type($$$);

my $dist_version = scan_dist_version;
print "*** producing for release $dist_version\n";

my $current      = "$history/$dist_version";
-d $current or mkdir $current or die "$current: $!";

my $types_list   = "$current/types.csv";
my $ext_list     = "$current/ext.csv";
my $save_source  = "$current/sources";

# Collect the info, the order is important!
my (%mimes, %exts);
my $last_count   = 0;
iana();
my_own();
apache();
sitepoint();
stdicon();
broofa();
freedesktop();
#warn Dumper \%mimes;

# Build the tables
simplify();
write_tables();
write_distributed();
keep_sources($save_source);

exit 0;

#
### IANA
#
# Run updata_iana first
# The format of the iana files is not standardized, so it is hard to
# collect other info automatically.  Therefore, that info is manually
# added to my-own list.
#

sub iana()
{   print "* processing iana types\n";

    foreach my $fn (bsd_glob "$iana_root/*/index.html")
    {   print "  . parsing file $fn\n";
        my ($major) = $fn =~ m!/([^/]+)/index.html$!;

        open IANA, '<', $fn or die "$fn: $!";
        my @page    = <IANA>;
        close IANA;

        my $types_in_page = 0;
        while(@page)
        {   $page[0] =~ m!^\<tr! or next;
            $page[1] =~ m!^\<td\>\&nbsp;! or next;
            $page[2] =~ m!^\<td\>(?:\<a[^>]*\>)?([^<]+)! or next;
            my $minor = $1;
            $page[3] =~ m!^\<td\>! or next;
            splice @page, 0, 3;

            $minor =~ s/\s+.*//;  # deprecation, etc
            my $type = "$major/$minor";
            my $info = add_type $type, 'iana', [];
            $types_in_page++;
        }
        continue
        {   shift @page;
        }

        print "    found $types_in_page types for $major\n";
    }

    $last_count = keys %mimes;
    print "  . found $last_count registered types\n";
}

#
### MY-OWN
#
# Based on years of existince of the MIME::Types module
#

sub my_own()
{   print "* processing my old list\n";

   # Exceptions
   # vms:text/plain;doc;8bit
   # mac:application/x-macbase64;;bin
   #
   # IE6 bug
   # image/pjpeg;;base64

    open OWN, '<', $my_own or die $!;
    while(<OWN>)
    {   chomp;
        next if /^#|^\s*$/;
        my ($type, $ext, $enc) = split /\;/;
        my @ext  = $ext ? (split /\,/, $ext) : ();
        my $info = add_type $type, 'my_own', \@ext;
        $info->{enc} = $enc if $enc;
    }
    close OWN;

    print "  . added ".(keys(%mimes) - $last_count)." types\n";
    $last_count = keys %mimes;
    print "  . now $last_count types\n";
}

#
### from Apache
#
# Apache uses the table to automatically add a mime-type for files on
# disk, based on the filename extension.

sub apache()
{   print "* processing apache\n";

    my $found = 0;

    open APACHE, '<', $apache or die $!;
    while(<APACHE>)
    {   chomp;
        next if /^#|^\s*$/;
        my ($type, $ext) = split /\s+/;
        my @ext  = $ext ? (split / /, $ext) : ();
        my $info = add_type $type, 'apache', \@ext;
        $found++;
    }
    close APACHE;

    print "  . added ".(keys(%mimes) - $last_count)." new types from $found\n";
    $last_count = keys %mimes;
    print "  . now $last_count types\n";
}

#
### from Sitepoint
#
# The list contains all discovered extension/type combination.  That
# results in too many options per extension, which should be filtered
# out later.

sub sitepoint()
{   print "* processing sitepoint\n";

    my $found = 0;

    open SITEP, '<', $sitepoint or die $!;
    while(<SITEP>)
    {   chomp;
        next if /^#|^\s*$/;
        my ($ext, $type, $comment) = split /\s+/;

        $ext =~ s/^\.//;
        my $info = add_type $type, 'sitepoint', [$ext];
        $found++;
    }
    close SITEP;

    print "  . added ".(keys(%mimes) - $last_count)." new types from $found\n";
    $last_count = keys %mimes;
    print "  . now $last_count types\n";
}

#
### from Stdicon
#

sub stdicon()
{   print "* processing stdicon\n";

    my $found = 0;

    open STDICON, '<', $stdicon or die $!;
    while(<STDICON>)
    {   chomp;
        next if /^#|^\s*$/;
        my ($ext, $type, $comment) = split /\s+/;

        $ext =~ s/^\.//;
        add_type $type, 'stdicon', [$ext];
        $found++;
    }
    close STDICON;

    print "  . added ".(keys(%mimes) - $last_count)." types from $found\n";
    $last_count = keys %mimes;
    print "  . now $last_count types\n";
}


#
### from github broofa
#

sub broofa()
{   print "* processing broofa\n";

    my $found = 0;

    open BROOFA, '<', $broofa or die $!;
    while(<BROOFA>)
    {   next if /^\#|^\s*$/;
        chomp;

        my ($type, @ext) = split;
        my $info = add_type $type, 'broofa', \@ext;
        $found++;
    }
    close BROOFA;

    print "  . added ".(keys(%mimes) - $last_count)." new types from $found\n";
    $last_count = keys %mimes;
    print "  . now $last_count types\n";
}

#
### from github freedesktop
#

sub freedesktop()
{   print "* processing freedesktop\n";

    my $c = XML::LibXML::Simple->new->XMLin($freedesktop);
    my $mimes = $c->{'mime-type'} || [];
    foreach my $record (@$mimes)
    {   my $type = $record->{type} or die;
        my $glob = $record->{glob} || [];
        $glob    = [$glob] if ref $glob eq 'HASH';   # when only one pattern
        my @patt = map +($_->{pattern} =~ /^\*\.(.*)/ ? $1 : ()), @$glob;

        # remove regex patterns
        my @ext  = grep !/[^a-zA-Z0-9+\-]/, @patt;
        my $info = add_type $type, 'freedesktop', \@ext;
    }

    print "  . added ".(keys(%mimes) - $last_count)." new types from "
      . @$mimes . "\n";
    $last_count = keys %mimes;
    print "  . now $last_count types\n";
}

#
### Simplify
#

sub simplify()
{   print "* simplifying\n";

    # order extensions
    my %back = map +($_ => 1), @iana_basetypes;
    foreach my $ext (keys %exts)
    {   my %seen;
        my @types   = grep !$seen{$_}++, @{$exts{$ext}};

        my @last    = grep $back{$_}, @types;
        $exts{$ext} = [ (grep !$back{$_}, @types), @last ];
    }

    foreach my $type (keys %mimes)
    {   my $info = $mimes{$type};

        # remove double extensions
        my %seen_ext;
        my @ext = grep !$seen_ext{$_}++, @{$info->{ext} || []};

        if($iana_basetypes{$type})
        {   @ext = grep $type eq $exts{$_}[0], @ext;
        }

        $info->{ext} = \@ext;
    }

    print "  . found ".(keys %exts)." extensions\n";
}

#
### Write
#

sub write_tables()
{   print "* write $types_list\n";
    open OUT, '>', $types_list or die $!;
    foreach my $type (sort keys %mimes)
    {   my $info = $mimes{$type};
        my $ext  = join ",", @{$info->{ext}};
        my $enc  = $info->{enc} || '';
        print OUT "$type;$ext;$enc\n";
    }
    close OUT;

    print "* write $ext_list\n";
    open OUT, '>', $ext_list or die $!;
    foreach my $ext (sort keys %exts)
    {   my $types = $exts{$ext};
        print OUT $ext, ';', join(',', @$types), "\n";
    }
    close OUT;
}

sub write_distributed()
{
    my %sets;
    foreach my $type (sort keys %mimes)
    {   my $info   = $mimes{$type};
        my ($major, $minor) = split m!/!, $type, 2;
        my @ext    = @{$info->{ext} || []};

        my $isIANA = $major =~ m{^x-} || $minor =~ m{^x-} ? '' : 'I';
        my $hasExt = @ext ? 'E' : '';
        my $ext    = join ',', @ext;
        my $enc    = $info->{enc} || '';
        push @{$sets{"$major:$isIANA:$hasExt"}}, "$minor;$ext;$enc";
    }
    foreach my $ext (sort keys %exts)
    {   my $types  = $exts{$ext};
        push @{$sets{EXTENSIONS}}, join(';', $ext, $types->[0]);
    }

    print "* write $distributed\n";
    open OUT, '>:encoding(utf8)', $distributed or die "$distributed: $!";
    foreach my $section (sort keys %sets)
    {   my $records = $sets{$section};
        print OUT join "\n"
           , @$records.":$section"
           , (sort @$records)
           , '', '';
    }
    close OUT;
}

sub keep_sources($)
{   my $fn = shift . '.tjz';
    print "* saving sources to $fn\n";
    system "tar cf - sources/| bzip2 -9v >$fn" and die $!;
}

my %iana_major;
sub add_type($$$)
{   my ($type, $from, $ext) = @_;
 
    my $simple = lc $type;
    my ($major, $minor) = $simple =~ m!^(?:x-)?([^/]+)/(?:x-)?(.+)$!;
    if($from eq 'iana')
    {   $iana_major{$major}++;
    }
    elsif(!$iana_major{$major})
    {   $simple = "x-$major/x-$minor";
    }
    elsif(!$mimes{$simple} && $minor !~ m/^(vnd|prs|x)\./)
    {   $simple = "$major/x-$minor";
    }

    my $info   = $mimes{$simple} ||= {};
    push @{$info->{ext}}, @$ext;
    $info->{by}{$from}++;

    push @{$exts{$_}}, $simple for @$ext;
    $info;
}

sub scan_dist_version()
{   open my($mf), '<', 'Makefile.PL' or die $!;
    while(<$mf>)
    {   return $1 if m/\$version\s*\=\s*['"]([^'"]+)'\s*;/;
    }
    die "version not found";
}