The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

use strict;
use warnings;
use File::Spec::Functions qw( catfile catdir );
use Encode qw( encode );
use Text::Wrap qw( wrap );

# Don't use tabs.  Wrap at 78 columns.
$Text::Wrap::unexpand = 0;
$Text::Wrap::columns  = 78;

if ( @ARGV != 2 ) {
    die "Usage: perl update_snowstop.pl SNOWBALL_SVN_CO LUCY_SNOWSTOP_DIR";
}
my ( $snow_co_dir, $dest_dir ) = @ARGV;

# Update to a particular rev of the Snowball repository.
die("Not a directory: '$snow_co_dir'") unless -d $snow_co_dir;
my $retval = system( "svn", "update", "-r", "541", $snow_co_dir );
die "svn update failed" if ( $retval >> 8 );

# Open destination C file and print start of file.
my $outpath = catfile( $dest_dir, 'source', 'snowball_stoplists.c' );
open( my $out_fh, '>', $outpath ) or die "Can't open '$outpath': $!";
print $out_fh <<'END_STUFF';
/* Auto-generated file -- DO NOT EDIT!
 *
 * The words in this file are taken from stoplists provided by the Snowball
 * project.
 */

#include "Lucy/Analysis/SnowballStopFilter.h"

END_STUFF

my %languages = (
    da => "danish",
    de => "german",
    en => "english",
    es => "spanish",
    fi => "finnish",
    fr => "french",
    hu => "hungarian",
    it => "italian",
    nl => "dutch",
    no => "norwegian",
    pt => "portuguese",
    ru => "russian",
    sv => "swedish",
);

for my $iso ( sort keys %languages ) {
    my $language = $languages{$iso};

    # Grab stoplists from Snowball source files.
    my $stop_path = "$snow_co_dir/website/algorithms/$language/stop.txt";
    my $source_enc = $iso eq 'ru' ? 'koi8-r' : 'iso-8859-1';
    open( my $stopfile_fh, "<:encoding($source_enc)", $stop_path )
        or die "Couldn't open file '$stop_path': $!";
    my @words;
    while ( defined( my $line = <$stopfile_fh> ) ) {
        $line =~ s/\|.*//g;
        next unless length($line);
        push @words, split( /\s+/, $line );
    }

    # Encode as UTF-8, change all non-ASCII bytes to octal escapes, and format
    # as C string literals.
    my @escaped = map { '"' . encode( 'UTF-8', $_ ) . '"' } @words;
    s/([\x80-\xFF])/octal_escape($1)/ge for @escaped;

    # Wrap text and print to outfile.
    my $joined = join( ', ', @escaped, 'NULL' );
    my $wrapped = wrap( '    ', '    ', $joined );
    print $out_fh <<END_STUFF;
static const char *words_${iso}[] = {
$wrapped
};
const uint8_t **lucy_SnowStop_snow_${iso} = (const uint8_t**)words_$iso;

END_STUFF
}

sub octal_escape {
    my $ord = ord( $_[0] );
    return sprintf( "\\%.3o", $ord );
}