# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
use strict;
use warnings;
use File::Spec::Functions qw( catfile catdir );
use Encode qw( encode );
use Text::Wrap qw( wrap );
# Don't use tabs. Wrap at 78 columns.
$Text::Wrap::unexpand = 0;
$Text::Wrap::columns = 78;
if ( @ARGV != 2 ) {
die "Usage: perl update_snowstop.pl SNOWBALL_SVN_CO LUCY_SNOWSTOP_DIR";
}
my ( $snow_co_dir, $dest_dir ) = @ARGV;
# Update to a particular rev of the Snowball repository.
die("Not a directory: '$snow_co_dir'") unless -d $snow_co_dir;
my $retval = system( "svn", "update", "-r", "541", $snow_co_dir );
die "svn update failed" if ( $retval >> 8 );
# Open destination C file and print start of file.
my $outpath = catfile( $dest_dir, 'source', 'snowball_stoplists.c' );
open( my $out_fh, '>', $outpath ) or die "Can't open '$outpath': $!";
print $out_fh <<'END_STUFF';
/* Auto-generated file -- DO NOT EDIT!
*
* The words in this file are taken from stoplists provided by the Snowball
* project.
*/
#include "Lucy/Analysis/SnowballStopFilter.h"
END_STUFF
my %languages = (
da => "danish",
de => "german",
en => "english",
es => "spanish",
fi => "finnish",
fr => "french",
hu => "hungarian",
it => "italian",
nl => "dutch",
no => "norwegian",
pt => "portuguese",
ru => "russian",
sv => "swedish",
);
for my $iso ( sort keys %languages ) {
my $language = $languages{$iso};
# Grab stoplists from Snowball source files.
my $stop_path = "$snow_co_dir/website/algorithms/$language/stop.txt";
my $source_enc = $iso eq 'ru' ? 'koi8-r' : 'iso-8859-1';
open( my $stopfile_fh, "<:encoding($source_enc)", $stop_path )
or die "Couldn't open file '$stop_path': $!";
my @words;
while ( defined( my $line = <$stopfile_fh> ) ) {
$line =~ s/\|.*//g;
next unless length($line);
push @words, split( /\s+/, $line );
}
# Encode as UTF-8, change all non-ASCII bytes to octal escapes, and format
# as C string literals.
my @escaped = map { '"' . encode( 'UTF-8', $_ ) . '"' } @words;
s/([\x80-\xFF])/octal_escape($1)/ge for @escaped;
# Wrap text and print to outfile.
my $joined = join( ', ', @escaped, 'NULL' );
my $wrapped = wrap( ' ', ' ', $joined );
print $out_fh <<END_STUFF;
static const char *words_${iso}[] = {
$wrapped
};
const uint8_t **lucy_SnowStop_snow_${iso} = (const uint8_t**)words_$iso;
END_STUFF
}
sub octal_escape {
my $ord = ord( $_[0] );
return sprintf( "\\%.3o", $ord );
}