The London Perl and Raku Workshop takes place on 26th Oct 2024. If your company depends on Perl, please consider sponsoring and/or attending.
# -*- perl -*-
# -*- coding: utf-8 -*-
#
# 00GraphemeBreakTest.t - Test suite provided by Unicode Consortium.
#
# - Passed by GraphemeBreakTest-6.1.0.txt (2011-12-07, 17:54:39 UTC), except
#   50 surrogate cases.
# - Passed by GraphemeBreakTest-6.2.0d4.txt (2012-06-02, 23:25:40 UTC), except
#   58 surrogate cases.  [sombok-2.3.0beta1]
# - Passed by GraphemeBreakTest-6.2.0d6.txt (2012-08-14, 17:54:56 UTC), except
#   54 surrogate cases.  [sombok-2.3.0gamma1]
# - Passed by GraphemeBreakTest-6.2.0d8.txt (2012-08-22, 12:41:15 UTC), except
#   54 surrogate cases.  [sombok-2.3.0]
# - Passed by GraphemeBreakTest-6.3.0d1.txt (2012-12-20, 22:18:29 UTC), except
#   54 surrogate cases.  [sombok-2.3.1b]
# - Passed by GraphemeBreakTest-7.0.0d13.txt (2013-11-27, 09:54:39 UTC), except
#   surrogate cases.  [sombok-2.3.2beta1]
#
# Note: Legacy-CM feature is enabled.
#

use strict;
use Test::More;
use Encode qw(decode is_utf8);
use Unicode::GCString;

BEGIN {
    my $tests = 0;
    if (open IN, 'test-data/GraphemeBreakTest.txt') {
	my $desc = '';
	while (<IN>) {
	    s/\s*#\s*(.*)//;
	    if ($. <= 2) {
		$desc .= " $1";
		chomp $desc;
	    }
	    next unless /\S/;
	    $tests++;
	}
	close IN;
	if ($tests) {
	    plan tests => $tests;
	    diag $desc;
	} else {
	    plan skip_all => 'test-data/GraphemeBreakTest.txt is empty.';
	}
    } else {
	plan skip_all => 'test-data/GraphemeBreakTest.txt found at '.
	    'http://www.unicode.org/Public/ is required.';
    }
}

my @opts = (LegacyCM => 'YES', ViramaAsJoiner => 'NO');

open IN, 'test-data/GraphemeBreakTest.txt';

while (<IN>) {
    chomp $_;
    s/\s*#\s*(.*)$//;
    my $desc = $1;
    next unless /\S/;

    SKIP: {
	skip "subtests including surrogate", 1
	    if /\bD[89AB][0-9A-F][0-9A-F]\b/;

	s/\s*÷$//;
	s/^÷\s*//;

	my $s = join '',
	    map {
		$_ = chr hex "0x$_";
		$_ = decode('iso-8859-1', $_) unless is_utf8($_);
		$_;
	    }
	    split /\s*(?:÷|×)\s*/, $_;

	is join(' ÷ ',
	    map {
		 join ' × ',
		 map { sprintf '%04X', ord $_ }
		 split //, $_->as_string;
	    }
	    @{Unicode::GCString->new($s, @opts)}
	  ), $_, $desc;
    }
}

close IN;