#!/usr/bin/env perl
#
# This script creates a towctrans.h header (or C source) file, musl old-style.
# for simple casefolding as implemented in towlower() and towupper().
# Used for building musl and safeclib for fast and small upper/lowercasing
# tables for towlower() and towupper() and its secure variants
# towupper_s() and towlower_s(). Planned also for the multi-byte
# folding tables for towfc().
#
# The generated code is licensed under the MIT.
#
# Usage:
#    gen_wctrans [-v 17] [--out towctrans.h] [--cf CaseFolding.txt]
#
# Input files (will be downloaded if missing):
#    CaseFolding.txt
#
# Output files:
#    towctrans.h or towctrans.c
#    towfc.h (later)

use 5.012;
use strict;
use warnings;
use Carp;

# Minimum size of and excluded range. The more ranges we have, the slower.
# The larger the ranges, the more misses in holes we might have, going through
# all checks. musl-old had ~2500. Tested via bench
use constant MIN_ECXL => 2500;

BEGIN {
    unless ( 'A' eq pack( 'U', 0x41 ) ) {
        die "Unicode::Towctrans cannot stringify a Unicode code point\n";
    }
    unless ( 0x41 == unpack( 'U', 'A' ) ) {
        die "Unicode::Towctrans cannot get Unicode code point\n";
    }
}
our $PACKAGE = 'Unicode::Towctrans';
$Unicode::Towctrans::VERSION = '0.03';

use Getopt::Long;
my ( $v, $help, $verbose, $safec, $musl );
my $cf            = "CaseFolding.txt";
my $out           = "towctrans.h";
my $with_iswalpha = 0;
my $cmdline_args = join(" ", @ARGV);
$cmdline_args = " $cmdline_args" if $cmdline_args;

GetOptions(
    "v=i"           => \$v,                # numeric
    "cf=s"          => \$cf,               # string
    "out|o=s"       => \$out,              # string
    "with-iswalpha" => \$with_iswalpha,    # flag
    "musl"          => \$musl,             # flag
    "safec"         => \$safec,            # flag
    "verbose"       => \$verbose,          # flag
    "help|h"        => \$help              # flag
) or die("Error in command line arguments\n");
if ($help) {
    print <<'EOF';
gen_wctrans [OPTIONS]
Generate casefolding C header file
OPTIONS
-v NUM                Unicode major version number
--cf CaseFolding.txt  input filename. default: CaseFolding.txt
                      Downloaded if not found.
--out filename        default: towctrans.h
--with-iswalpha       if you can trust iswalpha. not with glibc, only musl.
--musl                create towctrans.c for musl. with iswalpha and LOCALE_TR
--safec               create towctrans.c for safeclib, with LOCALE_TR
--verbose
--help
EOF
    exit;
}

if ($musl) {
    $with_iswalpha = 1;
    $out           = "towctrans.c" if $out eq "towctrans.h";
}
if ($safec) {
    die "cannot use --sace and --musl together\n" if $musl;
    $out = "towctrans.c"                          if $out eq "towctrans.h";
}
if ( !$v and !-e $cf ) {
    use Unicode::UCD;
    my $full = Unicode::UCD::UnicodeVersion();
    ($v) = $full =~ /^(\d+)\./;
}

binmode *STDOUT, ':utf8';
binmode *STDERR, ':utf8';
my ( @map, @lace, @excl, @pair, $prev, $lc, %upper, %lower, @CASEL );

########## helpers ##########

## converts string "hhhh hhhh hhhh" to a numeric list
## (hex digits separated by spaces)
sub getHexArray { map hex, $_[0] =~ /\G *([0-9A-Fa-f]+)/g }

########## writing header files ##########

#         from    until to (=lower of from)
# CASEMAP(0x00c0, 0xd6, 0xe0), // 192 32 23
# CASEMAP(0x00d8, 0xde, 0xf8), // 216 32 7
# i.e. 100-101,102-103...12e-12f
# CASELACE(0x0100, 0x12e), // 256 1 47
# CASELACE(0x0132, 0x136), // 306 1 5
# CASELACE(0x0139, 0x147), // 313 1 15
# CASELACE(0x014a, 0x176), // 330 1 45
# CASELACE(0x0179, 0x17d), // 377 1 5
# CASELACE(0x01a0, 0x1a4), // 416 1 5 O WITH HORN - P WITH HOOK
# CASELACE(0x01b3, 0x1b5), // 435 1 3
# CASELACE(0x01cd, 0x1db), // 461 1 15

sub map_len {
    my $m = shift;
    return $m->[1] - $m->[0] + 1;
}

sub map_lower {
    my $m = shift;
    return $m->[2] - $m->[0];
}

# triple of base, last, to.
# offset is to - base, length is last - base + 1
sub add_map {
    my ( $map, $base, $to ) = @_;

    my $diff  = $to - $base;
    my $odiff = $map->[-1] ? $map->[-1][2] - $map->[-1][0] : 0;

    # if it's the next cp and has the same offset
    if ( $map->[-1] and $map->[-1][1] == $base - 1 && $diff == $odiff ) {
        ++$map->[-1][1];
        if ($verbose) {
            warn sprintf( "bump map [%04X, %04X, %04X]\n", @{ $map->[-1] } );
        }
    }
    else {
        push @$map, [ $base, $base, $to ];
        if ($verbose) {
            warn sprintf( "new map [%04X, %04X, %04X]\n", @{ $map->[-1] } );
        }
    }
}

# pair of base, last with lower offset of 1
sub add_lace {
    my ( $lace, $base, $to ) = @_;

    # if it's the next cp
    if (   $lace->[-1]
        && $lace->[-1][1] == $to - 2 )
    {
        $lace->[-1][1] = $to;
        if ($verbose) {
            warn sprintf( "bump lace %04X, %04X\n", @{ $lace->[-1] } );
        }
    }
    else {
        push @$lace, [ $base, $to ];
        if ($verbose) {
            warn sprintf( "new lace %04X, %04X\n", @{ $lace->[-1] } );
        }
    }
}

# FIXME: check the reverse if deviating. upper(03BC) => 039C, not B5
sub add_pair {
    my ( $pair, $base, $to ) = @_;
    push @$pair, [ $base, $to ];
}

# exclude a pair of first, last.
# also observe the existing lhs cp and rhs lower mappings
sub add_excl {
    my ( $excl, $base ) = @_;

    if ( exists $lower{$base} or exists $upper{$base} ) {
        warn( sprintf( "skip excl %04X\n", $base ) ) if $verbose;
        return;
    }

    # if it's the next cp
    if ( $excl->[-1] && $excl->[-1][1] == $base - 1 ) {
        ++$excl->[-1][1];    # extend range
        warn( sprintf( "bump excl [%04X, %04X]\n", @{ $excl->[-1] } ) )
          if $verbose;
    }
    else {
        push @$excl, [ $base, $base ];    # new range
        warn( sprintf( "new excl [%04X, %04X]\n", @{ $excl->[-1] } ) )
          if $verbose;
    }
}

if ( $v and !-e $cf ) {
    my $url = "https://www.unicode.org/Public/$v.0.0/ucd/CaseFolding.txt";
    `wget -q $url -O $cf` and die "$PACKAGE: failed to download $url: $!";
}
open my $CF, "<", $cf or croak "$PACKAGE: $cf can't be read $!";

# first scan to catch all rhs lower mappings
while ( my $l = <$CF> ) {
    chomp $l;
    if ( !$v and $. == 1 ) {
        $l =~ /CaseFolding-(\d+).0.0.txt/;
        $v = $1;
    }
    next if $l =~ /^\s*#/;
    next if $l =~ /^\s*$/;
    my ( $cp, $status, $mapping, $name ) = split /;\s*/, $l;
    my @cp = getHexArray($cp);
    die "first column multiple codepoints" if @cp != 1;
    $cp = $cp[0];
    my @mapping = getHexArray($mapping);

    if ( @mapping == 1 ) {
        $lower{ $mapping[0] } = $cp;
    }
    $upper{$cp} = $mapping[0];
}

seek $CF, 0, 0;

# 2nd scan, now process CP's in order.
while ( my $l = <$CF> ) {
    chomp $l;
    next if $l =~ /^\s*#/;
    next if $l =~ /^\s*$/;
    my ( $cp, $status, $mapping, $name ) = split /;\s*/, $l;
    my @cp = getHexArray($cp);
    die "first column multiple codepoints" if @cp != 1;
    $cp = $cp[0];
    my @mapping = getHexArray($mapping);

    if ( !@map ) {
        add_map( \@map, $cp, $mapping[0] );    # 'A' -> 'a'
    }
    else {
        # check status. only if C
        if ( $cp - $prev == 1 ) {              # next cp
            if ( ( $status eq 'C' or $status eq 'S' ) and scalar @mapping == 1 )
            {
                $lc = $mapping[0];
                if ( $lc - $cp == 1 ) {

                    # check if we can convert the previous pair to a lace
                    if (    @pair
                        and $pair[-1][0] == $cp - 1
                        and $pair[-1][1] == $lc - 1 )
                    {
                        warn(
                            sprintf(
                                "convert old pair to lace %04X %04X\n",
                                $cp, $lc
                            )
                        ) if $verbose;
                        pop @pair;
                        add_lace( \@lace, $cp - 1, $lc - 1 );
                        add_lace( \@lace, $cp,     $lc );
                    }
                    else {
                        #warn "add_lace $cp $lc\n" if $verbose;
                        add_lace( \@lace, $cp, $lc );
                    }
                }
                else {
                    # check if we can convert the previous pair to a map
                    if (    @pair
                        and $pair[-1][0] == $cp - 1
                        and $pair[-1][1] == $lc - 1 )
                    {
                        if ($verbose) {
                            warn
                              sprintf( "convert old pair to map %04X, %04X\n",
                                $cp, $lc );
                        }
                        pop @pair;
                        add_map( \@map, $cp - 1, $lc - 1 );
                        add_map( \@map, $cp,     $lc );
                    }
                    else {
                        #warn "add_map $cp $lc\n" if $verbose;
                        add_map( \@map, $cp, $lc );
                    }
                }
            }
        }
        else {    # not next cp, a hole or lace
            if ( $status eq 'T' ) {
                ;
            }
            elsif ( ( $status eq 'C' or $status eq 'S' )
                and scalar @mapping == 1 )
            {
                my $to = $mapping[0];
                if ( $to - $cp == 1 ) {

                    #warn "add_lace $cp $to\n" if $verbose;
                    add_lace( \@lace, $cp, $to );
                }
                else {    # add a hole
                    warn( sprintf( "add_pair %04X, %04X\n", $cp, $to ) )
                      if $verbose;
                    add_pair( \@pair, $cp, $to );
                }
            }
        }
    }
    $prev = $cp;
}
close $CF;

my $ucd_version = "$v.0.0";
my @h_args      = ( $Unicode::Towctrans::VERSION, $cmdline_args, $ucd_version );

if ( !-w $out ) {
    chmod 0644, $out;
}
open FH, ">:utf8", $out or croak "$PACKAGE: $out can't be written $!";
printf FH <<'EOF', @h_args;
/* ex: set ro ft=c: -*- buffer-read-only: t -*-
 *
 * !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!!
 * This file is auto-generated by Unicode::Towctrans %s
 * gen_wctrans%s
 * for Unicode %s
 * Any changes here will be lost!
 */
EOF

if ( $musl or $safec ) {
    print FH "#define HAVE_LOCALE_TR\n";
}

printf FH <<'EOF', ( $ucd_version, $v );
/*
Copyright (c) 2005-2014 Rich Felker, et al.
Copyright (c) 2018,2020,2026 Reini Urban

--------------------------------------------------------------
This code is licensed under the following standard MIT license
--------------------------------------------------------------

Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:

The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
----------------------------------------------------------------------
*/

#include <assert.h>
#include <stdint.h>
#include <wctype.h>
#ifdef HAVE_LOCALE_TR
#include <locale.h>
#include <string.h>
#endif

/* map from upper until upper, to lower */
#define CASEMAP(u1, u2, l) {(u1), (l) - (u1), (u2) - (u1) + 1}
/* map from upper until lower, with dist 1 */
#define CASELACE(u1, u2) CASEMAP((u1), (u2), (u1) + 1)

/* for Unicode %s */
#define TOWCTRANS_UNICODE_VERSION %u

static const struct {
    uint16_t upper; /* base */
    int8_t lower;   /* distance from upper to lower. 1 with LACE */
    uint8_t len;    /* how many */
} casemaps[] = {
    /* from, until, to */
EOF

my $CASEL = <<'EOF';
static const struct {
    uint32_t upper; /* base */
    int lower;      /* distance from upper to lower */
    uint16_t len;   /* how many */
} casemapsl[] = {
    /* from, until, to */
EOF

# take next of @map and @lace
my ( $i, $m ) = each @map;
my ( $j, $l ) = each @lace;
my $has_long;

while ( defined($i) or defined($j) ) {

    # short or long
    if ( defined($i) and ( !defined($j) or $m->[0] < $l->[0] ) ) {
        if ( !$has_long and $m->[2] > 0xffff ) {
            $has_long++;
            print FH "    {0, 0, 0}};\n";
            print FH $CASEL;
            print FH $_ for @CASEL;
        }
        my $spc = $has_long ? $m->[0] < 0xffff ? " "x4 : " " : " "; # x8
        my $cmt = sprintf "%s/* '%c'->'%c'..'%c' {, %d, %u} */", $spc,
            $m->[0], $m->[1], $m->[2], map_lower($m), map_len($m);

        # on overflow push to @CASEL instead
        my $s = sprintf "    CASEMAP(0x%04x, 0x%04x, 0x%04x),%s\n", $m->[0],
          $m->[1],
          $m->[2], $cmt;
        my $lower = map_lower($m);
        my $len   = map_len($m);
        if ( $lower > 128 or $lower < -128 or $len > 255 ) {
            $spc = " ";
            my $cmt = sprintf "%s/* '%c'->'%c'..'%c' {, %d, %u} */", $spc,
                $m->[0], $m->[1], $m->[2], map_lower($m), map_len($m);
            my $s0 = sprintf "    CASEMAP(0x%05x, 0x%05x, 0x%05x),%s\n",  $m->[0],
              $m->[1], $m->[2], $cmt;
            warn "defer overflow $s0 to casemapl\n" if $verbose;
            push @CASEL, $s0;
        }
        else {
            printf FH $s;
        }
        ( $i, $m ) = each @map;
    }
    elsif ( defined($j) ) {
        if ( !$has_long and $l->[1] > 0xffff ) {
            $has_long++;
            print FH "    {0, 0, 0}};\n";
            print FH $CASEL;
            print FH $_ for @CASEL;
        }
        my $cmt =
            sprintf( "%s/* '%c'->'%c' {, %d, %u} */", $has_long ? " "x9 : " "x8,
                     $l->[0], $l->[1] - 1, map_lower($m), map_len($m) );
        printf FH "    CASELACE(0x%04x, 0x%04x),%s\n", $l->[0], $l->[1] - 1,
          $cmt;
        ( $j, $l ) = each @lace;
    }
}
print FH "    {0, 0, 0}};\n";
print FH "static const unsigned short pairs[][2] = {\n"
  . "    /* upper, lower */\n";

# also fixup pairs to enable reverse lookup
my $n_pairl;
for my $p (@pair) {
    my $cmt = sprintf( " /* '%c' -> '%c' */", $p->[0], $p->[1] );
    if ( $p->[0] > 0xffff ) {
        if ($n_pairl) {
            $n_pairl++;
        }
        else {
            $n_pairl++;
            print FH "    {0, 0}};\n";
            print FH "#define HAVE_PAIRL\n";
            print FH "static const unsigned int pairl[][2] = {\n"
              . "    /* upper, lower */\n";
        }
    }
    printf FH "    {0x%04x, 0x%04x},%s\n", $p->[0], $p->[1], $cmt;
}
print FH "    {0, 0}};\n";
printf FH "#define PAIRL_SZ %u\n", $n_pairl if $n_pairl;
print FH "\n";
print FH "uint32_t _towcase(uint32_t wc, int lower) {
    int i;
    int lmul;  /* 1 for lower, -1 for upper */
    int lmask; /* 0 for lower, -1/0xffff for upper */
    /* !iswalpha(wc) is broken on most locales, at least with glibc. */
";
if ($with_iswalpha) {    # if we have a working iswalpha (not with glibc)
    print FH "    if (!iswalpha(wc) || ";
}
else {
    # print larger exclusion ranges. iswalpha is useless with glibc
    print FH "    if (";
}

# generate the excl ranges here, because it is more stable
my %alpha;
$alpha{$_}++ for keys %lower;
$alpha{$_}++ for keys %upper;
my @sorted = sort { $a <=> $b } keys %alpha;
my $first  = $sorted[0];
my $last   = $sorted[-1];
@excl = ( [ 0, $first - 1 ] );

for ( $first + 1 .. $last - 1 ) {
    if ( !exists $alpha{$_} ) {
        add_excl( \@excl, $_ );
    }
}

# ternary_tree(1,6) => [4, [2, 1, 3], [6, 5]]
#sub ternary_tree {
#    my ($lo, $hi) = @_;
#    return undef if $lo > $hi;
#    my $mid = int(($lo + $hi + 1) / 2);
#    my @node = ($mid, ternary_tree($lo, $mid - 1), ternary_tree($mid + 1, $hi));
#    pop @node while @node && !defined $node[-1];
#    return @node == 1 ? $node[0] : \@node;
#}
#sub binary_search_indices {
#    my ($n) = @_;
#    return () if $n <= 1;
#
#    my @indices;
#    my @queue = ([1, $n - 1]);
#
#    while (@queue) {
#        my ($lo, $hi) = @{ shift @queue };
#        my $mid = int(($lo + $hi) / 2);
#        push @indices, $mid;
#
#        push @queue, [$lo, $mid - 1] if $lo < $mid;
#        push @queue, [$mid + 1, $hi] if $mid < $hi;
#    }
#
#    return @indices;
#}

# first purge all too small ranges
my @new_excl = ( $excl[0] );
for my $i ( 1 .. $#excl ) {
    my $e = $excl[$i];
    my $skip;
    if ( $e->[1] - $e->[0] >= MIN_ECXL ) {

        # cross-check
        for ( $e->[0] .. $e->[1] ) {
            if ( exists $lower{$_} or exists $upper{$_} ) {
                warn( sprintf( "wrong excl %04X skipped", $_ ) );
                $skip = 1;
                last;
            }
        }
        push @new_excl, $e unless $skip;
    }
}

# sort ranges by size. GH #2
printf FH "wc <= 0x%x                           /* %u */\n", $excl[0]->[1], $excl[0]->[1];
$j = 0;
shift @new_excl;
my @ex = sort { $b->[1] - $b->[0] <=> $a->[1] - $a->[0] } @new_excl;
for my $e (@ex) {
    my $s = sprintf("        || wc - 0x%x <= 0x%x - 0x%x", $e->[0], $e->[1], $e->[0]);
    my $spc = 45-length($s) > 0 ? " "x(45-length($s)) : " ";
    printf FH "%s%s/* %u */\n", $s, $spc, $e->[1] - $e->[0] + 1;
}
print FH "    )\n";
print FH "        return wc;\n";
print FH <<'EOF';

#ifdef HAVE_LOCALE_TR
    /* check for the 2 turkish mappings if we have a turkish locale. */
    if ((lower && (wc == 0x49 || wc == 0x130)) ||
        (!lower && (wc == 0x69 || wc == 0x131))) {
        const char *loc = setlocale(LC_CTYPE, NULL);
        if (loc && (!strncmp(loc, "tr", 2) || !strncmp(loc, "az", 2))) {
            if (lower) {
                if (wc == 0x49)
                    return 0x131;
                else
                    return 0x69;
            } else {
                if (wc == 0x69)
                    return 0x130;
                else
                    return 0x49;
            }
        }
    }
#endif

    lmul = 2 * lower - 1; /* 1 for lower, -1 for upper */
    lmask = lower - 1;    /* 0 for lower, -1/0xffff for upper */
    /* TODO: binary search the ranges if lower. GH #4 */
    for (i = 0; casemaps[i].len; i++) {
        int base = casemaps[i].upper + (lmask & casemaps[i].lower);
        assert(i > 0 ? casemaps[i].upper >= casemaps[i - 1].upper : 1);
        if (wc - base < casemaps[i].len) {
            if (casemaps[i].lower == 1)
                return wc + lower - ((wc - casemaps[i].upper) & 1);
            /* The only reverse fixup needed. Tested from Unicode 5 to 18. */
            if (wc == 0xA64B)
                return 0xA64A;
            else
                return wc + lmul * casemaps[i].lower;
        }
        if (lower && casemaps[i].upper > wc)
            break;
    }
    /* TODO: binary search the pairs (lower only?). GH #3 */
    for (i = 0; pairs[i][1 - lower]; i++) {
        assert(i > 0 ? pairs[i][0] >= pairs[i - 1][0] : 1);
        if (pairs[i][1 - lower] == wc)
            return pairs[i][lower];
        if (lower && pairs[i][0] > wc)
            break;
    }
#ifdef HAVE_PAIRL
#if PAIRL_SZ == 1
    if (pairl[0][1 - lower] == wc)
        return pairs[0][lower];
#else
    /* TODO: binary search the pairs. GH #3 */
    for (i = 0; pairl[i][1 - lower]; i++) {
        assert(i > 0 ? pairl[i][0] >= pairl[i - 1][0] : 1);
        if (pairl[i][1 - lower] == wc)
            return pairs[i][lower];
        if (lower && pairl[i][0] > wc)
            break;
    }
#endif
#endif
    /* TODO: binary search the ranges. (lower only?) GH #4 */
    for (i = 0; casemapsl[i].len; i++) {
        unsigned long base = casemapsl[i].upper + (lmask & casemapsl[i].lower);
        assert(i > 0 ? casemapsl[i].upper >= casemapsl[i - 1].upper : 1);
        if (wc - base < casemapsl[i].len) {
            if (casemapsl[i].lower == 1)
                return wc + lower - ((wc - casemapsl[i].upper) & 1);
            return wc + lmul * casemapsl[i].lower;
        }
        if (lower && casemaps[i].upper > wc)
            break;
    }
    return wc;
}
EOF

if ($musl) {

    print FH <<'EOF';

wint_t towupper(wint_t wc)
{
	return (unsigned)wc < 128 ? toupper(wc) : __towcase(wc, 0);
}

wint_t towlower(wint_t wc)
{
	return (unsigned)wc < 128 ? tolower(wc) : __towcase(wc, 1);
}

wint_t __towupper_l(wint_t c, locale_t l)
{
	return towupper(c);
}

wint_t __towlower_l(wint_t c, locale_t l)
{
	return towlower(c);
}

weak_alias(__towupper_l, towupper_l);
weak_alias(__towlower_l, towlower_l);
EOF
}
elsif ($safec) {

    print FH <<'EOF';

EXPORT uint32_t _towupper(uint32_t wc) {
    return wc < 128 ? (uint32_t)toupper(wc) : _towcase(wc, 0);
}
#ifndef HAVE_TOWUPPER
EXPORT uint32_t towupper(uint32_t wc) {
    return wc < 128 ? (uint32_t)toupper(wc) : _towcase(wc, 0);
}
#endif

#ifndef HAVE_TOWLOWER
EXPORT uint32_t towlower(uint32_t wc) {
    return (unsigned)wc < 128 ? (wint_t)tolower(wc) : _towcase(wc, 1);
}
#endif
EOF
}

close FH;
chmod 0444, $out;

__END__

=head1 NAME

Unicode::Towctrans - Generate small casefolding tables

=head1 SYNOPSIS

    gen_wctrans
    gen_wctrans --safec
    gen_wctrans --musl
    gen_wctrans -v 15
    gen_wctrans -v 15 --cf CaseFolding.txt.15 --out towctrans-15.h

=head1 DESCRIPTION

F<gen_wctrans> generates a F<towctrans.h> header file, which is used by C<musl>
and C<safeclib> to generate small and efficient case folding tables, to
build the libc C<towupper()> and C<towlower()> functions and its secure
variants C<towupper_s()> and C<towlower_s()>.

If the code may run on a system with the turkish or azeri locale, you
need to define C<-DHAVE_LOCALE_TR> to check for the special turkish i
locale and mappings at run-time.

If you know that your C<iswalpha()> works correctly (only with musl),
then use C<--with_iswalpha> to get a lightly faster function. E.g. for
benchmarking.

Planned also for the multi-byte folding tables for C<wcsfc_s()> for
safeclib. As the single-byte C<towupper> and C<towlower> conversions
are meaningless for many multi-byte unicode mappings, those with
status B<F> - folding. Use a proper string foldcasing function instead.

=head1 PERFORMANCE

Currently it is still a bit un-optimized, but small and fast enough
compared to the other implementations. And esp. correct compared to glibc,
which ignores characters from other locales.

    make -C examples
    ./bench
          my:        160 [us]
    musl-new:        352 [us]
    musl-old:        286 [us]
       glibc:        197 [us]

     wc -c towctrans-*.o
       5072 towctrans-my.o
       7096 towctrans-musl-new.o
       3408 towctrans-musl-old.o
      97432 towctrans-glibc.o

=head1 INSTALLATION

Perl 5.12 or later is required.

This module does not need to be installed. running gen_wctrans is enough.
However for full testing and global installation run this:

   perl Makefile.PL
   make
   make test
   make test-all
   sudo make install

=head1 DEPENDENCIES

This module requires a CaseFolding.txt file from Unicode Character
Database, which is automatically downloaded via wget if missing.

=head1 AUTHOR

Reini Urban <rurban@cpan.org>

Copyright(C) 2026 Reini Urban. All rights reserved

=head1 COPYRIGHT AND LICENSE

This module is free software; you can redistribute it
and/or modify it under the same terms as Perl itself.

The generated files are MIT licensed. See the generated files headers.

=head1 SEE ALSO

=over 4

=item L<https://www.unicode.org/reports/tr44/#Casemapping>

=item L<https://git.musl-libc.org/cgit/musl/tree/src/ctype/towctrans.c>

=item L<https://git.musl-libc.org/cgit/musl/tree/src/ctype/towctrans.c?id=e8aba58ab19a18f83d7f78e80d5e4f51e7e4e8a9>

=item L<https://github.com/rurban/safeclib/blob/master/src/extwchar/towctrans.c>

=item L<https://sourceware.org/git/?p=glibc.git;a=tree;f=wctype;;hb=HEAD>

=back

=cut

# Local Variables:
# perl-indent-level: 4
# End:
