#!/usr/bin/env perl
use strict;
use warnings;
use Test::More;
use Lugh;

# Skip if no test model is available
my $model_file = 'models/tinyllama-1.1b-chat-v1.0.Q2_K.gguf';
unless (-e $model_file) {
    plan skip_all => "No test model at $model_file - run download script";
}

plan tests => 32;

# Load model and create tokenizer
my $model = Lugh::Model->new(model => $model_file);
ok($model, 'Model loaded');

my $tokenizer = Lugh::Tokenizer->new(model => $model);
ok($tokenizer, 'Tokenizer created');
isa_ok($tokenizer, 'Lugh::Tokenizer');

# Test vocabulary size
my $vocab_size = $tokenizer->n_vocab;
ok($vocab_size > 0, 'Vocabulary size is positive');
is($vocab_size, 32000, 'LLaMA vocab size is 32000');

# Test special token IDs
my $bos_id = $tokenizer->bos_id;
my $eos_id = $tokenizer->eos_id;
ok(defined $bos_id, 'BOS token ID defined');
ok(defined $eos_id, 'EOS token ID defined');
is($bos_id, 1, 'BOS token ID is 1');
is($eos_id, 2, 'EOS token ID is 2');

# Test basic encoding with BOS
my @tokens = $tokenizer->encode("Hello");
ok(scalar(@tokens) > 0, 'encode() returns tokens');
is($tokens[0], $bos_id, 'First token is BOS by default');

# Test encoding without BOS
my @tokens_no_bos = $tokenizer->encode("Hello", add_bos => 0);
ok(scalar(@tokens_no_bos) > 0, 'encode() without BOS returns tokens');
isnt($tokens_no_bos[0], $bos_id, 'First token is not BOS when add_bos => 0');
is(scalar(@tokens_no_bos), scalar(@tokens) - 1, 'Without BOS has one less token');

# Test encoding empty string
my @empty_tokens = $tokenizer->encode("");
ok(scalar(@empty_tokens) > 0, 'Empty string returns at least BOS token');
is($empty_tokens[0], $bos_id, 'Empty string starts with BOS');

# Test encoding simple phrases
my @capital_tokens = $tokenizer->encode("The capital of France is");
ok(scalar(@capital_tokens) > 1, 'Phrase encodes to multiple tokens');
is($capital_tokens[0], $bos_id, 'Phrase starts with BOS');

# Test decode
my $decoded = $tokenizer->decode([$bos_id]);
is($decoded, "", 'BOS token decodes to empty (special token skipped)');

# Test decode with array ref
my $text1 = $tokenizer->decode([3681]); # Paris token
ok(length($text1) > 0, 'decode() with array ref returns text');

# Test decode with list
my $text2 = $tokenizer->decode(3681);
ok(length($text2) > 0, 'decode() with single token returns text');
is($text1, $text2, 'decode() works same with array ref and list');

# Test round-trip encoding/decoding
my $original = "Hello, world!";
my @encoded = $tokenizer->encode($original);
my $decoded_full = $tokenizer->decode(\@encoded);
like($decoded_full, qr/Hello/, 'Round-trip preserves main content');
like($decoded_full, qr/world/, 'Round-trip preserves all words');

# Test round-trip without BOS
my @encoded_no_bos = $tokenizer->encode($original, add_bos => 0);
my $decoded_no_bos = $tokenizer->decode(\@encoded_no_bos);
like($decoded_no_bos, qr/Hello/, 'Round-trip without BOS preserves content');

# Test encoding with punctuation
my @punct_tokens = $tokenizer->encode("Hello, how are you?");
ok(scalar(@punct_tokens) > 3, 'Punctuation encoded');

# Test encoding with numbers
my @num_tokens = $tokenizer->encode("The year is 2024");
ok(scalar(@num_tokens) > 1, 'Numbers encoded');

# Test UTF-8 handling (basic ASCII)
my @ascii_tokens = $tokenizer->encode("ABC123");
ok(scalar(@ascii_tokens) > 0, 'ASCII text encodes');

# Test multiple sentences
my @multi_tokens = $tokenizer->encode("First sentence. Second sentence.");
ok(scalar(@multi_tokens) > 5, 'Multiple sentences encode');

# Test decode multiple tokens
my @sample_tokens = (1, 450, 7483); # BOS, The, capital
my $sample_text = $tokenizer->decode(\@sample_tokens);
ok(length($sample_text) > 0, 'Multiple token decode produces text');

# Test decode preserves token order
my @ordered = (1, 2, 3, 4, 5);
my $ordered_text = $tokenizer->decode(\@ordered);
# Should not crash, even with special tokens
ok(defined $ordered_text, 'decode() handles sequential token IDs');

# Test specific known encodings for TinyLlama
# "The capital of France is" should consistently encode
my @france1 = $tokenizer->encode("The capital of France is");
my @france2 = $tokenizer->encode("The capital of France is");
is_deeply(\@france1, \@france2, 'Same input produces same tokens (deterministic)');
