Files
2026-04-19 16:16:13 +02:00

71 lines
2.2 KiB
Python

"""Tests for the tokeniser — compound token rule, punctuation stripping, length filter."""
import pytest
from festinger.tokenizer import tokenize, tokenize_all
def test_simple_tokens():
# "repo" is 4 chars — filtered by the ≥5 rule. Use a longer word.
tokens = tokenize("gnommoweb is a repository")
assert "gnommoweb" in tokens
assert "repository" in tokens
assert "repo" not in tokens # 4 chars — below threshold
def test_compound_token_rule():
tokens = tokenize("Glitch University runs on Docker")
assert "glitch_university" in tokens
assert "docker" in tokens
# Individual parts should NOT appear as separate tokens
assert "glitch" not in tokens
assert "university" not in tokens
def test_multi_word_compound():
tokens = tokenize("New York City is a place")
assert "new_york_city" in tokens
def test_lowercase_breaks_compound_run():
# "the" breaks the run — "Glitch University" still merges
tokens = tokenize("the Glitch University system")
assert "glitch_university" in tokens
assert "system" in tokens
assert "glitch" not in tokens
def test_length_filter():
# Tokens < 5 chars are dropped
tokens = tokenize("cat dog bird eagle")
assert "eagle" in tokens
assert "bird" not in tokens
assert "cat" not in tokens
assert "dog" not in tokens
def test_punctuation_stripped():
# Trailing punctuation (period, colon) breaks the compound run.
# "FastAPI." ends a run immediately; "Docker:" starts and ends a fresh run.
tokens = tokenize("gnommoweb, FastAPI. Docker:")
assert "gnommoweb" in tokens
assert "fastapi" in tokens # from "FastAPI." — flushed as solo compound
assert "docker" in tokens # from "Docker:" — flushed as solo compound
# Must NOT merge across sentence boundaries
assert "fastapi_docker" not in tokens
def test_deduplication():
tokens = tokenize("gnommoweb gnommoweb gnommoweb")
assert tokens.count("gnommoweb") == 1
def test_empty_string():
assert tokenize("") == []
def test_tokenize_all_no_length_filter():
# tokenize_all keeps short tokens
tokens = tokenize_all("is a part of")
assert "is" in tokens
assert "of" in tokens
assert "part" in tokens