"""Tests for the tokeniser — compound token rule, punctuation stripping, length filter.""" import pytest from festinger.tokenizer import tokenize, tokenize_all def test_simple_tokens(): # "repo" is 4 chars — filtered by the ≥5 rule. Use a longer word. tokens = tokenize("gnommoweb is a repository") assert "gnommoweb" in tokens assert "repository" in tokens assert "repo" not in tokens # 4 chars — below threshold def test_compound_token_rule(): tokens = tokenize("Glitch University runs on Docker") assert "glitch_university" in tokens assert "docker" in tokens # Individual parts should NOT appear as separate tokens assert "glitch" not in tokens assert "university" not in tokens def test_multi_word_compound(): tokens = tokenize("New York City is a place") assert "new_york_city" in tokens def test_lowercase_breaks_compound_run(): # "the" breaks the run — "Glitch University" still merges tokens = tokenize("the Glitch University system") assert "glitch_university" in tokens assert "system" in tokens assert "glitch" not in tokens def test_length_filter(): # Tokens < 5 chars are dropped tokens = tokenize("cat dog bird eagle") assert "eagle" in tokens assert "bird" not in tokens assert "cat" not in tokens assert "dog" not in tokens def test_punctuation_stripped(): # Trailing punctuation (period, colon) breaks the compound run. # "FastAPI." ends a run immediately; "Docker:" starts and ends a fresh run. tokens = tokenize("gnommoweb, FastAPI. Docker:") assert "gnommoweb" in tokens assert "fastapi" in tokens # from "FastAPI." — flushed as solo compound assert "docker" in tokens # from "Docker:" — flushed as solo compound # Must NOT merge across sentence boundaries assert "fastapi_docker" not in tokens def test_deduplication(): tokens = tokenize("gnommoweb gnommoweb gnommoweb") assert tokens.count("gnommoweb") == 1 def test_empty_string(): assert tokenize("") == [] def test_tokenize_all_no_length_filter(): # tokenize_all keeps short tokens tokens = tokenize_all("is a part of") assert "is" in tokens assert "of" in tokens assert "part" in tokens