Adding Festinger with wordnet
This commit is contained in:
@@ -0,0 +1,70 @@
|
||||
"""Tests for the tokeniser — compound token rule, punctuation stripping, length filter."""
|
||||
import pytest
|
||||
from festinger.tokenizer import tokenize, tokenize_all
|
||||
|
||||
|
||||
def test_simple_tokens():
|
||||
# "repo" is 4 chars — filtered by the ≥5 rule. Use a longer word.
|
||||
tokens = tokenize("gnommoweb is a repository")
|
||||
assert "gnommoweb" in tokens
|
||||
assert "repository" in tokens
|
||||
assert "repo" not in tokens # 4 chars — below threshold
|
||||
|
||||
|
||||
def test_compound_token_rule():
|
||||
tokens = tokenize("Glitch University runs on Docker")
|
||||
assert "glitch_university" in tokens
|
||||
assert "docker" in tokens
|
||||
# Individual parts should NOT appear as separate tokens
|
||||
assert "glitch" not in tokens
|
||||
assert "university" not in tokens
|
||||
|
||||
|
||||
def test_multi_word_compound():
|
||||
tokens = tokenize("New York City is a place")
|
||||
assert "new_york_city" in tokens
|
||||
|
||||
|
||||
def test_lowercase_breaks_compound_run():
|
||||
# "the" breaks the run — "Glitch University" still merges
|
||||
tokens = tokenize("the Glitch University system")
|
||||
assert "glitch_university" in tokens
|
||||
assert "system" in tokens
|
||||
assert "glitch" not in tokens
|
||||
|
||||
|
||||
def test_length_filter():
|
||||
# Tokens < 5 chars are dropped
|
||||
tokens = tokenize("cat dog bird eagle")
|
||||
assert "eagle" in tokens
|
||||
assert "bird" not in tokens
|
||||
assert "cat" not in tokens
|
||||
assert "dog" not in tokens
|
||||
|
||||
|
||||
def test_punctuation_stripped():
|
||||
# Trailing punctuation (period, colon) breaks the compound run.
|
||||
# "FastAPI." ends a run immediately; "Docker:" starts and ends a fresh run.
|
||||
tokens = tokenize("gnommoweb, FastAPI. Docker:")
|
||||
assert "gnommoweb" in tokens
|
||||
assert "fastapi" in tokens # from "FastAPI." — flushed as solo compound
|
||||
assert "docker" in tokens # from "Docker:" — flushed as solo compound
|
||||
# Must NOT merge across sentence boundaries
|
||||
assert "fastapi_docker" not in tokens
|
||||
|
||||
|
||||
def test_deduplication():
|
||||
tokens = tokenize("gnommoweb gnommoweb gnommoweb")
|
||||
assert tokens.count("gnommoweb") == 1
|
||||
|
||||
|
||||
def test_empty_string():
|
||||
assert tokenize("") == []
|
||||
|
||||
|
||||
def test_tokenize_all_no_length_filter():
|
||||
# tokenize_all keeps short tokens
|
||||
tokens = tokenize_all("is a part of")
|
||||
assert "is" in tokens
|
||||
assert "of" in tokens
|
||||
assert "part" in tokens
|
||||
Reference in New Issue
Block a user