Imrpove cue scanner and remove false positives

This commit is contained in:
2026-04-20 17:48:12 +02:00
parent b859aff3c5
commit 402e10901a
2 changed files with 269 additions and 7 deletions
+91 -1
View File
@@ -7,6 +7,11 @@ bypasses the saliency write threshold and goes directly to the write queue.
The `of {Z}` modifier after an ISA pattern names the dimension explicitly.
Without it, ISA defaults to dimension 'type'; ISPART defaults to 'membership'.
Noise filters applied to every extracted subject/parent:
- Must be at least MIN_TOKEN_LEN characters
- Must not be in the stopword list (pronouns, articles, common adj/adv/aux verbs)
- Must not be a bare number
"""
from __future__ import annotations
@@ -15,6 +20,86 @@ from dataclasses import dataclass
from typing import Optional
# ---------------------------------------------------------------------------
# Noise filters
# ---------------------------------------------------------------------------
MIN_TOKEN_LEN = 3 # "it", "be", "do", "a", "an" all filtered by length alone
# Words that are structural glue in English and never meaningful KG concepts.
# Covers: pronouns, articles, demonstratives, common aux/modal verbs,
# common adjectives, common adverbs, prepositions, conjunctions.
_STOPWORDS: frozenset[str] = frozenset({
# Personal pronouns
"i", "me", "my", "myself", "we", "us", "our", "ours", "ourselves",
"you", "your", "yours", "yourself", "yourselves",
"he", "him", "his", "himself",
"she", "her", "hers", "herself",
"it", "its", "itself",
"they", "them", "their", "theirs", "themselves",
# Demonstrative / relative / interrogative pronouns
"this", "that", "these", "those",
"which", "who", "whom", "whose", "what", "where", "when", "how", "why",
"whoever", "whatever", "whichever",
# Articles
"a", "an", "the",
# Auxiliary / modal verbs
"be", "been", "being", "am", "are", "was", "were",
"have", "has", "had", "having",
"do", "does", "did", "doing",
"will", "would", "shall", "should",
"may", "might", "must", "can", "could",
"ought", "need", "dare", "used",
# Common linking / existential
"there", "here",
# Prepositions / conjunctions / particles
"in", "on", "at", "to", "for", "of", "with", "by", "from", "as",
"into", "onto", "upon", "about", "above", "below", "between",
"through", "during", "before", "after", "over", "under", "within",
"without", "against", "along", "across", "behind", "beyond",
"and", "or", "but", "nor", "so", "yet", "both", "either", "neither",
"not", "no", "nor",
"if", "then", "else", "although", "though", "while", "whereas",
"because", "since", "unless", "until", "when", "than",
# Common adjectives that are never useful KG concepts
"new", "old", "good", "bad", "big", "small", "large", "little",
"long", "short", "high", "low", "right", "left", "next", "last",
"first", "second", "other", "same", "different", "few", "many",
"much", "more", "most", "less", "least", "some", "any", "all",
"every", "each", "both", "own", "such", "only", "just", "very",
"too", "also", "again", "once", "now", "then", "still", "already",
"always", "never", "often", "well", "back", "even", "way", "out",
# Common verbs that appear as bare tokens
"get", "got", "let", "put", "set", "go", "goes", "went",
"make", "made", "take", "took", "come", "came", "know", "knew",
"think", "thought", "see", "saw", "look", "use", "used",
"want", "try", "ask", "work", "seem", "feel", "call", "keep",
"give", "show", "run", "move", "live", "stand", "turn", "start",
"play", "follow", "create", "include", "continue", "add", "become",
# Boolean / value words
"true", "false", "none", "null", "yes",
})
def _is_meaningful(token: str) -> bool:
"""
Return True only if *token* could be a useful KG concept:
- long enough
- not a stopword
- not a bare integer or float
"""
if len(token) < MIN_TOKEN_LEN:
return False
if token in _STOPWORDS:
return False
try:
float(token)
return False # bare number
except ValueError:
pass
return True
@dataclass
class CueTriple:
subject: str # canonical token (lowercase, compound rule applied)
@@ -127,7 +212,7 @@ def scan_cues(text: str) -> list[CueTriple]:
# Extend parent into compound if followed by more capital words
parent = _extend_compound(text, m.end("parent"), raw_parent)
if not subj or not parent:
if not _is_meaningful(subj) or not _is_meaningful(parent):
continue
# Check for "of {Z}" dimension modifier immediately after the match
@@ -143,6 +228,11 @@ def scan_cues(text: str) -> list[CueTriple]:
else:
dimension = _infer_ispart_dimension(m.re.pattern)
# Reject if dimension was extracted but is a stopword (fall back to default)
if dimension not in ("type", "membership", "runs-on", "owned-by", "geography", "tech"):
if not _is_meaningful(dimension):
dimension = "type" if is_isa else "membership"
key = (subj, parent, dimension, is_isa)
if key not in seen:
seen.add(key)
+178 -6
View File
@@ -770,6 +770,81 @@ async def iknowthat(request: Request) -> dict:
}
# ---------------------------------------------------------------------------
# /models — LLM model management
# ---------------------------------------------------------------------------
@app.get("/models")
async def list_models(request: Request) -> dict:
pool = request.app.state.pool
async with pool.acquire() as conn:
rows = await conn.fetch(
"SELECT id, provider, model_name, created_at FROM models ORDER BY id"
)
return {"models": [
{"id": r["id"], "provider": r["provider"], "model_name": r["model_name"],
"created_at": r["created_at"].isoformat()}
for r in rows
]}
@app.post("/models")
async def create_model(request: Request) -> dict:
pool = request.app.state.pool
data = await request.json()
provider = data.get("provider", "").strip()
model_name = data.get("model_name", "").strip()
api_key = data.get("api_key", "").strip()
if not provider or not model_name or not api_key:
return {"error": "provider, model_name, and api_key are required"}
if provider not in ("claude", "openai"):
return {"error": "provider must be 'claude' or 'openai'"}
async with pool.acquire() as conn:
row = await conn.fetchrow(
"INSERT INTO models (provider, model_name, api_key) VALUES ($1,$2,$3) RETURNING id",
provider, model_name, api_key,
)
log.info("model created id=%d provider=%s model=%s", row["id"], provider, model_name)
return {"status": "ok", "id": row["id"]}
@app.delete("/models/{model_id}")
async def delete_model(model_id: int, request: Request) -> dict:
pool = request.app.state.pool
async with pool.acquire() as conn:
result = await conn.execute("DELETE FROM models WHERE id=$1", model_id)
deleted = int(result.split()[-1]) if result else 0
if not deleted:
return {"error": f"model {model_id} not found"}
log.info("model deleted id=%d", model_id)
return {"status": "ok", "deleted": model_id}
@app.get("/config")
async def get_all_config(request: Request) -> dict:
pool = request.app.state.pool
async with pool.acquire() as conn:
rows = await conn.fetch("SELECT key, value, updated_at FROM config ORDER BY key")
return {"config": {r["key"]: r["value"] for r in rows}}
@app.post("/config")
async def update_config(request: Request) -> dict:
pool = request.app.state.pool
data = await request.json()
key = data.get("key", "").strip()
value = str(data.get("value", "")).strip()
if not key:
return {"error": "key is required"}
async with pool.acquire() as conn:
await conn.execute(
"UPDATE config SET value=$1, updated_at=now() WHERE key=$2",
value, key,
)
log.info("config updated key=%s value=%s", key, value)
return {"status": "ok", "key": key, "value": value}
# ---------------------------------------------------------------------------
# /resolve/run — manually trigger resolution job
# ---------------------------------------------------------------------------
@@ -897,15 +972,19 @@ async def kg_log(request: Request, limit: int = 100, offset: int = 0, op: str =
count_query = "SELECT COUNT(*) FROM kg_write_log {where}"
if op:
where = "WHERE op = $3"
async with pool.acquire() as conn:
rows = await conn.fetch(query.format(where=where), limit, offset, op)
total = await conn.fetchval(count_query.format(where=where), op)
rows = await conn.fetch(
query.format(where="WHERE op = $3"),
limit, offset, op,
)
total = await conn.fetchval(
"SELECT COUNT(*) FROM kg_write_log WHERE op = $1",
op,
)
else:
where = ""
async with pool.acquire() as conn:
rows = await conn.fetch(query.format(where=where), limit, offset)
total = await conn.fetchval(count_query.format(where=where))
rows = await conn.fetch(query.format(where=""), limit, offset)
total = await conn.fetchval("SELECT COUNT(*) FROM kg_write_log")
def fmt(r):
return {
@@ -1642,6 +1721,34 @@ ADMIN_HTML = """<!DOCTYPE html>
<div class="stat"><div class="stat-label">Last resolution</div><div class="stat-value" style="font-size:0.85em" id="s-lastrun">…</div></div>
</div>
<h2>Resolution model</h2>
<div id="models-section">
<table id="models-table" style="margin-bottom:0.8em">
<thead><tr><th>ID</th><th>Provider</th><th>Model name</th><th>resolve?</th><th>write?</th><th></th></tr></thead>
<tbody id="models-tbody"><tr><td colspan="6">Loading…</td></tr></tbody>
</table>
<details style="margin-bottom:1em">
<summary style="cursor:pointer;font-size:0.9em;color:#555">Add model…</summary>
<div style="margin-top:0.6em;display:flex;gap:0.7em;flex-wrap:wrap;align-items:flex-end">
<label style="font-size:0.85em">Provider
<select id="m-provider" style="font-family:monospace;padding:4px 8px;display:block;margin-top:2px">
<option value="claude">claude</option>
<option value="openai">openai</option>
</select>
</label>
<label style="font-size:0.85em">Model name
<input id="m-name" type="text" value="claude-opus-4-6"
style="font-family:monospace;padding:5px 8px;border:1px solid #ccc;border-radius:3px;display:block;margin-top:2px;width:200px">
</label>
<label style="font-size:0.85em">API key
<input id="m-key" type="password" placeholder="sk-ant-…"
style="font-family:monospace;padding:5px 8px;border:1px solid #ccc;border-radius:3px;display:block;margin-top:2px;width:260px">
</label>
<button onclick="addModel(this)" style="height:32px">Add</button>
</div>
</details>
</div>
<h2>Actions</h2>
<div class="actions">
<button class="primary" onclick="runResolution(this)">Run conflict resolution now</button>
@@ -1694,6 +1801,70 @@ ADMIN_HTML = """<!DOCTYPE html>
: 'never';
}}
let _cfg = {{}};
async function loadModels() {{
const [mr, cr] = await Promise.all([fetch('/models'), fetch('/config')]);
const md = await mr.json();
_cfg = (await cr.json()).config;
const resolveId = _cfg['resolve_model_id'] || '';
const writeId = _cfg['write_model_id'] || '';
const tbody = document.getElementById('models-tbody');
if (!md.models.length) {{
tbody.innerHTML = '<tr><td colspan="6" style="color:#999">No models yet — add one below.</td></tr>';
return;
}}
tbody.innerHTML = md.models.map(m => `
<tr>
<td>${{m.id}}</td>
<td>${{m.provider}}</td>
<td>${{m.model_name}}</td>
<td><button onclick="setConfig('resolve_model_id','${{m.id}}')" style="padding:2px 8px;font-size:0.8em;${{resolveId==String(m.id)?'background:#2a7a2a;color:#fff;border-color:#2a7a2a':''}}">${{resolveId==String(m.id)?'✓ active':'set'}}</button></td>
<td><button onclick="setConfig('write_model_id','${{m.id}}')" style="padding:2px 8px;font-size:0.8em;${{writeId==String(m.id)?'background:#2a7a2a;color:#fff;border-color:#2a7a2a':''}}">${{writeId==String(m.id)?'✓ active':'set'}}</button></td>
<td><button onclick="deleteModel(${{m.id}},this)" style="padding:2px 8px;font-size:0.8em;color:#b00;border-color:#b00">✕</button></td>
</tr>`).join('');
}}
async function addModel(btn) {{
const provider = document.getElementById('m-provider').value;
const model_name = document.getElementById('m-name').value.trim();
const api_key = document.getElementById('m-key').value.trim();
if (!model_name || !api_key) {{ alert('Model name and API key are required.'); return; }}
btn.disabled = true;
try {{
const r = await fetch('/models', {{method:'POST', headers:{{'Content-Type':'application/json'}},
body: JSON.stringify({{provider, model_name, api_key}})}});
const d = await r.json();
if (d.error) {{ showResult('Error: ' + d.error, false); return; }}
showResult('Model added (id=' + d.id + '). You can now set it as the resolve model.', true);
document.getElementById('m-key').value = '';
await loadModels();
}} catch(e) {{ showResult('Error: ' + e.message, false); }}
finally {{ btn.disabled = false; }}
}}
async function deleteModel(id, btn) {{
if (!confirm('Delete model ' + id + '?')) return;
btn.disabled = true;
try {{
const r = await fetch('/models/' + id, {{method:'DELETE'}});
const d = await r.json();
if (d.error) {{ showResult('Error: ' + d.error, false); return; }}
await loadModels();
}} catch(e) {{ showResult('Error: ' + e.message, false); }}
finally {{ btn.disabled = false; }}
}}
async function setConfig(key, value) {{
const r = await fetch('/config', {{method:'POST', headers:{{'Content-Type':'application/json'}},
body: JSON.stringify({{key, value}})}});
const d = await r.json();
if (d.error) {{ showResult('Error: ' + d.error, false); return; }}
showResult('Config updated: ' + key + ' = ' + value, true);
await loadModels();
}}
async function loadConflicts() {{
const r = await fetch('/conflicts');
const d = await r.json();
@@ -1819,6 +1990,7 @@ ADMIN_HTML = """<!DOCTYPE html>
loadStats();
loadConflicts();
loadLog(0);
loadModels();
</script>
</body>
</html>