From 7f7425da46a982a6fb1339323a648151d9aafd64 Mon Sep 17 00:00:00 2001
From: jenstandstad <jens.tandstad@gmail.com>
Date: Mon, 12 Jan 2026 11:49:46 +0100
Subject: [PATCH] Add configurable slides path and malformed marker detection

- project.json now supports "slides" field pointing to slides.json location
- Slide images are loaded from same directory as slides.json
- Validation detects malformed markers (missing ], extra spaces)
- Reports line numbers for each malformed marker

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 gnommo/cli.py         | 12 ++++++------
 gnommo/models.py      |  2 ++
 gnommo/parser.py      | 40 +++++++++++++++++++++++++++++++++-------
 gnommo/renderer.py    |  6 +++---
 gnommo/transformer.py |  5 +++++
 gnommo/validator.py   | 20 ++++++++++++++++----
 6 files changed, 65 insertions(+), 20 deletions(-)

diff --git a/gnommo/cli.py b/gnommo/cli.py
index b68caa2..130e0db 100644
--- a/gnommo/cli.py
+++ b/gnommo/cli.py
@@ -112,13 +112,13 @@ def cmd_validate(project_path: Path) -> int:
     print(f"Validating project: {project_path}")
 
     # Parse all files
-    _, markers = parse_manuscript(project_path)
+    _, markers, malformed = parse_manuscript(project_path)
     config = parse_project_config(project_path)
-    slides = parse_slides(project_path)
+    slides = parse_slides(project_path, config)
     videos = parse_videos(project_path)
 
     # Validate
-    validate_project(project_path, markers, config, slides, videos)
+    validate_project(project_path, markers, config, slides, videos, malformed)
 
     print("Validation passed.")
     return 0
@@ -132,9 +132,9 @@ def cmd_render(project_path: Path, output_path: Path, verbose: bool, dry_run: bo
 
     # Stage 1: Extract
     print("Stage 1/4: Parsing input files...")
-    _, markers = parse_manuscript(project_path)
+    _, markers, malformed = parse_manuscript(project_path)
     config = parse_project_config(project_path)
-    slides = parse_slides(project_path)
+    slides = parse_slides(project_path, config)
     videos = parse_videos(project_path)
     transcript = parse_transcript(project_path)
 
@@ -145,7 +145,7 @@ def cmd_render(project_path: Path, output_path: Path, verbose: bool, dry_run: bo
 
     # Stage 2: Validate
     print("Stage 2/4: Validating...")
-    validate_project(project_path, markers, config, slides, videos)
+    validate_project(project_path, markers, config, slides, videos, malformed)
     print("  - Validation passed")
     print()
 
diff --git a/gnommo/models.py b/gnommo/models.py
index eb238ee..fc7061a 100644
--- a/gnommo/models.py
+++ b/gnommo/models.py
@@ -21,6 +21,7 @@ class ProjectConfig:
     talking_head: TalkingHeadConfig
     default_slide_type: str
     background_video: str
+    slides_path: str = "slides.json"  # path to slides.json relative to project
     audio_source: Optional[str] = None  # defaults to talking head
 
 
@@ -75,6 +76,7 @@ class RenderPlan:
     slide_events: list[SlideEvent]
     total_duration: float
     slides: dict[str, SlideDefinition]
+    slides_dir: Path = None  # directory containing slide images
 
 
 # Slide layout configurations (hardcoded for POC)
diff --git a/gnommo/parser.py b/gnommo/parser.py
index 253640b..9b9fe71 100644
--- a/gnommo/parser.py
+++ b/gnommo/parser.py
@@ -16,12 +16,12 @@ from .models import (
 )
 
 
-def parse_manuscript(project_path: Path) -> tuple[str, list[str]]:
+def parse_manuscript(project_path: Path) -> tuple[str, list[str], list[tuple[int, str]]]:
     """
     Parse manuscript.txt and extract text content and slide markers.
 
     Returns:
-        Tuple of (full text, list of marker IDs found)
+        Tuple of (full text, list of marker IDs found, list of malformed markers as (line_num, text))
     """
     manuscript_path = project_path / "manuscript.txt"
 
@@ -30,10 +30,32 @@ def parse_manuscript(project_path: Path) -> tuple[str, list[str]]:
 
     text = manuscript_path.read_text(encoding="utf-8")
 
-    # Extract all slide markers like [S1], [S2], etc.
+    # Extract all valid slide markers like [S1], [S2], etc.
     markers = re.findall(r"\[([A-Za-z0-9_]+)\]", text)
 
-    return text, markers
+    # Find malformed markers (missing brackets, extra spaces, etc.)
+    malformed: list[tuple[int, str]] = []
+    lines = text.split("\n")
+
+    for line_num, line in enumerate(lines, start=1):
+        # Pattern for potential markers that are malformed:
+        # - Missing closing bracket: [S1 or [S12 (not followed by ])
+        # - Extra spaces: [S 1] or [S1 ] or [ S1]
+
+        # Find unclosed brackets: [S followed by digits, then space/newline/EOF (not ])
+        # Match [S1, [S12, [S123 etc that are NOT followed by ]
+        for match in re.finditer(r"\[S\d+", line):
+            start, end = match.span()
+            # Check if there's a ] immediately after
+            if end >= len(line) or line[end] != "]":
+                malformed.append((line_num, match.group()))
+
+        # Find markers with internal/trailing spaces like [S 1] or [S1 ] or [ S1]
+        spaced = re.findall(r"\[\s+S\d+\s*\]|\[S\d+\s+\]|\[S\s+\d+\]", line)
+        for match in spaced:
+            malformed.append((line_num, match))
+
+    return text, markers, malformed
 
 
 def parse_transcript(project_path: Path) -> list[TimedWord]:
@@ -108,6 +130,7 @@ def parse_project_config(project_path: Path) -> ProjectConfig:
         talking_head=talking_head,
         default_slide_type=data.get("defaultSlideType", "square"),
         background_video=data.get("background_video", ""),
+        slides_path=data.get("slides", "slides.json"),
         audio_source=data.get("audio_source"),
     )
 
@@ -123,12 +146,15 @@ def _parse_dimension(value: Any) -> int:
     return 200  # default
 
 
-def parse_slides(project_path: Path) -> dict[str, SlideDefinition]:
+def parse_slides(project_path: Path, config: ProjectConfig = None) -> dict[str, SlideDefinition]:
     """Parse slides.json into slide definitions."""
-    slides_path = project_path / "slides.json"
+    if config and config.slides_path:
+        slides_path = project_path / config.slides_path
+    else:
+        slides_path = project_path / "slides.json"
 
     if not slides_path.exists():
-        raise ParseError("slides.json not found", slides_path)
+        raise ParseError(f"slides file not found: {slides_path}", slides_path)
 
     try:
         data = json.loads(slides_path.read_text(encoding="utf-8"))
diff --git a/gnommo/renderer.py b/gnommo/renderer.py
index 165336f..a690a0b 100644
--- a/gnommo/renderer.py
+++ b/gnommo/renderer.py
@@ -59,13 +59,13 @@ def build_ffmpeg_command(plan: RenderPlan, output_path: Path) -> list[str]:
         bg_path = project_path / plan.config.background_video
         cmd.extend(["-i", str(bg_path)])
 
-    # Input: slide images
-    slides_path = project_path / "media" / "slides"
+    # Input: slide images (from slides_dir, same directory as slides.json)
+    slides_dir = plan.slides_dir.resolve() if plan.slides_dir else project_path / "media" / "slides"
     slide_inputs: list[str] = []  # Track which slides we've added
 
     for event in plan.slide_events:
         if event.slide_id not in slide_inputs:
-            image_path = slides_path / event.slide_def.image
+            image_path = slides_dir / event.slide_def.image
             cmd.extend(["-i", str(image_path)])
             slide_inputs.append(event.slide_id)
 
diff --git a/gnommo/transformer.py b/gnommo/transformer.py
index 3ecc609..149b167 100644
--- a/gnommo/transformer.py
+++ b/gnommo/transformer.py
@@ -37,6 +37,10 @@ def build_render_plan(
     # Build slide events from transcript markers
     slide_events = _extract_slide_events(transcript, slides, total_duration)
 
+    # Derive slides directory from slides_path
+    slides_json_path = project_path / config.slides_path
+    slides_dir = slides_json_path.parent
+
     return RenderPlan(
         project_path=project_path,
         config=config,
@@ -44,6 +48,7 @@ def build_render_plan(
         slide_events=slide_events,
         total_duration=total_duration,
         slides=slides,
+        slides_dir=slides_dir,
     )
 
 
diff --git a/gnommo/validator.py b/gnommo/validator.py
index 8d5063c..b8e8a87 100644
--- a/gnommo/validator.py
+++ b/gnommo/validator.py
@@ -12,6 +12,7 @@ def validate_project(
     config: ProjectConfig,
     slides: dict[str, SlideDefinition],
     videos: dict[str, VideoSource],
+    malformed_markers: list[tuple[int, str]] = None,
 ) -> None:
     """
     Validate all parsed project data. Raises ValidationError if any issues found.
@@ -22,9 +23,19 @@ def validate_project(
     - All video files exist on disk
     - Background video exists (if specified)
     - Slide types are valid
+    - No malformed markers in manuscript
     """
     issues: list[ValidationIssue] = []
 
+    # Check for malformed markers first (these are likely typos)
+    if malformed_markers:
+        for line_num, marker_text in malformed_markers:
+            issues.append(ValidationIssue(
+                f"Malformed marker: {marker_text}",
+                project_path / "manuscript.txt",
+                line_num
+            ))
+
     # Check all manuscript markers have corresponding slides
     for marker in manuscript_markers:
         if marker not in slides:
@@ -34,15 +45,16 @@ def validate_project(
             ))
 
     # Check all slide images exist
-    media_path = project_path / "media"
-    slides_path = media_path / "slides"
+    # Slides are in the same directory as the slides.json file
+    slides_json_path = project_path / config.slides_path
+    slides_dir = slides_json_path.parent
 
     for slide_id, slide_def in slides.items():
-        image_path = slides_path / slide_def.image
+        image_path = slides_dir / slide_def.image
         if not image_path.exists():
             issues.append(ValidationIssue(
                 f"Slide image not found: {slide_def.image}",
-                project_path / "slides.json"
+                slides_json_path
             ))
 
         # Check slide type is valid