From cf40a19b4eed86adf1168cc20c73297b5398089c Mon Sep 17 00:00:00 2001
From: jenstandstad <jens.tandstad@gmail.com>
Date: Wed, 13 May 2026 08:13:20 +0200
Subject: [PATCH] Fixes to gnommo

---
 all.sh                |   9 ++++
 example/project.json  |   3 +-
 gnommo/cli.py         |  60 +++++++++++++++++++++-
 gnommo/parser.py      |   1 -
 gnommo/renderer.py    |  10 +++-
 gnommo/transformer.py | 113 ++++++++++++++++++------------------------
 6 files changed, 125 insertions(+), 71 deletions(-)
 create mode 100755 all.sh

diff --git a/all.sh b/all.sh
new file mode 100755
index 0000000..1d49795
--- /dev/null
+++ b/all.sh
@@ -0,0 +1,9 @@
+#!/bin/sh
+
+./gnommo.sh -p video1 all --force --prod
+./gnommo.sh -p video2 all --force --prod
+./gnommo.sh -p video3 all --force --prod
+#./gnommo.sh -p video4 all --force
+#./gnommo.sh -p video5 all --force
+#./gnommo.sh -p video6 all --force
+
diff --git a/example/project.json b/example/project.json
index 4cc3f45..79b79fb 100644
--- a/example/project.json
+++ b/example/project.json
@@ -4,8 +4,7 @@
   "description": "In this video, I demonstrate the Gnommo video editing pipeline - a code-first approach to creating presenter-mode videos from Keynote presentations.",
   "footer": "Subscribe for more tutorials!\nTwitter: @example",
   "resolution": [1920, 1080],
-  "fps": 30,
-  "gnommo_scratch": null,           
+  "fps": 30, 
   "defaultSlideType": "fullscreen",
   "keynote_file": "media/example.key",        
   "transcript": "media/videos/talking_head.transcript.json",
diff --git a/gnommo/cli.py b/gnommo/cli.py
index f5e1310..32cb1ec 100644
--- a/gnommo/cli.py
+++ b/gnommo/cli.py
@@ -2394,6 +2394,57 @@ def _parse_slide_range(slides_arg: str) -> tuple[str, Optional[str]]:
     return start_slide, end_slide
 
 
+def _project_markers_to_videos(
+    markers: list[str], videos_json_path: Path, config
+) -> None:
+    """ETL: project shorthand marker semantics into videos.json.
+
+    Scans the manuscript marker list for shorthand prefixes (vft:, vfb:, vst:,
+    vsb:, vf2t:, vf2b: and their pause variants) and writes the implied cutout
+    and layer values directly into videos.json.  This runs before parse_videos
+    so the render pass reads already-projected data and needs no shorthand logic.
+
+    The manuscript is the authoritative source: the LAST shorthand reference to
+    a given video_id wins, matching what a human editor would expect when they
+    change a marker near the end of the script.
+    """
+    if not videos_json_path.exists():
+        return
+
+    from .transformer import _SHORTHAND_PREFIXES  # (cutout, layer) lookup table
+
+    # Build projection: video_id → {cutout, layer}
+    projection: dict[str, dict] = {}
+    for marker in markers:
+        for prefix, implied in _SHORTHAND_PREFIXES.items():
+            if marker.startswith(prefix):
+                video_id = marker[len(prefix):]
+                cutout, layer = implied[0], implied[1]
+                projection[video_id] = {"cutout": cutout, "layer": layer}
+                break
+
+    if not projection:
+        return
+
+    with open(videos_json_path, "r", encoding="utf-8") as f:
+        raw = json.load(f)
+
+    changed = False
+    for video_id, fields in projection.items():
+        if video_id not in raw:
+            continue
+        for field, value in fields.items():
+            if raw[video_id].get(field) != value:
+                raw[video_id][field] = value
+                changed = True
+
+    if changed:
+        with open(videos_json_path, "w", encoding="utf-8") as f:
+            json.dump(raw, f, indent=2, ensure_ascii=False)
+        updated = [vid for vid in projection if vid in raw]
+        print(f"  Projected marker semantics → videos.json: {', '.join(updated)}")
+
+
 def _writeback_video_metadata(plan, project_path, config) -> None:
     """Write back cutout/layer derived from shorthand markers to videos.json.
 
@@ -2586,6 +2637,12 @@ def cmd_render(
         save_citations(citations, citations_path)
     config = parse_project_config(project_path)
 
+    # ETL: project shorthand marker semantics (cutout/layer) into videos.json
+    # before parse_videos reads it, so the render pass is purely data-driven.
+    _project_markers_to_videos(
+        markers, project_path / config.videos_path, config
+    )
+
     # Override resolution for preview modes
     if res != "full":
         cfg = RES_CONFIGS[res]
@@ -2732,8 +2789,7 @@ def cmd_render(
     if plan.time_offset > 0:
         print(f"  Time offset: {plan.time_offset:.1f}s (partial render)")
 
-    # Persist shorthand-derived cutout/layer back to videos.json (idempotent)
-    _writeback_video_metadata(plan, project_path, config)
+
 
     # Print detailed render plan with alignment info
     _print_render_plan_details(plan, marker_timings, slides)
diff --git a/gnommo/parser.py b/gnommo/parser.py
index fea68b5..5648926 100644
--- a/gnommo/parser.py
+++ b/gnommo/parser.py
@@ -260,7 +260,6 @@ def parse_project_config(project_path: Path) -> ProjectConfig:
         audio_path=data.get("audio", "audio.json"),
         audio_source=data.get("audio_source"),
         main_video=data.get("main_video"),
-        gnommo_scratch=data.get("gnommo_scratch"),
         process_cache=data.get("process_cache"),
         default_begin=float(data.get("default_begin", 0.0)),
         default_end_trim=float(data.get("default_end_trim", 0.0)),
diff --git a/gnommo/renderer.py b/gnommo/renderer.py
index 8c5bc9f..e2bc2f0 100644
--- a/gnommo/renderer.py
+++ b/gnommo/renderer.py
@@ -1254,10 +1254,13 @@ def build_filter_complex(
             delay_ms = int(event.start_time * 1000)
             label = f"tvaud{i}"
 
+            vol = event.video_source.volume
+            vol_filter = f",volume={vol:.2f}" if vol != 1.0 else ""
             filters.append(
                 f"[{video_idx}:a]atrim=0:{duration:.3f},"
                 f"asetpts=PTS-STARTPTS,"
-                f"adelay={delay_ms}|{delay_ms}[{label}]"
+                f"adelay={delay_ms}|{delay_ms}"
+                f"{vol_filter}[{label}]"
             )
             audio_labels_to_mix.append(f"[{label}]")
 
@@ -1273,10 +1276,13 @@ def build_filter_complex(
             delay_ms = int(event.start_time * 1000)
             label = f"outroaud{i}"
 
+            vol = event.video_source.volume
+            vol_filter = f",volume={vol:.2f}" if vol != 1.0 else ""
             filters.append(
                 f"[{video_idx}:a]atrim=0:{duration:.3f},"
                 f"asetpts=PTS-STARTPTS,"
-                f"adelay={delay_ms}|{delay_ms}[{label}]"
+                f"adelay={delay_ms}|{delay_ms}"
+                f"{vol_filter}[{label}]"
             )
             audio_labels_to_mix.append(f"[{label}]")
 
diff --git a/gnommo/transformer.py b/gnommo/transformer.py
index d276062..bd66176 100644
--- a/gnommo/transformer.py
+++ b/gnommo/transformer.py
@@ -28,6 +28,26 @@ from .transcriber import TranscribedWord
 # Audio trigger offset: play sound this many seconds before the marker
 AUDIO_OFFSET_SECONDS = 1.0
 
+# Shorthand marker prefix → (cutout_name, layer).
+# These are the ETL source-of-truth: when a manuscript contains [vft:X],
+# that projects cutout="fullscreen" and layer="above" into videos.json for X.
+# The pause-variant entries (vftp: etc.) carry a third element "pause_narration"
+# which is a per-event property, not stored in videos.json.
+_SHORTHAND_PREFIXES: dict[str, tuple] = {
+    "vft:":  ("fullscreen", "above"),
+    "vfb:":  ("fullscreen", "below"),
+    "vf2t:": ("fullscreen2", "above"),
+    "vf2b:": ("fullscreen2", "below"),
+    "vst:":  ("square", "above"),
+    "vsb:":  ("square", "below"),
+    "vftp:":  ("fullscreen", "above"),
+    "vfbp:":  ("fullscreen", "below"),
+    "vf2tp:": ("fullscreen2", "above"),
+    "vf2bp:": ("fullscreen2", "below"),
+    "vstp:":  ("square", "above"),
+    "vsbp:":  ("square", "below"),
+}
+
 
 @dataclass
 class MarkerTiming:
@@ -961,26 +981,14 @@ def _extract_video_events(
         ]
     )
 
-    # Mapping from shorthand marker prefix → (implied_cutout_name, implied_layer)
-    # These are the defaults; videos.json values act as a base but the marker wins.
-    _SHORTHAND: dict[str, tuple[str, str]] = {
-        "vft:": ("fullscreen", "above"),
-        "vfb:": ("fullscreen", "below"),
-        "vf2t:": ("fullscreen2", "above"),
-        "vf2b:": ("fullscreen2", "below"),
-        "vst:": ("square", "above"),
-        "vsb:": ("square", "below"),
-        "vftp:": ("fullscreen", "above", "pause_narration"),
-        "vfbp:": ("fullscreen", "below", "pause_narration"),
-        "vf2tp:": ("fullscreen2", "above", "pause_narration"),
-        "vf2bp:": ("fullscreen2", "below", "pause_narration"),
-        "vstp:": ("square", "above", "pause_narration"),
-        "vsbp:": ("square", "below", "pause_narration"),
-    }
+    # Pause-variant prefixes — the only thing the render pass still needs from
+    # shorthand markers at event-build time (pause_narration is per-event, not stored in videos.json).
+    _PAUSE_PREFIXES = {"vftp:", "vfbp:", "vf2tp:", "vf2bp:", "vstp:", "vsbp:"}
 
-    # Collect video markers: (time, video_id, event_type, cutout_name_override, layer_override)
-    # event_type is "video" (ends at next slide) or "narration" (runs to end)
-    video_markers: list[tuple[float, str, str, str | None, str | None]] = []
+    # Collect video markers: (time, video_id, event_type, pause_narration)
+    # video_markers: (timestamp, video_id, marker_type, pause_narration)
+    # cutout and layer are read from videos.json (projected there by _project_markers_to_videos)
+    video_markers: list[tuple[float, str, str, bool]] = []
 
     for timing in marker_timings:
         if timing.timestamp < 0:
@@ -988,26 +996,26 @@ def _extract_video_events(
 
         mid = timing.marker_id
 
-        # --- shorthand markers: vft/vfb/vst/vsb ---
-        shorthand_match = next((p for p in _SHORTHAND if mid.startswith(p)), None)
+        # --- shorthand markers (vft:/vfb:/vst:/vsb: and pause variants) ---
+        shorthand_match = next((p for p in _SHORTHAND_PREFIXES if mid.startswith(p)), None)
         if shorthand_match:
-            video_id = mid[len(shorthand_match) :]
+            video_id = mid[len(shorthand_match):]
             if video_id not in videos:
                 warnings.append(
                     f"[{mid}] references unknown video '{video_id}' — skipped. "
                     f"Add it to videos.json or remove the marker."
                 )
                 continue
-            implied_cutout, implied_layer = _SHORTHAND[shorthand_match]
-            if implied_cutout not in cutouts:
+            # Validate that videos.json has the correct cutout (written by ETL)
+            video_source = videos[video_id]
+            if not video_source.cutout or video_source.cutout not in cutouts:
                 warnings.append(
-                    f"[{mid}] requires cutout '{implied_cutout}' which is not defined in project config — skipped. "
-                    f"Available cutouts: {list(cutouts.keys())}"
+                    f"[{mid}] video '{video_id}' has no valid cutout in videos.json — "
+                    f"run render once to project values, or set cutout manually."
                 )
                 continue
-            video_markers.append(
-                (timing.timestamp, video_id, "video", implied_cutout, implied_layer)
-            )
+            pause_narration = shorthand_match in _PAUSE_PREFIXES
+            video_markers.append((timing.timestamp, video_id, "video", pause_narration))
             continue
 
         # --- legacy [video:xxx] ---
@@ -1015,23 +1023,16 @@ def _extract_video_events(
             video_id = mid[6:]
             if video_id not in videos:
                 warnings.append(
-                    f"[video:{video_id}] references unknown video '{video_id}' — skipped. "
-                    f"Add it to videos.json or remove the marker."
+                    f"[video:{video_id}] references unknown video '{video_id}' — skipped."
                 )
                 continue
             video_source = videos[video_id]
-            if not video_source.cutout:
+            if not video_source.cutout or video_source.cutout not in cutouts:
                 warnings.append(
-                    f"[video:{video_id}] has no 'cutout' set in videos.json — skipped."
+                    f"[video:{video_id}] has no valid cutout in videos.json — skipped."
                 )
                 continue
-            if video_source.cutout not in cutouts:
-                warnings.append(
-                    f"[video:{video_id}] cutout '{video_source.cutout}' is not defined in project config — skipped. "
-                    f"Available: {list(cutouts.keys())}"
-                )
-                continue
-            video_markers.append((timing.timestamp, video_id, "video", None, None))
+            video_markers.append((timing.timestamp, video_id, "video", False))
             continue
 
         # --- [narration:xxx] ---
@@ -1039,41 +1040,25 @@ def _extract_video_events(
             video_id = mid[10:]
             if video_id not in videos:
                 warnings.append(
-                    f"[narration:{video_id}] references unknown video '{video_id}' — skipped. "
-                    f"Add it to videos.json or remove the marker."
+                    f"[narration:{video_id}] references unknown video '{video_id}' — skipped."
                 )
                 continue
             video_source = videos[video_id]
-            if not video_source.cutout:
+            if not video_source.cutout or video_source.cutout not in cutouts:
                 warnings.append(
-                    f"[narration:{video_id}] has no 'cutout' set in videos.json — skipped."
+                    f"[narration:{video_id}] has no valid cutout in videos.json — skipped."
                 )
                 continue
-            if video_source.cutout not in cutouts:
-                warnings.append(
-                    f"[narration:{video_id}] cutout '{video_source.cutout}' is not defined in project config — skipped. "
-                    f"Available: {list(cutouts.keys())}"
-                )
-                continue
-            video_markers.append((timing.timestamp, video_id, "narration", None, None))
+            video_markers.append((timing.timestamp, video_id, "narration", False))
 
     events: list[VideoEvent] = []
-    for (
-        start_time,
-        video_id,
-        marker_type,
-        cutout_override,
-        layer_override,
-    ) in video_markers:
+    for (start_time, video_id, marker_type, pause_narration) in video_markers:
         video_source = videos[video_id]
 
-        # Resolve cutout: marker override > videos.json cutout
-        # (validation already ensured cutout exists — this is a safety assertion)
-        cutout_name = cutout_override or video_source.cutout
+        # Read cutout and layer directly from videos.json (projected by ETL)
+        cutout_name = video_source.cutout
         cutout = cutouts[cutout_name]
-
-        # Resolve layer: marker override > videos.json layer
-        layer = layer_override if layer_override is not None else video_source.layer
+        layer = video_source.layer
 
         end_on = video_source.end_on
         if end_on == "take" and video_source.take is not None: