From feb4df050676c792e752d092e44dc5c55b009b7e Mon Sep 17 00:00:00 2001
From: jenstandstad <jens.tandstad@gmail.com>
Date: Mon, 11 May 2026 21:45:30 +0200
Subject: [PATCH] Adding some files

---
 gnommo/cli.py          |  50 +++++++++++++----
 gnommo/preprocessor.py |  14 +++++
 gnommo/transformer.py  | 121 ++++++++++++++++++++++++++++++-----------
 3 files changed, 142 insertions(+), 43 deletions(-)
diff --git a/gnommo/cli.py b/gnommo/cli.py
index b0124fc..56674b0 100644
--- a/gnommo/cli.py
+++ b/gnommo/cli.py
@@ -234,7 +234,7 @@ Examples:
                 args.res,
             )
         elif action == "trim":
-            return cmd_trim(project_path, args.verbose, args.force, args.threshold)
+            return cmd_trim(project_path, args.verbose, args.force, args.threshold, args.res)
         elif action == "transcode":
             return cmd_transcode(
                 project_path,
@@ -1197,7 +1197,7 @@ def cmd_preprocess(
     """
     from concurrent.futures import ThreadPoolExecutor, as_completed
     from .parser import parse_project_config, parse_videos
-    from .preprocessor import preprocess_video
+    from .preprocessor import preprocess_video, RES_CONFIGS
     from .models import VideoSource as _VideoSource
 
     mode_str = f" ({res.upper()})" if res != "full" else ""
@@ -1278,7 +1278,15 @@ def cmd_preprocess(
         if using_compressed and segment_id.endswith("_compressed"):
             segment_id = segment_id[: -len("_compressed")]
 
-        output_file = f"processed/{segment_id}_processed.mov"
+        # For non-full res, write into the res subdir so stitch --res low finds the
+        # files at narration/low/processed/ (narration.json still records the plain
+        # "processed/..." path; stitch shifts the base dir itself).
+        _res_cfg = RES_CONFIGS.get(res) if res != "full" else None
+        if _res_cfg:
+            _, _, _subdir = _res_cfg
+            output_file = f"{_subdir}/processed/{segment_id}_processed.mov"
+        else:
+            output_file = f"processed/{segment_id}_processed.mov"
         output_path = narration_dir / output_file
 
         if output_path.exists() and not force:
@@ -1343,6 +1351,7 @@ def cmd_preprocess(
                 verbose=False,
                 force=force,
                 custom_gnommo_scratch=gnommo_scratch,
+                res=res,
             )
             return task
 
@@ -1371,6 +1380,7 @@ def cmd_preprocess(
                 verbose,
                 force,
                 gnommo_scratch,
+                res=res,
             )
             output_path = narration_dir / segment_source.output_file
             if output_path.exists():
@@ -1396,8 +1406,8 @@ def cmd_preprocess(
         for key in _PRESERVE_KEYS:
             if key in existing_entry:
                 entry[key] = existing_entry[key]
-        # Point source_file to the processed output
-        entry["source_file"] = segment_source.output_file
+        # Always record the plain path; stitch shifts the base dir for low/tiny.
+        entry["source_file"] = f"processed/{segment_id}_processed.mov"
         entry.setdefault("use_audio_channels", "auto")
         entry.setdefault("defer_loudnorm", True)
         existing_narration[segment_id] = entry
@@ -1437,7 +1447,7 @@ def cmd_preprocess(
                 continue
             print(f"    Processing: {video_id}")
             preprocess_video(
-                videos_dir, video_id, video_source, verbose, force, gnommo_scratch
+                videos_dir, video_id, video_source, verbose, force, gnommo_scratch, res=res
             )
 
     print("\nPreprocessing complete.")
@@ -1454,6 +1464,7 @@ def cmd_trim(
     verbose: bool,
     force: bool = False,
     threshold_db: float = -40.0,
+    res: str = "full",
 ) -> int:
     """
     Auto-detect silence bounds for all narration segments and write skip/take
@@ -1482,6 +1493,22 @@ def cmd_trim(
         print("  Run 'gnommo -p <project> import' first.")
         return 1
 
+    # Build a lookup of raw source files by segment ID. Raw files give cleaner
+    # silence detection — loudnorm can introduce early peaks in processed audio.
+    _video_exts = {".mov", ".mp4", ".avi", ".mkv", ".m4v"}
+    raw_dir = narration_dir / "raw_mov"
+    compressed_dir = narration_dir / "raw_mp4"
+
+    raw_lookup: dict[str, Path] = {}
+    for search_dir in (raw_dir, compressed_dir):
+        if search_dir.exists():
+            for f in search_dir.iterdir():
+                if f.is_file() and f.suffix.lower() in _video_exts and not f.name.startswith("."):
+                    stem = f.stem
+                    if stem.endswith("_compressed"):
+                        stem = stem[: -len("_compressed")]
+                    raw_lookup[stem] = f
+
     narration_json_path = narration_dir / "narration.json"
     raw_data: dict = _read_json(narration_json_path)
 
@@ -1495,14 +1522,15 @@ def cmd_trim(
             print(f"  {seg_id}: already trimmed, skipping (use --force to redo)")
             continue
 
-        # Always analyse the raw source file — it's always present and has the
-        # same audio as any processed version (processing is video-only).
-        source_path = narration_dir / seg.source_file
+        # Prefer raw file; fall back to processed if raw not available.
+        source_path = raw_lookup.get(seg_id)
+        if source_path is None:
+            source_path = narration_dir / seg.source_file
         if not source_path.exists():
-            print(f"  {seg_id}: source file not found ({seg.source_file}), skipping")
+            print(f"  {seg_id}: source file not found, skipping")
             continue
 
-        print(f"  {seg_id}: analysing...", end="", flush=True)
+        print(f"  {seg_id}: analysing {source_path.parent.name}/{source_path.name}...", end="", flush=True)
         first_sound, last_sound = detect_silence_bounds(
             source_path, noise_threshold_db=threshold_db, verbose=verbose
         )
diff --git a/gnommo/preprocessor.py b/gnommo/preprocessor.py
index 92e09bb..544c5fe 100644
--- a/gnommo/preprocessor.py
+++ b/gnommo/preprocessor.py
@@ -550,6 +550,7 @@ def preprocess_video(
     verbose: bool = False,
     force: bool = False,
     custom_gnommo_scratch: Optional[Path] = None,
+    res: str = "full",
 ) -> Path:
     """
     Apply preprocessing filters to a video source.
@@ -562,6 +563,7 @@ def preprocess_video(
         video_id: ID of the video being processed
         video_source: VideoSource with source_file, filter, and output_file
         custom_gnommo_scratch: Optional external directory for intermediate files (e.g., SSD)
+        res: Resolution preset — when not "full", source is downscaled before filtering
 
     Returns:
         Path to the final preprocessed output file.
@@ -586,6 +588,18 @@ def preprocess_video(
             filter_type=None,
         )
 
+    # For non-full res, downscale the raw source first so all subsequent
+    # filters (chroma key, color grade, etc.) operate on the small file.
+    if res != "full":
+        cfg = RES_CONFIGS.get(res)
+        if cfg:
+            width, height, _ = cfg
+            print(f"        Downscaling source to {width}x{height} ({res})...")
+            raw_low_dir = gnommo_scratch / f"raw_{res}"
+            current_input = create_downscaled_video(
+                current_input, raw_low_dir, width, height, force
+            )
+
     # Resolve channel setting (auto-detect if needed) and sanity check
     channel = video_source.use_audio_channels
     if channel == "auto":
diff --git a/gnommo/transformer.py b/gnommo/transformer.py
index 5b84fbd..d80db6b 100644
--- a/gnommo/transformer.py
+++ b/gnommo/transformer.py
@@ -182,14 +182,17 @@ def _extract_marker_contexts(
     slides: dict = None,
     videos: dict = None,
     audio: dict = None,
-) -> list[tuple[str, str, bool]]:
+) -> list[tuple[str, str, bool, str]]:
     """
     Extract known markers and the text immediately following them from manuscript.
 
     Unknown markers are filtered out and stripped from following text.
     Note: [cite:...] markers are already stripped at parse time.
 
-    Returns list of (marker_id, following_text, is_borrowed) tuples for known markers only.
+    Returns list of (marker_id, anchor_text, is_borrowed, anchor_type) tuples.
+    anchor_type is "before" (default — place before the matched phrase) or
+    "after" (place at the end of the matched phrase — used for markers that
+    trail a narration block and have no following text of their own).
     """
     slides = slides or {}
     videos = videos or {}
@@ -227,7 +230,7 @@ def _extract_marker_contexts(
     for i, (marker_id, following_text) in enumerate(raw_contexts):
         if following_text:
             words = following_text.split()[:10]
-            contexts.append((marker_id, " ".join(words), False))
+            contexts.append((marker_id, " ".join(words), False, "before"))
         else:
             borrowed = False
             for j in range(i + 1, len(raw_contexts)):
@@ -236,11 +239,24 @@ def _extract_marker_contexts(
                     if next_marker_id in (slides or {}):
                         break
                     words = next_text.split()[:10]
-                    contexts.append((marker_id, " ".join(words), True))
+                    contexts.append((marker_id, " ".join(words), True, "before"))
                     borrowed = True
                     break
             if not borrowed:
-                contexts.append((marker_id, "", False))
+                # No following text and blocked by a slide boundary — look
+                # backward for the tail of the preceding narration block and
+                # anchor to the END of those words instead of extrapolating.
+                preceding_text = ""
+                for k in range(i - 1, -1, -1):
+                    if raw_contexts[k][1]:
+                        preceding_text = raw_contexts[k][1]
+                        break
+                if preceding_text:
+                    words = preceding_text.split()
+                    tail = " ".join(words[-6:])
+                    contexts.append((marker_id, tail, False, "after"))
+                else:
+                    contexts.append((marker_id, "", False, "before"))
 
     return contexts
 
@@ -250,13 +266,18 @@ def _fuzzy_match_ratio(
     transcription: list[TranscribedWord],
     start_idx: int,
     window_size: int = 10,
+    pre_filler: int = 30,
+    inter_filler: int = 3,
 ) -> tuple[float, int, int]:
     """
     Calculate how many words from phrase match the transcription at start_idx.
 
-    Words are matched sequentially: each phrase word must appear at or after
-    the position of the previous match. This prevents false matches where
-    phrase words appear out of order or far into the window.
+    Words are matched sequentially. Two separate filler tolerances:
+    - pre_filler:   max words before the FIRST phrase word (absorbs ad-libs)
+    - inter_filler: max words between consecutive phrase words (keeps the
+                    match tight so common words don't stretch the window far
+                    into later text, which would push last_idx past subsequent
+                    markers' positions)
 
     Returns (ratio, first_match_offset, last_match_end_offset) where offsets
     are relative to start_idx. last_match_end_offset points past the last
@@ -265,14 +286,13 @@ def _fuzzy_match_ratio(
     if not phrase_words:
         return 0.0, 0, 0
 
-    words_to_check = min(len(phrase_words), window_size)
-    # +30 filler allowance: absorbs ad-libbed words spoken before or between
-    # the manuscript cue words without breaking the match ratio.
-    transcript_end = min(start_idx + words_to_check + 30, len(transcription))
-
     if start_idx >= len(transcription):
         return 0.0, 0, 0
 
+    words_to_check = min(len(phrase_words), window_size)
+    # Window only needs to cover pre_filler + phrase words + inter_filler slack
+    transcript_end = min(start_idx + pre_filler + words_to_check + inter_filler, len(transcription))
+
     transcript_words = [
         _normalize_token(transcription[j].word)
         for j in range(start_idx, transcript_end)
@@ -290,7 +310,14 @@ def _fuzzy_match_ratio(
             continue
         words_checked += 1
 
-        for j in range(t_pos, len(transcript_words)):
+        # First phrase word may be preceded by a long ad-lib; subsequent words
+        # should appear within a few positions of each other.
+        if matches == 0:
+            search_end = min(t_pos + pre_filler + 1, len(transcript_words))
+        else:
+            search_end = min(t_pos + inter_filler + 1, len(transcript_words))
+
+        for j in range(t_pos, search_end):
             t_word = transcript_words[j]
             matched = False
             if normalized == t_word:
@@ -344,7 +371,11 @@ def _find_phrase_timestamp(
             best_first_offset = first_offset
             best_end_offset = end_offset
 
-        if ratio >= 0.95:
+        # Sequential alignment: stop at the first position that clears the
+        # threshold. Continuing to scan the full transcript risks jumping
+        # to a higher-ratio match much later and skipping over subsequent
+        # markers' positions entirely.
+        if best_ratio >= fuzzy_threshold:
             break
 
     if best_ratio >= fuzzy_threshold and best_idx >= 0:
@@ -393,8 +424,8 @@ def align_markers_to_transcription(
     last_idx = 0
     last_end_time = 0.0
 
-    for marker_id, following_text, is_borrowed in contexts:
-        if not following_text.strip():
+    for marker_id, anchor_text, is_borrowed, anchor_type in contexts:
+        if not anchor_text.strip():
             marker_time = last_end_time + 1.0
             timings.append(
                 MarkerTiming(
@@ -408,34 +439,50 @@ def align_markers_to_transcription(
             continue
 
         idx, timestamp, confidence, match_end_idx = _find_phrase_timestamp(
-            following_text,
+            anchor_text,
             transcription,
             start_from=last_idx,
             fuzzy_threshold=fuzzy_threshold,
         )
 
         if idx >= 0:
-            adjusted_time = max(0.0, timestamp - 0.5)
-            timings.append(
-                MarkerTiming(
-                    marker_id=marker_id,
-                    timestamp=adjusted_time,
-                    context=following_text[:50],
-                    confidence=confidence,
+            if anchor_type == "after":
+                # Marker trails a narration block — place it at the END of the
+                # matched phrase (when those words finish being spoken).
+                end_idx = min(match_end_idx - 1, len(transcription) - 1)
+                marker_time = transcription[end_idx].end if transcription else 0.0
+                timings.append(
+                    MarkerTiming(
+                        marker_id=marker_id,
+                        timestamp=marker_time,
+                        context=f"(end of: {anchor_text[:40]})",
+                        confidence=confidence,
+                    )
                 )
-            )
-            if not is_borrowed:
                 last_idx = match_end_idx
-                if last_idx > 0 and last_idx <= len(transcription):
-                    last_end_time = transcription[last_idx - 1].end
-                else:
-                    last_end_time = transcription[-1].end if transcription else 0.0
+                last_end_time = marker_time
+            else:
+                adjusted_time = max(0.0, timestamp - 0.5)
+                timings.append(
+                    MarkerTiming(
+                        marker_id=marker_id,
+                        timestamp=adjusted_time,
+                        context=anchor_text[:50],
+                        confidence=confidence,
+                    )
+                )
+                if not is_borrowed:
+                    last_idx = match_end_idx
+                    if last_idx > 0 and last_idx <= len(transcription):
+                        last_end_time = transcription[last_idx - 1].end
+                    else:
+                        last_end_time = transcription[-1].end if transcription else 0.0
         else:
             timings.append(
                 MarkerTiming(
                     marker_id=marker_id,
                     timestamp=-1.0,
-                    context=following_text[:50],
+                    context=anchor_text[:50],
                     confidence=0.0,
                 )
             )
@@ -696,6 +743,16 @@ def build_render_plan(
     # Save narration end time (before outro)
     narration_end_time = total_duration
 
+    # Resolve any outro videos missing from videos.json via shared_assets.
+    if config.outro:
+        missing_outro_ids = [vid_id for vid_id in config.outro if vid_id not in videos]
+        if missing_outro_ids:
+            found = resolve_missing_videos(missing_outro_ids, project_path, config)
+            videos.update(found)
+        still_missing = [vid_id for vid_id in config.outro if vid_id not in videos]
+        for vid_id in still_missing:
+            print(f"  WARNING: outro video '{vid_id}' not found in videos.json or shared_assets — skipped", flush=True)
+
     # Build outro events (plays after narration ends)
     outro_events = _extract_outro_events(
         config.outro,