Adding some files

2026-05-11 21:45:30 +02:00
parent b9376cd650
commit feb4df0506
3 changed files with 142 additions and 43 deletions
@@ -234,7 +234,7 @@ Examples:
                args.res,
            )
        elif action == "trim":
-            return cmd_trim(project_path, args.verbose, args.force, args.threshold)
+            return cmd_trim(project_path, args.verbose, args.force, args.threshold, args.res)
        elif action == "transcode":
            return cmd_transcode(
                project_path,
@@ -1197,7 +1197,7 @@ def cmd_preprocess(
    """
    from concurrent.futures import ThreadPoolExecutor, as_completed
    from .parser import parse_project_config, parse_videos
-    from .preprocessor import preprocess_video
+    from .preprocessor import preprocess_video, RES_CONFIGS
    from .models import VideoSource as _VideoSource

    mode_str = f" ({res.upper()})" if res != "full" else ""
@@ -1278,6 +1278,14 @@ def cmd_preprocess(
        if using_compressed and segment_id.endswith("_compressed"):
            segment_id = segment_id[: -len("_compressed")]

+        # For non-full res, write into the res subdir so stitch --res low finds the
+        # files at narration/low/processed/ (narration.json still records the plain
+        # "processed/..." path; stitch shifts the base dir itself).
+        _res_cfg = RES_CONFIGS.get(res) if res != "full" else None
+        if _res_cfg:
+            _, _, _subdir = _res_cfg
+            output_file = f"{_subdir}/processed/{segment_id}_processed.mov"
+        else:
            output_file = f"processed/{segment_id}_processed.mov"
        output_path = narration_dir / output_file

@@ -1343,6 +1351,7 @@ def cmd_preprocess(
                verbose=False,
                force=force,
                custom_gnommo_scratch=gnommo_scratch,
+                res=res,
            )
            return task

@@ -1371,6 +1380,7 @@ def cmd_preprocess(
                verbose,
                force,
                gnommo_scratch,
+                res=res,
            )
            output_path = narration_dir / segment_source.output_file
            if output_path.exists():
@@ -1396,8 +1406,8 @@ def cmd_preprocess(
        for key in _PRESERVE_KEYS:
            if key in existing_entry:
                entry[key] = existing_entry[key]
-        # Point source_file to the processed output
-        entry["source_file"] = segment_source.output_file
+        # Always record the plain path; stitch shifts the base dir for low/tiny.
+        entry["source_file"] = f"processed/{segment_id}_processed.mov"
        entry.setdefault("use_audio_channels", "auto")
        entry.setdefault("defer_loudnorm", True)
        existing_narration[segment_id] = entry
@@ -1437,7 +1447,7 @@ def cmd_preprocess(
                continue
            print(f"    Processing: {video_id}")
            preprocess_video(
-                videos_dir, video_id, video_source, verbose, force, gnommo_scratch
+                videos_dir, video_id, video_source, verbose, force, gnommo_scratch, res=res
            )

    print("\nPreprocessing complete.")
@@ -1454,6 +1464,7 @@ def cmd_trim(
    verbose: bool,
    force: bool = False,
    threshold_db: float = -40.0,
+    res: str = "full",
 ) -> int:
    """
    Auto-detect silence bounds for all narration segments and write skip/take
@@ -1482,6 +1493,22 @@ def cmd_trim(
        print("  Run 'gnommo -p <project> import' first.")
        return 1

+    # Build a lookup of raw source files by segment ID. Raw files give cleaner
+    # silence detection — loudnorm can introduce early peaks in processed audio.
+    _video_exts = {".mov", ".mp4", ".avi", ".mkv", ".m4v"}
+    raw_dir = narration_dir / "raw_mov"
+    compressed_dir = narration_dir / "raw_mp4"
+
+    raw_lookup: dict[str, Path] = {}
+    for search_dir in (raw_dir, compressed_dir):
+        if search_dir.exists():
+            for f in search_dir.iterdir():
+                if f.is_file() and f.suffix.lower() in _video_exts and not f.name.startswith("."):
+                    stem = f.stem
+                    if stem.endswith("_compressed"):
+                        stem = stem[: -len("_compressed")]
+                    raw_lookup[stem] = f
+
    narration_json_path = narration_dir / "narration.json"
    raw_data: dict = _read_json(narration_json_path)

@@ -1495,14 +1522,15 @@ def cmd_trim(
            print(f"  {seg_id}: already trimmed, skipping (use --force to redo)")
            continue

-        # Always analyse the raw source file — it's always present and has the
-        # same audio as any processed version (processing is video-only).
+        # Prefer raw file; fall back to processed if raw not available.
+        source_path = raw_lookup.get(seg_id)
+        if source_path is None:
            source_path = narration_dir / seg.source_file
        if not source_path.exists():
-            print(f"  {seg_id}: source file not found ({seg.source_file}), skipping")
+            print(f"  {seg_id}: source file not found, skipping")
            continue

-        print(f"  {seg_id}: analysing...", end="", flush=True)
+        print(f"  {seg_id}: analysing {source_path.parent.name}/{source_path.name}...", end="", flush=True)
        first_sound, last_sound = detect_silence_bounds(
            source_path, noise_threshold_db=threshold_db, verbose=verbose
        )
@@ -550,6 +550,7 @@ def preprocess_video(
    verbose: bool = False,
    force: bool = False,
    custom_gnommo_scratch: Optional[Path] = None,
+    res: str = "full",
 ) -> Path:
    """
    Apply preprocessing filters to a video source.
@@ -562,6 +563,7 @@ def preprocess_video(
        video_id: ID of the video being processed
        video_source: VideoSource with source_file, filter, and output_file
        custom_gnommo_scratch: Optional external directory for intermediate files (e.g., SSD)
+        res: Resolution preset — when not "full", source is downscaled before filtering

    Returns:
        Path to the final preprocessed output file.
@@ -586,6 +588,18 @@ def preprocess_video(
            filter_type=None,
        )

+    # For non-full res, downscale the raw source first so all subsequent
+    # filters (chroma key, color grade, etc.) operate on the small file.
+    if res != "full":
+        cfg = RES_CONFIGS.get(res)
+        if cfg:
+            width, height, _ = cfg
+            print(f"        Downscaling source to {width}x{height} ({res})...")
+            raw_low_dir = gnommo_scratch / f"raw_{res}"
+            current_input = create_downscaled_video(
+                current_input, raw_low_dir, width, height, force
+            )
+
    # Resolve channel setting (auto-detect if needed) and sanity check
    channel = video_source.use_audio_channels
    if channel == "auto":
@@ -182,14 +182,17 @@ def _extract_marker_contexts(
    slides: dict = None,
    videos: dict = None,
    audio: dict = None,
-) -> list[tuple[str, str, bool]]:
+) -> list[tuple[str, str, bool, str]]:
    """
    Extract known markers and the text immediately following them from manuscript.

    Unknown markers are filtered out and stripped from following text.
    Note: [cite:...] markers are already stripped at parse time.

-    Returns list of (marker_id, following_text, is_borrowed) tuples for known markers only.
+    Returns list of (marker_id, anchor_text, is_borrowed, anchor_type) tuples.
+    anchor_type is "before" (default — place before the matched phrase) or
+    "after" (place at the end of the matched phrase — used for markers that
+    trail a narration block and have no following text of their own).
    """
    slides = slides or {}
    videos = videos or {}
@@ -227,7 +230,7 @@ def _extract_marker_contexts(
    for i, (marker_id, following_text) in enumerate(raw_contexts):
        if following_text:
            words = following_text.split()[:10]
-            contexts.append((marker_id, " ".join(words), False))
+            contexts.append((marker_id, " ".join(words), False, "before"))
        else:
            borrowed = False
            for j in range(i + 1, len(raw_contexts)):
@@ -236,11 +239,24 @@ def _extract_marker_contexts(
                    if next_marker_id in (slides or {}):
                        break
                    words = next_text.split()[:10]
-                    contexts.append((marker_id, " ".join(words), True))
+                    contexts.append((marker_id, " ".join(words), True, "before"))
                    borrowed = True
                    break
            if not borrowed:
-                contexts.append((marker_id, "", False))
+                # No following text and blocked by a slide boundary — look
+                # backward for the tail of the preceding narration block and
+                # anchor to the END of those words instead of extrapolating.
+                preceding_text = ""
+                for k in range(i - 1, -1, -1):
+                    if raw_contexts[k][1]:
+                        preceding_text = raw_contexts[k][1]
+                        break
+                if preceding_text:
+                    words = preceding_text.split()
+                    tail = " ".join(words[-6:])
+                    contexts.append((marker_id, tail, False, "after"))
+                else:
+                    contexts.append((marker_id, "", False, "before"))

    return contexts

@@ -250,13 +266,18 @@ def _fuzzy_match_ratio(
    transcription: list[TranscribedWord],
    start_idx: int,
    window_size: int = 10,
+    pre_filler: int = 30,
+    inter_filler: int = 3,
 ) -> tuple[float, int, int]:
    """
    Calculate how many words from phrase match the transcription at start_idx.

-    Words are matched sequentially: each phrase word must appear at or after
-    the position of the previous match. This prevents false matches where
-    phrase words appear out of order or far into the window.
+    Words are matched sequentially. Two separate filler tolerances:
+    - pre_filler:   max words before the FIRST phrase word (absorbs ad-libs)
+    - inter_filler: max words between consecutive phrase words (keeps the
+                    match tight so common words don't stretch the window far
+                    into later text, which would push last_idx past subsequent
+                    markers' positions)

    Returns (ratio, first_match_offset, last_match_end_offset) where offsets
    are relative to start_idx. last_match_end_offset points past the last
@@ -265,14 +286,13 @@ def _fuzzy_match_ratio(
    if not phrase_words:
        return 0.0, 0, 0

-    words_to_check = min(len(phrase_words), window_size)
-    # +30 filler allowance: absorbs ad-libbed words spoken before or between
-    # the manuscript cue words without breaking the match ratio.
-    transcript_end = min(start_idx + words_to_check + 30, len(transcription))
-
    if start_idx >= len(transcription):
        return 0.0, 0, 0

+    words_to_check = min(len(phrase_words), window_size)
+    # Window only needs to cover pre_filler + phrase words + inter_filler slack
+    transcript_end = min(start_idx + pre_filler + words_to_check + inter_filler, len(transcription))
+
    transcript_words = [
        _normalize_token(transcription[j].word)
        for j in range(start_idx, transcript_end)
@@ -290,7 +310,14 @@ def _fuzzy_match_ratio(
            continue
        words_checked += 1

-        for j in range(t_pos, len(transcript_words)):
+        # First phrase word may be preceded by a long ad-lib; subsequent words
+        # should appear within a few positions of each other.
+        if matches == 0:
+            search_end = min(t_pos + pre_filler + 1, len(transcript_words))
+        else:
+            search_end = min(t_pos + inter_filler + 1, len(transcript_words))
+
+        for j in range(t_pos, search_end):
            t_word = transcript_words[j]
            matched = False
            if normalized == t_word:
@@ -344,7 +371,11 @@ def _find_phrase_timestamp(
            best_first_offset = first_offset
            best_end_offset = end_offset

-        if ratio >= 0.95:
+        # Sequential alignment: stop at the first position that clears the
+        # threshold. Continuing to scan the full transcript risks jumping
+        # to a higher-ratio match much later and skipping over subsequent
+        # markers' positions entirely.
+        if best_ratio >= fuzzy_threshold:
            break

    if best_ratio >= fuzzy_threshold and best_idx >= 0:
@@ -393,8 +424,8 @@ def align_markers_to_transcription(
    last_idx = 0
    last_end_time = 0.0

-    for marker_id, following_text, is_borrowed in contexts:
-        if not following_text.strip():
+    for marker_id, anchor_text, is_borrowed, anchor_type in contexts:
+        if not anchor_text.strip():
            marker_time = last_end_time + 1.0
            timings.append(
                MarkerTiming(
@@ -408,19 +439,35 @@ def align_markers_to_transcription(
            continue

        idx, timestamp, confidence, match_end_idx = _find_phrase_timestamp(
-            following_text,
+            anchor_text,
            transcription,
            start_from=last_idx,
            fuzzy_threshold=fuzzy_threshold,
        )

        if idx >= 0:
+            if anchor_type == "after":
+                # Marker trails a narration block — place it at the END of the
+                # matched phrase (when those words finish being spoken).
+                end_idx = min(match_end_idx - 1, len(transcription) - 1)
+                marker_time = transcription[end_idx].end if transcription else 0.0
+                timings.append(
+                    MarkerTiming(
+                        marker_id=marker_id,
+                        timestamp=marker_time,
+                        context=f"(end of: {anchor_text[:40]})",
+                        confidence=confidence,
+                    )
+                )
+                last_idx = match_end_idx
+                last_end_time = marker_time
+            else:
                adjusted_time = max(0.0, timestamp - 0.5)
                timings.append(
                    MarkerTiming(
                        marker_id=marker_id,
                        timestamp=adjusted_time,
-                    context=following_text[:50],
+                        context=anchor_text[:50],
                        confidence=confidence,
                    )
                )
@@ -435,7 +482,7 @@ def align_markers_to_transcription(
                MarkerTiming(
                    marker_id=marker_id,
                    timestamp=-1.0,
-                    context=following_text[:50],
+                    context=anchor_text[:50],
                    confidence=0.0,
                )
            )
@@ -696,6 +743,16 @@ def build_render_plan(
    # Save narration end time (before outro)
    narration_end_time = total_duration

+    # Resolve any outro videos missing from videos.json via shared_assets.
+    if config.outro:
+        missing_outro_ids = [vid_id for vid_id in config.outro if vid_id not in videos]
+        if missing_outro_ids:
+            found = resolve_missing_videos(missing_outro_ids, project_path, config)
+            videos.update(found)
+        still_missing = [vid_id for vid_id in config.outro if vid_id not in videos]
+        for vid_id in still_missing:
+            print(f"  WARNING: outro video '{vid_id}' not found in videos.json or shared_assets — skipped", flush=True)
+
    # Build outro events (plays after narration ends)
    outro_events = _extract_outro_events(
        config.outro,