gnommo/gnommo/transformer.py

"""Transform stage: resolve timings and build render plan."""

import re
import string
from dataclasses import dataclass
from pathlib import Path
from typing import Optional

from .models import (
    AudioDefinition,
    AudioEvent,
    CameraEvent,
    CameraState,
    CutoutDefinition,
    CAMERA_PRESETS,
    NarrationPause,
    OutroEvent,
    ProjectConfig,
    RenderPlan,
    SlideDefinition,
    SlideEvent,
    VideoEvent,
    VideoSource,
)
from .parser import get_video_duration, resolve_missing_videos
from .transcriber import TranscribedWord

# Audio trigger offset: play sound this many seconds before the marker
AUDIO_OFFSET_SECONDS = 1.0


@dataclass
class MarkerTiming:
    """A marker with its aligned timestamp and confidence."""

    marker_id: str
    timestamp: float  # -1 if not found
    context: str  # the text following the marker
    confidence: float  # 0-1, how confident the match is


def _normalize_text(text: str) -> str:
    """Normalize text for matching (lowercase, expand contractions, remove punctuation)."""
    text = text.lower()
    # Expand common contractions before removing punctuation
    # This ensures "I'm" matches "I am" in transcripts
    contractions = {
        "i'm": "i am",
        "you're": "you are",
        "we're": "we are",
        "they're": "they are",
        "he's": "he is",
        "she's": "she is",
        "it's": "it is",
        "that's": "that is",
        "what's": "what is",
        "there's": "there is",
        "here's": "here is",
        "who's": "who is",
        "how's": "how is",
        "let's": "let us",
        "i've": "i have",
        "you've": "you have",
        "we've": "we have",
        "they've": "they have",
        "i'd": "i would",
        "you'd": "you would",
        "he'd": "he would",
        "she'd": "she would",
        "we'd": "we would",
        "they'd": "they would",
        "i'll": "i will",
        "you'll": "you will",
        "he'll": "he will",
        "she'll": "she will",
        "we'll": "we will",
        "they'll": "they will",
        "isn't": "is not",
        "aren't": "are not",
        "wasn't": "was not",
        "weren't": "were not",
        "haven't": "have not",
        "hasn't": "has not",
        "hadn't": "had not",
        "won't": "will not",
        "wouldn't": "would not",
        "don't": "do not",
        "doesn't": "does not",
        "didn't": "did not",
        "can't": "cannot",
        "couldn't": "could not",
        "shouldn't": "should not",
        "mightn't": "might not",
        "mustn't": "must not",
    }
    for contraction, expansion in contractions.items():
        text = re.sub(r"\b" + re.escape(contraction) + r"\b", expansion, text)
    text = re.sub(r"[^\w\s]", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()


def _normalize_token(word: str) -> str:
    """Normalize a single word token for comparison.

    Strips leading/trailing punctuation and lowercases. Interior characters
    (e.g. apostrophes in contractions) are preserved so "don't" stays "don't".
    Applied to both transcript tokens and phrase words at comparison time.
    """
    return word.lower().strip(string.punctuation)


def _is_known_marker(
    marker_id: str, slides: dict = None, videos: dict = None, audio: dict = None
) -> bool:
    """
    Check if a marker is a known type that should be processed.

    Known markers:
    - Slide markers (S1, S2, etc.) - must be in slides dict
    - video:xxx - video triggers
    - narration:xxx - narration triggers
    - Camera presets (Zoom1, TiltLeft, etc.)
    - Audio markers (A1, A2, etc.)

    Unknown markers are ignored (not part of the render plan).
    """
    slides = slides or {}
    videos = videos or {}
    audio = audio or {}

    # Slide markers
    if marker_id in slides:
        return True

    # Video/narration triggers (all supported prefixes)
    _VIDEO_PREFIXES = (
        "video:",
        "narration:",
        "vft:",
        "vfb:",
        "vf2t:",
        "vf2b:",
        "vst:",
        "vsb:",
        "vftp:",
        "vfbp:",
        "vf2tp:",
        "vf2bp:",
        "vstp:",
        "vsbp:",
    )
    if any(marker_id.startswith(p) for p in _VIDEO_PREFIXES):
        return True

    # Camera presets
    if marker_id in CAMERA_PRESETS:
        return True

    # Audio markers (A followed by id, e.g., Awoosh) or audio: prefix (e.g., audio:woosh)
    if marker_id.startswith("A") and len(marker_id) > 1:
        audio_id = marker_id[1:]
        if audio_id in audio or audio_id.isdigit():
            return True
    if marker_id.startswith("audio:") and audio is not None:
        audio_id = marker_id[6:]
        if audio_id in audio:
            return True

    return False


def _strip_unknown_markers(
    text: str, slides: dict = None, videos: dict = None, audio: dict = None
) -> str:
    """
    Remove all [...] markers from context text — none are pronounced aloud.

    Note: [cite:...] markers are already stripped at parse time by parse_manuscript().
    """
    return re.sub(r"\[([^\]]+)\]", "", text)


def _extract_marker_contexts(
    manuscript_text: str,
    slides: dict = None,
    videos: dict = None,
    audio: dict = None,
) -> list[tuple[str, str, bool, str]]:
    """
    Extract known markers and the text immediately following them from manuscript.

    Unknown markers are filtered out and stripped from following text.
    Note: [cite:...] markers are already stripped at parse time.

    Returns list of (marker_id, anchor_text, is_borrowed, anchor_type) tuples.
    anchor_type is "before" (default — place before the matched phrase) or
    "after" (place at the end of the matched phrase — used for markers that
    trail a narration block and have no following text of their own).
    """
    slides = slides or {}
    videos = videos or {}
    audio = audio or {}

    parts = re.split(r"\[([^\]]+)\]", manuscript_text)

    raw_contexts = []
    for i in range(1, len(parts), 2):
        marker_id = parts[i]

        if not _is_known_marker(marker_id, slides, videos, audio):
            continue

        text_pieces = []
        j = i + 1
        while j < len(parts):
            chunk = parts[j].strip()
            if chunk:
                text_pieces.append(chunk)
            j += 1
            if j >= len(parts):
                break
            if _is_known_marker(parts[j], slides, videos, audio):
                break
            j += 1

        following_text = " ".join(text_pieces)
        following_text = " ".join(following_text.split())
        following_text = _strip_unknown_markers(following_text, slides, videos, audio)
        following_text = " ".join(following_text.split())
        raw_contexts.append((marker_id, following_text))

    contexts = []
    for i, (marker_id, following_text) in enumerate(raw_contexts):
        if following_text:
            words = following_text.split()[:10]
            contexts.append((marker_id, " ".join(words), False, "before"))
        else:
            borrowed = False
            for j in range(i + 1, len(raw_contexts)):
                next_marker_id, next_text = raw_contexts[j]
                if next_text:
                    if next_marker_id in (slides or {}):
                        break
                    words = next_text.split()[:10]
                    contexts.append((marker_id, " ".join(words), True, "before"))
                    borrowed = True
                    break
            if not borrowed:
                # No following text and blocked by a slide boundary — look
                # backward for the tail of the preceding narration block and
                # anchor to the END of those words instead of extrapolating.
                preceding_text = ""
                for k in range(i - 1, -1, -1):
                    if raw_contexts[k][1]:
                        preceding_text = raw_contexts[k][1]
                        break
                if preceding_text:
                    words = preceding_text.split()
                    tail = " ".join(words[-6:])
                    contexts.append((marker_id, tail, False, "after"))
                else:
                    contexts.append((marker_id, "", False, "before"))

    return contexts


def _fuzzy_match_ratio(
    phrase_words: list[str],
    transcription: list[TranscribedWord],
    start_idx: int,
    window_size: int = 10,
    pre_filler: int = 30,
    inter_filler: int = 3,
) -> tuple[float, int, int]:
    """
    Calculate how many words from phrase match the transcription at start_idx.

    Words are matched sequentially. Two separate filler tolerances:
    - pre_filler:   max words before the FIRST phrase word (absorbs ad-libs)
    - inter_filler: max words between consecutive phrase words (keeps the
                    match tight so common words don't stretch the window far
                    into later text, which would push last_idx past subsequent
                    markers' positions)

    Returns (ratio, first_match_offset, last_match_end_offset) where offsets
    are relative to start_idx. last_match_end_offset points past the last
    matched word.
    """
    if not phrase_words:
        return 0.0, 0, 0

    if start_idx >= len(transcription):
        return 0.0, 0, 0

    words_to_check = min(len(phrase_words), window_size)
    # Window only needs to cover pre_filler + phrase words + inter_filler slack
    transcript_end = min(start_idx + pre_filler + words_to_check + inter_filler, len(transcription))

    transcript_words = [
        _normalize_token(transcription[j].word)
        for j in range(start_idx, transcript_end)
    ]

    matches = 0
    words_checked = 0
    t_pos = 0
    first_match_offset = 0
    last_match_end_offset = 0

    for phrase_word in phrase_words[:words_to_check]:
        normalized = _normalize_token(phrase_word)
        if len(normalized) < 2:
            continue
        words_checked += 1

        # First phrase word may be preceded by a long ad-lib; subsequent words
        # should appear within a few positions of each other.
        if matches == 0:
            search_end = min(t_pos + pre_filler + 1, len(transcript_words))
        else:
            search_end = min(t_pos + inter_filler + 1, len(transcript_words))

        for j in range(t_pos, search_end):
            t_word = transcript_words[j]
            matched = False
            if normalized == t_word:
                matched = True
            elif len(normalized) >= 4 and len(t_word) >= 4:
                if normalized in t_word or t_word in normalized:
                    matched = True

            if matched:
                if matches == 0:
                    first_match_offset = j
                matches += 1
                last_match_end_offset = j + 1
                t_pos = j + 1
                break

    ratio = matches / words_checked if words_checked > 0 else 0.0
    return ratio, first_match_offset, last_match_end_offset


def _find_phrase_timestamp(
    phrase: str,
    transcription: list[TranscribedWord],
    start_from: int = 0,
    fuzzy_threshold: float = 0.5,
) -> tuple[int, float, float, int]:
    """
    Find a phrase in the transcription using fuzzy matching.

    Returns (word_index, timestamp, confidence, match_end_idx) or
    (-1, -1.0, 0.0, -1) if not found. word_index points to the first
    matched word. match_end_idx points past the last matched word.
    """
    phrase_words = [tok for tok in (_normalize_token(w) for w in phrase.split()) if tok]

    if not phrase_words:
        return -1, -1.0, 0.0, -1

    best_idx = -1
    best_ratio = 0.0
    best_first_offset = 0
    best_end_offset = 0

    for i in range(start_from, len(transcription)):
        ratio, first_offset, end_offset = _fuzzy_match_ratio(
            phrase_words, transcription, i
        )
        if ratio > best_ratio:
            best_ratio = ratio
            best_idx = i
            best_first_offset = first_offset
            best_end_offset = end_offset

        # Sequential alignment: stop at the first position that clears the
        # threshold. Continuing to scan the full transcript risks jumping
        # to a higher-ratio match much later and skipping over subsequent
        # markers' positions entirely.
        if best_ratio >= fuzzy_threshold:
            break

    if best_ratio >= fuzzy_threshold and best_idx >= 0:
        actual_idx = best_idx + best_first_offset
        match_end_idx = best_idx + best_end_offset
        return actual_idx, transcription[actual_idx].start, best_ratio, match_end_idx

    return -1, -1.0, 0.0, -1


def align_markers_to_transcription(
    manuscript_text: str,
    transcription: list[TranscribedWord],
    slides: dict = None,
    videos: dict = None,
    audio: dict = None,
    fuzzy_threshold: float = 0.6,
) -> list[MarkerTiming]:
    """
    Align manuscript markers to transcription timestamps using fuzzy phrase matching.

    For each known marker, extracts the text immediately following it in the
    manuscript and searches for that phrase in the Whisper transcript. Markers are
    matched in manuscript order, each starting its search after the previous match.

    The filler-word window is intentionally large (+30 words) so that ad-libbed
    words spoken before or between the manuscript cue words do not prevent a match.

    Unknown markers are filtered out — they aren't pronounced and shouldn't be in
    the render plan. Note: [cite:...] markers are stripped at parse time.

    Args:
        manuscript_text: Full manuscript with [S1], [video:xxx], etc.
        transcription:   Word-level timestamps from Whisper
        slides:          Slide definitions (to identify valid slide markers)
        videos:          Video definitions (to identify valid video markers)
        audio:           Audio definitions (to identify valid audio markers)
        fuzzy_threshold: Minimum match ratio (default 0.6 = 60% of words must match)

    Returns:
        List of MarkerTiming with timestamps and confidence (known markers only)
    """
    contexts = _extract_marker_contexts(manuscript_text, slides, videos, audio)
    timings: list[MarkerTiming] = []

    last_idx = 0
    last_end_time = 0.0

    for marker_id, anchor_text, is_borrowed, anchor_type in contexts:
        if not anchor_text.strip():
            marker_time = last_end_time + 1.0
            timings.append(
                MarkerTiming(
                    marker_id=marker_id,
                    timestamp=marker_time,
                    context="(after previous)",
                    confidence=1.0,
                )
            )
            last_end_time = marker_time
            continue

        idx, timestamp, confidence, match_end_idx = _find_phrase_timestamp(
            anchor_text,
            transcription,
            start_from=last_idx,
            fuzzy_threshold=fuzzy_threshold,
        )

        if idx >= 0:
            if anchor_type == "after":
                # Marker trails a narration block — place it at the END of the
                # matched phrase (when those words finish being spoken).
                end_idx = min(match_end_idx - 1, len(transcription) - 1)
                marker_time = transcription[end_idx].end if transcription else 0.0
                timings.append(
                    MarkerTiming(
                        marker_id=marker_id,
                        timestamp=marker_time,
                        context=f"(end of: {anchor_text[:40]})",
                        confidence=confidence,
                    )
                )
                last_idx = match_end_idx
                last_end_time = marker_time
            else:
                adjusted_time = max(0.0, timestamp - 0.5)
                timings.append(
                    MarkerTiming(
                        marker_id=marker_id,
                        timestamp=adjusted_time,
                        context=anchor_text[:50],
                        confidence=confidence,
                    )
                )
                if not is_borrowed:
                    last_idx = match_end_idx
                    if last_idx > 0 and last_idx <= len(transcription):
                        last_end_time = transcription[last_idx - 1].end
                    else:
                        last_end_time = transcription[-1].end if transcription else 0.0
        else:
            timings.append(
                MarkerTiming(
                    marker_id=marker_id,
                    timestamp=-1.0,
                    context=anchor_text[:50],
                    confidence=0.0,
                )
            )

    # Deduplicate slide markers. The manuscript pattern [SN]\n\n[SN] text... is
    # common: the first blank occurrence is a visual-transition cue and the second
    # carries the narration text used for alignment. We keep the first entry in
    # order (preserving manuscript position) but upgrade its timestamp to the
    # best-matched value found for that ID, then drop subsequent duplicates.
    slides_set = set(slides or {})
    seen: dict[str, int] = {}  # marker_id → index in deduped list
    deduped: list[MarkerTiming] = []
    for timing in timings:
        if timing.marker_id not in slides_set:
            deduped.append(timing)
            continue
        if timing.marker_id not in seen:
            seen[timing.marker_id] = len(deduped)
            deduped.append(timing)
        else:
            prev_idx = seen[timing.marker_id]
            prev = deduped[prev_idx]
            if prev.context == "(after previous)" and timing.context != "(after previous)":
                deduped[prev_idx] = MarkerTiming(
                    marker_id=prev.marker_id,
                    timestamp=timing.timestamp,
                    context=timing.context,
                    confidence=timing.confidence,
                )

    return deduped


def build_render_plan(
    project_path: Path,
    config: ProjectConfig,
    slides: dict[str, SlideDefinition],
    videos: dict[str, VideoSource],
    videos_dir: Path,
    manuscript_text: str,
    transcription: list[TranscribedWord],
    audio: Optional[dict[str, AudioDefinition]] = None,
    audio_dir: Optional[Path] = None,
    slide_range: Optional[tuple[str, Optional[str]]] = None,
) -> tuple[RenderPlan, list[MarkerTiming]]:
    """
    Build a complete render plan from manuscript and transcription.

    This performs on-the-fly alignment of manuscript markers to transcription
    timestamps, then builds the render plan.

    Args:
        manuscript_text: The manuscript.txt content (source of truth for markers)
        transcription: Word-level timestamps from whisper transcription
        slide_range: Optional tuple of (start_slide, end_slide) for partial rendering.

    Returns:
        Tuple of (RenderPlan, list of MarkerTiming for display)
    """
    audio = audio or {}
    audio_dir = audio_dir or project_path

    # Find the main narration video first (need skip value for timing adjustment)
    narration_video_id = config.main_video
    if isinstance(narration_video_id, list):
        narration_video_id = narration_video_id[0] if narration_video_id else None
    if not (narration_video_id and narration_video_id in videos):
        raise ValueError(
            f"Main video '{narration_video_id}' not specified or not found in videos. "
            f"Available: {list(videos.keys())}"
        )
    narration_video = videos[narration_video_id]

    # Align markers to transcription timestamps
    marker_timings = align_markers_to_transcription(
        manuscript_text, transcription, slides=slides, videos=videos, audio=audio
    )

    # Apply skip offset: if narration video has skip, subtract it from all timestamps
    # This accounts for the fact that the video will start at skip seconds, not 0
    narration_skip = narration_video.skip
    if narration_skip > 0:
        for timing in marker_timings:
            if timing.timestamp >= 0:
                timing.timestamp = max(0.0, timing.timestamp - narration_skip)

    # Build marker -> timestamp lookup
    marker_times: dict[str, float] = {}
    for timing in marker_timings:
        if timing.timestamp >= 0:
            marker_times[timing.marker_id] = timing.timestamp

    # Find shared_assets directory
    shared_assets_dir = None
    if (project_path / "shared_assets").exists():
        shared_assets_dir = project_path / "shared_assets"
    elif (project_path.parent / "shared_assets").exists():
        shared_assets_dir = project_path.parent / "shared_assets"

    narration_video = videos[narration_video_id]
    cutout = config.cutouts[narration_video.cutout]

    # Track which files are loaded from external cache
    cached_files: set[str] = set()

    narration_videos: list[tuple[str, VideoSource, CutoutDefinition]] = []
    video_path, is_cached = _resolve_video_path(
        videos_dir, narration_video, shared_assets_dir, project_path
    )
    if is_cached:
        cached_files.add(narration_video_id)
    full_duration = get_video_duration(video_path)
    # Adjust duration for skip (content starts at skip, so effective duration is less)
    effective_duration = full_duration - narration_skip
    # Get total duration from first always_visible video
    narration_videos.append((narration_video_id, narration_video, cutout))
    # Resolve slide range to time range
    time_offset = 0.0
    render_end_time = effective_duration
    if slide_range:
        start_slide, end_slide = slide_range
        if start_slide not in marker_times:
            raise ValueError(
                f"Start slide '{start_slide}' not found in aligned markers"
            )
        time_offset = marker_times[start_slide]
        if end_slide:
            if end_slide not in marker_times:
                raise ValueError(
                    f"End slide '{end_slide}' not found in aligned markers"
                )
            render_end_time = marker_times[end_slide]

    # Build events from aligned markers
    slide_events = _extract_slide_events(
        marker_timings,
        slides,
        effective_duration,
        time_range=(time_offset, render_end_time) if slide_range else None,
    )

    # Before extracting video events, resolve any referenced videos that are missing
    # from the project's videos.json by looking them up in shared_assets/videos.json.
    _VIDEO_MARKER_PREFIXES = (
        "video:", "narration:", "vft:", "vfb:", "vf2t:", "vf2b:", "vst:", "vsb:",
        "vftp:", "vfbp:", "vf2tp:", "vf2bp:", "vstp:", "vsbp:",
    )
    missing_video_ids = [
        timing.marker_id[len(prefix):]
        for timing in marker_timings
        if timing.timestamp >= 0
        for prefix in _VIDEO_MARKER_PREFIXES
        if timing.marker_id.startswith(prefix)
        and timing.marker_id[len(prefix):] not in videos
    ]
    if missing_video_ids:
        found = resolve_missing_videos(missing_video_ids, project_path, config)
        videos.update(found)

    video_events, video_warnings = _extract_video_events(
        marker_timings,
        videos,
        config.cutouts,
        slides,
        effective_duration,
        time_range=(time_offset, render_end_time) if slide_range else None,
    )
    if video_warnings:
        import sys
        print("\nWarnings:", file=sys.stderr)
        for w in video_warnings:
            print(f"  ⚠ {w}", file=sys.stderr)
        print("", file=sys.stderr)

    # Track cached files for triggered videos
    for event in video_events:
        _, is_cached = _resolve_video_path(
            videos_dir, event.video_source, shared_assets_dir, project_path
        )
        if is_cached:
            cached_files.add(event.video_id)

    audio_events = _extract_audio_events(
        marker_timings,
        audio,
        time_range=(time_offset, render_end_time) if slide_range else None,
    )

    camera_events, initial_camera_state = _extract_camera_events(
        marker_timings,
        time_range=(time_offset, render_end_time) if slide_range else None,
    )

    # Apply time offset to all events (for partial rendering)
    if time_offset > 0:
        for event in slide_events:
            event.start_time -= time_offset
            event.end_time -= time_offset
        for event in video_events:
            event.start_time -= time_offset
            event.end_time -= time_offset
        for event in audio_events:
            event.start_time = max(0, event.start_time - time_offset)
        for event in camera_events:
            event.time -= time_offset

    total_duration = render_end_time - time_offset

    # Handle narration pauses (videos that pause the narration track)
    narration_pauses: list[NarrationPause] = []
    pause_video_events = [e for e in video_events if e.video_source.pause_narration]

    if pause_video_events:
        # Sort pause events by their narration time
        pause_video_events.sort(key=lambda e: e.start_time)

        cumulative_offset = 0.0
        for event in pause_video_events:
            pause_duration = event.video_source.pause_narration
            narration_time = event.start_time  # Time in narration source

            # Create pause record (before applying offset to this event)
            narration_pauses.append(
                NarrationPause(
                    output_time=narration_time + cumulative_offset,
                    narration_time=narration_time,
                    duration=pause_duration,
                    video_id=event.video_id,
                )
            )

            # Offset all events that come AFTER this pause
            for slide_event in slide_events:
                if slide_event.start_time > narration_time:
                    slide_event.start_time += pause_duration
                if slide_event.end_time > narration_time:
                    slide_event.end_time += pause_duration

            for vid_event in video_events:
                if vid_event.start_time > narration_time:
                    vid_event.start_time += pause_duration
                if vid_event.end_time > narration_time:
                    vid_event.end_time += pause_duration

            for aud_event in audio_events:
                if aud_event.start_time > narration_time:
                    aud_event.start_time += pause_duration

            for cam_event in camera_events:
                if cam_event.time > narration_time:
                    cam_event.time += pause_duration

            cumulative_offset += pause_duration

        # Update total duration
        total_duration += cumulative_offset

    # Save narration end time (before outro)
    narration_end_time = total_duration

    # Resolve any outro videos missing from videos.json via shared_assets.
    if config.outro:
        missing_outro_ids = [vid_id for vid_id in config.outro if vid_id not in videos]
        if missing_outro_ids:
            found = resolve_missing_videos(missing_outro_ids, project_path, config)
            videos.update(found)
        still_missing = [vid_id for vid_id in config.outro if vid_id not in videos]
        for vid_id in still_missing:
            print(f"  WARNING: outro video '{vid_id}' not found in videos.json or shared_assets — skipped", flush=True)

    # Build outro events (plays after narration ends)
    outro_events = _extract_outro_events(
        config.outro,
        videos,
        config.cutouts,
        total_duration,
        videos_dir,
        shared_assets_dir,
        project_path,
        cached_files,
    )

    # Update total duration to include outro
    if outro_events:
        total_duration = outro_events[-1].end_time

    # Derive slides directory — lowercase path for case-sensitive filesystems (WSL/Linux).
    slides_json_path = project_path / config.slides_path.lower()
    slides_dir = slides_json_path.parent

    plan = RenderPlan(
        project_path=project_path,
        config=config,
        slide_events=slide_events,
        total_duration=total_duration,
        slides=slides,
        videos=videos,
        video_events=video_events,
        narration_videos=narration_videos,
        slides_dir=slides_dir,
        videos_dir=videos_dir,
        audio_events=audio_events,
        audio=audio,
        audio_dir=audio_dir,
        camera_events=camera_events,
        time_offset=time_offset,
        initial_camera_state=initial_camera_state,
        input_seek_time=time_offset,
        shared_assets_dir=shared_assets_dir,
        narration_pauses=narration_pauses,
        outro_events=outro_events,
        narration_end_time=narration_end_time,
        cached_files=cached_files,
    )

    return plan, marker_timings


def _resolve_video_path(
    videos_dir: Path,
    video_source: VideoSource,
    shared_assets_dir: Path = None,
    project_path: Path = None,
) -> tuple[Path, bool]:
    """Resolve the actual video file path with cache fallback.

    Returns:
        Tuple of (resolved_path, is_cached) where is_cached=True if
        the file was found in the external cache.
    """
    from .cache import resolve_with_cache

    if video_source.is_shared and shared_assets_dir:
        base_dir = shared_assets_dir
    else:
        base_dir = videos_dir

    if video_source.output_file:
        video_path = base_dir / video_source.output_file
        if project_path:
            resolved, is_cached = resolve_with_cache(video_path, project_path)
            if resolved.exists():
                return resolved, is_cached
        elif video_path.exists():
            return video_path, False
        webm_path = video_path.with_suffix(".mov")
        if project_path:
            resolved, is_cached = resolve_with_cache(webm_path, project_path)
            if resolved.exists():
                return resolved, is_cached
        elif webm_path.exists():
            return webm_path, False

    source_path = base_dir / video_source.source_file
    if project_path:
        return resolve_with_cache(source_path, project_path)
    return source_path, False


def _extract_slide_events(
    marker_timings: list[MarkerTiming],
    slides: dict[str, SlideDefinition],
    total_duration: float,
    time_range: Optional[tuple[float, float]] = None,
) -> list[SlideEvent]:
    """Extract slide events from aligned marker timings.

    Each slide starts at its own marker timestamp and ends when the next
    slide's marker appears. Before the first slide, no slide is shown.

    Slides that could not be aligned (timestamp < 0) have their position
    interpolated evenly between the surrounding aligned slides rather than
    being excluded.
    """
    range_start, range_end = time_range if time_range else (0.0, float("inf"))

    # Get ALL slide markers in manuscript order (aligned and unaligned)
    all_slide_markers: list[tuple[float, str]] = []
    for timing in marker_timings:
        if timing.marker_id in slides:
            all_slide_markers.append((timing.timestamp, timing.marker_id))

    if not all_slide_markers:
        return []

    # Interpolate timestamps for unaligned slides (timestamp < 0).
    # For each run of consecutive unaligned slides, spread them evenly between
    # the nearest aligned slides before and after in manuscript order.
    n = len(all_slide_markers)
    resolved: list[tuple[float, str]] = list(all_slide_markers)

    i = 0
    while i < n:
        if resolved[i][0] < 0:
            run_start = i
            while i < n and resolved[i][0] < 0:
                i += 1
            run_end = i  # exclusive

            prev_time = resolved[run_start - 1][0] if run_start > 0 else 0.0
            next_time = resolved[run_end][0] if run_end < n else total_duration

            count = run_end - run_start
            for j, idx in enumerate(range(run_start, run_end)):
                frac = (j + 1) / (count + 1)
                resolved[idx] = (
                    prev_time + (next_time - prev_time) * frac,
                    resolved[idx][1],
                )
        else:
            i += 1

    events: list[SlideEvent] = []
    for i, (marker_time, marker_id) in enumerate(resolved):
        # First slide always starts at 0 — it's the opening state of the presentation.
        start_time = 0.0 if i == 0 else marker_time

        # End time is when the NEXT slide's marker appears, or end of video
        if i + 1 < len(resolved):
            end_time = resolved[i + 1][0]
        else:
            end_time = total_duration

        # Filter by time range
        if end_time <= range_start or start_time >= range_end:
            continue
        start_time = max(start_time, range_start)
        end_time = min(end_time, range_end)

        events.append(
            SlideEvent(
                slide_id=marker_id,
                start_time=start_time,
                end_time=end_time,
                slide_def=slides[marker_id],
            )
        )

    return events


def _extract_video_events(
    marker_timings: list[MarkerTiming],
    videos: dict[str, VideoSource],
    cutouts: dict[str, CutoutDefinition],
    slides: dict[str, SlideDefinition],
    total_duration: float,
    time_range: Optional[tuple[float, float]] = None,
) -> tuple[list[VideoEvent], list[str]]:
    """
    Extract video events from aligned marker timings.

    - [video:xxx] events end at the next SLIDE marker
    - [narration:xxx] events run until end

    Returns (events, warnings). Invalid markers are skipped and reported in warnings.
    """
    warnings: list[str] = []
    range_start, range_end = time_range if time_range else (0.0, float("inf"))

    # Collect slide times for video: end time calculation
    slide_times: list[float] = sorted(
        [
            t.timestamp
            for t in marker_timings
            if t.marker_id in slides and t.timestamp >= 0
        ]
    )

    # Mapping from shorthand marker prefix → (implied_cutout_name, implied_layer)
    # These are the defaults; videos.json values act as a base but the marker wins.
    _SHORTHAND: dict[str, tuple[str, str]] = {
        "vft:": ("fullscreen", "above"),
        "vfb:": ("fullscreen", "below"),
        "vf2t:": ("fullscreen2", "above"),
        "vf2b:": ("fullscreen2", "below"),
        "vst:": ("square", "above"),
        "vsb:": ("square", "below"),
        "vftp:": ("fullscreen", "above", "pause_narration"),
        "vfbp:": ("fullscreen", "below", "pause_narration"),
        "vf2tp:": ("fullscreen2", "above", "pause_narration"),
        "vf2bp:": ("fullscreen2", "below", "pause_narration"),
        "vstp:": ("square", "above", "pause_narration"),
        "vsbp:": ("square", "below", "pause_narration"),
    }

    # Collect video markers: (time, video_id, event_type, cutout_name_override, layer_override)
    # event_type is "video" (ends at next slide) or "narration" (runs to end)
    video_markers: list[tuple[float, str, str, str | None, str | None]] = []

    for timing in marker_timings:
        if timing.timestamp < 0:
            continue

        mid = timing.marker_id

        # --- shorthand markers: vft/vfb/vst/vsb ---
        shorthand_match = next((p for p in _SHORTHAND if mid.startswith(p)), None)
        if shorthand_match:
            video_id = mid[len(shorthand_match) :]
            if video_id not in videos:
                warnings.append(
                    f"[{mid}] references unknown video '{video_id}' — skipped. "
                    f"Add it to videos.json or remove the marker."
                )
                continue
            implied_cutout, implied_layer = _SHORTHAND[shorthand_match]
            if implied_cutout not in cutouts:
                warnings.append(
                    f"[{mid}] requires cutout '{implied_cutout}' which is not defined in project config — skipped. "
                    f"Available cutouts: {list(cutouts.keys())}"
                )
                continue
            video_markers.append(
                (timing.timestamp, video_id, "video", implied_cutout, implied_layer)
            )
            continue

        # --- legacy [video:xxx] ---
        if mid.startswith("video:"):
            video_id = mid[6:]
            if video_id not in videos:
                warnings.append(
                    f"[video:{video_id}] references unknown video '{video_id}' — skipped. "
                    f"Add it to videos.json or remove the marker."
                )
                continue
            video_source = videos[video_id]
            if not video_source.cutout:
                warnings.append(
                    f"[video:{video_id}] has no 'cutout' set in videos.json — skipped."
                )
                continue
            if video_source.cutout not in cutouts:
                warnings.append(
                    f"[video:{video_id}] cutout '{video_source.cutout}' is not defined in project config — skipped. "
                    f"Available: {list(cutouts.keys())}"
                )
                continue
            video_markers.append((timing.timestamp, video_id, "video", None, None))
            continue

        # --- [narration:xxx] ---
        if mid.startswith("narration:"):
            video_id = mid[10:]
            if video_id not in videos:
                warnings.append(
                    f"[narration:{video_id}] references unknown video '{video_id}' — skipped. "
                    f"Add it to videos.json or remove the marker."
                )
                continue
            video_source = videos[video_id]
            if not video_source.cutout:
                warnings.append(
                    f"[narration:{video_id}] has no 'cutout' set in videos.json — skipped."
                )
                continue
            if video_source.cutout not in cutouts:
                warnings.append(
                    f"[narration:{video_id}] cutout '{video_source.cutout}' is not defined in project config — skipped. "
                    f"Available: {list(cutouts.keys())}"
                )
                continue
            video_markers.append((timing.timestamp, video_id, "narration", None, None))

    events: list[VideoEvent] = []
    for (
        start_time,
        video_id,
        marker_type,
        cutout_override,
        layer_override,
    ) in video_markers:
        video_source = videos[video_id]

        # Resolve cutout: marker override > videos.json cutout
        # (validation already ensured cutout exists — this is a safety assertion)
        cutout_name = cutout_override or video_source.cutout
        cutout = cutouts[cutout_name]

        # Resolve layer: marker override > videos.json layer
        layer = layer_override if layer_override is not None else video_source.layer

        end_on = video_source.end_on
        if end_on == "take" and video_source.take is not None:
            end_time = start_time + video_source.take
        elif end_on == "end":
            end_time = total_duration
        elif end_on == "next_slide" or (end_on is None and marker_type == "video"):
            # End at next slide marker
            end_time = total_duration
            for slide_time in slide_times:
                if slide_time > start_time:
                    end_time = slide_time
                    break
        else:
            # end_on is None and marker_type == "narration": runs to end
            end_time = total_duration

        # Filter by time range
        if start_time < range_start or start_time >= range_end:
            continue
        end_time = min(end_time, range_end)

        events.append(
            VideoEvent(
                video_id=video_id,
                start_time=start_time,
                end_time=end_time,
                video_source=video_source,
                cutout=cutout,
                cutout_name=cutout_name,
                layer=layer,
            )
        )

    return events, warnings


def _extract_audio_events(
    marker_timings: list[MarkerTiming],
    audio: dict[str, AudioDefinition],
    time_range: Optional[tuple[float, float]] = None,
) -> list[AudioEvent]:
    """Extract audio events from aligned marker timings."""
    range_start, range_end = time_range if time_range else (0.0, float("inf"))
    events: list[AudioEvent] = []

    for timing in marker_timings:
        if timing.timestamp < 0:
            continue

        marker_id = timing.marker_id
        audio_id = None
        if marker_id.startswith("A") and len(marker_id) > 1:
            audio_id = marker_id[1:]
        elif marker_id.startswith("audio:"):
            audio_id = marker_id[6:]
        if audio_id is not None and audio_id in audio:
            if timing.timestamp < range_start or timing.timestamp >= range_end:
                continue
            start_time = max(0, timing.timestamp - AUDIO_OFFSET_SECONDS)
            events.append(
                AudioEvent(
                    audio_id=audio_id,
                    start_time=start_time,
                    audio_def=audio[audio_id],
                )
            )

    return events


def _extract_camera_events(
    marker_timings: list[MarkerTiming],
    time_range: Optional[tuple[float, float]] = None,
) -> tuple[list[CameraEvent], CameraState]:
    """
    Extract camera events from aligned marker timings.

    Camera state is cumulative. Returns (events, initial_state).
    """
    range_start, range_end = time_range if time_range else (0.0, float("inf"))

    events: list[CameraEvent] = []
    current_state = CameraState()
    initial_state = CameraState()
    found_range_start = False

    for timing in marker_timings:
        if timing.timestamp < 0:
            continue

        marker_id = timing.marker_id
        if marker_id not in CAMERA_PRESETS:
            continue

        preset = CAMERA_PRESETS[marker_id]

        # Determine new state based on marker type
        if marker_id in ("Reset", "NoTilt"):
            new_state = CameraState()
        elif marker_id.startswith("Zoom"):
            new_state = CameraState(
                zoom=preset.zoom,
                rotation=current_state.rotation,
                pan_x=current_state.pan_x,
                pan_y=current_state.pan_y,
                focal_x=current_state.focal_x,
                focal_y=current_state.focal_y,
            )
        elif marker_id.startswith("Tilt"):
            new_state = CameraState(
                zoom=current_state.zoom,
                rotation=preset.rotation,
                pan_x=current_state.pan_x,
                pan_y=current_state.pan_y,
                focal_x=current_state.focal_x,
                focal_y=current_state.focal_y,
            )
        elif marker_id.startswith("Pan"):
            new_state = CameraState(
                zoom=current_state.zoom,
                rotation=current_state.rotation,
                pan_x=preset.pan_x,
                pan_y=preset.pan_y,
                focal_x=current_state.focal_x,
                focal_y=current_state.focal_y,
            )
        else:
            new_state = preset

        # Capture state at range start
        if not found_range_start and timing.timestamp >= range_start:
            initial_state = current_state
            found_range_start = True

        # Only emit events within range
        if range_start <= timing.timestamp < range_end:
            events.append(
                CameraEvent(
                    time=timing.timestamp,
                    target_state=new_state,
                    duration=0.2,
                    easing="ease-out",
                )
            )

        current_state = new_state

    if not found_range_start:
        initial_state = CameraState()

    return events, initial_state


def _extract_outro_events(
    outro_video_ids: list[str],
    videos: dict[str, VideoSource],
    cutouts: dict[str, CutoutDefinition],
    narration_end_time: float,
    videos_dir: Path,
    shared_assets_dir: Path = None,
    project_path: Path = None,
    cached_files: set = None,
) -> list[OutroEvent]:
    """
    Extract outro events that play after the narration ends.

    Outro videos play in sequence, starting from narration_end_time.
    Each video plays for its `take` duration (or full source duration if no take).
    """
    events: list[OutroEvent] = []
    current_time = narration_end_time

    for video_id in outro_video_ids:
        if video_id not in videos:
            continue

        video_source = videos[video_id]

        # Get the video duration
        video_path, is_cached = _resolve_video_path(
            videos_dir, video_source, shared_assets_dir, project_path
        )
        if is_cached and cached_files is not None:
            cached_files.add(video_id)
        if video_path.exists():
            full_duration = get_video_duration(video_path)
        else:
            full_duration = 10.0  # Fallback

        # Use take if specified, otherwise use full duration
        duration = video_source.take if video_source.take is not None else full_duration

        # Account for skip
        duration = max(0, duration)

        # Resolve cutout (None = fullscreen)
        cutout = None
        if video_source.cutout and video_source.cutout in cutouts:
            cutout = cutouts[video_source.cutout]

        events.append(
            OutroEvent(
                video_id=video_id,
                start_time=current_time,
                end_time=current_time + duration,
                video_source=video_source,
                cutout=cutout,
            )
        )

        current_time += duration

    return events