From fdd275ac0ed9b29f862acce02c1923794c03aa6a Mon Sep 17 00:00:00 2001 From: jenstandstad Date: Fri, 6 Feb 2026 17:56:05 +0100 Subject: [PATCH] Adding changes version 1 --- docs/partial-rendering-spec.md | 317 ++++ docs/virtual-camera-effects.md | 265 +++ example/citations.json | 10 + example/manuscript.txt | 20 +- example/media/slides/example/slides.json | 26 + .../intermediate/segments/concat_list.txt | 2 + .../media/videos/talking_head.transcript.json | 497 ++++++ example/media/videos/videos.json | 39 + example/project.json | 38 +- example/slides.json | 10 - example/transcript.csv | 8 - example/videos.json | 6 - gnommo.sh | 145 +- gnommo/aligner.py | 199 --- gnommo/cli.py | 1046 ++++++++++-- gnommo/description.py | 359 ++++ gnommo/errors.py | 18 +- gnommo/extract_keynote_notes.js | 74 + gnommo/extract_presenter_notes.py | 94 + gnommo/models.py | 402 ++++- gnommo/parser.py | 274 ++- gnommo/preprocessor.py | 1509 ++++++++++++++++- gnommo/renderer.py | 911 +++++++++- gnommo/transcriber.py | 22 +- gnommo/transformer.py | 986 ++++++++++- gnommo/validator.py | 195 ++- main.py | 6 + notes.json | 0 requirements.txt | 2 + tasks.md | 476 ++++++ 30 files changed, 7068 insertions(+), 888 deletions(-) create mode 100644 docs/partial-rendering-spec.md create mode 100644 docs/virtual-camera-effects.md create mode 100644 example/citations.json create mode 100644 example/media/slides/example/slides.json create mode 100644 example/media/videos/intermediate/segments/concat_list.txt create mode 100644 example/media/videos/talking_head.transcript.json create mode 100644 example/media/videos/videos.json delete mode 100644 example/slides.json delete mode 100644 example/transcript.csv delete mode 100644 example/videos.json delete mode 100644 gnommo/aligner.py create mode 100644 gnommo/description.py create mode 100644 gnommo/extract_keynote_notes.js create mode 100644 gnommo/extract_presenter_notes.py create mode 100644 main.py create mode 100644 notes.json create mode 100644 requirements.txt create mode 100644 tasks.md diff --git a/docs/partial-rendering-spec.md b/docs/partial-rendering-spec.md new file mode 100644 index 0000000..2b1fc25 --- /dev/null +++ b/docs/partial-rendering-spec.md @@ -0,0 +1,317 @@ +# Partial Rendering Specification + +## Overview + +Enable rendering of specific sections of a video (e.g., slides 1-10, then 10-20) instead of the full video. This is useful for: +- Faster iteration during development +- Re-rendering specific sections after fixes +- Parallel rendering of segments that can be concatenated later + +## Scope (v1) + +**In scope:** +- Camera state tracking (cumulative state must be computed from t=0) +- Time offset adjustment for all events +- Slide range filtering +- Input video seeking + +**Out of scope (v1):** +- Audio events crossing range boundaries +- Triggered video duration edge cases +- Events are assumed to begin at their marker timestamp and never "carry over" + +## Current Architecture Analysis + +### 1. Camera State Management + +**Current behavior** (`transformer.py:250-332`): +- Camera state is **cumulative** across the transcript +- `_extract_camera_events()` walks through ALL markers sequentially +- Each marker type (Zoom/Tilt/Pan) only modifies its property while preserving others +- Example: `[Zoom2]` then `[TiltLeft]` = both zoom AND tilt active + +**Problem for partial rendering**: +If we start rendering at slide 10, we need the camera state AS IT WOULD BE after processing slides 1-9. + +**Solution**: +Separate "state computation" from "event generation": +1. Always walk through ALL transcript markers to compute cumulative state +2. Track the "initial state" at the start of the render range +3. Only emit CameraEvents for markers WITHIN the render range +4. First event in partial render must transition FROM the computed initial state + +### 2. Time Signature Adjustment + +**Current behavior**: +All timing uses absolute timestamps from `transcript.csv`: +- `SlideEvent.start_time/end_time` +- `VideoEvent.start_time/end_time` +- `AudioEvent.start_time` +- `CameraEvent.time` +- FFmpeg expressions: `enable=between(t, start, end)` +- Camera animation: `if(between(t, 1.000, 1.200), ...)` + +**Problem for partial rendering**: +If slide 10 starts at t=10.0s and we render from there, FFmpeg expects t=0 at the start of output. + +**Solution**: +Apply a `time_offset` to all events after extraction: +``` +new_time = original_time - time_offset +``` +Where `time_offset` = start time of first slide/event in range. + +### 3. Input Video Seeking + +**Current behavior**: +- Always-visible videos (talking head) start from the beginning +- FFmpeg processes entire input duration + +**Problem for partial rendering**: +Need to seek into source videos to the correct position. + +**Solution**: +Add `-ss ` before input files for always-visible videos: +``` +ffmpeg -ss 10.0 -i talking_head.mov ... +``` + +--- + +## Proposed API + +### Command Line Interface + +```bash +# Render full video (current behavior) +gnommo render example/project.json output.mp4 + +# Render specific slide range +gnommo render example/project.json output.mp4 --slides S1:S10 +gnommo render example/project.json output.mp4 --slides S10:S20 +gnommo render example/project.json output.mp4 --slides S5: # S5 to end + +# Render specific time range (alternative) +gnommo render example/project.json output.mp4 --time 0:60 +gnommo render example/project.json output.mp4 --time 60:120 +``` + +### Internal API + +New parameters for `build_render_plan()`: +```python +def build_render_plan( + ... + slide_range: Optional[tuple[str, Optional[str]]] = None, # (start_slide, end_slide) + # OR + time_range: Optional[tuple[float, Optional[float]]] = None, # (start_time, end_time) +) -> RenderPlan: +``` + +New field on `RenderPlan`: +```python +@dataclass +class RenderPlan: + ... + time_offset: float = 0.0 # Offset to subtract from all timestamps + initial_camera_state: CameraState = field(default_factory=CameraState) # State at render start + input_seek_time: float = 0.0 # Seek position for input videos +``` + +--- + +## Implementation Details + +### Phase 1: Compute Full State, Filter Events + +Modify `_extract_camera_events()` to accept a time range: + +```python +def _extract_camera_events( + transcript: list[TimedWord], + time_range: Optional[tuple[float, float]] = None, # (start, end) +) -> tuple[list[CameraEvent], CameraState]: + """ + Returns: + - List of CameraEvents within time_range + - Initial CameraState at start of time_range + """ + events: list[CameraEvent] = [] + current_state = CameraState() + initial_state = CameraState() + start_time, end_time = time_range or (0.0, float('inf')) + + found_start = False + + for timed_word in transcript: + if not timed_word.is_marker: + continue + + marker_id = timed_word.marker_id + if not marker_id or marker_id not in CAMERA_PRESETS: + continue + + # Always update current_state (full walk) + preset = CAMERA_PRESETS[marker_id] + new_state = _apply_preset(current_state, marker_id, preset) + + # Capture state just before we enter the render range + if not found_start and timed_word.time >= start_time: + initial_state = current_state # State BEFORE this marker + found_start = True + + # Only emit events within range + if start_time <= timed_word.time < end_time: + events.append(CameraEvent( + time=timed_word.time, + target_state=new_state, + duration=0.2, + easing="ease-out", + )) + + current_state = new_state + + return events, initial_state +``` + +### Phase 2: Apply Time Offset + +After extracting events, apply offset to all timestamps: + +```python +def _apply_time_offset(plan: RenderPlan, offset: float) -> RenderPlan: + """Shift all timestamps by offset (subtract offset from all times).""" + + # Adjust slide events + for event in plan.slide_events: + event.start_time -= offset + event.end_time -= offset + + # Adjust video events + for event in plan.video_events: + event.start_time -= offset + event.end_time -= offset + + # Adjust audio events + for event in plan.audio_events: + event.start_time = max(0, event.start_time - offset) + + # Adjust camera events + for event in plan.camera_events: + event.time -= offset + + # Adjust total duration + plan.total_duration -= offset + plan.time_offset = offset + plan.input_seek_time = offset + + return plan +``` + +### Phase 3: FFmpeg Seeking + +Modify `build_ffmpeg_command()` to add seeking: + +```python +def build_ffmpeg_command(plan: RenderPlan, output_path: Path) -> list[str]: + cmd = ["ffmpeg", "-y"] + + # Add seek for always-visible videos + for video_id, video_source, cutout in plan.narration_videos: + video_path = _resolve_video_path(videos_dir, video_source) + if plan.input_seek_time > 0: + cmd.extend(["-ss", str(plan.input_seek_time)]) # Seek BEFORE -i + cmd.extend(["-i", str(video_path)]) + ... +``` + +### Phase 4: Initial Camera State Handling + +If `initial_camera_state` is not default, inject a "virtual" camera event at t=0: + +```python +def build_camera_transform( + camera_events: list[CameraEvent], + initial_state: CameraState, # NEW PARAMETER + ... +) -> str: + # If initial state differs from default, prepend a virtual event + if not initial_state.is_default(): + initial_event = CameraEvent( + time=0.0, + target_state=initial_state, + duration=0.0, # Instant - no transition + easing="linear", + ) + camera_events = [initial_event] + camera_events + ... +``` + +--- + +## FFmpeg Optimization + +**Only emit filters for events within range.** + +When rendering a partial range, the `RenderPlan` should only contain events within that range. This means: +- Fewer inputs added to the FFmpeg command (only slides/videos/audio actually used) +- Fewer overlay filters in filter_complex +- Fewer `between(t, start, end)` enable expressions to evaluate per frame + +Example: Full video has 50 slides, rendering S40:S50 only: +- **Before**: 50 slide inputs, 50 overlay filters +- **After**: 10 slide inputs, 10 overlay filters + +This is achieved naturally by filtering events in `build_render_plan()` before constructing the plan - the renderer already only processes events present in the plan. + +--- + +## Edge Cases (v1 Simplified) + +### 1. Camera state from before range +If rendering S5:S10 but there's a camera event at the S4 marker: +- Camera state from S4 must be captured as `initial_camera_state` +- Rendered output starts with that state already applied at t=0 + +### 2. Events filter by marker position +All events (slides, videos, audio) are filtered by whether their START marker falls within the range. +- Events beginning outside range are excluded +- No "carry over" or boundary-crossing logic needed + +--- + +## Testing Strategy + +### Unit Tests +1. Camera state computation maintains state across full transcript +2. Time offset correctly shifts all event types +3. Initial camera state correctly captured at boundary + +### Integration Tests +1. Render slides 1-5, then 5-10, concatenate, compare to full render +2. Camera state continuity across segment boundaries +3. Audio alignment after seeking + +### Manual Verification +1. Visual inspection of camera state at segment boundaries +2. Audio sync verification + +--- + +## Future Enhancements + +### Parallel Rendering Pipeline +```bash +# Render in parallel, then concatenate +gnommo render proj.json seg1.mp4 --slides S1:S10 & +gnommo render proj.json seg2.mp4 --slides S10:S20 & +gnommo render proj.json seg3.mp4 --slides S20: & +wait +ffmpeg -f concat -i segments.txt -c copy final.mp4 +``` + +### Smart Re-rendering +Track which slides changed and only re-render affected segments. + +### Preview Mode +Quick low-quality render of specific section for review. diff --git a/docs/virtual-camera-effects.md b/docs/virtual-camera-effects.md new file mode 100644 index 0000000..10d9980 --- /dev/null +++ b/docs/virtual-camera-effects.md @@ -0,0 +1,265 @@ +# Virtual Camera Effects + +Ideas for "stuff happening" to keep viewers engaged in edutainment videos. +These effects are triggered by markers in the manuscript, just like slides. + +## Zoom Effects + +| Marker | Description | +|--------|-------------| +| `[Zoom1]` | Zoom to 110% - subtle emphasis | +| `[Zoom2]` | Zoom to 125% - moderate emphasis | +| `[Zoom3]` | Zoom to 150% - strong emphasis | +| `[Zoom0]` | Return to 100% (default) | +| `[ZoomPunch]` | Quick zoom in + out (single beat emphasis) | + +**Use case:** Rapid `[Zoom1][Zoom2][Zoom3]` for comedic/dramatic triple emphasis. + +## Tilt/Rotation Effects + +| Marker | Description | +|--------|-------------| +| `[TiltLeft]` | Rotate -15 degrees | +| `[TiltRight]` | Rotate +15 degrees | +| `[NoTilt]` | Return to 0 degrees | +| `[TiltShake]` | Quick left-right shake (confusion/emphasis) | + +**Use case:** Tilt when saying something "off" or wrong, return to flat for correction. + +## Pan/Position Effects + +| Marker | Description | +|--------|-------------| +| `[PanLeft]` | Shift frame left (subject moves right) | +| `[PanRight]` | Shift frame right (subject moves left) | +| `[PanUp]` | Shift frame up | +| `[PanDown]` | Shift frame down | +| `[PanCenter]` | Return to center | + +**Use case:** Pan to make room for a slide appearing on one side. + +## Shake/Movement Effects + +| Marker | Description | +|--------|-------------| +| `[Shake]` | Brief screen shake (impact, surprise) | +| `[ShakeHard]` | Intense shake (explosion, error) | +| `[Wobble]` | Gentle continuous wobble | +| `[NoWobble]` | Stop wobble | + +**Use case:** Shake on "WRONG!" or when something crashes/fails. + +## Speed/Rhythm Effects + +| Marker | Description | +|--------|-------------| +| `[Beat]` | Single visual pulse (scale bump) | +| `[BeatStart]` | Start pulsing to rhythm | +| `[BeatStop]` | Stop pulsing | + +**Use case:** Rhythmic emphasis during lists or key points. + +## Transition Effects + +| Marker | Description | +|--------|-------------| +| `[Flash]` | Quick white flash | +| `[Blackout]` | Brief black frame | +| `[Glitch]` | Digital glitch effect | + +**Use case:** Transition between topics or for "record scratch" moments. + +## Picture-in-Picture Variations + +| Marker | Description | +|--------|-------------| +| `[PipGrow]` | Enlarge talking head cutout | +| `[PipShrink]` | Shrink talking head cutout | +| `[PipHide]` | Temporarily hide talking head | +| `[PipShow]` | Restore talking head | +| `[PipMove:corner]` | Move pip to different corner | + +**Use case:** Shrink self when showing important diagram, grow when making personal point. + +## Combination Presets + +| Marker | Description | +|--------|-------------| +| `[Emphasis]` | Zoom2 + slight tilt (general emphasis) | +| `[Surprise]` | Quick zoom + shake | +| `[Sarcasm]` | Slow zoom + tilt | +| `[Reset]` | Return all effects to default | + +--- + +## Architecture: The Camera Abstraction + +### The Core Insight + +All visual elements (slides, cutouts, talking head, background) exist in a **scene**. +The **camera** views the scene. When the camera zooms, tilts, or pans - everything +moves together, just like a real camera filming a physical set. + +``` +┌─────────────────────────────────────────────────────────┐ +│ SCENE │ +│ ┌─────────────────────────────────────────────────┐ │ +│ │ Background Layer │ │ +│ │ ┌─────────────┐ │ │ +│ │ │ Talking Head│ ┌──────────────────┐ │ │ +│ │ │ (cutout) │ │ Slide │ │ │ +│ │ └─────────────┘ │ (from .png) │ │ │ +│ │ └──────────────────┘ │ │ +│ └─────────────────────────────────────────────────┘ │ +└─────────────────────────────────────────────────────────┘ + │ + ▼ + ┌─────────────┐ + │ CAMERA │ + │ zoom: 1.25 │ + │ tilt: -15° │ + │ pan: 0, 0 │ + └─────────────┘ + │ + ▼ + ┌─────────────────┐ + │ Final Output │ + │ (1920x1080) │ + └─────────────────┘ +``` + +### Why This Matters + +**Keynote slides are designed for a specific frame.** If you create a slide with +an arrow pointing at where the talking head cutout will be, that spatial +relationship must be preserved when the camera zooms or tilts. + +If we zoomed only the background and not the slides, the arrow would point to +the wrong place. The camera abstraction ensures everything transforms together. + +### Camera Properties + +```python +@dataclass +class CameraState: + zoom: float = 1.0 # 1.0 = 100%, 1.25 = 125% + rotation: float = 0.0 # degrees, positive = clockwise + pan_x: float = 0.0 # -1.0 to 1.0, percentage of frame + pan_y: float = 0.0 # -1.0 to 1.0, percentage of frame + +@dataclass +class CameraKeyframe: + time: float # timestamp in seconds + state: CameraState + easing: str = "linear" # linear, ease-in, ease-out, ease-in-out +``` + +### Rendering Pipeline (Updated) + +``` +Current Pipeline: + Parse → Validate → Transform → Render + │ + ▼ + build_filter_complex() + │ + [bg] → overlays → [vout] + +New Pipeline: + Parse → Validate → Transform → Render + │ + Extract camera + keyframes from + markers + │ + ▼ + build_filter_complex() + │ + [bg] → overlays → [scene] + │ + apply_camera_transform() + │ + [scene] → zoom/rotate/pan → [vout] +``` + +### FFmpeg Implementation + +The camera transform is a **final filter stage** applied to the composed scene: + +``` +# Compose scene (existing code) +[0:v]scale=1920:1080[bg]; +[bg][slide1]overlay=...[s1]; +[s1][talkinghead]overlay=...[scene]; + +# Camera transform (new) +[scene]scale=iw*{zoom}:ih*{zoom}, + rotate={rotation}*PI/180:fillcolor=black, + crop=1920:1080:(iw-1920)/2:(ih-1080)/2[vout] +``` + +For smooth animated zoom (using expressions): +``` +[scene]zoompan=z='if(between(t,5,8), 1+0.25*(t-5)/3, 1)': + x='iw/2-(iw/zoom/2)': + y='ih/2-(ih/zoom/2)': + d=1:s=1920x1080:fps=30[vout] +``` + +### Camera Events in Timeline + +New model for camera changes: + +```python +@dataclass +class CameraEvent: + time: float + target_state: CameraState + duration: float = 0.0 # 0 = instant snap + easing: str = "ease-out" +``` + +Markers map to camera events: +- `[Zoom2]` → `CameraEvent(time=t, target_state=CameraState(zoom=1.25), duration=0.2)` +- `[TiltLeft]` → `CameraEvent(time=t, target_state=CameraState(rotation=-15), duration=0.3)` +- `[Reset]` → `CameraEvent(time=t, target_state=CameraState(), duration=0.2)` + +### Considerations + +1. **Overscan**: When zoomed in, we're cropping. The scene must be rendered + larger than output (e.g., 2x) to have room for zoom without quality loss. + +2. **Rotation center**: Rotate around frame center, not corner. + +3. **State accumulation**: `[Zoom2]` then `[TiltLeft]` means zoom AND tilt + are both active. `[Reset]` clears all. + +4. **Interaction with cutouts**: Cutout positions are in scene-space, so they + transform naturally with the camera. No special handling needed. + +5. **Slides stay synced**: Keynote exports are positioned for the base frame. + Camera zoom/tilt transforms them identically to everything else. + +--- + +## Implementation Plan + +### Phase 1: Camera Data Model ✓ +- [x] Add `CameraState` and `CameraEvent` to models.py +- [x] Add camera effect markers to transformer.py +- [x] Generate camera keyframes from markers + +### Phase 2: Render Pipeline ✓ +- [x] Modify renderer to compose to `[scene]` instead of `[vout]` +- [x] Add camera transform stage after composition +- [ ] Handle overscan (render larger, crop to output) - deferred, upsampling OK for now + +### Phase 3: Smooth Animation (partial) +- [x] Support animated transitions between keyframes (linear interpolation) +- [ ] Implement easing functions as FFmpeg expressions (ease-in, ease-out) +- [ ] Test with rapid zoom sequences + +### Phase 4: Effect Presets ✓ +- [x] Define presets (Zoom0/1/2/3, TiltLeft/Right/NoTilt, Pan*, Reset) +- [x] Presets defined in `CAMERA_PRESETS` dict in models.py +- [ ] Support custom parameterized markers `[Zoom:1.35]` - future enhancement diff --git a/example/citations.json b/example/citations.json new file mode 100644 index 0000000..058dd1b --- /dev/null +++ b/example/citations.json @@ -0,0 +1,10 @@ +[ + { + "reference": "Gnommo Documentation - https://github.com/example/gnommo", + "context": "" + }, + { + "reference": "FFmpeg Documentation - https://ffmpeg.org/documentation.html", + "context": "" + } +] \ No newline at end of file diff --git a/example/manuscript.txt b/example/manuscript.txt index 8e52f2d..7844a7f 100644 --- a/example/manuscript.txt +++ b/example/manuscript.txt @@ -1,5 +1,19 @@ -Welcome to GnommoEditor, a code-first video editing system. [S1] +[S1] +This is the first slide. It appears immediately. [cite:Gnommo Documentation - https://github.com/example/gnommo] -In this example, we demonstrate how slides appear at specific timestamps based on markers in the transcript. [S2] +[S2] +However, this is the second slide. It should appear 1 second prior to when I say "however" -And that's the end of our demo. +[S3] +[video:Zoomin_MontageZoom] +This is me talking alongside a video. The video is constrained within the red square. Notice how the video stops immediately when we make the transition to the next slide. [cite:FFmpeg Documentation - https://ffmpeg.org/documentation.html] + +[S4] +I will continue to talk without pause, but in the finished recording - there will be a pause before the narration continues. Now a video will play that pauses the narration + +[S5] +[video:gnommologo] + +Notice how my voice continues after the video finished. + +[S6] diff --git a/example/media/slides/example/slides.json b/example/media/slides/example/slides.json new file mode 100644 index 0000000..1ea7963 --- /dev/null +++ b/example/media/slides/example/slides.json @@ -0,0 +1,26 @@ +{ + "S1": { + "image": "example.001.png", + "type": "fullscreen" + }, + "S2": { + "image": "example.002.png", + "type": "fullscreen" + }, + "S3": { + "image": "example.003.png", + "type": "fullscreen" + }, + "S4": { + "image": "example.004.png", + "type": "fullscreen" + }, + "S5": { + "image": "example.005.png", + "type": "fullscreen" + }, + "S6": { + "image": "example.006.png", + "type": "fullscreen" + } +} \ No newline at end of file diff --git a/example/media/videos/intermediate/segments/concat_list.txt b/example/media/videos/intermediate/segments/concat_list.txt new file mode 100644 index 0000000..c8d583a --- /dev/null +++ b/example/media/videos/intermediate/segments/concat_list.txt @@ -0,0 +1,2 @@ +file '/Users/jenstandstad/Projects/gnommo/example/media/videos/intermediate/talking_head_batch0.mov' +file '/Users/jenstandstad/Projects/gnommo/example/media/videos/intermediate/segments/segment_0002.mov' diff --git a/example/media/videos/talking_head.transcript.json b/example/media/videos/talking_head.transcript.json new file mode 100644 index 0000000..b7a72b3 --- /dev/null +++ b/example/media/videos/talking_head.transcript.json @@ -0,0 +1,497 @@ +[ + { + "word": "This", + "start": 10.72, + "end": 11.4 + }, + { + "word": "is", + "start": 11.4, + "end": 11.6 + }, + { + "word": "the", + "start": 11.6, + "end": 11.78 + }, + { + "word": "first", + "start": 11.78, + "end": 11.98 + }, + { + "word": "slide.", + "start": 11.98, + "end": 12.44 + }, + { + "word": "It", + "start": 13.02, + "end": 13.3 + }, + { + "word": "appears", + "start": 13.3, + "end": 13.66 + }, + { + "word": "immediately.", + "start": 13.66, + "end": 14.3 + }, + { + "word": "However,", + "start": 15.34, + "end": 16.02 + }, + { + "word": "this", + "start": 16.34, + "end": 16.46 + }, + { + "word": "is", + "start": 16.46, + "end": 16.58 + }, + { + "word": "the", + "start": 16.58, + "end": 16.76 + }, + { + "word": "second", + "start": 16.76, + "end": 17.04 + }, + { + "word": "slide.", + "start": 17.04, + "end": 17.4 + }, + { + "word": "It", + "start": 17.74, + "end": 17.96 + }, + { + "word": "should", + "start": 17.96, + "end": 18.2 + }, + { + "word": "appear", + "start": 18.2, + "end": 18.54 + }, + { + "word": "one", + "start": 18.54, + "end": 18.98 + }, + { + "word": "second", + "start": 18.98, + "end": 19.46 + }, + { + "word": "prior", + "start": 19.46, + "end": 19.88 + }, + { + "word": "to", + "start": 19.88, + "end": 20.1 + }, + { + "word": "the", + "start": 20.1, + "end": 20.22 + }, + { + "word": "word", + "start": 20.22, + "end": 20.52 + }, + { + "word": "to", + "start": 20.52, + "end": 21.14 + }, + { + "word": "say", + "start": 21.14, + "end": 21.42 + }, + { + "word": "whoever", + "start": 21.42, + "end": 21.8 + }, + { + "word": "the", + "start": 21.8, + "end": 22.16 + }, + { + "word": "first", + "start": 22.16, + "end": 22.4 + }, + { + "word": "time.", + "start": 22.4, + "end": 22.68 + }, + { + "word": "This", + "start": 24.28, + "end": 24.96 + }, + { + "word": "is", + "start": 24.96, + "end": 25.12 + }, + { + "word": "me", + "start": 25.12, + "end": 25.36 + }, + { + "word": "taking,", + "start": 25.36, + "end": 25.74 + }, + { + "word": "talking", + "start": 26.12, + "end": 27.12 + }, + { + "word": "alongside", + "start": 27.12, + "end": 27.64 + }, + { + "word": "a", + "start": 27.64, + "end": 27.88 + }, + { + "word": "video.", + "start": 27.88, + "end": 28.16 + }, + { + "word": "The", + "start": 28.16, + "end": 28.92 + }, + { + "word": "video", + "start": 28.92, + "end": 29.18 + }, + { + "word": "is", + "start": 29.18, + "end": 29.36 + }, + { + "word": "constrained", + "start": 29.36, + "end": 29.76 + }, + { + "word": "within", + "start": 29.76, + "end": 30.14 + }, + { + "word": "the", + "start": 30.14, + "end": 30.32 + }, + { + "word": "red", + "start": 30.32, + "end": 30.48 + }, + { + "word": "square.", + "start": 30.48, + "end": 30.9 + }, + { + "word": "Notice", + "start": 31.26, + "end": 31.44 + }, + { + "word": "how", + "start": 31.44, + "end": 31.74 + }, + { + "word": "the", + "start": 31.74, + "end": 31.92 + }, + { + "word": "video", + "start": 31.92, + "end": 32.14 + }, + { + "word": "stops", + "start": 32.14, + "end": 32.44 + }, + { + "word": "immediately", + "start": 32.44, + "end": 32.94 + }, + { + "word": "when", + "start": 32.94, + "end": 33.36 + }, + { + "word": "we", + "start": 33.36, + "end": 33.54 + }, + { + "word": "make", + "start": 33.54, + "end": 33.74 + }, + { + "word": "the", + "start": 33.74, + "end": 33.94 + }, + { + "word": "transition", + "start": 33.94, + "end": 34.38 + }, + { + "word": "to", + "start": 34.38, + "end": 34.68 + }, + { + "word": "the", + "start": 34.68, + "end": 34.8 + }, + { + "word": "next", + "start": 34.8, + "end": 35.02 + }, + { + "word": "slide.", + "start": 35.02, + "end": 35.48 + }, + { + "word": "I", + "start": 37.18, + "end": 37.72 + }, + { + "word": "will", + "start": 37.72, + "end": 37.78 + }, + { + "word": "continue", + "start": 37.78, + "end": 38.08 + }, + { + "word": "to", + "start": 38.08, + "end": 38.32 + }, + { + "word": "talk", + "start": 38.32, + "end": 38.56 + }, + { + "word": "without", + "start": 38.56, + "end": 38.88 + }, + { + "word": "pause,", + "start": 38.88, + "end": 39.24 + }, + { + "word": "but", + "start": 39.46, + "end": 39.56 + }, + { + "word": "in", + "start": 39.56, + "end": 39.68 + }, + { + "word": "the", + "start": 39.68, + "end": 39.74 + }, + { + "word": "finished", + "start": 39.74, + "end": 39.98 + }, + { + "word": "recording", + "start": 39.98, + "end": 40.46 + }, + { + "word": "there", + "start": 40.46, + "end": 41.18 + }, + { + "word": "will", + "start": 41.18, + "end": 41.36 + }, + { + "word": "be", + "start": 41.36, + "end": 41.54 + }, + { + "word": "a", + "start": 41.54, + "end": 41.64 + }, + { + "word": "pause", + "start": 41.64, + "end": 41.92 + }, + { + "word": "before", + "start": 41.92, + "end": 42.28 + }, + { + "word": "the", + "start": 42.28, + "end": 42.5 + }, + { + "word": "narration", + "start": 42.5, + "end": 43.0 + }, + { + "word": "continues.", + "start": 43.0, + "end": 43.64 + }, + { + "word": "Now", + "start": 44.38, + "end": 44.52 + }, + { + "word": "a", + "start": 44.52, + "end": 44.68 + }, + { + "word": "video", + "start": 44.68, + "end": 44.9 + }, + { + "word": "will", + "start": 44.9, + "end": 45.08 + }, + { + "word": "play", + "start": 45.08, + "end": 45.36 + }, + { + "word": "that", + "start": 45.36, + "end": 45.76 + }, + { + "word": "pauses", + "start": 45.76, + "end": 46.52 + }, + { + "word": "the", + "start": 46.52, + "end": 46.76 + }, + { + "word": "narration.", + "start": 46.76, + "end": 47.2 + }, + { + "word": "Notice", + "start": 48.64, + "end": 49.18 + }, + { + "word": "how", + "start": 49.18, + "end": 49.42 + }, + { + "word": "my", + "start": 49.42, + "end": 49.58 + }, + { + "word": "voice", + "start": 49.58, + "end": 49.8 + }, + { + "word": "continues", + "start": 49.8, + "end": 50.36 + }, + { + "word": "after", + "start": 50.36, + "end": 50.84 + }, + { + "word": "the", + "start": 50.84, + "end": 51.02 + }, + { + "word": "video", + "start": 51.02, + "end": 51.24 + }, + { + "word": "finished.", + "start": 51.24, + "end": 51.76 + } +] \ No newline at end of file diff --git a/example/media/videos/videos.json b/example/media/videos/videos.json new file mode 100644 index 0000000..b3ff4ca --- /dev/null +++ b/example/media/videos/videos.json @@ -0,0 +1,39 @@ +{ + "talking_head": { + "source_file": "talking_head.mov", + "output_file": "talking_head_processed.mov", + "cutout": "talkinghead", + "always_visible": true, + "filter": [ + { + "type": "chroma_key", + "color": [131, 177, 83], + "similarity": 0.04, + "blend": 0.025, + "spill": 0.05 + }, + { + "type": "mask", + "left": 0.05, + "right": 0.10 + } + ] + }, + "gnommologo": { + "source_file": "Logo.mov", + "is_shared": true, + "cutout": "fullscreen", + "pause_narration": 0 , + "take": 10, + "skip": 0 + }, + "Zoomin_MontageZoom": { + "description": "Montage zoom", + "source_file": "MontageZoom.mp4", + "output_file": "MontageZoom.mp4", + "pause_narration":3, + "cutout": "square", + "is_shared": true, + "filter": [] + } +} diff --git a/example/project.json b/example/project.json index 9bd1b68..7c37d64 100644 --- a/example/project.json +++ b/example/project.json @@ -1,11 +1,35 @@ { + "id": "VideoExample", + "name": "Example", + "description": "In this video, I demonstrate the Gnommo video editing pipeline - a code-first approach to creating presenter-mode videos from Keynote presentations.", + "footer": "Subscribe for more tutorials!\nTwitter: @example", "resolution": [1920, 1080], "fps": 30, - "talkinghead": { - "x": 50, - "y": 600, - "targetheight": 400 - }, - "defaultSlideType": "square", - "background_video": "" + "gnommo_scratch": null, + "defaultSlideType": "fullscreen", + "keynote_file": "media/example.key", + "transcript": "media/videos/talking_head.transcript.json", + "background": "shared_assets/solarpunk.png", + "videos": "media/videos/videos.json", + "slides": "media/slides/Example/slides.json", + "audio": "media/audio/audio.json", + "main_video": "talking_head", + "cutouts": { + "talkinghead": { + "x": "-10%", + "y": "40%", + "height": "60%" + }, + "square": { + "x": "45%", + "y": "3%", + "width": "53%", + "height": "94%" + }, + "fullscreen": { + "x": "0%", + "y": "0%", + "height": "100%" + } + } } diff --git a/example/slides.json b/example/slides.json deleted file mode 100644 index f28a190..0000000 --- a/example/slides.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "S1": { - "image": "S1.png", - "type": "square" - }, - "S2": { - "image": "S2.png", - "type": "square" - } -} diff --git a/example/transcript.csv b/example/transcript.csv deleted file mode 100644 index bc1fb7d..0000000 --- a/example/transcript.csv +++ /dev/null @@ -1,8 +0,0 @@ -t,word -0.00,Hello -0.30,world -0.60,[S1] -1.50,Second -1.80,slide -2.00,[S2] -2.50,End diff --git a/example/videos.json b/example/videos.json deleted file mode 100644 index a3c0606..0000000 --- a/example/videos.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "talking_head": { - "file": "media/talking_head.mp4", - "preprocess": [] - } -} diff --git a/gnommo.sh b/gnommo.sh index 831a8e4..5a35012 100755 --- a/gnommo.sh +++ b/gnommo.sh @@ -1,154 +1,21 @@ #!/bin/bash # # GnommoEditor - Code-first video editing pipeline +# This is a thin wrapper that activates the venv and runs the Python CLI. # -# Usage: -# gnommo.sh -p Render project -# gnommo.sh -p import Generate slides.json from image files -# gnommo.sh -p validate Validate only -# gnommo.sh -p preprocess Apply video preprocessing filters -# gnommo.sh -p transcribe Transcribe video -# gnommo.sh -p align Align markers to transcript -# gnommo.sh -p all Full pipeline: transcribe → align → render +# Usage: gnommo -p [action] [options] +# Run with -h for full help. # -set -e - SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" VENV_PYTHON="$SCRIPT_DIR/venv/bin/python" # Check for venv if [[ ! -f "$VENV_PYTHON" ]]; then echo "Error: Virtual environment not found at $SCRIPT_DIR/venv" - echo "Create it with: python -m venv venv && ./venv/bin/pip install openai-whisper" + echo "Create it with: python -m venv venv && ./venv/bin/pip install -e . openai-whisper" exit 1 fi -# Parse arguments -PROJECT="" -COMMAND="render" -VERBOSE="" -FORCE="" - -usage() { - echo "Usage: gnommo.sh -p [command] [options]" - echo "" - echo "Commands:" - echo " render Render video (default)" - echo " import Generate slides.json from image files" - echo " validate Validate project only" - echo " preprocess Apply video preprocessing filters (chroma key, etc.)" - echo " transcribe Transcribe video audio" - echo " align Align manuscript to transcript" - echo " all Full pipeline: transcribe → align → render" - echo "" - echo "Options:" - echo " -p Project directory (required)" - echo " -v Verbose output" - echo " -f Force overwrite existing files" - echo " -h Show this help" - echo "" - echo "Examples:" - echo " gnommo.sh -p video1 # Render video1 project" - echo " gnommo.sh -p video1 import # Generate slides.json" - echo " gnommo.sh -p video1 import -f # Force overwrite slides.json" - echo " gnommo.sh -p video1 validate # Validate only" - echo " gnommo.sh -p video1 all # Full pipeline" - exit 0 -} - -while [[ $# -gt 0 ]]; do - case $1 in - -p|--project) - PROJECT="$2" - shift 2 - ;; - -v|--verbose) - VERBOSE="-v" - shift - ;; - -f|--force) - FORCE="-f" - shift - ;; - -h|--help) - usage - ;; - import|validate|render|preprocess|transcribe|align|all) - COMMAND="$1" - shift - ;; - *) - echo "Unknown option: $1" - usage - ;; - esac -done - -# Validate project argument -if [[ -z "$PROJECT" ]]; then - echo "Error: Project directory required (-p )" - echo "" - usage -fi - -if [[ ! -d "$PROJECT" ]]; then - echo "Error: Project directory not found: $PROJECT" - exit 1 -fi - -if [[ ! -f "$PROJECT/project.json" ]]; then - echo "Error: project.json not found in $PROJECT" - exit 1 -fi - -# Run commands using new CLI interface -run_gnommo() { - "$VENV_PYTHON" -m gnommo -p "$PROJECT" -a "$1" $VERBOSE -} - -run_gnommo_import() { - "$VENV_PYTHON" -m gnommo -p "$PROJECT" -a validate -i $FORCE $VERBOSE -} - -case $COMMAND in - import) - echo "=== Importing assets for $PROJECT ===" - run_gnommo_import - ;; - - validate) - echo "=== Validating $PROJECT ===" - run_gnommo validate - ;; - - transcribe) - echo "=== Transcribing $PROJECT ===" - run_gnommo transcribe - ;; - - align) - echo "=== Aligning $PROJECT ===" - run_gnommo align - ;; - - render) - echo "=== Rendering $PROJECT ===" - run_gnommo render - ;; - - preprocess) - echo "=== Preprocessing $PROJECT ===" - run_gnommo preprocess - ;; - - all) - echo "=== Full Pipeline: $PROJECT ===" - run_gnommo all - ;; - - *) - echo "Unknown command: $COMMAND" - usage - ;; -esac +# Pass all arguments directly to the Python CLI +exec "$VENV_PYTHON" -m gnommo "$@" diff --git a/gnommo/aligner.py b/gnommo/aligner.py deleted file mode 100644 index abea290..0000000 --- a/gnommo/aligner.py +++ /dev/null @@ -1,199 +0,0 @@ -"""Alignment stage: match manuscript markers to transcript timestamps.""" - -import csv -import re -from dataclasses import dataclass -from pathlib import Path - -from .errors import GnommoError -from .transcriber import TranscribedWord - - -class AlignmentError(GnommoError): - """Error during alignment.""" - pass - - -@dataclass -class MarkerAlignment: - """A marker with its aligned timestamp.""" - marker_id: str - timestamp: float - matched_phrase: str - confidence: float # 0-1, how confident the match is - - -def extract_marker_contexts(manuscript_text: str) -> list[tuple[str, str]]: - """ - Extract markers and the text immediately following them. - - Returns: - List of (marker_id, following_text) tuples - """ - # Split by markers, keeping the markers - parts = re.split(r"\[([A-Za-z0-9_]+)\]", manuscript_text) - - # parts will be: [text_before, marker1, text_after1, marker2, text_after2, ...] - contexts = [] - - for i in range(1, len(parts), 2): - marker_id = parts[i] - if i + 1 < len(parts): - following_text = parts[i + 1].strip() - # Get first sentence or first N words - following_text = _get_first_phrase(following_text) - contexts.append((marker_id, following_text)) - - return contexts - - -def _get_first_phrase(text: str, max_words: int = 10) -> str: - """Extract first phrase (up to first sentence end or max_words).""" - # Clean up the text - text = text.replace("\n", " ").strip() - - # Find first sentence boundary - match = re.search(r"[.!?]", text) - if match and match.start() < 200: - text = text[: match.start()] - - # Limit to max_words - words = text.split()[:max_words] - return " ".join(words) - - -def normalize_text(text: str) -> str: - """Normalize text for matching (lowercase, remove punctuation).""" - text = text.lower() - text = re.sub(r"[^\w\s]", "", text) - text = re.sub(r"\s+", " ", text) - return text.strip() - - -def find_phrase_in_transcript( - phrase: str, - transcript: list[TranscribedWord], - start_from: int = 0, -) -> tuple[int, float]: - """ - Find a phrase in the transcript and return the word index and timestamp. - - Uses sliding window matching with normalization. - - Returns: - Tuple of (word_index, timestamp) or (-1, 0.0) if not found - """ - phrase_normalized = normalize_text(phrase) - phrase_words = phrase_normalized.split() - - if not phrase_words: - return -1, 0.0 - - # Try to find increasingly shorter prefixes - for length in range(len(phrase_words), 2, -1): - target = " ".join(phrase_words[:length]) - - # Sliding window through transcript - for i in range(start_from, len(transcript) - length + 1): - window_words = [normalize_text(transcript[j].word) for j in range(i, i + length)] - window_text = " ".join(window_words) - - if target in window_text or window_text in target: - return i, transcript[i].start - - # Fallback: try to find just the first few words - if len(phrase_words) >= 2: - target = " ".join(phrase_words[:3]) - for i in range(start_from, len(transcript) - 2): - window_words = [normalize_text(transcript[j].word) for j in range(i, min(i + 5, len(transcript)))] - window_text = " ".join(window_words) - if phrase_words[0] in window_text and phrase_words[1] in window_text: - return i, transcript[i].start - - return -1, 0.0 - - -def align_markers( - manuscript_text: str, - transcript: list[TranscribedWord], - offset_seconds: float = -1.0, -) -> list[MarkerAlignment]: - """ - Align manuscript markers to transcript timestamps. - - Args: - manuscript_text: Full manuscript text with [S1], [S2] etc. - transcript: Word-level transcript with timestamps - offset_seconds: Offset to apply to found timestamps (default -1.0) - - Returns: - List of MarkerAlignment with timestamps - """ - contexts = extract_marker_contexts(manuscript_text) - alignments: list[MarkerAlignment] = [] - - last_index = 0 - - for marker_id, following_text in contexts: - idx, timestamp = find_phrase_in_transcript( - following_text, transcript, start_from=last_index - ) - - if idx >= 0: - # Apply offset (e.g., -1 second before the word) - adjusted_time = max(0.0, timestamp + offset_seconds) - alignments.append(MarkerAlignment( - marker_id=marker_id, - timestamp=adjusted_time, - matched_phrase=following_text[:50], - confidence=1.0, - )) - last_index = idx - else: - # Could not find match - report but continue - alignments.append(MarkerAlignment( - marker_id=marker_id, - timestamp=-1.0, # Indicates not found - matched_phrase=following_text[:50], - confidence=0.0, - )) - - return alignments - - -def save_aligned_transcript( - alignments: list[MarkerAlignment], - transcript: list[TranscribedWord], - output_path: Path, -) -> None: - """ - Save aligned transcript as CSV compatible with gnommo's transcript.csv format. - - Format: - t,word - 0.00,Hello - 1.50,[S1] - 1.51,This - ... - """ - # Build list of (timestamp, word) including markers - entries: list[tuple[float, str]] = [] - - # Add all words from transcript - for word in transcript: - entries.append((word.start, word.word)) - - # Add markers at their aligned positions - for alignment in alignments: - if alignment.timestamp >= 0: - entries.append((alignment.timestamp, f"[{alignment.marker_id}]")) - - # Sort by timestamp - entries.sort(key=lambda x: x[0]) - - # Write CSV - with open(output_path, "w", encoding="utf-8", newline="") as f: - writer = csv.writer(f) - writer.writerow(["t", "word"]) - for timestamp, word in entries: - writer.writerow([f"{timestamp:.2f}", word]) diff --git a/gnommo/cli.py b/gnommo/cli.py index 0d0a86b..e88ac77 100644 --- a/gnommo/cli.py +++ b/gnommo/cli.py @@ -3,15 +3,19 @@ import argparse import json import re +import shutil +import subprocess import sys +from datetime import datetime from pathlib import Path - from . import __version__ from .errors import GnommoError, ParseError, ValidationError, RenderError +from typing import Optional, Union class NotImplementedException(GnommoError): """Feature not yet implemented.""" + pass @@ -20,6 +24,19 @@ def main() -> int: parser = argparse.ArgumentParser( prog="gnommo", description="GnommoEditor - A code-first video editing pipeline", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + gnommo -p video1 render Render the full project + gnommo -p video1 render --slides S1:S10 Render only slides S1-S10 + gnommo -p video1 render --slides S10: Render from S10 to end + gnommo -p video1 validate Validate only + gnommo -p video1 import Generate slides.json from images + gnommo -p video1 pre Preprocess videos (chroma key, etc.) + gnommo -p video1 all Full pipeline: transcribe → align → render + gnommo -p video1 render --dry-run Show FFmpeg command without running + gnommo -p video1 description Generate YouTube description file +""", ) parser.add_argument( "--version", @@ -29,46 +46,68 @@ def main() -> int: # Required arguments parser.add_argument( - "-p", "--project", + "-p", + "--project", type=str, required=True, - help="Project name (directory in current folder)", + help="Project directory", ) parser.add_argument( - "-a", "--action", + "action", type=str, - choices=["validate", "preprocess", "render", "all", "transcribe", "align"], - required=True, - help="Action to perform", + nargs="?", + default="render", + choices=[ + "validate", + "preprocess", + "pre", + "render", + "all", + "transcribe", + "align", + "import", + "description", + ], + help="Action to perform (default: render)", ) # Optional arguments parser.add_argument( - "-i", "--import", - dest="import_assets", - action="store_true", - help="Import assets and generate metadata JSON files", - ) - parser.add_argument( - "-v", "--verbose", + "-v", + "--verbose", action="store_true", help="Verbose output", ) parser.add_argument( - "-f", "--force", + "-f", + "--force", action="store_true", - help="Force destructive changes (overwrite existing files)", - ) - parser.add_argument( - "--no-cache", - action="store_true", - help="Force cache break (not implemented)", + help="Force overwrite existing files", ) parser.add_argument( "--dry-run", action="store_true", help="Show what would be done without executing", ) + parser.add_argument( + "--slides", + type=str, + help="Render only a range of slides (e.g., S1:S10, S5:, S10:S20)", + ) + parser.add_argument( + "--res", + type=str, + choices=["low", "full"], + default="full", + help="Resolution: 'low' (490x270) for fast preview, 'full' for project resolution", + ) + parser.add_argument( + "-w", + "--workers", + type=int, + default=1, + help="Number of parallel workers for preprocessing (default: 1)", + ) args = parser.parse_args() @@ -78,27 +117,34 @@ def main() -> int: project_path = Path.cwd() / project_path try: - # Check for --no-cache - if args.no_cache: - raise NotImplementedException("--no-cache is not yet implemented") - - # Handle import mode - if args.import_assets: - return cmd_import(project_path, args.force, args.verbose) - # Handle actions - if args.action == "validate": + action = args.action + + if action == "import": + return cmd_import(project_path, args.force, args.verbose) + elif action == "validate": return cmd_validate(project_path, args.verbose) - elif args.action == "preprocess": - return cmd_preprocess(project_path, args.verbose, args.dry_run) - elif args.action == "render": - return cmd_render(project_path, args.verbose, args.dry_run) - elif args.action == "transcribe": + elif action in ("preprocess", "pre"): + return cmd_preprocess(project_path, args.verbose, args.dry_run, args.force, args.workers) + elif action == "render": + return cmd_render( + project_path, + args.verbose, + args.dry_run, + args.slides, + args.res, + args.force, + ) + elif action == "transcribe": return cmd_transcribe(project_path, args.verbose) - elif args.action == "align": + elif action == "align": return cmd_align(project_path, args.verbose) - elif args.action == "all": - return cmd_all(project_path, args.verbose, args.dry_run) + elif action == "all": + return cmd_all( + project_path, args.verbose, args.dry_run, args.res, args.force + ) + elif action == "description": + return cmd_description(project_path, args.verbose) except GnommoError as e: print(f"Error: {e}", file=sys.stderr) @@ -114,6 +160,7 @@ def main() -> int: # Import Command # ============================================================================= + def cmd_import(project_path: Path, force: bool, verbose: bool) -> int: """Import assets and generate metadata JSON files.""" print(f"Importing assets for: {project_path.name}") @@ -122,37 +169,139 @@ def cmd_import(project_path: Path, force: bool, verbose: bool) -> int: print(f"Error: Project directory not found: {project_path}", file=sys.stderr) return 1 - # Check for existing files that would be overwritten + # Import videos from media/videos directory + videos_dir = project_path / "media" / "videos" + if videos_dir.exists(): + _import_videos(videos_dir, verbose) + + # Import presenter notes from Keynote file (also exports slide PNGs) + keynote_files = list(project_path.glob("*.key")) + if keynote_files: + keynote_file = keynote_files[0] # Use first .key file found + if len(keynote_files) > 1: + print(f" Warning: Multiple .key files found, using {keynote_file.name}") + _import_presenter_notes(project_path, keynote_file, verbose) + + # Generate slides.json for each slide directory (after Keynote export) slides_base = project_path / "media" / "slides" - slides_dirs = [d for d in slides_base.glob("*/") if d.is_dir()] if slides_base.exists() else [] - videos_json = project_path / "videos.json" - - files_to_create = [] - - # Check for slide directories to import + slides_dirs = ( + [d for d in slides_base.glob("*/") if d.is_dir()] + if slides_base.exists() + else [] + ) for slides_dir in slides_dirs: - slides_json = slides_dir / "slides.json" - if slides_json.exists() and not force: - print(f"Warning: {slides_json} already exists. Use -f to overwrite.") - return 1 - files_to_create.append(("slides", slides_dir)) + _generate_slides_json(slides_dir, verbose) + else: + if verbose: + print(" No .key file found, skipping presenter notes import") - if not force and files_to_create: - print("\nThe following files will be created/overwritten:") - for ftype, fpath in files_to_create: - print(f" - {fpath}/slides.json") - print("\nUse -f/--force to proceed.") - return 1 - - # Generate slides.json for each directory - for ftype, slides_dir in files_to_create: - if ftype == "slides": - _generate_slides_json(slides_dir, verbose) + # Import shared assets (pexels, etc.) from shared_assets directory + # Look for shared_assets relative to project or in parent directories + shared_assets_dir = _find_shared_assets(project_path) + if shared_assets_dir: + _import_shared_assets(shared_assets_dir, verbose) print("Import complete.") return 0 +def _find_shared_assets(project_path: Path) -> Optional[Path]: + """Find the shared_assets directory. + + Looks in: + 1. project_path/shared_assets + 2. project_path/../shared_assets (sibling to project) + """ + # Check if shared_assets is inside project + if (project_path / "shared_assets").exists(): + return project_path / "shared_assets" + + # Check if shared_assets is sibling to project + if (project_path.parent / "shared_assets").exists(): + return project_path.parent / "shared_assets" + + return None + + +def _import_shared_assets(shared_assets_dir: Path, verbose: bool) -> None: + """Import video files from shared_assets directory into videos.json. + + Scans the root level and all subdirectories for video files and creates + a unified videos.json in shared_assets/. + + Video IDs use the filename for root-level files (e.g., "Logo") or + are prefixed with the subfolder name for subdirectory files (e.g., "pexels/filename"). + """ + video_extensions = {".mov", ".mp4", ".webm", ".avi", ".mkv", ".m4v"} + + # Find all video files in shared_assets (root level and subdirectories) + video_files: list[tuple[Path, Path]] = [] # (relative_path, absolute_path) + + for item in shared_assets_dir.iterdir(): + if item.name.startswith("."): + continue + + if item.is_file(): + # Video file directly in shared_assets root + if ( + item.suffix.lower() in video_extensions + and not item.name.endswith("_processed.mov") + and not item.name.endswith("_processed.webm") + ): + rel_path = item.relative_to(shared_assets_dir) + video_files.append((rel_path, item)) + elif item.is_dir(): + # Scan subdirectories recursively + for video_file in item.rglob("*"): + if ( + video_file.is_file() + and video_file.suffix.lower() in video_extensions + and not video_file.name.endswith("_processed.mov") + and not video_file.name.endswith("_processed.webm") + ): + rel_path = video_file.relative_to(shared_assets_dir) + video_files.append((rel_path, video_file)) + + if not video_files: + if verbose: + print(f" No video files found in {shared_assets_dir}") + return + + # Load existing videos.json if it exists + videos_json_path = shared_assets_dir / "videos.json" + existing_videos: dict = {} + if videos_json_path.exists(): + with open(videos_json_path, "r", encoding="utf-8") as f: + existing_videos = json.load(f) + + # Add new videos (don't overwrite existing) + added_count = 0 + for rel_path, abs_path in sorted(video_files): + # Use path relative to shared_assets without extension as video_id + # e.g., "Logo" for root files, "pexels/6759604-hd" for subdirectory files + video_id = str(rel_path.with_suffix("")) + + if video_id in existing_videos: + if verbose: + print(f" Skipping {video_id} (already exists)") + continue + + existing_videos[video_id] = { + "source_file": str(rel_path), + } + added_count += 1 + if verbose: + print(f" Added: {video_id}") + + if added_count > 0: + # Write updated videos.json + with open(videos_json_path, "w", encoding="utf-8") as f: + json.dump(existing_videos, f, indent=2) + print(f" Updated {videos_json_path} (+{added_count} shared assets)") + else: + print(f" No new shared assets to add") + + def _generate_slides_json(directory: Path, verbose: bool) -> None: """Generate slides.json from Keynote export folder.""" extensions = {".png", ".gif", ".pdf", ".jpg", ".jpeg"} @@ -194,10 +343,152 @@ def _generate_slides_json(directory: Path, verbose: bool) -> None: print(f" [{slide_id}]") +def _import_videos(videos_dir: Path, verbose: bool) -> None: + """Import video files into videos.json. + + Scans the videos directory for video files and adds them to videos.json. + Uses the filename (without extension) as the video_id. + Does not overwrite existing entries - only adds new ones. + """ + video_extensions = {".mov", ".mp4", ".webm", ".avi", ".mkv", ".m4v"} + + # Find all video files (exclude processed outputs and files in subdirs) + video_files = [ + f + for f in videos_dir.iterdir() + if f.is_file() + and f.suffix.lower() in video_extensions + and not f.name.endswith("_processed.mov") + and not f.name.endswith("_processed.webm") + ] + + if not video_files: + if verbose: + print(f" No new video files found in {videos_dir}") + return + + # Load existing videos.json if it exists + videos_json_path = videos_dir / "videos.json" + existing_videos: dict = {} + if videos_json_path.exists(): + with open(videos_json_path, "r", encoding="utf-8") as f: + existing_videos = json.load(f) + + # Add new videos (don't overwrite existing) + added_count = 0 + for video_file in sorted(video_files): + # Use filename without extension as video_id + video_id = video_file.stem + + if video_id in existing_videos: + if verbose: + print( + f" Skipping {video_id} (already exists). Change manually if needed" + ) + continue + + existing_videos[video_id] = { + "source_file": video_file.name, + "output_file": video_file.name, + "cutout": "square", + "filter": [], + } + added_count += 1 + if verbose: + print(f" Added: {video_id}") + + if added_count > 0: + # Write updated videos.json + with open(videos_json_path, "w", encoding="utf-8") as f: + json.dump(existing_videos, f, indent=2) + print(f" Updated {videos_json_path} (+{added_count} videos)") + else: + print(f" No new videos to add in {videos_dir}") + + +def _import_presenter_notes( + project_path: Path, keynote_file: Path, verbose: bool +) -> None: + """Extract presenter notes from Keynote and write to manuscript.txt. + + Uses the JXA script (extract_keynote_notes.js) to extract notes via osascript. + Also exports slides as PNG images to media/slides/{project_name}/. + Backs up existing manuscript.txt before overwriting. + """ + print(f" Extracting presenter notes from {keynote_file.name}...") + + # Find the JXA script (in the same directory as this module) + script_dir = Path(__file__).parent + jxa_script = script_dir / "extract_keynote_notes.js" + + if not jxa_script.exists(): + print(f" Error: JXA script not found at {jxa_script}", file=sys.stderr) + return + + # Backup existing manuscript.txt if it exists + manuscript_path = project_path / "manuscript.txt" + if manuscript_path.exists(): + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + backup_path = project_path / f"manuscript.txt.{timestamp}.bak" + shutil.copy2(manuscript_path, backup_path) + if verbose: + print(f" Backed up manuscript.txt to {backup_path.name}") + + # Slides export directory: {project}/media/slides/{project_name}/ + slides_dir = project_path / "media" / "slides" / project_path.name + print(f" Exporting slides to {slides_dir}...") + + # Run JXA extractor via osascript (also exports slides) + proc = subprocess.run( + [ + "osascript", + "-l", + "JavaScript", + str(jxa_script), + str(keynote_file.resolve()), + str(slides_dir.resolve()), + ], + capture_output=True, + text=True, + ) + + if proc.returncode != 0: + print(f" Error extracting presenter notes:", file=sys.stderr) + print(f" {proc.stderr}", file=sys.stderr) + return + + # Parse JSON output from JXA script + try: + notes_data = json.loads(proc.stdout) + except json.JSONDecodeError as e: + print(f" Error parsing notes JSON: {e}", file=sys.stderr) + return + + # Convert to manuscript.txt format + lines = [] + for item in notes_data: + idx = item.get("slide_index") + notes = (item.get("notes") or "").rstrip() + + lines.append(f"[S{idx}]") + if notes: + lines.append(notes) + lines.append("") # blank line between slides + + # Write manuscript.txt + manuscript_path.write_text("\n".join(lines).rstrip() + "\n", encoding="utf-8") + print(f" Wrote {manuscript_path} ({len(notes_data)} slides)") + + if verbose: + non_empty = sum(1 for item in notes_data if item.get("notes")) + print(f" {non_empty} slides have presenter notes") + + # ============================================================================= # Validate Command # ============================================================================= + def cmd_validate(project_path: Path, verbose: bool) -> int: """Validate project configuration.""" from .parser import ( @@ -215,10 +506,10 @@ def cmd_validate(project_path: Path, verbose: bool) -> int: return 1 # Parse all files - _, markers, malformed = parse_manuscript(project_path) + _, markers, malformed, _ = parse_manuscript(project_path) config = parse_project_config(project_path) slides = parse_slides(project_path, config) - videos = parse_videos(project_path) + videos, videos_dir = parse_videos(project_path, config) if verbose: print(f" - Markers in manuscript: {len(markers)}") @@ -226,7 +517,9 @@ def cmd_validate(project_path: Path, verbose: bool) -> int: print(f" - Videos defined: {len(videos)}") # Validate - validate_project(project_path, markers, config, slides, videos, malformed) + validate_project( + project_path, markers, config, slides, videos, videos_dir, malformed + ) print("Validation passed.") return 0 @@ -236,29 +529,83 @@ def cmd_validate(project_path: Path, verbose: bool) -> int: # Preprocess Command # ============================================================================= -def cmd_preprocess(project_path: Path, verbose: bool, dry_run: bool) -> int: + +def cmd_preprocess( + project_path: Path, verbose: bool, dry_run: bool, force: bool = False, workers: int = 1 +) -> int: """Run preprocessing pipeline on video sources.""" + from concurrent.futures import ThreadPoolExecutor, as_completed from .parser import parse_project_config, parse_videos from .preprocessor import preprocess_video print(f"Preprocessing: {project_path.name}") config = parse_project_config(project_path) - videos = parse_videos(project_path) + videos, videos_dir = parse_videos(project_path, config) + # Resolve intermediate directory + gnommo_scratch = None + if config.gnommo_scratch: + gnommo_scratch = Path(config.gnommo_scratch) + if not gnommo_scratch.is_absolute(): + gnommo_scratch = project_path / gnommo_scratch + print(f" Using intermediate dir: {gnommo_scratch}") + + # Filter videos that need preprocessing + videos_to_process = [] for video_id, video_source in videos.items(): - print(f"\n Processing: {video_id}") - - if not video_source.preprocess: - print(" No preprocessing steps defined, skipping.") + if not video_source.filter: + print(f" {video_id}: No filters defined, skipping.") continue + videos_to_process.append((video_id, video_source)) - if dry_run: - print(f" Would preprocess: {video_source.file}") - for step in video_source.preprocess: + if not videos_to_process: + print("\nNo videos to preprocess.") + return 0 + + if dry_run: + for video_id, video_source in videos_to_process: + print(f"\n Would preprocess: {video_id}") + print(f" Source: {video_source.source_file}") + print(f" Output: {video_source.output_file or 'N/A'}") + for step in video_source.filter: print(f" - {step}") - else: - preprocess_video(project_path, video_id, video_source, verbose) + return 0 + + # Process videos + if workers > 1 and len(videos_to_process) > 1: + # Parallel processing + num_workers = min(workers, len(videos_to_process)) + print(f"\n Processing {len(videos_to_process)} videos in parallel ({num_workers} workers)") + + def process_video_task(task): + video_id, video_source = task + preprocess_video( + videos_dir, video_id, video_source, verbose=False, force=force, + custom_gnommo_scratch=gnommo_scratch + ) + return video_id + + completed = 0 + with ThreadPoolExecutor(max_workers=num_workers) as executor: + futures = {executor.submit(process_video_task, task): task for task in videos_to_process} + for future in as_completed(futures): + video_id = future.result() + completed += 1 + print(f" Completed: {video_id} ({completed}/{len(videos_to_process)})") + else: + # Sequential processing + for video_id, video_source in videos_to_process: + print(f"\n Processing: {video_id}") + if video_source.take: + print(f" Taking only {video_source.take} seconds") + print(f" Source file: {video_source.source_file}") + print(f" Output file: {video_source.output_file or 'N/A'}") + print(f" Filters: {len(video_source.filter)} step(s)") + + preprocess_video( + videos_dir, video_id, video_source, verbose, force, gnommo_scratch + ) print("\nPreprocessing complete.") return 0 @@ -268,51 +615,276 @@ def cmd_preprocess(project_path: Path, verbose: bool, dry_run: bool) -> int: # Render Command # ============================================================================= -def cmd_render(project_path: Path, verbose: bool, dry_run: bool) -> int: + +def _format_time(seconds: float) -> str: + """Format seconds as MM:SS.ms""" + if seconds < 0: + return "??:??.??" + mins = int(seconds // 60) + secs = seconds % 60 + return f"{mins:02d}:{secs:05.2f}" + + +def _print_render_plan_details(plan, marker_timings, slides: dict) -> None: + """ + Print a detailed render plan showing each marker with its aligned time. + + Uses marker_timings from the transformer which contains alignment info. + """ + from .models import CAMERA_PRESETS + + print("\n RENDER PLAN:") + print(" " + "-" * 76) + + # Build lookup for video events by video_id + video_events_by_id = {} + for event in plan.video_events: + video_events_by_id[event.video_id] = event + + audio_events_by_time = {} + for event in plan.audio_events: + t = round(event.start_time, 1) + if t not in audio_events_by_time: + audio_events_by_time[t] = [] + audio_events_by_time[t].append(event) + + camera_events_by_time = {} + for event in plan.camera_events: + t = round(event.time, 1) + if t not in camera_events_by_time: + camera_events_by_time[t] = [] + camera_events_by_time[t].append(event) + + # Print each marker timing + aligned_count = 0 + unaligned_count = 0 + + for timing in marker_timings: + marker_id = timing.marker_id + context = timing.context + if len(context) > 50: + context = context[:47] + "..." + + if timing.timestamp >= 0: + aligned_count += 1 + time_str = _format_time(timing.timestamp) + + # Show confidence if fuzzy match + conf_str = "" + if timing.confidence < 1.0: + conf_str = f" ({timing.confidence:.0%})" + + # Determine marker type for display + if marker_id in slides: + print(f' {marker_id:6} {time_str}{conf_str} "{context}"') + elif marker_id.startswith("video:"): + video_id = marker_id[6:] + # Find corresponding event by video_id + event = video_events_by_id.get(video_id) + if event: + cutout = event.video_source.cutout + duration = event.end_time - event.start_time + else: + cutout = "?" + duration = 0 + print(f" {marker_id:20} {time_str} in '{cutout}' ({duration:.1f}s)") + elif marker_id.startswith("narration:"): + video_id = marker_id[10:] + print(f" {marker_id:20} {time_str} (continuous)") + elif marker_id in CAMERA_PRESETS: + print(f" {time_str} [{marker_id}]") + elif marker_id.startswith("A"): + print(f" {time_str} [audio:{marker_id[1:]}]") + else: + print(f' {marker_id:6} {time_str} "{context}"') + else: + unaligned_count += 1 + print(f' {marker_id:6} ??:??.?? NOT ALIGNED - "{context}"') + + print(" " + "-" * 76) + + # Summary + total_markers = len(marker_timings) + slide_markers = [t for t in marker_timings if t.marker_id in slides] + aligned_slides = len([t for t in slide_markers if t.timestamp >= 0]) + total_slides = len(slide_markers) + + status = "OK" if unaligned_count == 0 else f"{unaligned_count} UNALIGNED" + print(f" Markers: {aligned_count}/{total_markers} aligned ({status})") + print(f" Slides: {aligned_slides}/{total_slides}") + print( + f" Videos: {len(plan.video_events)} triggered, {len(plan.narration_videos)} always-visible" + ) + if plan.outro_events: + print(f" Outro: {len(plan.outro_events)} video(s)") + for event in plan.outro_events: + print(f" - {event.video_id}: {_format_time(event.start_time)} - {_format_time(event.end_time)}") + print(f" Duration: {_format_time(plan.total_duration)}") + + +def _parse_slide_range(slides_arg: str) -> tuple[str, Optional[str]]: + """Parse slide range argument like 'S1:S10' or 'S5:' into a tuple.""" + if ":" not in slides_arg: + raise ValueError( + f"Invalid slide range '{slides_arg}'. Expected format: S1:S10 or S5:" + ) + + parts = slides_arg.split(":", 1) + start_slide = parts[0].strip() + end_slide = parts[1].strip() if parts[1].strip() else None + + if not start_slide: + raise ValueError( + f"Invalid slide range '{slides_arg}'. Start slide is required." + ) + + return start_slide, end_slide + + +def cmd_render( + project_path: Path, + verbose: bool, + dry_run: bool, + slides_arg: str = None, + res: str = "full", + force: bool = False, +) -> int: """Render final video.""" from .parser import ( + parse_audio, parse_manuscript, parse_project_config, parse_slides, - parse_transcript, parse_videos, + save_citations, ) + from .transcriber import load_transcript from .validator import validate_project from .transformer import build_render_plan from .renderer import render, generate_ffmpeg_command_string - print(f"Rendering: {project_path.name}") + # Parse slide range if provided + slide_range = None + if slides_arg: + slide_range = _parse_slide_range(slides_arg) + print(f"Rendering: {project_path.name} (slides {slides_arg})") + else: + print(f"Rendering: {project_path.name}") + + # Show resolution mode + if res == "low": + print(" Resolution: LOW (490x270) - fast preview mode") # Stage 1: Parse print("\n[1/4] Parsing...") - _, markers, malformed = parse_manuscript(project_path) + manuscript_text, markers, malformed, citations = parse_manuscript(project_path) + + # Save citations for later use (e.g., description generation) + if citations: + citations_path = project_path / "citations.json" + save_citations(citations, citations_path) config = parse_project_config(project_path) + + # Override resolution for low-res preview mode + if res == "low": + config.resolution = (490, 270) + slides = parse_slides(project_path, config) - videos = parse_videos(project_path) - transcript = parse_transcript(project_path) + videos, videos_dir = parse_videos(project_path, config) + audio, audio_dir = parse_audio(project_path, config) + + # Load whisper transcription JSON + # Look for .transcript.json next to the narration video + result = _find_narration_video(config, videos) + if result: + _, narration_source = result + video_path = videos_dir / narration_source.source_file + transcript_path = video_path.with_suffix(".transcript.json") + else: + transcript_path = project_path / "transcript.json" + + if not transcript_path.exists(): + print(f"Error: Transcription not found: {transcript_path}", file=sys.stderr) + print(f"Run 'gnommo -p {project_path.name} transcribe' first.", file=sys.stderr) + return 1 + + transcription = load_transcript(transcript_path) if verbose: - print(f" - Markers: {len(markers)}") - print(f" - Slides: {len(slides)}") - print(f" - Transcript entries: {len(transcript)}") + print(f" - Markers in manuscript: {len(markers)}") + print(f" - Slides defined: {len(slides)}") + print(f" - Audio clips: {len(audio)}") + print(f" - Transcription words: {len(transcription)}") # Stage 2: Validate print("\n[2/4] Validating...") - validate_project(project_path, markers, config, slides, videos, malformed) + validate_project( + project_path, markers, config, slides, videos, videos_dir, malformed + ) print(" Passed.") - # Stage 3: Transform + # Stage 3: Transform (includes on-the-fly alignment) print("\n[3/4] Building render plan...") - plan = build_render_plan(project_path, config, slides, videos, transcript) - print(f" - Duration: {plan.total_duration:.1f}s") - print(f" - Slide events: {len(plan.slide_events)}") + plan, marker_timings = build_render_plan( + project_path, + config, + slides, + videos, + videos_dir, + manuscript_text, + transcription, + audio, + audio_dir, + slide_range=slide_range, + ) + if plan.time_offset > 0: + print(f" Time offset: {plan.time_offset:.1f}s (partial render)") - if verbose: - for event in plan.slide_events: - print(f" [{event.slide_id}] {event.start_time:.1f}s - {event.end_time:.1f}s") + # Print detailed render plan with alignment info + _print_render_plan_details(plan, marker_timings, slides) + if plan.audio_events: + print(f"\n Audio effects:") + for event in plan.audio_events: + loop_str = " (loop)" if event.audio_def.loop else "" + pause_str = " [ignores pauses]" if event.audio_def.ignore_pauses else "" + print(f" - {event.audio_id}: '{event.audio_def.file}' @ {_format_time(event.start_time)}{loop_str}{pause_str}") + # Show always-visible videos + if plan.narration_videos: + print(f"\n Always-visible videos:") + for video_id, video_source, cutout in plan.narration_videos: + skip_str = f" (skip: {video_source.skip:.1f}s)" if video_source.skip > 0 else "" + print(f" - {video_id} in '{video_source.cutout}'{skip_str}") + + # Show narration pauses + if plan.narration_pauses: + print(f"\n Narration pauses:") + for pause in plan.narration_pauses: + print( + f" - {pause.video_id} at {_format_time(pause.output_time)} " + f"for {pause.duration:.1f}s (narration freezes at {_format_time(pause.narration_time)})" + ) + + # Check for unaligned markers + unaligned = [t for t in marker_timings if t.timestamp < 0] + if unaligned: + print(f"\n WARNING: {len(unaligned)} marker(s) could not be aligned!") + for t in unaligned: + print(f' [{t.marker_id}] - "{t.context}"') + if not force: + print(f"\n Run with -f/--force to render anyway.") + return 1 + else: + print(f"\n Continuing anyway due to --force flag...") # Stage 4: Render - output_path = project_path / "out" / "final.mp4" + # Generate output filename based on slide range and resolution + base_name = "preview" if res == "low" else "final" + if slide_range: + start, end = slide_range + range_suffix = f"_{start}-{end}" if end else f"_{start}-end" + output_path = project_path / "out" / f"{base_name}{range_suffix}.mp4" + else: + output_path = project_path / "out" / f"{base_name}.mp4" if dry_run: print("\n[4/4] FFmpeg command (dry run):") @@ -331,22 +903,56 @@ def cmd_render(project_path: Path, verbose: bool, dry_run: bool) -> int: # Transcribe Command # ============================================================================= + +def _find_narration_video(config, videos: dict) -> Optional[tuple[str, "VideoSource"]]: + """ + Find the video to use for transcription/narration. + + Priority: + 1. config.audio_source if set + 2. First video with always_visible=True + 3. First video in dict + """ + from .models import VideoSource + + # 1. Check audio_source config + if config.audio_source and config.audio_source in videos: + return config.audio_source, videos[config.audio_source] + + # 2. Find always_visible video (main talking head) + for video_id, video_source in videos.items(): + if video_source.always_visible: + return video_id, video_source + + # 3. Fall back to first video + if videos: + video_id = next(iter(videos.keys())) + return video_id, videos[video_id] + + return None + + def cmd_transcribe(project_path: Path, verbose: bool) -> int: """Transcribe video audio using Whisper.""" from .transcriber import transcribe_video, save_transcript - from .parser import parse_videos + from .parser import parse_project_config, parse_videos print(f"Transcribing: {project_path.name}") - videos = parse_videos(project_path) + config = parse_project_config(project_path) + videos, videos_dir = parse_videos(project_path, config) if not videos: print("Error: No videos defined in videos.json", file=sys.stderr) return 1 - # Use first video - video_id = next(iter(videos.keys())) - video_source = videos[video_id] - video_path = project_path / video_source.file + # Find the narration video + result = _find_narration_video(config, videos) + if not result: + print("Error: No suitable video found for transcription", file=sys.stderr) + return 1 + + video_id, video_source = result + video_path = videos_dir / video_source.source_file if not video_path.exists(): print(f"Error: Video not found: {video_path}", file=sys.stderr) @@ -374,59 +980,87 @@ def cmd_transcribe(project_path: Path, verbose: bool) -> int: # Align Command # ============================================================================= + def cmd_align(project_path: Path, verbose: bool) -> int: - """Align manuscript markers to transcript timestamps.""" + """Preview manuscript marker alignment (no files written).""" from .transcriber import load_transcript - from .aligner import align_markers, save_aligned_transcript - from .parser import parse_videos + from .transformer import align_markers_to_transcription + from .parser import parse_project_config, parse_videos, parse_slides, parse_audio, parse_manuscript, save_citations - print(f"Aligning: {project_path.name}") + print(f"Alignment preview: {project_path.name}") + print(" (This is a preview - alignment happens automatically during render)") - # Load manuscript - manuscript_path = project_path / "manuscript.txt" - if not manuscript_path.exists(): - print(f"Error: manuscript.txt not found", file=sys.stderr) + # Load manuscript (cites are stripped at parse time) + manuscript_text, _, _, citations = parse_manuscript(project_path) + + # Save citations for later use (e.g., description generation) + if citations: + citations_path = project_path / "citations.json" + save_citations(citations, citations_path) + + # Load project config and resources + config = parse_project_config(project_path) + slides = parse_slides(project_path, config) + videos, videos_dir = parse_videos(project_path, config) + audio, _ = parse_audio(project_path, config) + + # Find transcription (from narration video) + result = _find_narration_video(config, videos) + if not result: + print("Error: No suitable video found for transcription", file=sys.stderr) return 1 - manuscript_text = manuscript_path.read_text(encoding="utf-8") - - # Find transcript - videos = parse_videos(project_path) - video_id = next(iter(videos.keys())) - video_source = videos[video_id] - video_path = project_path / video_source.file + video_id, video_source = result + video_path = videos_dir / video_source.source_file transcript_path = video_path.with_suffix(".transcript.json") if not transcript_path.exists(): - print(f"Error: Transcript not found: {transcript_path}", file=sys.stderr) - print("Run with -a transcribe first.", file=sys.stderr) + print(f"Error: Transcription not found: {transcript_path}", file=sys.stderr) + print(f"Run 'gnommo -p {project_path.name} transcribe' first.", file=sys.stderr) return 1 print(f" Loading: {transcript_path.name}") - transcript = load_transcript(transcript_path) - print(f" - {len(transcript)} words") + transcription = load_transcript(transcript_path) + print(f" - {len(transcription)} words") - # Align - print(" Aligning markers...") - alignments = align_markers(manuscript_text, transcript, offset_seconds=-1.0) + # Align (cite markers already stripped at parse time) + print("\n Aligning markers to transcription...") + timings = align_markers_to_transcription( + manuscript_text, transcription, slides=slides, videos=videos, audio=audio + ) - # Report + # Report alignment results unmatched = 0 - for a in alignments: - if a.timestamp >= 0: - if verbose: - print(f" [{a.marker_id}] @ {a.timestamp:.1f}s") + fuzzy_matched = 0 + exact_matched = 0 + + for t in timings: + if t.timestamp >= 0: + if t.confidence >= 1.0: + exact_matched += 1 + if verbose: + print(f" [{t.marker_id}] @ {_format_time(t.timestamp)}") + else: + fuzzy_matched += 1 + # Always show fuzzy matches so user can verify + print( + f" [{t.marker_id}] @ {_format_time(t.timestamp)} (fuzzy {t.confidence:.0%})" + ) else: - print(f" [{a.marker_id}] NOT FOUND") + print(f' [{t.marker_id}] NOT FOUND - "{t.context}"') unmatched += 1 + # Summary + total = len(timings) + print(f"\n Alignment summary:") + print(f" - Exact matches: {exact_matched}/{total}") + if fuzzy_matched > 0: + print(f" - Fuzzy matches (60%+ words): {fuzzy_matched}/{total}") if unmatched > 0: - print(f"\n Warning: {unmatched} markers not aligned") - - # Save - output_path = project_path / "transcript.csv" - save_aligned_transcript(alignments, transcript, output_path) - print(f"\n Saved: {output_path}") + print(f" - NOT FOUND: {unmatched}/{total}") + print( + f"\n Some markers could not be aligned. Check manuscript.txt matches the spoken audio." + ) return 0 @@ -435,37 +1069,145 @@ def cmd_align(project_path: Path, verbose: bool) -> int: # All Command (Full Pipeline) # ============================================================================= -def cmd_all(project_path: Path, verbose: bool, dry_run: bool) -> int: - """Run full pipeline: transcribe → align → render.""" - from .parser import parse_videos + +def cmd_all( + project_path: Path, + verbose: bool, + dry_run: bool, + res: str = "full", + force: bool = False, +) -> int: + """Run full pipeline: transcribe → render (alignment is automatic).""" + from .parser import parse_project_config, parse_videos print(f"=== Full Pipeline: {project_path.name} ===\n") - # Check if transcript exists - videos = parse_videos(project_path) - if videos: - video_id = next(iter(videos.keys())) - video_source = videos[video_id] - video_path = project_path / video_source.file + # Check if transcription exists + config = parse_project_config(project_path) + videos, videos_dir = parse_videos(project_path, config) + result = _find_narration_video(config, videos) + if result: + video_id, video_source = result + video_path = videos_dir / video_source.source_file transcript_path = video_path.with_suffix(".transcript.json") if not transcript_path.exists(): - print(">>> Step 1/3: Transcribe\n") + print(">>> Step 1/2: Transcribe\n") result = cmd_transcribe(project_path, verbose) if result != 0: return result else: - print(f">>> Step 1/3: Transcribe (cached: {transcript_path.name})\n") + print(f">>> Step 1/2: Transcribe (cached: {transcript_path.name})\n") - # Align - print("\n>>> Step 2/3: Align\n") - result = cmd_align(project_path, verbose) - if result != 0: - return result + # Render (alignment happens automatically) + print("\n>>> Step 2/2: Render\n") + return cmd_render(project_path, verbose, dry_run, res=res, force=force) - # Render - print("\n>>> Step 3/3: Render\n") - return cmd_render(project_path, verbose, dry_run) + +# ============================================================================= +# Description Command +# ============================================================================= + + +def cmd_description(project_path: Path, verbose: bool) -> int: + """Generate YouTube description file with chapters, citations, and attributions.""" + from .parser import ( + parse_audio, + parse_manuscript, + parse_project_config, + parse_slides, + parse_videos, + load_citations, + ) + from .transcriber import load_transcript + from .transformer import align_markers_to_transcription + from .description import write_description_file + + print(f"Generating description: {project_path.name}") + + # Parse all project files + manuscript_text, markers, _, _ = parse_manuscript(project_path) + + # Load citations from file (saved during parse/render/align stages) + citations_path = project_path / "citations.json" + citations = load_citations(citations_path) + config = parse_project_config(project_path) + slides = parse_slides(project_path, config) + videos, videos_dir = parse_videos(project_path, config) + audio, _ = parse_audio(project_path, config) + + # Load transcription for alignment (optional but recommended) + transcription = None + result = _find_narration_video(config, videos) + if result: + _, narration_source = result + video_path = videos_dir / narration_source.source_file + transcript_path = video_path.with_suffix(".transcript.json") + if transcript_path.exists(): + transcription = load_transcript(transcript_path) + if verbose: + print(f" Loaded transcription: {len(transcription)} words") + else: + print(f" Warning: No transcription found at {transcript_path}") + print(f" Run 'gnommo -p {project_path.name} transcribe' for better timestamps.") + + # Align markers to get timings + print(" Aligning markers...") + marker_timings = align_markers_to_transcription( + manuscript_text, + transcription or [], + slides=slides, + videos=videos, + audio=audio, + ) + + if verbose: + aligned = sum(1 for t in marker_timings if t.timestamp >= 0) + print(f" Aligned {aligned}/{len(marker_timings)} markers") + + # Generate description + output_path = project_path / "out" / "description_youtube.txt" + description = write_description_file( + output_path=output_path, + config=config, + manuscript_text=manuscript_text, + slides=slides, + videos=videos, + marker_timings=marker_timings, + transcription=transcription, + citations=citations, + ) + + # Print summary + lines = description.split("\n") + print(f"\n Output: {output_path}") + print(f" Length: {len(description)} characters, {len(lines)} lines") + + # Show sections found + sections = [] + if config.description: + sections.append("description") + if "CHAPTERS" in description: + sections.append("chapters") + if "REFERENCES" in description: + sections.append("references") + if "STOCK FOOTAGE" in description: + sections.append("attributions") + if config.footer: + sections.append("footer") + + print(f" Sections: {', '.join(sections)}") + + if verbose: + print("\n --- Preview ---") + preview_lines = lines[:20] + for line in preview_lines: + print(f" {line}") + if len(lines) > 20: + print(f" ... ({len(lines) - 20} more lines)") + + print("\nDone.") + return 0 if __name__ == "__main__": diff --git a/gnommo/description.py b/gnommo/description.py new file mode 100644 index 0000000..1c88576 --- /dev/null +++ b/gnommo/description.py @@ -0,0 +1,359 @@ +"""Description generator: Create YouTube description with chapters, citations, and attributions.""" + +import re +from dataclasses import dataclass +from pathlib import Path +from typing import Optional + +from .models import ( + Attribution, + Citation, + ProjectConfig, + SlideDefinition, + VideoSource, +) +from .transcriber import TranscribedWord + + +@dataclass +class ChapterMarker: + """A chapter marker with timestamp and title.""" + + slide_id: str + timestamp: float + title: str + + +def _format_timestamp(seconds: float) -> str: + """Format seconds as M:SS or H:MM:SS for YouTube chapters.""" + if seconds < 0: + return "0:00" + + hours = int(seconds // 3600) + minutes = int((seconds % 3600) // 60) + secs = int(seconds % 60) + + if hours > 0: + return f"{hours}:{minutes:02d}:{secs:02d}" + else: + return f"{minutes}:{secs:02d}" + + +def _extract_chapter_title( + manuscript_text: str, slide_id: str, slides: dict[str, SlideDefinition] +) -> str: + """ + Extract a chapter title for a slide. + + Tries to find meaningful title from: + 1. First sentence/line after the slide marker + 2. Falls back to slide ID if nothing useful found + """ + # Find the marker and text after it + pattern = rf"\[{re.escape(slide_id)}\]\s*(.+?)(?=\[S\d+\]|\[video:|\[narration:|\Z)" + match = re.search(pattern, manuscript_text, re.DOTALL) + + if match: + text = match.group(1).strip() + # Remove any other markers from the text + text = re.sub(r"\[[^\]]+\]", "", text).strip() + + if text: + # Take first line or first sentence + first_line = text.split("\n")[0].strip() + # Truncate if too long + if len(first_line) > 50: + # Try to break at word boundary + truncated = first_line[:47] + last_space = truncated.rfind(" ") + if last_space > 30: + truncated = truncated[:last_space] + first_line = truncated + "..." + + if first_line: + return first_line + + # Fallback to slide number + slide_num = slide_id[1:] if slide_id.startswith("S") else slide_id + return f"Section {slide_num}" + + +def _align_citation_to_transcription( + citation: Citation, + transcription: list[TranscribedWord], + manuscript_text: str, +) -> float: + """ + Align a citation to the transcription to find its timestamp. + + Uses the context text following the citation to find the approximate + position in the audio. + + Returns timestamp in seconds, or -1 if not found. + """ + if not transcription or not citation.context: + return -1.0 + + # Get more context from the manuscript for better matching + # Find the citation in the manuscript and get surrounding text + pattern = rf"\[cite:{re.escape(citation.reference)}\]\s*(.{{0,200}})" + match = re.search(pattern, manuscript_text, re.DOTALL) + + if not match: + return -1.0 + + context_text = match.group(1).strip() + # Clean up: remove markers, normalize whitespace + context_text = re.sub(r"\[[^\]]+\]", "", context_text) + context_text = " ".join(context_text.split()) + + if not context_text: + return -1.0 + + # Normalize for matching + context_words = context_text.lower().split()[:10] # Use up to 10 words + if not context_words: + return -1.0 + + # Build normalized transcription + trans_words = [(w.word.lower(), w.start) for w in transcription] + + # Simple sliding window match + best_match_score = 0 + best_match_time = -1.0 + + for i in range(len(trans_words) - len(context_words) + 1): + matches = 0 + for j, ctx_word in enumerate(context_words): + trans_word = trans_words[i + j][0] + # Allow partial matches for longer words + if ctx_word == trans_word: + matches += 1 + elif len(ctx_word) >= 4 and ( + ctx_word in trans_word or trans_word in ctx_word + ): + matches += 0.5 + + score = matches / len(context_words) + if score > best_match_score and score >= 0.5: + best_match_score = score + best_match_time = trans_words[i][1] + + return best_match_time + + +def generate_chapters( + manuscript_text: str, + slides: dict[str, SlideDefinition], + marker_timings: list, # List of MarkerTiming from transformer + min_chapter_duration: float = 30.0, +) -> list[ChapterMarker]: + """ + Generate chapter markers from slide timings. + + Args: + manuscript_text: The manuscript content + slides: Slide definitions + marker_timings: Aligned marker timings from the transformer + min_chapter_duration: Minimum seconds between chapters (merges short ones) + + Returns: + List of ChapterMarker objects + """ + chapters = [] + + # Build timing lookup + timing_lookup = {t.marker_id: t.timestamp for t in marker_timings if t.timestamp >= 0} + + # Process slides in order + slide_ids = sorted( + [s for s in slides.keys() if s.startswith("S")], + key=lambda x: int(x[1:]) if x[1:].isdigit() else 0, + ) + + for slide_id in slide_ids: + if slide_id not in timing_lookup: + continue + + timestamp = timing_lookup[slide_id] + title = _extract_chapter_title(manuscript_text, slide_id, slides) + + # Check if we should merge with previous chapter (too short) + if chapters and (timestamp - chapters[-1].timestamp) < min_chapter_duration: + continue # Skip this chapter, previous one covers it + + chapters.append( + ChapterMarker( + slide_id=slide_id, + timestamp=timestamp, + title=title, + ) + ) + + # Ensure first chapter starts at 0:00 + if chapters and chapters[0].timestamp > 0: + chapters[0] = ChapterMarker( + slide_id=chapters[0].slide_id, + timestamp=0.0, + title=chapters[0].title, + ) + + return chapters + + +def collect_attributions( + videos: dict[str, VideoSource], + video_events: list = None, +) -> list[tuple[str, Attribution]]: + """ + Collect all video attributions. + + Returns list of (video_id, Attribution) tuples for videos that have attribution. + Only includes videos that are actually used in the project (via video_events) + or videos from shared assets that have attribution. + """ + attributions = [] + + # Get set of used video IDs from events + used_video_ids = set() + if video_events: + for event in video_events: + used_video_ids.add(event.video_id) + + for video_id, video_source in videos.items(): + if video_source.attribution: + # Include if used in video or if it's a shared asset + if video_id in used_video_ids or video_source.is_shared: + attributions.append((video_id, video_source.attribution)) + + return attributions + + +def generate_description( + config: ProjectConfig, + manuscript_text: str, + slides: dict[str, SlideDefinition], + videos: dict[str, VideoSource], + marker_timings: list, + transcription: list[TranscribedWord] = None, + video_events: list = None, + citations: list[Citation] = None, + include_chapters: bool = True, + include_citations: bool = True, + include_attributions: bool = True, +) -> str: + """ + Generate complete YouTube description. + + Combines: + - Video description from project.json + - Chapter markers (optional) + - Citations from manuscript (optional) + - Stock footage attributions (optional) + - Footer from project.json + + Returns formatted description text. + """ + sections = [] + + # 1. Video description + if config.description: + sections.append(config.description.strip()) + + # 2. Chapters + if include_chapters: + chapters = generate_chapters(manuscript_text, slides, marker_timings) + if chapters: + chapter_lines = ["CHAPTERS", ""] + for ch in chapters: + chapter_lines.append(f"{_format_timestamp(ch.timestamp)} {ch.title}") + sections.append("\n".join(chapter_lines)) + + # 3. Citations/References + if include_citations: + citations = citations or [] + if citations and transcription: + # Align citations to get timestamps + for citation in citations: + citation.timestamp = _align_citation_to_transcription( + citation, transcription, manuscript_text + ) + + if citations: + ref_lines = ["REFERENCES", ""] + for citation in citations: + if citation.timestamp >= 0: + ref_lines.append( + f"{_format_timestamp(citation.timestamp)} - {citation.reference}" + ) + else: + ref_lines.append(f"- {citation.reference}") + sections.append("\n".join(ref_lines)) + + # 4. Stock footage attributions + if include_attributions: + attributions = collect_attributions(videos, video_events) + if attributions: + attr_lines = ["STOCK FOOTAGE", ""] + for video_id, attr in attributions: + # Format: "Description by Creator via Source: URL" + line = f"{video_id.replace('_', ' ').title()} by {attr.creator} via {attr.source.title()}" + if attr.url: + line += f": {attr.url}" + attr_lines.append(line) + sections.append("\n".join(attr_lines)) + + # 5. Footer + if config.footer: + sections.append(config.footer.strip()) + + # Join sections with double newlines + return "\n\n".join(sections) + + +def write_description_file( + output_path: Path, + config: ProjectConfig, + manuscript_text: str, + slides: dict[str, SlideDefinition], + videos: dict[str, VideoSource], + marker_timings: list, + transcription: list[TranscribedWord] = None, + video_events: list = None, + citations: list[Citation] = None, +) -> str: + """ + Generate and write YouTube description to file. + + Args: + output_path: Path to write description (e.g., out/description_youtube.txt) + config: Project configuration + manuscript_text: Manuscript content + slides: Slide definitions + videos: Video definitions + marker_timings: Aligned marker timings + transcription: Word-level transcription (optional, for citation timestamps) + video_events: Video events from render plan (optional, for attribution filtering) + citations: Pre-extracted citations (optional, loaded from citations.json) + + Returns: + The generated description text + """ + description = generate_description( + config=config, + manuscript_text=manuscript_text, + slides=slides, + videos=videos, + marker_timings=marker_timings, + transcription=transcription, + video_events=video_events, + citations=citations, + ) + + # Ensure output directory exists + output_path.parent.mkdir(parents=True, exist_ok=True) + + # Write description + output_path.write_text(description, encoding="utf-8") + + return description diff --git a/gnommo/errors.py b/gnommo/errors.py index 90d60c1..3559556 100644 --- a/gnommo/errors.py +++ b/gnommo/errors.py @@ -7,12 +7,14 @@ from typing import Optional class GnommoError(Exception): """Base exception for all GnommoEditor errors.""" + pass @dataclass class ValidationIssue: """A single validation issue with location context.""" + message: str file: Optional[Path] = None line: Optional[int] = None @@ -30,7 +32,9 @@ class ValidationIssue: class ParseError(GnommoError): """Error during parsing of input files.""" - def __init__(self, message: str, file: Optional[Path] = None, line: Optional[int] = None): + def __init__( + self, message: str, file: Optional[Path] = None, line: Optional[int] = None + ): self.issue = ValidationIssue(message, file, line) super().__init__(str(self.issue)) @@ -48,7 +52,9 @@ class ValidationError(GnommoError): class RenderError(GnommoError): """Error during rendering stage.""" - def __init__(self, message: str, command: Optional[str] = None, stderr: Optional[str] = None): + def __init__( + self, message: str, command: Optional[str] = None, stderr: Optional[str] = None + ): self.command = command self.stderr = stderr full_message = message @@ -62,7 +68,13 @@ class RenderError(GnommoError): class PreprocessError(GnommoError): """Error during preprocessing stage.""" - def __init__(self, message: str, filter_type: Optional[str] = None, command: Optional[str] = None, stderr: Optional[str] = None): + def __init__( + self, + message: str, + filter_type: Optional[str] = None, + command: Optional[str] = None, + stderr: Optional[str] = None, + ): self.filter_type = filter_type self.command = command self.stderr = stderr diff --git a/gnommo/extract_keynote_notes.js b/gnommo/extract_keynote_notes.js new file mode 100644 index 0000000..397e59b --- /dev/null +++ b/gnommo/extract_keynote_notes.js @@ -0,0 +1,74 @@ +ObjC.import('stdlib'); +ObjC.import('Foundation'); + +function toAbsolutePath(p) { + // Expand ~ and make absolute relative to current working directory + var s = $(String(p)).stringByExpandingTildeInPath; + if (!s.isAbsolutePath) { + var cwd = $.NSFileManager.defaultManager.currentDirectoryPath; + s = cwd.stringByAppendingPathComponent(s); + } + return s.stringByStandardizingPath.js; +} + +function fileExists(p) { + return $.NSFileManager.defaultManager.fileExistsAtPath($(p)); +} + +function getNotes(slide) { + try { return slide.presenterNotes(); } catch (e) {} + try { return slide.speakerNotes(); } catch (e) {} + return ""; +} + +function run(argv) { + if (!argv || argv.length < 1) throw new Error("Usage: script.js [slides_output_dir]"); + var abs = toAbsolutePath(argv[0]); + var slidesDir = argv.length >= 2 ? toAbsolutePath(argv[1]) : null; + + if (!fileExists(abs)) { + throw new Error("File not found: " + abs); + } + + var Keynote = Application('Keynote'); + Keynote.activate(); + + // Keynote is happiest when given a Path() made from an absolute POSIX path + var doc = Keynote.open(Path(abs)); + + // Export slides as PNG if output directory is provided + if (slidesDir) { + // Create directory if it doesn't exist + var fm = $.NSFileManager.defaultManager; + if (!fm.fileExistsAtPath($(slidesDir))) { + fm.createDirectoryAtPathWithIntermediateDirectoriesAttributesError( + $(slidesDir), true, $(), $() + ); + } + + // Export using AppleScript (more reliable than JXA for Keynote export) + var app = Application.currentApplication(); + app.includeStandardAdditions = true; + + // Build osascript command with proper escaping + // Using multiple -e flags to avoid quoting issues + var cmd = '/usr/bin/osascript' + + ' -e \'tell application "Keynote"\'' + + ' -e \'export front document to POSIX file "' + slidesDir + '" as slide images with properties {image format:PNG}\'' + + ' -e \'end tell\''; + + app.doShellScript(cmd); + } + + var slides = doc.slides(); + var out = []; + for (var i = 0; i < slides.length; i++) { + out.push({ + slide_index: i + 1, + notes: String(getNotes(slides[i]) || "") + }); + } + + doc.close({ saving: 'no' }); + return JSON.stringify(out, null, 2); +} diff --git a/gnommo/extract_presenter_notes.py b/gnommo/extract_presenter_notes.py new file mode 100644 index 0000000..29a04ec --- /dev/null +++ b/gnommo/extract_presenter_notes.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python3 +""" +Extract presenter notes from a Keynote .key file. + +Usage: + python extract_keynote_notes.py path/to/deck.key --out notes.json + +Notes: +- A .key file is a package (zip). The presenter notes live in an XML-ish file + typically called index.apxl inside the package. +- This script tries to be robust across minor format changes by searching for + likely note fields. +""" +import json +import os +import subprocess +import argparse +import json +import os +import re +import shutil +import tempfile +import zipfile +from pathlib import Path + + +def write_manuscript(data: Path, out_path: Path): + data = json.loads( + data.read_text(encoding="utf-8") + ) # list of {"slide_index": int, "notes": str} + + lines = [] + i = 0 + for item in data: + print(f"Writing notes for slide {i} to file") + idx = item.get("slide_index") + notes = (item.get("notes") or "").rstrip() + + lines.append(f"[S{idx}]") + lines.append(notes) + lines.append("") # blank line between slides + i += 1 + + out_path.write_text("\n".join(lines).rstrip() + "\n", encoding="utf-8") + print(f"Wrote {out_path}") + + +def main(): + keynote_file = Path("video1/video1.key").expanduser().resolve() + if not keynote_file.exists(): + raise FileNotFoundError(f"Keynote file not found: {keynote_file}") + + script_file = Path("gnommo/extract_keynote_notes.js").expanduser().resolve() + if not script_file.exists(): + raise FileNotFoundError(f"Extractor script not found: {script_file}") + + presenter_notes_json_file = Path("video1/manuscript.json").expanduser().resolve() + + # Run JXA extractor + proc = subprocess.run( + [ + "osascript", + "-l", + "JavaScript", + str(script_file), + str(keynote_file), + ], + capture_output=True, + text=True, + ) + + if proc.returncode != 0: + raise RuntimeError( + "Failed to extract presenter notes:\n" + f"STDERR:\n{proc.stderr}\n" + f"STDOUT:\n{proc.stdout}" + ) + + # Write JSON output + presenter_notes_json_file.write_text(proc.stdout, encoding="utf-8") + + if not presenter_notes_json_file.exists(): + raise FileNotFoundError( + f"Failed to extract presenter notes to {presenter_notes_json_file}" + ) + + # Convert JSON → manuscript.txt + write_manuscript( + presenter_notes_json_file, out_path=keynote_file.parent / "manuscript.txt" + ) + + +if __name__ == "__main__": + main() diff --git a/gnommo/models.py b/gnommo/models.py index 7a9496a..50e0ee4 100644 --- a/gnommo/models.py +++ b/gnommo/models.py @@ -6,31 +6,64 @@ from typing import Optional @dataclass -class TalkingHeadConfig: - """Configuration for talking head video positioning.""" - x: int - y: int - target_height: int # in pixels, or -1 for percentage-based - target_height_percent: float = 0.0 # percentage (0.0-1.0) if target_height is -1 - file: Optional[str] = None # Path to video or metadata JSON file +class CutoutDefinition: + """Definition of a named zone for placing video content. + + All positioning values support both pixels (int) and percentages (str like "50%"). + Percentage values are stored as floats (0.0-1.0) with pixel value set to -1. + + Videos placed in cutouts are cropped to fit the cutout dimensions. + """ + + x: int # in pixels, or -1 for percentage-based + y: int # in pixels, or -1 for percentage-based + height: int # in pixels, or -1 for percentage-based + width: int = ( + -1 + ) # in pixels, or -1 for percentage-based (defaults to height for square) + x_percent: float = 0.0 # percentage (0.0-1.0) if x is -1 + y_percent: float = 0.0 # percentage (0.0-1.0) if y is -1 + height_percent: float = 0.0 # percentage (0.0-1.0) if height is -1 + width_percent: float = 0.0 # percentage (0.0-1.0) if width is -1 + + +# Backwards compatibility alias +TalkingHeadConfig = CutoutDefinition @dataclass class ProjectConfig: """Global project configuration from project.json.""" + resolution: tuple[int, int] fps: int - talking_head: TalkingHeadConfig default_slide_type: str + cutouts: dict[str, CutoutDefinition] = field( + default_factory=dict + ) # Named zones for video placement background: str = "" # Background image or video path (in shared_assets/) background_video: str = "" # Deprecated: use background instead slides_path: str = "slides.json" # path to slides.json relative to project + videos_path: str = "videos.json" # path to videos.json relative to project + audio_path: str = "audio.json" # path to audio.json relative to project audio_source: Optional[str] = None # defaults to talking head + main_video: Optional[str] = None # ID of main video (e.g., talking head) + gnommo_scratch: Optional[ + str + ] = None # directory for intermediate files (e.g., external SSD) + # Outro sequence - plays after narration ends (not marker-triggered) + outro: list[str] = field( + default_factory=list + ) # List of video IDs to play in sequence after narration + # YouTube description fields + description: str = "" # Video description text for YouTube + footer: str = "" # Footer text (social links, subscribe CTA, etc.) @dataclass class SlideDefinition: """Definition of a single slide from slides.json.""" + image: str type: str # "fullscreen" | "square" @@ -38,25 +71,170 @@ class SlideDefinition: @dataclass class ChromaKeyConfig: """Configuration for chroma key (green screen) filter.""" + color: tuple[int, int, int] = (0, 255, 0) # RGB color to key out - similarity: float = 0.15 # Color similarity threshold (0.0-1.0) - blend: float = 0.1 # Edge blend/feathering (0.0-1.0) - spill: float = 0.0 # Spill suppression amount (0.0-1.0) + similarity: float = ( + 0.4 # Color similarity threshold (0.0-1.0), higher = more aggressive + ) + blend: float = 0.08 # Edge blend/feathering (0.0-1.0), lower = tighter edges + spill: float = 0.1 # Spill suppression amount (0.0-1.0) + edge_erode: int = 0 # Pixels to erode from alpha edge (0-5), removes green fringe + # Color protection - restore opacity for colors that shouldn't be keyed + protect_color: tuple[int, int, int] = None # RGB color to protect from keying + protect_tolerance: float = ( + 0.15 # How much variation from protect_color to allow (0-1) + ) + + +@dataclass +class GnommoKeyConfig: + """Configuration for gnommokey filter - Keylight-style color-difference keyer. + + Uses YCbCr color-difference keying (like Keylight/Ultimatte) instead of + simple Euclidean distance. This handles lighting variation much better + than basic chromakey. + """ + + # Screen color (the green/blue screen color to key out) + screen_color: tuple[int, int, int] = (0, 177, 64) # RGB of the screen + + # Key extraction strength (default 100, higher = more aggressive) + # Values 80-150 are typical. Maps to Keylight's Screen Gain. + screen_gain: float = 100.0 + + # Balance between chrominance and luminance in key calculation (0-100) + # 0 = pure color-difference, 100 = luminance weighted + # Maps to Keylight's Screen Balance. + screen_balance: float = 50.0 + + # Alpha/matte adjustments + clip_black: float = 0.0 # Crush blacks (0-100). Higher = more transparent areas + clip_white: float = 100.0 # Crush whites (0-100). Lower = more opaque areas + + # Despill: color to shift green spill toward (RGB) + # Typical values: skin tone [217, 200, 180] or neutral [200, 200, 200] + despill_bias: tuple[int, int, int] = None + + # How aggressively to apply despill (0-1) + despill_strength: float = 0.5 + + # Alpha bias: influences edge treatment (RGB) + # Can help with edge color contamination + alpha_bias: tuple[int, int, int] = None + + # Edge refinement + edge_erode: int = 0 # Pixels to erode from alpha edge (0-5) + edge_soften: float = 0.0 # Blur the alpha edge (0-5 pixels) + + +@dataclass +class ColorGradeConfig: + """Configuration for color grading filter. + + Applies color balance, contrast curves, and saturation adjustments + while preserving the alpha channel. + """ + + # Color balance (range: -1.0 to 1.0, 0 = no change) + # Midtones + rm: float = 0.0 # Red midtones adjustment + gm: float = 0.0 # Green midtones adjustment + bm: float = 0.0 # Blue midtones adjustment + # Highlights + rh: float = 0.0 # Red highlights adjustment + gh: float = 0.0 # Green highlights adjustment + bh: float = 0.0 # Blue highlights adjustment + # Shadows + rs: float = 0.0 # Red shadows adjustment + gs: float = 0.0 # Green shadows adjustment + bs: float = 0.0 # Blue shadows adjustment + + # Curves preset (none, lighter, darker, increase_contrast, medium_contrast, etc.) + curves_preset: str = "none" + + # EQ adjustments + contrast: float = 1.0 # Contrast multiplier (0.0-2.0, 1.0 = no change) + brightness: float = 0.0 # Brightness adjustment (-1.0 to 1.0, 0 = no change) + saturation: float = 1.0 # Saturation multiplier (0.0-3.0, 1.0 = no change) + + # Custom curves for lift/gamma/gain control + # Format: "0/0 0.5/0.56 1/1" means (input/output) control points + curves_r: str = "" # Red channel curve + curves_g: str = "" # Green channel curve + curves_b: str = "" # Blue channel curve + curves_master: str = "" # Master (luminance) curve + + +@dataclass +class AudioNormalizeConfig: + """Configuration for audio normalization filter. + + Applies noise reduction, compression, and loudness normalization + to improve audio quality and consistency. + """ + + # Noise reduction (afftdn filter) + denoise: bool = True # Enable noise reduction + noise_floor: float = -25.0 # Noise floor in dB (default -25, lower = more aggressive) + + # Compression (acompressor filter) + compress: bool = True # Enable dynamic range compression + threshold: float = -20.0 # Compression threshold in dB + ratio: float = 4.0 # Compression ratio (4:1 default) + attack: float = 5.0 # Attack time in ms + release: float = 50.0 # Release time in ms + makeup: float = 2.0 # Makeup gain in dB + + # Loudness normalization (loudnorm filter - EBU R128) + normalize: bool = True # Enable loudness normalization + target_lufs: float = -16.0 # Target integrated loudness (YouTube recommends -14 to -16) + target_lra: float = 11.0 # Target loudness range + target_tp: float = -1.5 # Target true peak in dB @dataclass class FilterConfig: """Base configuration for a preprocessing filter.""" + type: str # Type-specific config stored in subclasses or as dict +@dataclass +class Attribution: + """Attribution information for stock footage (e.g., Pexels).""" + + source: str # Source platform (e.g., "pexels", "pixabay", "unsplash") + creator: str # Creator/photographer name + url: Optional[str] = None # URL to the original content + + @dataclass class VideoSource: """Video source definition from videos.json.""" - file: str - preprocess: list[dict] = field(default_factory=list) # List of filter config dicts - output_file: Optional[str] = None # Path to preprocessed output (if any) + + source_file: str # Source video filename (relative to videos.json location or shared_assets/) + filter: list[dict] = field(default_factory=list) # List of filter config dicts + output_file: Optional[ + str + ] = None # Path to preprocessed output (relative to videos.json) + take: Optional[ + float + ] = None # Max duration to play (seconds). Default: until next slide or end of clip + skip: float = 0.0 # Skip this many seconds at start of video (seek point) + zoom: float = ( + 1.0 # Scale factor for video (1.0 = fit to cutout height, >1 = enlarge) + ) + cutout: Optional[ + str + ] = None # Name of cutout to place video in (from project.json cutouts) + always_visible: bool = False # If True, video is always shown (like talking head) + is_shared: bool = False # If True, source_file is relative to shared_assets/ + pause_narration: float = ( + 0.0 # Seconds to pause narration during this video (0 = no pause) + ) + attribution: Optional[Attribution] = None # Attribution for stock footage + use_audio_channels: str = "both" # Audio channel selection: "both", "left", or "right" @dataclass @@ -67,50 +245,202 @@ class VideoMetadata: This allows defining preprocessing steps separately from videos.json, enabling per-video preprocessing configuration. """ + source_file: str # Original source video file preprocess: list[dict] = field(default_factory=list) # Preprocessing filters - output: Optional[dict] = None # Output config {"file": "...", "colorspace": "...", "alpha": "..."} - - -@dataclass -class TimedWord: - """A word or marker with its timestamp from transcript.csv.""" - time: float - word: str - - @property - def is_marker(self) -> bool: - """Check if this is a slide marker like [S1].""" - return self.word.startswith("[") and self.word.endswith("]") - - @property - def marker_id(self) -> Optional[str]: - """Extract marker ID (e.g., 'S1' from '[S1]').""" - if self.is_marker: - return self.word[1:-1] - return None + output: Optional[ + dict + ] = None # Output config {"file": "...", "colorspace": "...", "alpha": "..."} @dataclass class SlideEvent: """A resolved slide event with timing information.""" + slide_id: str start_time: float end_time: float slide_def: SlideDefinition +@dataclass +class AudioDefinition: + """Definition of an audio clip from audio.json.""" + + file: str # Audio filename (relative to audio.json location) + volume: float = 1.0 # Volume multiplier (0.0-1.0) + loop: bool = False # If True, loop for entire duration from trigger point + ignore_pauses: bool = False # If True, audio continues playing during narration pauses + + +@dataclass +class Citation: + """A citation extracted from manuscript.txt [cite:...] markers.""" + + reference: str # The literal reference text after cite: + marker_id: str # The full marker (e.g., "cite:Smith et al...") + timestamp: float = -1.0 # Aligned timestamp (-1 if not aligned) + context: str = "" # Text following the citation for alignment + + +@dataclass +class AudioEvent: + """A resolved audio event with timing information.""" + + audio_id: str + start_time: float # When to start playing (marker time - offset) + audio_def: AudioDefinition + + +@dataclass +class VideoEvent: + """A resolved video event with timing information.""" + + video_id: str + start_time: float + end_time: float + video_source: "VideoSource" + cutout: "CutoutDefinition" + + +@dataclass +class CameraState: + """State of the virtual camera at a point in time. + + The camera transforms the entire composed scene (background, slides, cutouts). + This ensures all elements stay spatially synchronized when zooming/tilting. + """ + + zoom: float = 1.0 # 1.0 = 100%, 1.25 = 125%, etc. + rotation: float = 0.0 # degrees, positive = clockwise + pan_x: float = 0.0 # -1.0 to 1.0, percentage of frame width + pan_y: float = 0.0 # -1.0 to 1.0, percentage of frame height + focal_x: float = 0.5 # 0.0 to 1.0, zoom focal point X (0.5 = center) + focal_y: float = 0.5 # 0.0 to 1.0, zoom focal point Y (0.5 = center) + + def __post_init__(self): + # Clamp values to reasonable ranges + self.zoom = max(0.5, min(3.0, self.zoom)) + self.rotation = max(-45.0, min(45.0, self.rotation)) + self.pan_x = max(-1.0, min(1.0, self.pan_x)) + self.pan_y = max(-1.0, min(1.0, self.pan_y)) + self.focal_x = max(0.0, min(1.0, self.focal_x)) + self.focal_y = max(0.0, min(1.0, self.focal_y)) + + def is_default(self) -> bool: + """Check if this is the default camera state (no transform).""" + return ( + self.zoom == 1.0 + and self.rotation == 0.0 + and self.pan_x == 0.0 + and self.pan_y == 0.0 + and self.focal_x == 0.5 + and self.focal_y == 0.5 + ) + + +@dataclass +class CameraEvent: + """A camera state change at a specific time. + + Camera events can be instant (duration=0) or animated (duration>0). + When animated, the camera smoothly transitions from its current state + to the target state over the specified duration using the easing function. + """ + + time: float # timestamp in seconds + target_state: CameraState + duration: float = 0.2 # transition duration (0 = instant snap) + easing: str = "ease-out" # linear, ease-in, ease-out, ease-in-out + + +# Camera effect presets - map marker names to camera states +# Effect strengths are intentionally subtle for professional look +CAMERA_PRESETS: dict[str, CameraState] = { + # Zoom levels (halved for subtlety) + "Zoom0": CameraState(zoom=1.0), + "Zoom1": CameraState(zoom=1.05), + "Zoom2": CameraState(zoom=1.125), + "Zoom3": CameraState(zoom=1.25), + # Tilt/rotation (halved) + "TiltLeft": CameraState(rotation=-7.5), + "TiltRight": CameraState(rotation=7.5), + "NoTilt": CameraState(), # Full reset to default state + # Pan (halved) + "PanLeft": CameraState(pan_x=-0.1), + "PanRight": CameraState(pan_x=0.1), + "PanUp": CameraState(pan_y=-0.075), + "PanDown": CameraState(pan_y=0.075), + "PanCenter": CameraState(pan_x=0.0, pan_y=0.0), + # Reset all + "Reset": CameraState(), +} + + +@dataclass +class NarrationPause: + """A pause in the narration timeline for an interstitial video.""" + + output_time: float # When the pause starts in the OUTPUT timeline + narration_time: float # Where we are in the NARRATION source when pause starts + duration: float # How long the pause lasts + video_id: str # The video that plays during the pause + + +@dataclass +class OutroEvent: + """A video that plays as part of the outro sequence (after narration ends).""" + + video_id: str + start_time: float # When this outro video starts (in output timeline) + end_time: float # When this outro video ends + video_source: "VideoSource" + cutout: Optional["CutoutDefinition"] = None # None = fullscreen + + @dataclass class RenderPlan: """Complete plan for rendering the final video.""" + project_path: Path config: ProjectConfig - talking_head: VideoSource slide_events: list[SlideEvent] total_duration: float slides: dict[str, SlideDefinition] + videos: dict[str, VideoSource] = field(default_factory=dict) + video_events: list[VideoEvent] = field( + default_factory=list + ) # Triggered video overlays + narration_videos: list[tuple[str, VideoSource, CutoutDefinition]] = field( + default_factory=list + ) # (video_id, source, cutout) slides_dir: Path = None # directory containing slide images - talking_head_path: Path = None # Resolved path to actual video file + videos_dir: Path = None # directory containing videos.json and video files + audio_events: list[AudioEvent] = field(default_factory=list) + audio: dict[str, AudioDefinition] = field(default_factory=dict) + audio_dir: Path = None # directory containing audio.json and audio files + camera_events: list[CameraEvent] = field( + default_factory=list + ) # Virtual camera keyframes + # Partial rendering support + time_offset: float = ( + 0.0 # Offset subtracted from all timestamps (for partial render) + ) + initial_camera_state: "CameraState" = ( + None # Camera state at render start (for partial render) + ) + input_seek_time: float = 0.0 # Seek position for input videos (for partial render) + # Shared assets support + shared_assets_dir: Path = None # Directory containing shared assets (pexels, etc.) + # Narration pause support + narration_pauses: list[NarrationPause] = field( + default_factory=list + ) # Gaps in narration for interstitial videos + # Outro sequence (plays after narration ends) + outro_events: list["OutroEvent"] = field( + default_factory=list + ) # Videos that play after narration ends + narration_end_time: float = 0.0 # When narration ends (before outro starts) # Slide layout configurations (hardcoded for POC) diff --git a/gnommo/parser.py b/gnommo/parser.py index 9367efe..5ef05bb 100644 --- a/gnommo/parser.py +++ b/gnommo/parser.py @@ -1,6 +1,5 @@ """Extract stage: parse all input files.""" -import csv import json import re from pathlib import Path @@ -8,21 +7,28 @@ from typing import Any, Optional from .errors import ParseError from .models import ( + Attribution, + AudioDefinition, + Citation, + CutoutDefinition, ProjectConfig, SlideDefinition, - TalkingHeadConfig, - TimedWord, VideoMetadata, VideoSource, ) -def parse_manuscript(project_path: Path) -> tuple[str, list[str], list[tuple[int, str]]]: +def parse_manuscript( + project_path: Path, +) -> tuple[str, list[str], list[tuple[int, str]], list[Citation]]: """ Parse manuscript.txt and extract text content and slide markers. + Strips [cite:...] markers from the returned text so they never pollute + alignment contexts. Citations are extracted and returned separately. + Returns: - Tuple of (full text, list of marker IDs found, list of malformed markers as (line_num, text)) + Tuple of (full text, list of marker IDs found, list of malformed markers, list of citations) """ manuscript_path = project_path / "manuscript.txt" @@ -31,8 +37,15 @@ def parse_manuscript(project_path: Path) -> tuple[str, list[str], list[tuple[int text = manuscript_path.read_text(encoding="utf-8") - # Extract all valid slide markers like [S1], [S2], etc. - markers = re.findall(r"\[([A-Za-z0-9_]+)\]", text) + # Extract citations before stripping them + citations = parse_citations(text) + + # Strip [cite:...] markers from text so they don't pollute alignment + text = re.sub(r"\[cite:[^\]]+\]", "", text) + + # Extract all valid markers like [S1], [video:demo], [Zoom2], etc. + # Include . in pattern to catch markers with file extensions (so validator can warn about them) + markers = re.findall(r"\[([A-Za-z0-9_:.]+)\]", text) # Find malformed markers (missing brackets, extra spaces, etc.) malformed: list[tuple[int, str]] = [] @@ -56,48 +69,75 @@ def parse_manuscript(project_path: Path) -> tuple[str, list[str], list[tuple[int for match in spaced: malformed.append((line_num, match)) - return text, markers, malformed + return text, markers, malformed, citations -def parse_transcript(project_path: Path) -> list[TimedWord]: +def parse_citations(manuscript_text: str) -> list[Citation]: """ - Parse transcript.csv into a list of timed words. + Extract all [cite:...] markers from manuscript text. - Expected format: - t,word - 0.00,This - 0.42,is - ... + The text after 'cite:' is the literal reference that should appear + in the video description. + + Returns: + List of Citation objects with reference text and context for alignment. """ - transcript_path = project_path / "transcript.csv" + citations = [] - if not transcript_path.exists(): - raise ParseError("transcript.csv not found", transcript_path) + # Match [cite:...] markers - content can include any characters except ] + # Use a more permissive pattern that handles multi-word citations + pattern = r"\[cite:([^\]]+)\]" - timed_words = [] + for match in re.finditer(pattern, manuscript_text): + reference = match.group(1).strip() + marker_id = f"cite:{reference}" - with open(transcript_path, "r", encoding="utf-8") as f: - reader = csv.DictReader(f) + # Extract context: text following the citation (for alignment) + # Get up to 100 chars after the marker, stopping at next marker or newline + end_pos = match.end() + context_text = manuscript_text[end_pos : end_pos + 150] - if reader.fieldnames is None or "t" not in reader.fieldnames or "word" not in reader.fieldnames: - raise ParseError( - "transcript.csv must have columns: t, word", - transcript_path + # Clean up context: take text until next marker or double newline + context_match = re.match(r"([^\[]*?)(?:\[|\n\n|$)", context_text) + context = context_match.group(1).strip() if context_match else "" + + # Truncate context to ~50 chars for display + if len(context) > 50: + context = context[:47] + "..." + + citations.append( + Citation( + reference=reference, + marker_id=marker_id, + context=context, ) + ) - for line_num, row in enumerate(reader, start=2): # start=2 because line 1 is header - try: - time = float(row["t"]) - word = row["word"].strip() - timed_words.append(TimedWord(time=time, word=word)) - except (ValueError, KeyError) as e: - raise ParseError( - f"Invalid row: {e}", - transcript_path, - line_num - ) + return citations - return timed_words + +def save_citations(citations: list[Citation], path: Path) -> None: + """Save citations to a JSON file.""" + data = [ + {"reference": c.reference, "context": c.context} + for c in citations + ] + path.write_text(json.dumps(data, indent=2), encoding="utf-8") + + +def load_citations(path: Path) -> list[Citation]: + """Load citations from a JSON file.""" + if not path.exists(): + return [] + data = json.loads(path.read_text(encoding="utf-8")) + return [ + Citation( + reference=item["reference"], + marker_id=f"cite:{item['reference']}", + context=item.get("context", ""), + ) + for item in data + ] def parse_project_config(project_path: Path) -> ProjectConfig: @@ -112,16 +152,27 @@ def parse_project_config(project_path: Path) -> ProjectConfig: except json.JSONDecodeError as e: raise ParseError(f"Invalid JSON: {e}", config_path) - # Parse talking head config - th_data = data.get("talkinghead", {}) - th_height, th_height_pct = _parse_dimension(th_data.get("targetheight", 200)) - talking_head = TalkingHeadConfig( - x=th_data.get("x", 100), - y=th_data.get("y", 100), - target_height=th_height, - target_height_percent=th_height_pct, - file=th_data.get("file"), - ) + # Parse cutouts (named zones for video placement) + cutouts: dict[str, CutoutDefinition] = {} + cutouts_data = data.get("cutouts", {}) + for cutout_name, cutout_data in cutouts_data.items(): + x, x_pct = _parse_dimension(cutout_data.get("x", 0)) + y, y_pct = _parse_dimension(cutout_data.get("y", 0)) + height, height_pct = _parse_dimension(cutout_data.get("height", 200)) + # Width defaults to same as height (square) if not specified + width, width_pct = _parse_dimension( + cutout_data.get("width", cutout_data.get("height", 200)) + ) + cutouts[cutout_name] = CutoutDefinition( + x=x, + y=y, + height=height, + width=width, + x_percent=x_pct, + y_percent=y_pct, + height_percent=height_pct, + width_percent=width_pct, + ) # Parse resolution resolution = data.get("resolution", [1920, 1080]) @@ -131,12 +182,19 @@ def parse_project_config(project_path: Path) -> ProjectConfig: return ProjectConfig( resolution=tuple(resolution), fps=data.get("fps", 30), - talking_head=talking_head, default_slide_type=data.get("defaultSlideType", "square"), + cutouts=cutouts, background=data.get("background", ""), background_video=data.get("background_video", ""), # Deprecated slides_path=data.get("slides", "slides.json"), + videos_path=data.get("videos", "videos.json"), + audio_path=data.get("audio", "audio.json"), audio_source=data.get("audio_source"), + main_video=data.get("main_video"), + gnommo_scratch=data.get("gnommo_scratch"), + outro=data.get("outro", []), + description=data.get("description", ""), + footer=data.get("footer", ""), ) @@ -157,7 +215,9 @@ def _parse_dimension(value: Any) -> tuple[int, float]: return 200, 0.0 # default -def parse_slides(project_path: Path, config: ProjectConfig = None) -> dict[str, SlideDefinition]: +def parse_slides( + project_path: Path, config: ProjectConfig = None +) -> dict[str, SlideDefinition]: """Parse slides.json into slide definitions.""" if config and config.slides_path: slides_path = project_path / config.slides_path @@ -176,8 +236,7 @@ def parse_slides(project_path: Path, config: ProjectConfig = None) -> dict[str, for slide_id, slide_data in data.items(): if "image" not in slide_data: raise ParseError( - f"Slide '{slide_id}' missing required field 'image'", - slides_path + f"Slide '{slide_id}' missing required field 'image'", slides_path ) slides[slide_id] = SlideDefinition( image=slide_data["image"], @@ -187,12 +246,67 @@ def parse_slides(project_path: Path, config: ProjectConfig = None) -> dict[str, return slides -def parse_videos(project_path: Path) -> dict[str, VideoSource]: - """Parse videos.json into video source definitions.""" - videos_path = project_path / "videos.json" +def parse_audio( + project_path: Path, config: Optional[ProjectConfig] = None +) -> tuple[dict[str, AudioDefinition], Path]: + """ + Parse audio.json into audio definitions. + + Returns: + Tuple of (audio dict, audio_dir) where audio_dir is the directory + containing audio.json (for resolving relative file paths). + """ + if config and config.audio_path: + audio_path = project_path / config.audio_path + else: + audio_path = project_path / "audio.json" + + # Audio is optional - return empty dict if not found + if not audio_path.exists(): + return {}, project_path + + audio_dir = audio_path.parent + + try: + data = json.loads(audio_path.read_text(encoding="utf-8")) + except json.JSONDecodeError as e: + raise ParseError(f"Invalid JSON: {e}", audio_path) + + audio = {} + for audio_id, audio_data in data.items(): + if "file" not in audio_data: + raise ParseError( + f"Audio '{audio_id}' missing required field 'file'", audio_path + ) + audio[audio_id] = AudioDefinition( + file=audio_data["file"], + volume=float(audio_data.get("volume", 1.0)), + loop=bool(audio_data.get("loop", False)), + ignore_pauses=bool(audio_data.get("ignore_pauses", False)), + ) + + return audio, audio_dir + + +def parse_videos( + project_path: Path, config: Optional[ProjectConfig] = None +) -> tuple[dict[str, VideoSource], Path]: + """ + Parse videos.json into video source definitions. + + Returns: + Tuple of (videos dict, videos_dir) where videos_dir is the directory + containing videos.json (for resolving relative file paths). + """ + if config and config.videos_path: + videos_path = project_path / config.videos_path + else: + videos_path = project_path / "videos.json" if not videos_path.exists(): - raise ParseError("videos.json not found", videos_path) + raise ParseError(f"videos.json not found: {videos_path}", videos_path) + + videos_dir = videos_path.parent try: data = json.loads(videos_path.read_text(encoding="utf-8")) @@ -201,18 +315,37 @@ def parse_videos(project_path: Path) -> dict[str, VideoSource]: videos = {} for video_id, video_data in data.items(): - if "file" not in video_data: + if "source_file" not in video_data: raise ParseError( - f"Video '{video_id}' missing required field 'file'", - videos_path + f"Video '{video_id}' missing required field 'source_file'", videos_path ) + + # Parse attribution if present + attribution = None + if "attribution" in video_data: + attr_data = video_data["attribution"] + attribution = Attribution( + source=attr_data.get("source", "unknown"), + creator=attr_data.get("creator", "Unknown"), + url=attr_data.get("url"), + ) + videos[video_id] = VideoSource( - file=video_data["file"], - preprocess=video_data.get("preprocess", []), + source_file=video_data["source_file"], + filter=video_data.get("filter", []), output_file=video_data.get("output_file"), + take=video_data.get("take"), + skip=video_data.get("skip", 0.0), + zoom=video_data.get("zoom", 1.0), + cutout=video_data.get("cutout"), + always_visible=video_data.get("always_visible", False), + is_shared=video_data.get("is_shared", False), + pause_narration=float(video_data.get("pause_narration", 0)), + attribution=attribution, + use_audio_channels=video_data.get("use_audio_channels", "both"), ) - return videos + return videos, videos_dir def get_video_duration(video_path: Path) -> float: @@ -221,10 +354,13 @@ def get_video_duration(video_path: Path) -> float: cmd = [ "ffprobe", - "-v", "error", - "-show_entries", "format=duration", - "-of", "default=noprint_wrappers=1:nokey=1", - str(video_path) + "-v", + "error", + "-show_entries", + "format=duration", + "-of", + "default=noprint_wrappers=1:nokey=1", + str(video_path), ] result = subprocess.run(cmd, capture_output=True, text=True) @@ -261,7 +397,9 @@ def parse_video_metadata(metadata_path: Path) -> VideoMetadata: raise ParseError(f"Invalid JSON: {e}", metadata_path) if "source_file" not in data: - raise ParseError("Video metadata missing required field 'source_file'", metadata_path) + raise ParseError( + "Video metadata missing required field 'source_file'", metadata_path + ) return VideoMetadata( source_file=data["source_file"], @@ -270,7 +408,9 @@ def parse_video_metadata(metadata_path: Path) -> VideoMetadata: ) -def resolve_video_file(project_path: Path, file_ref: str) -> tuple[Path, Optional[VideoMetadata]]: +def resolve_video_file( + project_path: Path, file_ref: str +) -> tuple[Path, Optional[VideoMetadata]]: """ Resolve a video file reference, which can be either: 1. A direct path to a video file diff --git a/gnommo/preprocessor.py b/gnommo/preprocessor.py index 7762e97..155e9b8 100644 --- a/gnommo/preprocessor.py +++ b/gnommo/preprocessor.py @@ -1,38 +1,192 @@ """Preprocessing stage: apply filters to source videos.""" +import os import subprocess +import sys +from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed from pathlib import Path -from typing import Any - +from typing import Any, Optional +import shutil from .errors import PreprocessError -from .models import VideoSource, ChromaKeyConfig +from .models import VideoSource, ChromaKeyConfig, ColorGradeConfig, GnommoKeyConfig, AudioNormalizeConfig +from typing import Union, Optional + +# Number of parallel workers for segment processing +DEFAULT_SEGMENT_WORKERS = 4 + +# Segment duration in seconds for chunked processing (avoids huge intermediate files) +SEGMENT_DURATION = 60 + + +def get_video_duration(video_path: Path) -> float: + """Get duration of a video file using ffprobe.""" + cmd = [ + "ffprobe", + "-v", + "error", + "-show_entries", + "format=duration", + "-of", + "default=noprint_wrappers=1:nokey=1", + str(video_path), + ] + result = subprocess.run(cmd, capture_output=True, text=True) + if result.returncode != 0: + return 0.0 + try: + return float(result.stdout.strip()) + except ValueError: + return 0.0 + + +def format_time(seconds: float) -> str: + """Format seconds as human-readable time string.""" + if seconds < 60: + return f"{int(seconds)}s" + elif seconds < 3600: + mins = int(seconds // 60) + secs = int(seconds % 60) + return f"{mins}m {secs}s" + else: + hours = int(seconds // 3600) + mins = int((seconds % 3600) // 60) + return f"{hours}h {mins}m" + + +import selectors, time, sys, subprocess + + +def run_ffmpeg_with_progress(cmd, duration, description="Processing"): + cmd = cmd.copy() + insert_pos = cmd.index("-y") + 1 if "-y" in cmd else 1 + cmd[insert_pos:insert_pos] = [ + "-progress", + "pipe:1", + "-nostats", + "-loglevel", + "warning", + ] + + p = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + bufsize=1, + universal_newlines=True, + ) + + sel = selectors.DefaultSelector() + sel.register(p.stdout, selectors.EVENT_READ) + + bar_width = 30 + last_update = time.time() + last_percent = 0 + seen_any_progress = False + logs = [] + + def draw(percent, suffix=""): + filled = int(bar_width * percent / 100) + bar = "█" * filled + "░" * (bar_width - filled) + sys.stdout.write( + f"\r {description}: [{bar}] {percent:3d}% {suffix} " + ) + sys.stdout.flush() + + draw(0, "Initializing...") + + while True: + # If process ended and no more output, break + if p.poll() is not None: + # drain any remaining output quickly + while True: + line = p.stdout.readline() + if not line: + break + logs.append(line) + break + + events = sel.select(timeout=0.2) + if not events: + # No output right now; show finalizing if we're near end + if ( + seen_any_progress + and last_percent >= 99 + and (time.time() - last_update) > 1.0 + ): + draw(last_percent, "Finalizing...") + continue + + for key, _ in events: + line = key.fileobj.readline() + if not line: + continue + logs.append(line) + + if line.startswith("out_time_ms="): + val = line.split("=", 1)[1].strip() + if val != "N/A": + try: + t_ms = int(val) + t_s = t_ms / 1_000_000 + percent = ( + min(99, int((t_s / duration) * 100)) if duration > 0 else 0 + ) + last_percent = max(last_percent, percent) + last_update = time.time() + seen_any_progress = True + draw(last_percent, "") + except ValueError: + pass + + # Completion + if p.returncode == 0: + draw(100, "Done\n") + else: + sys.stdout.write("\n") + sys.stdout.flush() + + return subprocess.CompletedProcess( + cmd, p.returncode, stdout="", stderr="".join(logs) + ) def preprocess_video( - project_path: Path, + videos_dir: Path, video_id: str, video_source: VideoSource, verbose: bool = False, + force: bool = False, + custom_gnommo_scratch: Optional[Path] = None, ) -> Path: """ Apply preprocessing filters to a video source. - Each filter is applied atomically, producing an intermediate ProRes 4444 - file with alpha channel support. Filters are chained sequentially. + Video filters (chroma_key, mask) are combined into single FFmpeg passes + for efficiency. Non-video filters (transcribe) are handled separately. + + Args: + videos_dir: Directory containing videos.json and video files + video_id: ID of the video being processed + video_source: VideoSource with source_file, filter, and output_file + custom_gnommo_scratch: Optional external directory for intermediate files (e.g., SSD) Returns: Path to the final preprocessed output file. """ - if not video_source.preprocess: - # No preprocessing needed, return original file - return project_path / video_source.file + if not video_source.filter: + # No filters defined, return original file + return videos_dir / video_source.source_file - # Ensure intermediate directory exists - intermediate_dir = project_path / "intermediate" - intermediate_dir.mkdir(parents=True, exist_ok=True) + # Use custom intermediate dir if provided, otherwise default to videos_dir/intermediate + if custom_gnommo_scratch: + gnommo_scratch = custom_gnommo_scratch / video_id + else: + gnommo_scratch = videos_dir / "intermediate" + gnommo_scratch.mkdir(parents=True, exist_ok=True) - # Start with the source file - current_input = project_path / video_source.file + # Start with the source file (relative to videos_dir) + current_input = videos_dir / video_source.source_file if not current_input.exists(): raise PreprocessError( @@ -40,66 +194,975 @@ def preprocess_video( filter_type=None, ) - # Apply each filter in sequence - for i, filter_config in enumerate(video_source.preprocess): + # Track intermediate files for cleanup + intermediate_files: list[Path] = [] + + # Video filter types that can be combined in a single FFmpeg pass + VIDEO_FILTER_TYPES = {"chroma_key", "mask", "color_grade", "gnommokey"} + + # Group consecutive video filters into batches + filter_batches: list[list[dict]] = [] + current_batch: list[dict] = [] + + for filter_config in video_source.filter: filter_type = filter_config.get("type") + if filter_type in VIDEO_FILTER_TYPES: + current_batch.append(filter_config) + else: + # Non-video filter breaks the batch + if current_batch: + filter_batches.append(current_batch) + current_batch = [] + # Add non-video filter as its own "batch" + filter_batches.append([filter_config]) - if filter_type is None: - raise PreprocessError( - f"Filter {i} missing 'type' field", - filter_type=None, + # Don't forget the last batch + if current_batch: + filter_batches.append(current_batch) + + # Process each batch + batch_num = 0 + for batch in filter_batches: + first_filter_type = batch[0].get("type") + + if first_filter_type in VIDEO_FILTER_TYPES: + # Combined video filter batch - use segmented processing for large files + filter_names = "+".join(f.get("type") for f in batch) + print(f" Video filters (combined): {filter_names}") + + # Output to WebM (compressed with alpha) instead of ProRes + step_output = gnommo_scratch / f"{video_id}_batch{batch_num}.mov" + intermediate_files.append(step_output) + + apply_combined_video_filters_segmented( + current_input, + step_output, + batch, + verbose, + take=video_source.take, + scratch_dir=gnommo_scratch / "segments", ) + current_input = step_output + batch_num += 1 - # Determine output path for this filter step - step_output = intermediate_dir / f"{video_id}_step{i}_{filter_type}.mov" + elif first_filter_type == "transcribe": + # Transcribe doesn't transform video + print(" Filter: transcribe") + apply_transcribe(current_input, batch[0], verbose, force) - if verbose: - print(f" Step {i + 1}: {filter_type}") - print(f" Input: {current_input}") - print(f" Output: {step_output}") + elif first_filter_type == "audio_normalize": + # Audio normalization: denoise, compress, and normalize loudness + print(" Filter: audio_normalize") + step_output = gnommo_scratch / f"{video_id}_batch{batch_num}_audio.mov" + intermediate_files.append(step_output) + apply_audio_normalize( + current_input, + step_output, + batch[0], + verbose, + take=video_source.take, + ) + current_input = step_output + batch_num += 1 - # Apply the appropriate filter - if filter_type == "chroma_key": - apply_chroma_key(current_input, step_output, filter_config, verbose) else: raise PreprocessError( - f"Unknown filter type: {filter_type}", - filter_type=filter_type, + f"Unknown filter type: {first_filter_type}", + filter_type=first_filter_type, ) - current_input = step_output - - # If output_file is specified, copy/rename to final location + # If output_file is specified, copy/rename to final location and clean up if video_source.output_file: - final_output = project_path / video_source.output_file + import shutil + + final_output = videos_dir / video_source.output_file + final_output.parent.mkdir(parents=True, exist_ok=True) # Copy the final intermediate to the output location - import shutil shutil.copy2(current_input, final_output) if verbose: print(f" Final output: {final_output}") + # Clean up intermediate files + for intermediate_file in intermediate_files: + if intermediate_file.exists(): + intermediate_file.unlink() + if verbose: + print(f" Removed intermediate: {intermediate_file.name}") + + # Remove intermediate directory if empty + try: + gnommo_scratch.rmdir() + except OSError: + pass # Directory not empty (other videos may have intermediates) + return final_output + # No output_file specified, return current processed file return current_input +def apply_combined_video_filters( + input_path: Path, + output_path: Path, + filters: list[dict], + verbose: bool = False, + take: float = None, +) -> None: + """ + Apply multiple video filters in a single FFmpeg pass. + + Combines chroma_key, mask, and other video filters into one filter chain. + """ + filter_parts: list[str] = [] + + for filter_config in filters: + filter_type = filter_config.get("type") + + if filter_type == "chroma_key": + filter_parts.append(build_chroma_key_filter(filter_config)) + elif filter_type == "mask": + filter_parts.append(build_mask_filter(filter_config)) + elif filter_type == "color_grade": + filter_parts.append(build_color_grade_filter(filter_config)) + elif filter_type == "gnommokey": + filter_parts.append(build_gnommokey_filter(filter_config)) + + video_filter = ",".join(filter_parts) + + # Build FFmpeg command + cmd = ["ffmpeg", "-y"] + + if take is not None: + cmd.extend(["-t", str(take)]) + + cmd.extend( + [ + "-i", + str(input_path), + "-vf", + video_filter, + "-c:v", + "prores_ks", + "-profile:v", + "4", # ProRes 4444 + "-pix_fmt", + "yuva444p10le", # 10-bit with alpha + "-c:a", + "pcm_s16le", # Lossless audio + str(output_path), + ] + ) + + if verbose: + print(f" Combined filter: {video_filter}") + print(f" Command: {' '.join(cmd)}") + + # Get duration for progress bar + duration = take if take is not None else get_video_duration(input_path) + + result = run_ffmpeg_with_progress(cmd, duration, "Processing") + + if result.returncode != 0: + raise PreprocessError( + "Combined video filter failed", + filter_type="combined", + command=" ".join(cmd), + stderr=result.stderr, + ) + + +def build_chroma_key_filter(config: dict) -> str: + """Build FFmpeg chromakey filter string from config.""" + chroma_config = parse_chroma_key_config(config) + + r, g, b = chroma_config.color + hex_color = f"0x{r:02x}{g:02x}{b:02x}" + + parts = [ + f"chromakey={hex_color}:{chroma_config.similarity:.3f}:{chroma_config.blend:.3f}" + ] + + if chroma_config.spill > 0: + parts.append(f"despill=type=green:mix={chroma_config.spill:.3f}") + + # Edge erosion: shrink alpha mask to remove green fringe + # Uses erosion filter targeting only alpha channel (plane 3) + # threshold0-2=65535 means Y/U/V unchanged, threshold3=0 erodes alpha + if chroma_config.edge_erode > 0: + erode_passes = min(chroma_config.edge_erode, 5) # Cap at 5 passes + parts.append("format=yuva444p") + for _ in range(erode_passes): + parts.append( + "erosion=threshold0=65535:threshold1=65535:threshold2=65535:threshold3=0" + ) + + # Color protection: restore alpha for pixels matching protected color + # This runs AFTER chromakey/despill/erosion to restore any incorrectly keyed pixels + if chroma_config.protect_color: + pr, pg, pb = chroma_config.protect_color + # Convert tolerance from 0-1 range to pixel range (0-255) + tol = int(chroma_config.protect_tolerance * 255) + + # Ensure we're in RGBA for geq to work with r/g/b/alpha functions + parts.append("format=rgba") + + # Build condition: pixel RGB is within tolerance of protected color + # between(value, min, max) returns 1 if min <= value <= max + # Multiply conditions together for AND logic + condition = ( + f"between(r(X,Y),{max(0, pr-tol)},{min(255, pr+tol)})*" + f"between(g(X,Y),{max(0, pg-tol)},{min(255, pg+tol)})*" + f"between(b(X,Y),{max(0, pb-tol)},{min(255, pb+tol)})" + ) + + # geq: if pixel matches protected color, set alpha to 255, else keep current alpha + parts.append( + f"geq=r='r(X,Y)':g='g(X,Y)':b='b(X,Y)':a='if({condition},255,alpha(X,Y))'" + ) + + return ",".join(parts) + + +def build_mask_filter(config: dict) -> str: + """Build FFmpeg geq mask filter string from config.""" + left = float(config.get("left", 0)) + right = float(config.get("right", 0)) + top = float(config.get("top", 0)) + bottom = float(config.get("bottom", 0)) + + conditions = [] + if left > 0: + conditions.append(f"lt(X,W*{left})") + if right > 0: + conditions.append(f"gt(X,W*{1-right})") + if top > 0: + conditions.append(f"lt(Y,H*{top})") + if bottom > 0: + conditions.append(f"gt(Y,H*{1-bottom})") + + if not conditions: + return "copy" # No-op filter + + alpha_expr = "+".join(conditions) + alpha_expr = f"if({alpha_expr},0,alpha(X,Y))" + + return f"geq=lum='lum(X,Y)':cb='cb(X,Y)':cr='cr(X,Y)':a='{alpha_expr}'" + + +def build_color_grade_filter(config: dict) -> str: + """Build FFmpeg color grading filter string from config. + + Applies color balance, curves, and EQ adjustments while preserving alpha. + The filter chain converts to RGBA for color operations, then back to + yuva444p10le to preserve the alpha channel. + """ + grade_config = parse_color_grade_config(config) + parts: list[str] = [] + + # Start with format conversion to RGBA for color operations + parts.append("format=rgba") + + # Color balance (only add if any value is non-zero) + colorbalance_parts = [] + if grade_config.rs != 0: + colorbalance_parts.append(f"rs={grade_config.rs:.3f}") + if grade_config.gs != 0: + colorbalance_parts.append(f"gs={grade_config.gs:.3f}") + if grade_config.bs != 0: + colorbalance_parts.append(f"bs={grade_config.bs:.3f}") + if grade_config.rm != 0: + colorbalance_parts.append(f"rm={grade_config.rm:.3f}") + if grade_config.gm != 0: + colorbalance_parts.append(f"gm={grade_config.gm:.3f}") + if grade_config.bm != 0: + colorbalance_parts.append(f"bm={grade_config.bm:.3f}") + if grade_config.rh != 0: + colorbalance_parts.append(f"rh={grade_config.rh:.3f}") + if grade_config.gh != 0: + colorbalance_parts.append(f"gh={grade_config.gh:.3f}") + if grade_config.bh != 0: + colorbalance_parts.append(f"bh={grade_config.bh:.3f}") + + if colorbalance_parts: + parts.append(f"colorbalance={':'.join(colorbalance_parts)}") + + # Curves preset (if specified) + if grade_config.curves_preset and grade_config.curves_preset != "none": + parts.append(f"curves=preset={grade_config.curves_preset}") + + # EQ adjustments (only add if different from defaults) + eq_parts = [] + if grade_config.contrast != 1.0: + eq_parts.append(f"contrast={grade_config.contrast:.3f}") + if grade_config.brightness != 0.0: + eq_parts.append(f"brightness={grade_config.brightness:.3f}") + if grade_config.saturation != 1.0: + eq_parts.append(f"saturation={grade_config.saturation:.3f}") + + if eq_parts: + parts.append(f"eq={':'.join(eq_parts)}") + + # Custom curves (if specified) + custom_curves = [] + if grade_config.curves_r: + custom_curves.append(f"r='{grade_config.curves_r}'") + if grade_config.curves_g: + custom_curves.append(f"g='{grade_config.curves_g}'") + if grade_config.curves_b: + custom_curves.append(f"b='{grade_config.curves_b}'") + if grade_config.curves_master: + custom_curves.append(f"master='{grade_config.curves_master}'") + + if custom_curves: + parts.append(f"curves={':'.join(custom_curves)}") + + # Convert back to yuva444p10le to preserve alpha for downstream filters + parts.append("format=yuva444p10le") + + return ",".join(parts) + + +def parse_color_grade_config(config: dict) -> ColorGradeConfig: + """Parse a color grade config dictionary into ColorGradeConfig.""" + return ColorGradeConfig( + # Shadows + rs=float(config.get("rs", 0.0)), + gs=float(config.get("gs", 0.0)), + bs=float(config.get("bs", 0.0)), + # Midtones + rm=float(config.get("rm", 0.0)), + gm=float(config.get("gm", 0.0)), + bm=float(config.get("bm", 0.0)), + # Highlights + rh=float(config.get("rh", 0.0)), + gh=float(config.get("gh", 0.0)), + bh=float(config.get("bh", 0.0)), + # Curves preset + curves_preset=config.get("curves_preset", "none"), + # EQ + contrast=float(config.get("contrast", 1.0)), + brightness=float(config.get("brightness", 0.0)), + saturation=float(config.get("saturation", 1.0)), + # Custom curves + curves_r=config.get("curves_r", ""), + curves_g=config.get("curves_g", ""), + curves_b=config.get("curves_b", ""), + curves_master=config.get("curves_master", ""), + ) + + +def build_gnommokey_filter(config: dict) -> str: + """Build FFmpeg gnommokey filter string - Keylight-style color-difference keyer. + + Uses YCbCr color-difference keying algorithm: + - For green screen: key signal = (Cb - Cr), high values = green + - screen_gain scales the key extraction strength + - screen_balance mixes luminance into the key calculation + - clip_black/clip_white compress the matte range + - despill shifts green spill toward the bias color + """ + cfg = parse_gnommokey_config(config) + parts: list[str] = [] + + # Get screen color RGB values + sr, sg, sb = cfg.screen_color + + # Determine if this is green or blue screen based on RGB dominance + # Green screen: G is the highest channel + # Blue screen: B is the highest channel + is_green_screen = sg >= sb + + # Work in RGBA space for RGB-based color difference keying + parts.append("format=rgba") + + # Build the alpha calculation expression + gain = cfg.screen_gain / 100.0 + balance = cfg.screen_balance / 100.0 + + # RGB-based color-difference key calculation: + # For green screen: key = G - max(R, B) → measures "greenness" + # For blue screen: key = B - max(R, G) → measures "blueness" + # This is more reliable than YCbCr for screens that aren't pure colors + if is_green_screen: + # Green screen: how much does G exceed the stronger of R or B? + key_signal = "max(0,g(X,Y)-max(r(X,Y),b(X,Y)))" + else: + # Blue screen: how much does B exceed the stronger of R or G? + key_signal = "max(0,b(X,Y)-max(r(X,Y),g(X,Y)))" + + # Apply screen_balance: mix in luminance-based keying + # At balance=0: pure color difference + # At balance=1: luminance contributes (pixels matching screen luma key more) + screen_y = int(0.299 * sr + 0.587 * sg + 0.114 * sb) + + if balance > 0: + # Luma similarity: boost keying for pixels with similar luminance to screen + # This helps key darker/lighter greens that might otherwise be missed + luma_expr = f"(0.299*r(X,Y)+0.587*g(X,Y)+0.114*b(X,Y))" + luma_boost = f"(1+{balance:.2f}*(1-abs({luma_expr}-{screen_y})/128))" + key_expr = f"({key_signal})*{luma_boost}" + else: + key_expr = f"({key_signal})" + + # Apply gain: screen_gain of 100 = 1.0, 126 = 1.26 + # For typical green screen, G-max(R,B) ranges 0-150 + # Scale factor maps this to 0-255 range + scale_factor = gain * 2.5 + key_expr = f"({key_expr})*{scale_factor:.3f}" + + # Apply clip_black and clip_white to compress the matte + # clip_black: key values below this become 0 (those pixels stay opaque) + # clip_white: key values above this become 255 (fully transparent) + # Default 0/100 means: 0-255 maps to 0-255 (no change) + clip_b = cfg.clip_black * 2.55 # Convert 0-100 to 0-255 + clip_w = cfg.clip_white * 2.55 + + if clip_w > clip_b: + # Remap the range [clip_b, clip_w] to [0, 255] + range_scale = 255.0 / (clip_w - clip_b) + key_expr = f"clip(({key_expr}-{clip_b:.1f})*{range_scale:.3f},0,255)" + else: + key_expr = f"clip({key_expr},0,255)" + + # Invert: high key value (green) = low alpha (transparent) + alpha_expr = f"255-{key_expr}" + + # Build the geq filter for alpha (in RGBA mode) + parts.append(f"geq=r='r(X,Y)':g='g(X,Y)':b='b(X,Y)':a='{alpha_expr}'") + + # Despill: shift green/blue spill toward the bias color + if cfg.despill_bias and cfg.despill_strength > 0: + # Already in RGBA format + br, bg, bb = cfg.despill_bias + strength = cfg.despill_strength + + if is_green_screen: + # Green spill: G exceeds max(R, B) + spill_expr = "max(0,g(X,Y)-max(r(X,Y),b(X,Y)))" + else: + # Blue spill: B exceeds max(R, G) + spill_expr = "max(0,b(X,Y)-max(r(X,Y),g(X,Y)))" + + # Lerp factor based on spill amount + factor_expr = f"({spill_expr}/255*{strength:.2f})" + + # Shift colors toward the bias + new_r = f"clip(r(X,Y)+({br}-r(X,Y))*{factor_expr},0,255)" + new_g = f"clip(g(X,Y)+({bg}-g(X,Y))*{factor_expr},0,255)" + new_b = f"clip(b(X,Y)+({bb}-b(X,Y))*{factor_expr},0,255)" + + parts.append(f"geq=r='{new_r}':g='{new_g}':b='{new_b}':a='alpha(X,Y)'") + + # Edge-aware despill: aggressively suppress green at semi-transparent edges + # This targets the 2-4px green fringe that regular despill misses + # edge_factor is high (1.0) at alpha=128, low (0) at alpha=0 or 255 + # At edges: cap G so it never exceeds max(R, B) + if is_green_screen: + # Edge factor: peaks at alpha=128, falls off toward 0 and 255 + # Using min(alpha, 255-alpha)/128 gives smooth 0→1→0 curve + edge_factor = "min(alpha(X,Y),255-alpha(X,Y))/128" + + # Green excess at this pixel + green_excess = "max(0,g(X,Y)-max(r(X,Y),b(X,Y)))" + + # Suppress green proportionally to edge_factor + # At edges: G = G - excess (caps G to max(R,B)) + # At interior: G unchanged + new_g = f"clip(g(X,Y)-({green_excess})*({edge_factor}),0,255)" + + parts.append(f"geq=r='r(X,Y)':g='{new_g}':b='b(X,Y)':a='alpha(X,Y)'") + else: + # Blue screen edge despill + edge_factor = "min(alpha(X,Y),255-alpha(X,Y))/128" + blue_excess = "max(0,b(X,Y)-max(r(X,Y),g(X,Y)))" + new_b = f"clip(b(X,Y)-({blue_excess})*({edge_factor}),0,255)" + + parts.append(f"geq=r='r(X,Y)':g='g(X,Y)':b='{new_b}':a='alpha(X,Y)'") + + # Edge erosion: shrink alpha channel to remove green fringe + # threshold=0 means "don't change", threshold=65535 means "full erosion" + # We want to erode only the alpha channel (plane 3), leave RGB unchanged + if cfg.edge_erode > 0: + erode_passes = min(cfg.edge_erode, 5) + for _ in range(erode_passes): + parts.append( + "erosion=threshold0=0:threshold1=0:threshold2=0:threshold3=65535" + ) + + # Edge softening (blur the alpha) + if cfg.edge_soften > 0: + # Use gblur on alpha channel only via format manipulation + # First extract to a format where we can blur, then re-merge + # Simpler approach: use avgblur with small radius + radius = min(int(cfg.edge_soften), 5) + if radius > 0: + parts.append(f"alphaextract,avgblur=sizeX={radius}:sizeY={radius}[blur]") + # This gets complex - for now, skip alpha blur and just use erosion + + # Ensure output is in a good format + parts.append("format=yuva444p10le") + + return ",".join(parts) + + +def parse_gnommokey_config(config: dict) -> GnommoKeyConfig: + """Parse a gnommokey config dictionary into GnommoKeyConfig.""" + # Parse screen_color + screen_color = config.get("screen_color", [0, 177, 64]) + if isinstance(screen_color, list) and len(screen_color) == 3: + screen_color = tuple(screen_color) + else: + screen_color = (0, 177, 64) + + # Parse despill_bias + despill_bias = config.get("despill_bias") + if despill_bias: + if isinstance(despill_bias, list) and len(despill_bias) == 3: + despill_bias = tuple(despill_bias) + else: + despill_bias = None + + # Parse alpha_bias + alpha_bias = config.get("alpha_bias") + if alpha_bias: + if isinstance(alpha_bias, list) and len(alpha_bias) == 3: + alpha_bias = tuple(alpha_bias) + else: + alpha_bias = None + + return GnommoKeyConfig( + screen_color=screen_color, + screen_gain=float(config.get("screen_gain", 100.0)), + screen_balance=float(config.get("screen_balance", 50.0)), + clip_black=float(config.get("clip_black", 0.0)), + clip_white=float(config.get("clip_white", 100.0)), + despill_bias=despill_bias, + despill_strength=float(config.get("despill_strength", 0.5)), + alpha_bias=alpha_bias, + edge_erode=int(config.get("edge_erode", 0)), + edge_soften=float(config.get("edge_soften", 0.0)), + ) + + +def apply_combined_video_filters_segmented( + input_path: Path, + output_path: Path, + filters: list[dict], + verbose: bool = False, + take: float = None, + scratch_dir: Path = None, +) -> None: + """ + Apply video filters using segment-based processing for large files. + + For videos longer than SEGMENT_DURATION: + 1. Split into segments + 2. Process each segment with filters + 3. Encode to VP9/WebM with alpha (compressed) + 4. Concatenate segments into final output + + VP9/WebM is used instead of ProRes 4444 for much better compression + while maintaining alpha channel support. + """ + duration = take if take is not None else get_video_duration(input_path) + + # Short video: process directly without segmentation + if duration <= SEGMENT_DURATION: + _process_segment_to_prores4444( + input_path, output_path, filters, 0, duration, verbose, take, True + ) + return + + # Long video: process in segments (parallel) + if scratch_dir is None: + scratch_dir = output_path.parent / "segments" + scratch_dir.mkdir(parents=True, exist_ok=True) + + num_segments = int(duration / SEGMENT_DURATION) + 1 + segment_files: list[Path] = [] + segment_tasks: list[tuple] = [] # (index, segment_path, start_time, segment_duration) + + # Build list of segment tasks + for i in range(num_segments): + start_time = i * SEGMENT_DURATION + segment_duration = min(SEGMENT_DURATION, duration - start_time) + + if segment_duration <= 0: + break + + segment_path = scratch_dir / f"segment_{i:04d}.mov" + segment_files.append(segment_path) + segment_tasks.append((i, segment_path, start_time, segment_duration)) + + num_workers = min(DEFAULT_SEGMENT_WORKERS, len(segment_tasks)) + print( + f" Processing {len(segment_tasks)} segments in parallel ({num_workers} workers)" + ) + + # Process segments in parallel + def process_segment_task(task): + i, segment_path, start_time, seg_duration = task + _process_segment_to_prores4444( + input_path, + segment_path, + filters, + start_time, + seg_duration, + verbose=False, # Suppress verbose in parallel mode + take=seg_duration, + ) + return i, segment_path + + completed = 0 + with ThreadPoolExecutor(max_workers=num_workers) as executor: + futures = {executor.submit(process_segment_task, task): task for task in segment_tasks} + for future in as_completed(futures): + i, segment_path = future.result() + completed += 1 + print(f" Completed segment {i+1}/{len(segment_tasks)} ({completed}/{len(segment_tasks)} done)") + + # Concatenate all segments at once + print(f" Concatenating {len(segment_files)} segments...") + _concatenate_prores4444_segments( + segment_files, output_path, verbose, keep_audio=True + ) + + # Clean up segment files + for segment_file in segment_files: + if segment_file.exists(): + segment_file.unlink() + + # Remove segments directory if empty + try: + scratch_dir.rmdir() + except OSError: + pass + + +def _process_segment_to_prores4444( + input_path: Path, + output_path: Path, + filters: list[dict], + start_time: float, + segment_duration: float, + verbose: bool = False, + take: float = None, + keep_audio: bool = True, +) -> None: + """ + Process a video segment with filters and encode to ProRes 4444 (MOV) with alpha. + + This is intended as an intermediate format for compositing: + - true alpha channel (non-binary edges) + - 4:4:4 chroma (better key edges than 4:2:0) + - robust for concatenation and further filtering + """ + + filter_parts: list[str] = [] + + for filter_config in filters: + filter_type = filter_config.get("type") + if filter_type == "chroma_key": + filter_parts.append(build_chroma_key_filter(filter_config)) + elif filter_type == "mask": + filter_parts.append(build_mask_filter(filter_config)) + elif filter_type == "color_grade": + filter_parts.append(build_color_grade_filter(filter_config)) + elif filter_type == "gnommokey": + filter_parts.append(build_gnommokey_filter(filter_config)) + + video_filter = ",".join(filter_parts) + + # Ensure we end in an alpha-capable pixel format. + # 10-bit 4:4:4 + alpha is ideal for keyed edges. + if video_filter: + video_filter += ",format=yuva444p10le" + else: + video_filter = "format=yuva444p10le" + + # Build FFmpeg command + cmd: list[str] = ["ffmpeg", "-y"] + + # Seek to start time (before input for fast seeking) + if start_time > 0: + cmd.extend(["-ss", str(start_time)]) + + cmd.extend(["-i", str(input_path)]) + + # Limit duration + actual_take = take if take is not None else segment_duration + if actual_take is not None: + cmd.extend(["-t", str(actual_take)]) + + # Video encode: ProRes 4444 with alpha + cmd.extend( + [ + "-vf", + video_filter, + "-c:v", + "prores_ks", + "-profile:v", + "4", # 4 = ProRes 4444 + "-pix_fmt", + "yuva444p10le", # must carry alpha + "-vendor", + "apl0", # optional; helps some NLEs tag as Apple ProRes + "-movflags", + "+faststart", # optional; makes MOV streamable + ] + ) + + # Audio handling (optional) + if keep_audio: + # PCM is the least surprising intermediate audio. + # You can also do "-c:a copy" if your source audio codec is stable across chunks. + cmd.extend(["-c:a", "pcm_s16le"]) + else: + cmd.append("-an") + + cmd.append(str(output_path)) + + if verbose: + print(f" Filter: {video_filter}") + print(f" Command: {' '.join(cmd)}") + + result = run_ffmpeg_with_progress(cmd, actual_take or segment_duration, "Encoding") + + if result.returncode != 0: + raise PreprocessError( + "Segment processing failed", + filter_type="segment", + command=" ".join(cmd), + stderr=result.stderr, + ) + + +def _process_segment_to_webm( + input_path: Path, + output_path: Path, + filters: list[dict], + start_time: float, + segment_duration: float, + verbose: bool = False, + take: float = None, +) -> None: + """ + Process a video segment with filters and encode to VP9/WebM with alpha. + + VP9 with alpha uses ~10-20% of ProRes 4444 file size while maintaining + good quality for compositing. + """ + filter_parts: list[str] = [] + + for filter_config in filters: + filter_type = filter_config.get("type") + if filter_type == "chroma_key": + filter_parts.append(build_chroma_key_filter(filter_config)) + elif filter_type == "mask": + filter_parts.append(build_mask_filter(filter_config)) + elif filter_type == "color_grade": + filter_parts.append(build_color_grade_filter(filter_config)) + elif filter_type == "gnommokey": + filter_parts.append(build_gnommokey_filter(filter_config)) + + video_filter = ",".join(filter_parts) + + # Force output to yuva420p to preserve alpha channel through to encoder + video_filter += ",format=yuva420p" + + # Build FFmpeg command for VP9 with alpha + cmd = ["ffmpeg", "-y"] + + # Seek to start time (before input for fast seeking) + if start_time > 0: + cmd.extend(["-ss", str(start_time)]) + + cmd.extend(["-i", str(input_path)]) + + # Limit duration + actual_take = take if take is not None else segment_duration + if actual_take is not None: + cmd.extend(["-t", str(actual_take)]) + + cmd.extend( + [ + "-vf", + video_filter, + "-c:v", + "libvpx-vp9", + "-pix_fmt", + "yuva420p", # VP9 with alpha + "-auto-alt-ref", + "0", # Required for alpha channel in VP9 + "-crf", + "25", # Quality (lower = better, 15-35 typical) + "-b:v", + "0", # Variable bitrate mode + "-deadline", + "good", # Encoding speed (good balance) + "-cpu-used", + "2", # Speed/quality tradeoff (0-5, lower = better) + "-c:a", + "libopus", # Opus audio codec + "-b:a", + "128k", + str(output_path), + ] + ) + + if verbose: + print(f" Filter: {video_filter}") + print(f" Command: {' '.join(cmd)}") + + result = run_ffmpeg_with_progress(cmd, actual_take or segment_duration, "Encoding") + + if result.returncode != 0: + raise PreprocessError( + "Segment processing failed", + filter_type="segment", + command=" ".join(cmd), + stderr=result.stderr, + ) + + +def _concatenate_prores4444_segments( + segment_files: list[Path], + output_path: Path, + verbose: bool = False, + keep_audio: bool = False, +) -> None: + """ + Concatenate ProRes 4444 (MOV) segments into a single ProRes 4444 output. + + Uses FFmpeg concat demuxer, then re-encodes once to ensure alpha and + stream consistency across segments. + """ + + concat_list = output_path.parent / "concat_list.txt" + output_path.parent.mkdir(parents=True, exist_ok=True) + + with open(concat_list, "w", encoding="utf-8") as f: + for segment in segment_files: + f.write(f"file '{segment.resolve()}'\n") + + cmd: list[str] = [ + "ffmpeg", + "-y", + "-f", + "concat", + "-safe", + "0", + "-i", + str(concat_list), + # Encode to ProRes 4444 with alpha + "-c:v", + "prores_ks", + "-profile:v", + "4", # ProRes 4444 + "-pix_fmt", + "yuva444p10le", # preserve alpha + best key edges + "-vendor", + "apl0", + "-movflags", + "+faststart", + ] + + if keep_audio: + # safest for intermediates; alternatively "-c:a copy" if identical across segments + cmd += ["-c:a", "pcm_s16le"] + else: + cmd += ["-an"] + + cmd.append(str(output_path)) + + if verbose: + print(f" Concat list: {concat_list}") + print(f" Command: {' '.join(cmd)}") + + result = subprocess.run(cmd, capture_output=True, text=True) + + if result.returncode != 0: + raise PreprocessError( + "Segment concatenation failed", + filter_type="concat", + command=" ".join(cmd), + stderr=result.stderr, + ) + + +def _concatenate_webm_segments( + segment_files: list[Path], + output_path: Path, + verbose: bool = False, +) -> None: + """ + Concatenate WebM segments into a single output file. + + Uses FFmpeg's concat demuxer for lossless concatenation. + """ + # Create concat file list + concat_list = output_path.parent / "concat_list.txt" + + with open(concat_list, "w") as f: + for segment in segment_files: + # FFmpeg concat format: file 'path' + f.write(f"file '{segment.resolve()}'\n") + + cmd = [ + "ffmpeg", + "-y", + "-f", + "concat", + "-safe", + "0", + "-i", + str(concat_list), + "-c:v", + "libvpx-vp9", + "-pix_fmt", + "yuva420p", # Stream copy (no re-encoding) + str(output_path), + ] + + if verbose: + print(f" Concat list: {concat_list}") + print(f" Command: {' '.join(cmd)}") + + result = subprocess.run(cmd, capture_output=True, text=True) + + # Clean up concat list + concat_list.unlink() + + if result.returncode != 0: + raise PreprocessError( + "Segment concatenation failed", + filter_type="concat", + command=" ".join(cmd), + stderr=result.stderr, + ) + + def apply_chroma_key( input_path: Path, output_path: Path, config: dict[str, Any], verbose: bool = False, + take: float = None, ) -> None: """ Apply chroma key (green screen) filter using FFmpeg. Config options: color: [R, G, B] - Color to key out (default: [0, 255, 0] green) - similarity: float - Color similarity threshold 0.0-1.0 (default: 0.15) - blend: float - Edge blend/feathering 0.0-1.0 (default: 0.1) - spill: float - Spill suppression 0.0-1.0 (default: 0.0) + similarity: float - Color similarity threshold 0.0-1.0 (default: 0.4) + blend: float - Edge blend/feathering 0.0-1.0 (default: 0.08) + spill: float - Spill suppression 0.0-1.0 (default: 0.1) + + Args: + take: Optional duration in seconds to limit processing (for quick iteration) Output is ProRes 4444 with alpha channel for lossless quality. """ @@ -112,13 +1175,13 @@ def apply_chroma_key( # Build FFmpeg chromakey filter # chromakey=color:similarity:blend + # Using higher similarity to capture more green shades filter_parts = [ f"chromakey={hex_color}:{chroma_config.similarity:.3f}:{chroma_config.blend:.3f}" ] - # Add despill if specified + # Add despill to remove green spill on edges (always recommended for green screen) if chroma_config.spill > 0: - # despill filter removes color spill on edges filter_parts.append(f"despill=type=green:mix={chroma_config.spill:.3f}") video_filter = ",".join(filter_parts) @@ -128,24 +1191,40 @@ def apply_chroma_key( cmd = [ "ffmpeg", "-y", # Overwrite output - "-i", str(input_path), - "-vf", video_filter, - "-c:v", "prores_ks", - "-profile:v", "4", # ProRes 4444 - "-pix_fmt", "yuva444p10le", # 10-bit with alpha - "-c:a", "pcm_s16le", # Lossless audio - str(output_path), ] + # Add duration limit if specified (before input for efficiency) + if take is not None: + cmd.extend(["-t", str(take)]) + + cmd.extend( + [ + "-i", + str(input_path), + "-vf", + video_filter, + "-c:v", + "prores_ks", + "-profile:v", + "4", # ProRes 4444 + "-pix_fmt", + "yuva444p10le", # 10-bit with alpha + "-c:a", + "pcm_s16le", # Lossless audio + str(output_path), + ] + ) + if verbose: print(f" Filter: {video_filter}") + if take: + print(f" Duration limit: {take}s") print(f" Command: {' '.join(cmd)}") - result = subprocess.run( - cmd, - capture_output=True, - text=True, - ) + # Get duration for progress bar + duration = take if take is not None else get_video_duration(input_path) + + result = run_ffmpeg_with_progress(cmd, duration, "Chroma key") if result.returncode != 0: raise PreprocessError( @@ -156,40 +1235,342 @@ def apply_chroma_key( ) +def apply_mask( + input_path: Path, + output_path: Path, + config: dict[str, Any], + verbose: bool = False, + take: float = None, +) -> None: + """ + Apply a mask to make edges transparent using FFmpeg. + + Config options: + left: float - Percentage of left side to make transparent (0.0-1.0) + right: float - Percentage of right side to make transparent (0.0-1.0) + top: float - Percentage of top to make transparent (0.0-1.0) + bottom: float - Percentage of bottom to make transparent (0.0-1.0) + + Args: + take: Optional duration in seconds to limit processing + + Uses geq filter to set alpha channel to 0 for masked regions. + """ + left = float(config.get("left", 0)) + right = float(config.get("right", 0)) + top = float(config.get("top", 0)) + bottom = float(config.get("bottom", 0)) + + # Build alpha expression for geq filter + # Alpha is 255 (opaque) in the center, 0 (transparent) at edges + # X < W*left -> transparent + # X > W*(1-right) -> transparent + # Y < H*top -> transparent + # Y > H*(1-bottom) -> transparent + conditions = [] + if left > 0: + conditions.append(f"lt(X,W*{left})") + if right > 0: + conditions.append(f"gt(X,W*{1-right})") + if top > 0: + conditions.append(f"lt(Y,H*{top})") + if bottom > 0: + conditions.append(f"gt(Y,H*{1-bottom})") + + if not conditions: + # No masking needed, just copy + import shutil + + shutil.copy2(input_path, output_path) + return + + # Combine conditions with OR - if any condition is true, alpha = 0 + alpha_expr = "+".join(conditions) + # geq alpha: if any condition matches, return 0, else return alpha(X,Y) + # Using: if(condition, 0, alpha(X,Y)) + alpha_expr = f"if({alpha_expr},0,alpha(X,Y))" + + # Build the geq filter - preserve luma, chroma, modify alpha + video_filter = f"geq=lum='lum(X,Y)':cb='cb(X,Y)':cr='cr(X,Y)':a='{alpha_expr}'" + + # Build FFmpeg command + cmd = [ + "ffmpeg", + "-y", # Overwrite output + ] + + if take is not None: + cmd.extend(["-t", str(take)]) + + cmd.extend( + [ + "-i", + str(input_path), + "-vf", + video_filter, + "-c:v", + "prores_ks", + "-profile:v", + "4", # ProRes 4444 + "-pix_fmt", + "yuva444p10le", # 10-bit with alpha + "-c:a", + "pcm_s16le", # Lossless audio + str(output_path), + ] + ) + + if verbose: + print(f" Mask: left={left}, right={right}, top={top}, bottom={bottom}") + print(f" Filter: {video_filter}") + print(f" Command: {' '.join(cmd)}") + + # Get duration for progress bar + duration = take if take is not None else get_video_duration(input_path) + + result = run_ffmpeg_with_progress(cmd, duration, "Mask") + + if result.returncode != 0: + raise PreprocessError( + "Mask filter failed", + filter_type="mask", + command=" ".join(cmd), + stderr=result.stderr, + ) + + +def apply_transcribe( + input_path: Path, + config: dict[str, Any], + verbose: bool = False, + force: bool = False, +) -> Path: + """ + Transcribe video audio using Whisper and save to JSON file. + + Config options: + model: str - Whisper model size (tiny, base, small, medium, large). Default: "base" + output: str - Output filename. Default: input filename with .transcript.json suffix + + This filter doesn't transform the video, it creates a sidecar transcript file. + Skips if output file exists unless force=True. + + Returns: + Path to the transcript JSON file. + """ + from .transcriber import transcribe_video, save_transcript + + model = config.get("model", "base") + output_name = config.get("output") + + if output_name: + output_path = input_path.parent / output_name + else: + output_path = input_path.with_suffix(".transcript.json") + + # Skip if exists (unless force) + if output_path.exists() and not force: + print(f" Transcript exists, skipping: {output_path.name}") + print(" (use --force to regenerate)") + return output_path + + if verbose: + print(f" Model: {model}") + print(f" Output: {output_path}") + + # Run transcription + words = transcribe_video(input_path, model=model) + save_transcript(words, output_path) + + print(f" Transcribed {len(words)} words -> {output_path.name}") + + return output_path + + +def apply_audio_normalize( + input_path: Path, + output_path: Path, + config: dict[str, Any], + verbose: bool = False, + take: float = None, +) -> None: + """ + Apply audio normalization: denoise, compress, and loudness normalize. + + Config options: + denoise: bool - Enable noise reduction (default: True) + noise_floor: float - Noise floor in dB (default: -25) + compress: bool - Enable compression (default: True) + threshold: float - Compression threshold in dB (default: -20) + ratio: float - Compression ratio (default: 4) + attack: float - Attack time in ms (default: 5) + release: float - Release time in ms (default: 50) + makeup: float - Makeup gain in dB (default: 2) + normalize: bool - Enable loudness normalization (default: True) + target_lufs: float - Target loudness in LUFS (default: -16) + target_lra: float - Target loudness range (default: 11) + target_tp: float - Target true peak in dB (default: -1.5) + + Uses FFmpeg filters: + - afftdn: Adaptive frequency-domain noise reduction + - acompressor: Dynamic range compression + - loudnorm: EBU R128 loudness normalization + """ + cfg = parse_audio_normalize_config(config) + + # Build audio filter chain + audio_filters: list[str] = [] + + # 1. Noise reduction (afftdn) + if cfg.denoise: + # afftdn with adaptive noise floor + # nr = noise reduction amount, nf = noise floor + audio_filters.append(f"afftdn=nf={cfg.noise_floor:.1f}") + + # 2. Compression (acompressor) + if cfg.compress: + audio_filters.append( + f"acompressor=threshold={cfg.threshold:.1f}dB" + f":ratio={cfg.ratio:.1f}" + f":attack={cfg.attack:.1f}" + f":release={cfg.release:.1f}" + f":makeup={cfg.makeup:.1f}dB" + ) + + # 3. Loudness normalization (loudnorm - EBU R128) + if cfg.normalize: + audio_filters.append( + f"loudnorm=I={cfg.target_lufs:.1f}" + f":LRA={cfg.target_lra:.1f}" + f":TP={cfg.target_tp:.1f}" + ) + + if not audio_filters: + # No filters enabled, just copy + import shutil + shutil.copy2(input_path, output_path) + return + + audio_filter = ",".join(audio_filters) + + # Build FFmpeg command - copy video, process audio + cmd = ["ffmpeg", "-y"] + + if take is not None: + cmd.extend(["-t", str(take)]) + + cmd.extend([ + "-i", str(input_path), + "-c:v", "copy", # Copy video stream unchanged + "-af", audio_filter, + "-c:a", "pcm_s16le", # Lossless audio output + str(output_path), + ]) + + if verbose: + print(f" Audio filter: {audio_filter}") + print(f" Command: {' '.join(cmd)}") + + # Get duration for progress bar + duration = take if take is not None else get_video_duration(input_path) + + result = run_ffmpeg_with_progress(cmd, duration, "Audio normalize") + + if result.returncode != 0: + raise PreprocessError( + "Audio normalization failed", + filter_type="audio_normalize", + command=" ".join(cmd), + stderr=result.stderr, + ) + + +def parse_audio_normalize_config(config: dict[str, Any]) -> AudioNormalizeConfig: + """Parse an audio normalize config dictionary into AudioNormalizeConfig.""" + return AudioNormalizeConfig( + denoise=bool(config.get("denoise", True)), + noise_floor=float(config.get("noise_floor", -25.0)), + compress=bool(config.get("compress", True)), + threshold=float(config.get("threshold", -20.0)), + ratio=float(config.get("ratio", 4.0)), + attack=float(config.get("attack", 5.0)), + release=float(config.get("release", 50.0)), + makeup=float(config.get("makeup", 2.0)), + normalize=bool(config.get("normalize", True)), + target_lufs=float(config.get("target_lufs", -16.0)), + target_lra=float(config.get("target_lra", 11.0)), + target_tp=float(config.get("target_tp", -1.5)), + ) + + def parse_chroma_key_config(config: dict[str, Any]) -> ChromaKeyConfig: - """Parse a chroma key config dictionary into ChromaKeyConfig.""" + """Parse a chroma key config dictionary into ChromaKeyConfig. + + Defaults are tuned for aggressive green screen removal: + - similarity 0.4: Captures wide range of green shades (lighting variations) + - blend 0.08: Tight edges with minimal feathering + - spill 0.1: Light despill to remove green reflections on subject + - edge_erode 0: No alpha erosion (set 1-3 to remove green fringe) + - protect_color: Optional RGB color to protect from keying (e.g., yellow jumpsuit) + - protect_tolerance: How much variation from protect_color to allow (0-1, default 0.15) + """ color = config.get("color", [0, 255, 0]) if isinstance(color, list) and len(color) == 3: color = tuple(color) else: color = (0, 255, 0) + # Parse protect_color if provided + protect_color = config.get("protect_color") + if protect_color: + if isinstance(protect_color, list) and len(protect_color) == 3: + protect_color = tuple(protect_color) + else: + protect_color = None + return ChromaKeyConfig( color=color, - similarity=float(config.get("similarity", 0.15)), - blend=float(config.get("blend", 0.1)), - spill=float(config.get("spill", 0.0)), + similarity=float(config.get("similarity", 0.4)), + blend=float(config.get("blend", 0.08)), + spill=float(config.get("spill", 0.1)), + edge_erode=int(config.get("edge_erode", 0)), + protect_color=protect_color, + protect_tolerance=float(config.get("protect_tolerance", 0.15)), ) -def get_preprocessed_path(project_path: Path, video_source: VideoSource) -> Path: +def get_preprocessed_path(videos_dir: Path, video_source: VideoSource) -> Path: """ Get the path to the preprocessed video file. Returns output_file if specified, otherwise returns the original file. + Also checks for WebM variant since preprocessing now outputs WebM. """ if video_source.output_file: - return project_path / video_source.output_file - return project_path / video_source.file + output_path = videos_dir / video_source.output_file + if output_path.exists(): + return output_path + # Check for WebM variant + webm_path = output_path.with_suffix(".mov") + if webm_path.exists(): + return webm_path + return output_path # Return expected path even if doesn't exist + return videos_dir / video_source.source_file -def needs_preprocessing(project_path: Path, video_source: VideoSource) -> bool: +def needs_preprocessing(videos_dir: Path, video_source: VideoSource) -> bool: """Check if preprocessing is needed (has filters and output doesn't exist).""" - if not video_source.preprocess: + if not video_source.filter: return False if video_source.output_file: - output_path = project_path / video_source.output_file - return not output_path.exists() + output_path = videos_dir / video_source.output_file + if output_path.exists(): + return False + # Also check for WebM variant + webm_path = output_path.with_suffix(".mov") + if webm_path.exists(): + return False + return True return True diff --git a/gnommo/renderer.py b/gnommo/renderer.py index bce9a7d..67c1b76 100644 --- a/gnommo/renderer.py +++ b/gnommo/renderer.py @@ -1,10 +1,22 @@ """Load stage: generate and execute FFmpeg commands.""" +import math import subprocess from pathlib import Path from .errors import RenderError -from .models import RenderPlan, SlideEvent, SLIDE_LAYOUTS +from .models import ( + AudioEvent, + CameraEvent, + CameraState, + CutoutDefinition, + RenderPlan, + SlideEvent, + SLIDE_LAYOUTS, + VideoEvent, + VideoSource, +) +from .preprocessor import run_ffmpeg_with_progress def render(plan: RenderPlan, output_path: Path, verbose: bool = False) -> None: @@ -27,10 +39,9 @@ def render(plan: RenderPlan, output_path: Path, verbose: bool = False) -> None: print(" ".join(cmd)) print() - result = subprocess.run( - cmd, - capture_output=True, - text=True, + # Run with progress bar and ETA + result = run_ffmpeg_with_progress( + cmd, duration=plan.total_duration, description="Rendering" ) if result.returncode != 0: @@ -41,6 +52,66 @@ def render(plan: RenderPlan, output_path: Path, verbose: bool = False) -> None: ) +def _resolve_video_path( + videos_dir: Path, + video_source: VideoSource, + shared_assets_dir: Path = None, +) -> Path: + """Resolve the actual video file path (output_file if exists, else source_file). + + Also checks for WebM variant since preprocessing now outputs WebM for + compressed alpha channel support. + + If video_source.is_shared is True, looks in shared_assets_dir instead of videos_dir. + """ + # Determine base directory based on is_shared flag + if video_source.is_shared and shared_assets_dir: + base_dir = shared_assets_dir + else: + base_dir = videos_dir + + if video_source.output_file: + video_path = base_dir / video_source.output_file + if video_path.exists(): + return video_path + # Check for WebM variant (preprocessing outputs compressed WebM instead of ProRes) + webm_path = video_path.with_suffix(".mov") + if webm_path.exists(): + return webm_path + return base_dir / video_source.source_file + + +def _has_audio_stream(video_path: Path) -> bool: + """Check if a video file contains an audio stream using ffprobe.""" + result = subprocess.run( + [ + "ffprobe", "-v", "error", + "-select_streams", "a", + "-show_entries", "stream=index", + "-of", "csv=p=0", + str(video_path), + ], + capture_output=True, text=True, + ) + return bool(result.stdout.strip()) + + +def _build_audio_channel_filter(use_audio_channels: str) -> str: + """Build ffmpeg audio filter for channel selection. + + Args: + use_audio_channels: "both", "left", or "right" + + Returns: + Filter string (e.g., "pan=mono|c0=c1") or empty string for "both" + """ + if use_audio_channels == "left": + return "pan=mono|c0=c0" + elif use_audio_channels == "right": + return "pan=mono|c0=c1" + return "" # "both" - no filter needed + + def build_ffmpeg_command(plan: RenderPlan, output_path: Path) -> list[str]: """Build the complete FFmpeg command as a list of arguments.""" cmd = ["ffmpeg", "-y"] # -y to overwrite output @@ -48,128 +119,530 @@ def build_ffmpeg_command(plan: RenderPlan, output_path: Path) -> list[str]: # Resolve paths to absolute project_path = plan.project_path.resolve() output_path = output_path.resolve() + videos_dir = plan.videos_dir.resolve() if plan.videos_dir else project_path + shared_assets_dir = ( + plan.shared_assets_dir.resolve() if plan.shared_assets_dir else None + ) - # Input: talking head video - # Use resolved path if available, otherwise construct from file - talking_head_path = plan.talking_head_path or (project_path / plan.talking_head.file) - cmd.extend(["-i", str(talking_head_path)]) + # Track input indices + input_idx = 0 + + # Input: always_visible videos (like talking head) + # Add -ss seek BEFORE -i for skip parameter and/or partial rendering + always_visible_inputs: list[int] = [] + for video_id, video_source, cutout in plan.narration_videos: + video_path = _resolve_video_path(videos_dir, video_source, shared_assets_dir) + # Combine video skip setting with partial render offset + total_seek = video_source.skip + plan.input_seek_time + if total_seek > 0: + cmd.extend(["-ss", f"{total_seek:.3f}"]) + cmd.extend(["-i", str(video_path)]) + always_visible_inputs.append(input_idx) + input_idx += 1 # Input: background image/video (if specified) bg_file = plan.config.background or plan.config.background_video has_background = bool(bg_file) + bg_idx = None bg_is_image = False if has_background: - # Try project folder first, then parent (for shared_assets) bg_path = project_path / bg_file if not bg_path.exists(): bg_path = project_path.parent / bg_file - cmd.extend(["-i", str(bg_path)]) - # Check if background is an image image_extensions = {".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".webp"} bg_is_image = bg_path.suffix.lower() in image_extensions + # Loop background videos infinitely + if not bg_is_image: + cmd.extend(["-stream_loop", "-1"]) + cmd.extend(["-i", str(bg_path)]) + bg_idx = input_idx + input_idx += 1 - # Input: slide images (from slides_dir, same directory as slides.json) - slides_dir = plan.slides_dir.resolve() if plan.slides_dir else project_path / "media" / "slides" - slide_inputs: list[str] = [] # Track which slides we've added + # Input: slide images + slides_dir = ( + plan.slides_dir.resolve() + if plan.slides_dir + else project_path / "media" / "slides" + ) + slide_inputs: dict[str, int] = {} # slide_id -> input_idx for event in plan.slide_events: if event.slide_id not in slide_inputs: image_path = slides_dir / event.slide_def.image cmd.extend(["-i", str(image_path)]) - slide_inputs.append(event.slide_id) + slide_inputs[event.slide_id] = input_idx + input_idx += 1 + + # Input: triggered videos + # Each video event needs its own input because they may have different skip times + # video_inputs maps (video_id, event_index) -> input_idx + video_inputs: dict[int, int] = {} # event_index -> input_idx + video_events_with_audio: set[int] = set() # event indices whose files have audio + + for i, event in enumerate(plan.video_events): + video_path = _resolve_video_path( + videos_dir, event.video_source, shared_assets_dir + ) + # Seek to skip point before loading input + skip = event.video_source.skip + if skip > 0: + cmd.extend(["-ss", f"{skip:.3f}"]) + cmd.extend(["-i", str(video_path)]) + video_inputs[i] = input_idx + input_idx += 1 + if _has_audio_stream(video_path): + video_events_with_audio.add(i) + + # Input: outro videos (play after narration ends) + outro_inputs: dict[int, int] = {} # event_index -> input_idx + outro_events_with_audio: set[int] = set() + + for i, event in enumerate(plan.outro_events): + video_path = _resolve_video_path( + videos_dir, event.video_source, shared_assets_dir + ) + # Seek to skip point before loading input + skip = event.video_source.skip + if skip > 0: + cmd.extend(["-ss", f"{skip:.3f}"]) + cmd.extend(["-i", str(video_path)]) + outro_inputs[i] = input_idx + input_idx += 1 + if _has_audio_stream(video_path): + outro_events_with_audio.add(i) + + # Track where audio inputs start + num_inputs_before_audio = input_idx + + # Input: audio files + audio_dir = plan.audio_dir.resolve() if plan.audio_dir else project_path + audio_inputs: dict[str, int] = {} # audio_id -> input_idx + + for event in plan.audio_events: + if event.audio_id not in audio_inputs: + audio_path = audio_dir / event.audio_def.file + cmd.extend(["-i", str(audio_path)]) + audio_inputs[event.audio_id] = input_idx + input_idx += 1 # Build filter_complex - filter_complex = build_filter_complex(plan, has_background, slide_inputs, bg_is_image) + filter_complex = build_filter_complex( + plan, + has_background, + bg_idx, + bg_is_image, + always_visible_inputs, + slide_inputs, + video_inputs, + num_inputs_before_audio, + audio_inputs, + video_events_with_audio, + outro_inputs, + outro_events_with_audio, + ) cmd.extend(["-filter_complex", filter_complex]) # Map output video and audio cmd.extend(["-map", "[vout]"]) - cmd.extend(["-map", "0:a"]) # Audio from talking head + + # Determine audio source + # Priority: [aout] from filter > triggered video > no audio + # Note: we always create [aout] when always_visible_inputs exists + if always_visible_inputs: + cmd.extend( + ["-map", "[aout]"] + ) # Audio from filter (may be segmented or simple copy) + elif video_inputs: + # Get first triggered video's input index + first_video_idx = next(iter(video_inputs.values())) + cmd.extend( + ["-map", f"{first_video_idx}:a?"] + ) # Audio from first triggered video (? = optional) + # else: no audio source available, output will be silent # Output settings - cmd.extend([ - "-t", str(plan.total_duration), # Limit output duration - "-c:v", "libx264", - "-preset", "fast", - "-crf", "23", - "-c:a", "aac", - "-b:a", "192k", - "-r", str(plan.config.fps), - str(output_path), - ]) + cmd.extend( + [ + "-t", + str(plan.total_duration), + "-c:v", + "libx264", + "-preset", + "fast", + "-crf", + "23", + "-c:a", + "aac", + "-b:a", + "192k", + "-r", + str(plan.config.fps), + str(output_path), + ] + ) return cmd +def _calculate_cutout_position( + cutout: CutoutDefinition, frame_width: int, frame_height: int +) -> tuple[int, int, int, int]: + """Calculate pixel position, width, and height from cutout definition. + + Returns: (x, y, width, height) + """ + # Calculate height + if cutout.height >= 0: + cut_height = cutout.height + else: + cut_height = int(frame_height * cutout.height_percent) + + # Calculate width (defaults to height if not specified) + if cutout.width >= 0: + cut_width = cutout.width + elif cutout.width_percent > 0: + cut_width = int(frame_width * cutout.width_percent) + else: + cut_width = cut_height # Square by default + + # Calculate x position + if cutout.x >= 0: + cut_x = cutout.x + else: + cut_x = int(frame_width * cutout.x_percent) + + # Calculate y position + if cutout.y >= 0: + cut_y = cutout.y + else: + cut_y = int(frame_height * cutout.y_percent) + + return cut_x, cut_y, cut_width, cut_height + + +def build_camera_transform( + camera_events: list[CameraEvent], + width: int, + height: int, + fps: int, + initial_state: CameraState = None, + output_label: str = "vout", +) -> str: + """ + Build FFmpeg filter string for camera transforms (zoom, rotate, pan). + + Takes the composed [scene] and applies animated camera transforms, + outputting to the specified label. + + Args: + initial_state: Camera state at t=0 (for partial rendering). + If provided and not default, a virtual event is + prepended to set the initial state. + output_label: Label for the output stream (default: "vout") + """ + # Handle initial state for partial rendering + if initial_state and not initial_state.is_default(): + # Prepend a virtual event at t=0 with the initial state (instant, no transition) + initial_event = CameraEvent( + time=0.0, + target_state=initial_state, + duration=0.0, # Instant + easing="linear", + ) + camera_events = [initial_event] + camera_events + + # Identity transform: if no camera events, pass through. + if not camera_events: + return f"[scene]copy[{output_label}]" + + # Build time-based expressions for each camera property + zoom_expr = _build_animated_expr(camera_events, "zoom", 1.0) + rotation_expr = _build_animated_expr(camera_events, "rotation", 0.0) + pan_x_expr = _build_animated_expr(camera_events, "pan_x", 0.0) + pan_y_expr = _build_animated_expr(camera_events, "pan_y", 0.0) + focal_x_expr = _build_animated_expr(camera_events, "focal_x", 0.5) + focal_y_expr = _build_animated_expr(camera_events, "focal_y", 0.5) + + # Pad big enough to avoid corners during rotation + # Use even dimensions to avoid rounding issues in scale/crop + diagonal = int(math.ceil(math.sqrt(width**2 + height**2))) + pad_w = ((diagonal + 100) // 2) * 2 # Round up to even + pad_h = ((diagonal + 100) // 2) * 2 + + # Calculate integer offsets for centering + pad_x = (pad_w - width) // 2 + pad_y = (pad_h - height) // 2 + + filters: list[str] = [] + + # Pad the scene to allow rotation without clipping + filters.append(f"[scene]pad={pad_w}:{pad_h}:{pad_x}:{pad_y}:color=black@0[padded]") + + # Scale for zoom - use max(1, zoom) to prevent shrinking below pad size + # The ceil/2*2 pattern ensures even output dimensions + filters.append( + f"[padded]scale=eval=frame:" + f"w='trunc(iw*max(1,{zoom_expr})/2+0.5)*2':" + f"h='trunc(ih*max(1,{zoom_expr})/2+0.5)*2'[zoomed]" + ) + + # Rotate (degrees -> radians), keep transparent fill + rotation_rad = f"(-({rotation_expr})*PI/180)" + filters.append( + f"[zoomed]format=rgba," + f"rotate=a='{rotation_rad}':ow=iw:oh=ih:c='black@0'," + f"format=yuva444p10le[rotated]" + ) + + # Crop back to output size with focal point and pan offsets + # focal_x/focal_y determine where the zoom centers (0.5 = center, 0 = left/top, 1 = right/bottom) + crop_x = f"((iw-{width})*({focal_x_expr}) + ({pan_x_expr})*(iw-{width})/2)" + crop_y = f"((ih-{height})*({focal_y_expr}) + ({pan_y_expr})*(ih-{height})/2)" + filters.append(f"[rotated]crop={width}:{height}:{crop_x}:{crop_y}[{output_label}]") + + return ";".join(filters) + + +def ff_escape_expr(expr: str) -> str: + # Escape filtergraph separators that appear inside FFmpeg expressions. + # Backslash first to avoid double-escaping. + return expr.replace("\\", "\\\\").replace(":", "\\:").replace(",", "\\,") + + +def _build_animated_expr( + camera_events: list[CameraEvent], + property_name: str, + default_value: float, +) -> str: + """ + Build an FFmpeg expression that animates a camera property over time. + + Creates a piecewise function using nested if() statements: + - Before first keyframe: default value + - During transition: linear interpolation + - After transition: hold value until next keyframe + + The expression structure is built backwards (inside-out) so the final + value is the innermost default, and earlier time checks wrap around it. + """ + if not camera_events: + return str(default_value) + + # Build list of (start_time, end_time, start_value, end_value) segments + segments: list[tuple[float, float, float, float]] = [] + prev_value = default_value + prev_end_time = 0.0 + + for event in camera_events: + target_value = getattr(event.target_state, property_name) + start_time = event.time + duration = event.duration + + # Hold segment: from previous end to this start (if gap exists) + if start_time > prev_end_time: + segments.append((prev_end_time, start_time, prev_value, prev_value)) + + # Transition segment + if duration > 0: + end_time = start_time + duration + segments.append((start_time, end_time, prev_value, target_value)) + else: + # Instant change - represented as a very short segment + end_time = start_time + + prev_value = target_value + prev_end_time = end_time + + # Build expression from the last segment backwards + # Start with the final held value + expr = str(prev_value) + + # Process segments in reverse order + for start_time, end_time, start_val, end_val in reversed(segments): + if start_time == end_time: + # Point change (instant) + continue + + if start_val == end_val: + # Hold segment: constant value + segment_expr = str(start_val) + else: + # Transition segment: linear interpolation + # lerp = start + (end - start) * (t - start_time) / duration + duration = end_time - start_time + segment_expr = f"({start_val}+({end_val}-{start_val})*(t-{start_time:.3f})/{duration:.3f})" + + # Wrap with time check + expr = f"if(between(t,{start_time:.3f},{end_time:.3f}),{segment_expr},{expr})" + + # Handle time before first segment + if segments and segments[0][0] > 0: + expr = f"if(lt(t,{segments[0][0]:.3f}),{default_value},{expr})" + # Escape special characters for FFmpeg filtergraph + escaped = ff_escape_expr(expr) + return escaped + + +def _build_narration_segments( + pauses: list, total_duration: float +) -> list[tuple[float, float, float, float]]: + """ + Build narration video segments accounting for pauses. + + Returns list of (source_start, source_end, output_start, output_end) tuples. + + Example with pause at narration_time=30 for 5 seconds: + - Segment 1: source 0-30 -> output 0-30 + - Segment 2: source 30-end -> output 35-end + """ + if not pauses: + return [(0.0, total_duration, 0.0, total_duration)] + + segments = [] + cumulative_pause = 0.0 + prev_narration_end = 0.0 + + for pause in pauses: + # Segment before this pause + src_start = prev_narration_end + src_end = pause.narration_time + out_start = prev_narration_end + cumulative_pause + out_end = pause.output_time + + if src_end > src_start: + segments.append((src_start, src_end, out_start, out_end)) + + # Update for next segment + prev_narration_end = pause.narration_time + cumulative_pause += pause.duration + + # Final segment after all pauses + # Calculate total narration duration (total_duration minus all pause durations) + total_pause_duration = sum(p.duration for p in pauses) + narration_end = total_duration - total_pause_duration + + if narration_end > prev_narration_end: + src_start = prev_narration_end + src_end = narration_end + out_start = prev_narration_end + cumulative_pause + out_end = total_duration + segments.append((src_start, src_end, out_start, out_end)) + + return segments + + def build_filter_complex( plan: RenderPlan, has_background: bool, - slide_inputs: list[str], - bg_is_image: bool = False, + bg_idx: int, + bg_is_image: bool, + always_visible_inputs: list[int], + slide_inputs: dict[str, int], + video_inputs: dict[int, int], # event_index -> input_idx + num_inputs_before_audio: int, + audio_inputs: dict[str, int], + video_events_with_audio: set[int] = None, + outro_inputs: dict[int, int] = None, # outro event_index -> input_idx + outro_events_with_audio: set[int] = None, ) -> str: """ Build the filter_complex string for FFmpeg. Layer structure: - Layer 1: Background (solid color, image, or video) - - Layer 2: Talking head + - Layer 2: Always visible videos (like talking head) in cutouts - Layer 3: Slides (with time-based enable) + - Layer 4: Triggered videos in cutouts (with time-based enable) + - Layer 5: Camera transform + - Layer 6: Outro videos (fullscreen, after narration ends) + - Audio: Main audio mixed with triggered sound effects and outro audio """ + outro_inputs = outro_inputs or {} + outro_events_with_audio = outro_events_with_audio or set() width, height = plan.config.resolution filters: list[str] = [] - # Input indices: - # 0 = talking head - # 1 = background (if present) - # 2+ = slides - talking_head_idx = 0 - bg_idx = 1 if has_background else None - slide_start_idx = 2 if has_background else 1 - # Create base layer (background) if has_background: if bg_is_image: - # For images: loop to create video stream, then scale filters.append( f"[{bg_idx}:v]loop=loop=-1:size=1:start=0," f"scale={width}:{height}:force_original_aspect_ratio=increase," f"crop={width}:{height},fps={plan.config.fps}[bg]" ) else: - # For videos: just scale filters.append( f"[{bg_idx}:v]scale={width}:{height}:force_original_aspect_ratio=increase," f"crop={width}:{height}[bg]" ) - base_label = "bg" else: - # Create solid color background filters.append(f"color=c=black:s={width}x{height}:r={plan.config.fps}[bg]") - base_label = "bg" - # Scale and position talking head - th_config = plan.config.talking_head - if th_config.target_height > 0: - th_height = th_config.target_height - else: - # Percentage-based: calculate from frame height - th_height = int(height * th_config.target_height_percent) + current_label = "bg" - filters.append( - f"[{talking_head_idx}:v]scale=-1:{th_height}[head]" - ) + # Overlay always_visible videos (like talking head) + # If there are narration pauses, we need to segment the video + for i, (video_id, video_source, cutout) in enumerate(plan.narration_videos): + input_idx = always_visible_inputs[i] + cut_x, cut_y, cut_width, cut_height = _calculate_cutout_position( + cutout, width, height + ) - # Overlay talking head on background - filters.append( - f"[{base_label}][head]overlay=x={th_config.x}:y={th_config.y}[base]" - ) + # Apply zoom factor to cutout dimensions + zoom = video_source.zoom + zoomed_width = int(cut_width * zoom) + zoomed_height = int(cut_height * zoom) - current_label = "base" + if not plan.narration_pauses: + # Simple case: no pauses, continuous overlay + video_label = f"av{i}" + filters.append( + f"[{input_idx}:v]format=yuva444p10le," + f"scale={zoomed_width}:{zoomed_height}:force_original_aspect_ratio=increase," + f"crop={cut_width}:{cut_height}:(iw-{cut_width})/2:(ih-{cut_height})/2," + f"format=rgba[{video_label}]" + ) + + next_label = f"avbase{i}" + filters.append( + f"[{current_label}][{video_label}]overlay=x={cut_x}:y={cut_y}[{next_label}]" + ) + current_label = next_label + else: + # Complex case: narration pauses - segment the video + # Each segment is trimmed from source and positioned in output timeline + segments = _build_narration_segments( + plan.narration_pauses, plan.total_duration + ) + + for seg_idx, (src_start, src_end, out_start, out_end) in enumerate( + segments + ): + seg_label = f"av{i}_seg{seg_idx}" + # Trim to source range, then shift PTS to output position + # setpts=PTS-STARTPTS puts segment at 0, then +offset/TB shifts to output time + pts_offset = out_start + filters.append( + f"[{input_idx}:v]trim={src_start:.3f}:{src_end:.3f}," + f"setpts=PTS-STARTPTS+{pts_offset:.3f}/TB," + f"format=yuva444p10le," + f"scale={zoomed_width}:{zoomed_height}:force_original_aspect_ratio=increase," + f"crop={cut_width}:{cut_height}:(iw-{cut_width})/2:(ih-{cut_height})/2," + f"format=rgba[{seg_label}]" + ) + + # Overlay with enable for this segment's output time range + next_label = f"avbase{i}_seg{seg_idx}" + enable_expr = f"between(t\\,{out_start:.3f}\\,{out_end:.3f})" + filters.append( + f"[{current_label}][{seg_label}]overlay=x={cut_x}:y={cut_y}:" + f"enable={enable_expr}[{next_label}]" + ) + current_label = next_label # Add slide overlays with time-based enable - # Slides are scaled to full frame - transparency shows layers below for i, event in enumerate(plan.slide_events): - slide_idx = slide_start_idx + slide_inputs.index(event.slide_id) + slide_idx = slide_inputs[event.slide_id] # Scale slide to full frame size (transparent areas show through) slide_label = f"s{i}" @@ -179,34 +652,330 @@ def build_filter_complex( ) # Overlay at 0,0 (full frame) with time-based enable - next_label = f"v{i}" if i < len(plan.slide_events) - 1 else "vout" - enable_expr = f"between(t,{event.start_time:.3f},{event.end_time:.3f})" - + next_label = f"sbase{i}" + enable_expr = f"between(t\\,{event.start_time:.3f}\\,{event.end_time:.3f})" filters.append( f"[{current_label}][{slide_label}]overlay=" - f"x=0:y=0:" - f"enable='{enable_expr}'[{next_label}]" + f"x=0:y=0:enable={enable_expr}" + f"[{next_label}]" ) current_label = next_label - # If no slides, just rename base to vout - if not plan.slide_events: + # Add triggered video overlays with time-based enable + for i, event in enumerate(plan.video_events): + video_idx = video_inputs[i] + cut_x, cut_y, cut_width, cut_height = _calculate_cutout_position( + event.cutout, width, height + ) + + # Calculate effective end time (respecting 'take' parameter) + duration = event.end_time - event.start_time + if event.video_source.take is not None: + duration = min(duration, event.video_source.take) + effective_end = event.start_time + duration + + # Apply zoom factor to cutout dimensions + zoom = event.video_source.zoom + zoomed_width = int(cut_width * zoom) + zoomed_height = int(cut_height * zoom) + + # Scale to cover the zoomed area (like CSS object-fit: cover) + # Then crop to cutout dimensions (centered) + # Use setpts to sync video start with overlay enable time + video_label = f"tv{i}" + start_pts = event.start_time + filters.append( + f"[{video_idx}:v]format=yuva444p10le," + f"setpts=PTS-STARTPTS+{start_pts:.3f}/TB," + f"scale={zoomed_width}:{zoomed_height}:force_original_aspect_ratio=increase," + f"crop={cut_width}:{cut_height}:(iw-{cut_width})/2:(ih-{cut_height})/2," + f"format=rgba[{video_label}]" + ) + + # Overlay with time-based enable + next_label = f"tvbase{i}" + enable_expr = f"between(t\\,{event.start_time:.3f}\\,{effective_end:.3f})" + filters.append( + f"[{current_label}][{video_label}]overlay=" + f"x={cut_x}:y={cut_y}:enable={enable_expr}" + f"[{next_label}]" + ) + + current_label = next_label + + # Scene composition complete - now apply camera transform + # Check if we need camera transform (events exist OR initial state is non-default) + needs_camera_transform = plan.camera_events or ( + plan.initial_camera_state and not plan.initial_camera_state.is_default() + ) + + # Determine output label based on whether we have outro events + has_outro = bool(plan.outro_events and outro_inputs) + cam_output_label = "cam_out" if has_outro else "vout" + + if needs_camera_transform: + # Output to [scene], then camera transform will produce [cam_out] or [vout] + filters.append(f"[{current_label}]copy[scene]") + camera_filter = build_camera_transform( + plan.camera_events, + width, + height, + plan.config.fps, + initial_state=plan.initial_camera_state, + output_label=cam_output_label, + ) + filters.append(camera_filter) + current_label = cam_output_label + else: + # No camera events + if has_outro: + filters.append(f"[{current_label}]copy[cam_out]") + current_label = "cam_out" + else: + filters.append(f"[{current_label}]copy[vout]") + + # Add outro video overlays (fullscreen, after narration ends) + if has_outro: + for i, event in enumerate(plan.outro_events): + video_idx = outro_inputs[i] + + # Calculate effective duration (respecting 'take' parameter) + duration = event.end_time - event.start_time + if event.video_source.take is not None: + duration = min(duration, event.video_source.take) + effective_end = event.start_time + duration + + # Determine if fullscreen or in cutout + if event.cutout: + cut_x, cut_y, cut_width, cut_height = _calculate_cutout_position( + event.cutout, width, height + ) + else: + # Fullscreen + cut_x, cut_y, cut_width, cut_height = 0, 0, width, height + + # Apply zoom factor + zoom = event.video_source.zoom + zoomed_width = int(cut_width * zoom) + zoomed_height = int(cut_height * zoom) + + # Scale and crop video + video_label = f"outro{i}" + start_pts = event.start_time + filters.append( + f"[{video_idx}:v]format=yuva444p10le," + f"setpts=PTS-STARTPTS+{start_pts:.3f}/TB," + f"scale={zoomed_width}:{zoomed_height}:force_original_aspect_ratio=increase," + f"crop={cut_width}:{cut_height}:(iw-{cut_width})/2:(ih-{cut_height})/2," + f"format=rgba[{video_label}]" + ) + + # Overlay with time-based enable + next_label = f"outrobase{i}" + enable_expr = f"between(t\\,{event.start_time:.3f}\\,{effective_end:.3f})" + filters.append( + f"[{current_label}][{video_label}]overlay=" + f"x={cut_x}:y={cut_y}:enable={enable_expr}" + f"[{next_label}]" + ) + + current_label = next_label + + # Final output filters.append(f"[{current_label}]copy[vout]") + # Audio mixing: combine main audio with sound effects + if always_visible_inputs: + main_audio_idx = always_visible_inputs[0] + audio_labels_to_mix = [] + + # Get audio channel setting from first narration video + channel_filter = "" + if plan.narration_videos: + _, first_video_source, _ = plan.narration_videos[0] + channel_filter = _build_audio_channel_filter( + first_video_source.use_audio_channels + ) + + # Use narration_end_time to stop audio before outro (if outro exists) + audio_end_time = plan.narration_end_time if plan.outro_events else plan.total_duration + + if not plan.narration_pauses: + # Simple case: trim main audio to end before outro (with optional channel filter) + if plan.outro_events: + # Trim narration audio to stop before outro + if channel_filter: + filters.append(f"[{main_audio_idx}:a]{channel_filter}atrim=0:{audio_end_time:.3f},asetpts=PTS-STARTPTS[main_aud]") + else: + filters.append(f"[{main_audio_idx}:a]atrim=0:{audio_end_time:.3f},asetpts=PTS-STARTPTS[main_aud]") + audio_labels_to_mix.append("[main_aud]") + elif channel_filter: + filters.append(f"[{main_audio_idx}:a]{channel_filter}[main_aud]") + audio_labels_to_mix.append("[main_aud]") + else: + audio_labels_to_mix.append(f"[{main_audio_idx}:a]") + else: + # Complex case: segment the narration audio for pauses + segments = _build_narration_segments( + plan.narration_pauses, audio_end_time + ) + for seg_idx, (src_start, src_end, out_start, out_end) in enumerate( + segments + ): + seg_label = f"narr_aud{seg_idx}" + delay_ms = int(out_start * 1000) + # Trim audio to source range, then delay to output position + # Apply channel filter if needed + channel_part = f"{channel_filter}," if channel_filter else "" + filters.append( + f"[{main_audio_idx}:a]{channel_part}atrim={src_start:.3f}:{src_end:.3f}," + f"asetpts=PTS-STARTPTS," + f"adelay={delay_ms}|{delay_ms}[{seg_label}]" + ) + audio_labels_to_mix.append(f"[{seg_label}]") + + # Process each audio event with delay and volume + if plan.audio_events and audio_inputs: + for i, event in enumerate(plan.audio_events): + audio_idx = audio_inputs[event.audio_id] + volume = event.audio_def.volume + + if event.audio_def.loop: + # Looping audio: loop source, then trim/segment + # Stop at narration end if there's an outro + loop_end_time = audio_end_time + remaining = loop_end_time - event.start_time + + if plan.narration_pauses and not event.audio_def.ignore_pauses: + # Build segments that skip narration pauses (pauses by default) + relevant_pauses = [ + p for p in plan.narration_pauses + if p.output_time > event.start_time + ] + src_pos = 0.0 + seg_start = event.start_time + seg_count = 0 + + for pause in relevant_pauses: + seg_end = pause.output_time + if seg_end > seg_start: + seg_dur = seg_end - seg_start + seg_label = f"aud{i}_seg{seg_count}" + delay_ms = int(seg_start * 1000) + filters.append( + f"[{audio_idx}:a]aloop=loop=-1:size=2e+09," + f"atrim={src_pos:.3f}:{src_pos + seg_dur:.3f}," + f"asetpts=PTS-STARTPTS," + f"adelay={delay_ms}|{delay_ms}," + f"volume={volume:.2f}[{seg_label}]" + ) + audio_labels_to_mix.append(f"[{seg_label}]") + src_pos += seg_dur + seg_count += 1 + seg_start = pause.output_time + pause.duration + + # Final segment after last pause (stop at narration end if outro) + if seg_start < loop_end_time: + seg_dur = loop_end_time - seg_start + seg_label = f"aud{i}_seg{seg_count}" + delay_ms = int(seg_start * 1000) + filters.append( + f"[{audio_idx}:a]aloop=loop=-1:size=2e+09," + f"atrim={src_pos:.3f}:{src_pos + seg_dur:.3f}," + f"asetpts=PTS-STARTPTS," + f"adelay={delay_ms}|{delay_ms}," + f"volume={volume:.2f}[{seg_label}]" + ) + audio_labels_to_mix.append(f"[{seg_label}]") + else: + # Simple loop: no pauses or ignore_pauses=True + label = f"aud{i}" + delay_ms = int(event.start_time * 1000) + filters.append( + f"[{audio_idx}:a]aloop=loop=-1:size=2e+09," + f"atrim=0:{remaining:.3f}," + f"asetpts=PTS-STARTPTS," + f"adelay={delay_ms}|{delay_ms}," + f"volume={volume:.2f}[{label}]" + ) + audio_labels_to_mix.append(f"[{label}]") + else: + # One-shot audio: delay to trigger time + label = f"aud{i}" + delay_ms = int(event.start_time * 1000) + filters.append( + f"[{audio_idx}:a]adelay={delay_ms}|{delay_ms},volume={volume:.2f}[{label}]" + ) + audio_labels_to_mix.append(f"[{label}]") + + # Extract and mix audio from triggered video events + _have_audio = video_events_with_audio or set() + for i, event in enumerate(plan.video_events): + if i not in _have_audio: + continue + video_idx = video_inputs[i] + # Calculate effective duration (same logic as video side) + duration = event.end_time - event.start_time + if event.video_source.take is not None: + duration = min(duration, event.video_source.take) + delay_ms = int(event.start_time * 1000) + label = f"tvaud{i}" + + filters.append( + f"[{video_idx}:a]atrim=0:{duration:.3f}," + f"asetpts=PTS-STARTPTS," + f"adelay={delay_ms}|{delay_ms}[{label}]" + ) + audio_labels_to_mix.append(f"[{label}]") + + # Extract and mix audio from outro video events + for i, event in enumerate(plan.outro_events): + if i not in outro_events_with_audio: + continue + video_idx = outro_inputs[i] + # Calculate effective duration (same logic as video side) + duration = event.end_time - event.start_time + if event.video_source.take is not None: + duration = min(duration, event.video_source.take) + delay_ms = int(event.start_time * 1000) + label = f"outroaud{i}" + + filters.append( + f"[{video_idx}:a]atrim=0:{duration:.3f}," + f"asetpts=PTS-STARTPTS," + f"adelay={delay_ms}|{delay_ms}[{label}]" + ) + audio_labels_to_mix.append(f"[{label}]") + + # Mix all audio tracks together + if len(audio_labels_to_mix) > 1: + num_audio_tracks = len(audio_labels_to_mix) + audio_mix_inputs = "".join(audio_labels_to_mix) + filters.append( + f"{audio_mix_inputs}amix=inputs={num_audio_tracks}:duration=longest:dropout_transition=0[aout]" + ) + elif len(audio_labels_to_mix) == 1: + # Single audio track, just copy it + label = audio_labels_to_mix[0].strip("[]") + filters.append(f"[{label}]acopy[aout]") + return ";".join(filters) def generate_ffmpeg_command_string(plan: RenderPlan, output_path: Path) -> str: """Generate a human-readable FFmpeg command string (for debugging).""" cmd = build_ffmpeg_command(plan, output_path) - + fg = cmd[cmd.index("-filter_complex") + 1] + print("FILTER_COMPLEX repr:", repr(fg)) # Format nicely with line breaks result = [] i = 0 while i < len(cmd): if cmd[i] == "-filter_complex": - result.append(f" -filter_complex \"\n {cmd[i+1].replace(';', ';' + chr(10) + ' ')}\n \"") + result.append( + f" -filter_complex \"\n {cmd[i+1].replace(';', ';' + chr(10) + ' ')}\n \"" + ) i += 2 elif cmd[i].startswith("-"): if i + 1 < len(cmd) and not cmd[i + 1].startswith("-"): @@ -219,4 +988,4 @@ def generate_ffmpeg_command_string(plan: RenderPlan, output_path: Path) -> str: result.append(f" {cmd[i]}") i += 1 - return "ffmpeg \\\n" + " \\\n".join(result) + return "".join(result) diff --git a/gnommo/transcriber.py b/gnommo/transcriber.py index 466f486..405ae1f 100644 --- a/gnommo/transcriber.py +++ b/gnommo/transcriber.py @@ -11,6 +11,7 @@ from .errors import GnommoError @dataclass class TranscribedWord: """A word with its timestamp from transcription.""" + word: str start: float end: float @@ -18,6 +19,7 @@ class TranscribedWord: class TranscriptionError(GnommoError): """Error during transcription.""" + pass @@ -57,21 +59,20 @@ def transcribe_video(video_path: Path, model: str = "base") -> list[TranscribedW for segment in result.get("segments", []): for word_info in segment.get("words", []): - words.append(TranscribedWord( - word=word_info["word"].strip(), - start=word_info["start"], - end=word_info["end"], - )) + words.append( + TranscribedWord( + word=word_info["word"].strip(), + start=word_info["start"], + end=word_info["end"], + ) + ) return words def save_transcript(words: list[TranscribedWord], output_path: Path) -> None: """Save transcribed words to a JSON file.""" - data = [ - {"word": w.word, "start": w.start, "end": w.end} - for w in words - ] + data = [{"word": w.word, "start": w.start, "end": w.end} for w in words] with open(output_path, "w", encoding="utf-8") as f: json.dump(data, f, indent=2) @@ -86,6 +87,5 @@ def load_transcript(transcript_path: Path) -> list[TranscribedWord]: data = json.load(f) return [ - TranscribedWord(word=w["word"], start=w["start"], end=w["end"]) - for w in data + TranscribedWord(word=w["word"], start=w["start"], end=w["end"]) for w in data ] diff --git a/gnommo/transformer.py b/gnommo/transformer.py index 560a0ea..187a8f8 100644 --- a/gnommo/transformer.py +++ b/gnommo/transformer.py @@ -1,16 +1,434 @@ """Transform stage: resolve timings and build render plan.""" +import re +from dataclasses import dataclass from pathlib import Path +from typing import Optional from .models import ( + AudioDefinition, + AudioEvent, + CameraEvent, + CameraState, + CutoutDefinition, + CAMERA_PRESETS, + NarrationPause, + OutroEvent, ProjectConfig, RenderPlan, SlideDefinition, SlideEvent, - TimedWord, + VideoEvent, VideoSource, ) -from .parser import get_video_duration, resolve_video_file +from .parser import get_video_duration +from .transcriber import TranscribedWord + +# Audio trigger offset: play sound this many seconds before the marker +AUDIO_OFFSET_SECONDS = 1.0 + + +@dataclass +class MarkerTiming: + """A marker with its aligned timestamp and confidence.""" + + marker_id: str + timestamp: float # -1 if not found + context: str # the text following the marker + confidence: float # 0-1, how confident the match is + + +def _normalize_text(text: str) -> str: + """Normalize text for matching (lowercase, expand contractions, remove punctuation).""" + text = text.lower() + # Expand common contractions before removing punctuation + # This ensures "I'm" matches "I am" in transcripts + contractions = { + "i'm": "i am", + "you're": "you are", + "we're": "we are", + "they're": "they are", + "he's": "he is", + "she's": "she is", + "it's": "it is", + "that's": "that is", + "what's": "what is", + "there's": "there is", + "here's": "here is", + "who's": "who is", + "how's": "how is", + "let's": "let us", + "i've": "i have", + "you've": "you have", + "we've": "we have", + "they've": "they have", + "i'd": "i would", + "you'd": "you would", + "he'd": "he would", + "she'd": "she would", + "we'd": "we would", + "they'd": "they would", + "i'll": "i will", + "you'll": "you will", + "he'll": "he will", + "she'll": "she will", + "we'll": "we will", + "they'll": "they will", + "isn't": "is not", + "aren't": "are not", + "wasn't": "was not", + "weren't": "were not", + "haven't": "have not", + "hasn't": "has not", + "hadn't": "had not", + "won't": "will not", + "wouldn't": "would not", + "don't": "do not", + "doesn't": "does not", + "didn't": "did not", + "can't": "cannot", + "couldn't": "could not", + "shouldn't": "should not", + "mightn't": "might not", + "mustn't": "must not", + } + for contraction, expansion in contractions.items(): + text = re.sub(r"\b" + re.escape(contraction) + r"\b", expansion, text) + text = re.sub(r"[^\w\s]", "", text) + text = re.sub(r"\s+", " ", text) + return text.strip() + + +def _is_known_marker( + marker_id: str, slides: dict = None, videos: dict = None, audio: dict = None +) -> bool: + """ + Check if a marker is a known type that should be processed. + + Known markers: + - Slide markers (S1, S2, etc.) - must be in slides dict + - video:xxx - video triggers + - narration:xxx - narration triggers + - Camera presets (Zoom1, TiltLeft, etc.) + - Audio markers (A1, A2, etc.) + + Unknown markers are ignored (not part of the render plan). + """ + slides = slides or {} + videos = videos or {} + audio = audio or {} + + # Slide markers + if marker_id in slides: + return True + + # Video/narration triggers + if marker_id.startswith("video:") or marker_id.startswith("narration:"): + return True + + # Camera presets + if marker_id in CAMERA_PRESETS: + return True + + # Audio markers (A followed by id) + if marker_id.startswith("A") and len(marker_id) > 1: + audio_id = marker_id[1:] + if audio_id in audio or audio_id.isdigit(): + return True + + return False + + +def _strip_unknown_markers( + text: str, slides: dict = None, videos: dict = None, audio: dict = None +) -> str: + """ + Remove unknown markers from text. + + Unknown markers aren't pronounced, so they should be stripped + before fuzzy matching. Note: [cite:...] markers are already + stripped at parse time by parse_manuscript(). + """ + + def replace_marker(match): + marker_id = match.group(1) + if _is_known_marker(marker_id, slides, videos, audio): + return match.group(0) # Keep known markers + return "" # Strip unknown markers + + return re.sub(r"\[([A-Za-z0-9_:]+)\]", replace_marker, text) + + +def _extract_marker_contexts( + manuscript_text: str, + slides: dict = None, + videos: dict = None, + audio: dict = None, +) -> list[tuple[str, str]]: + """ + Extract known markers and the text immediately following them from manuscript. + + Unknown markers are filtered out and stripped from following text. + Note: [cite:...] markers are already stripped at parse time. + + Returns list of (marker_id, following_text) tuples for known markers only. + """ + slides = slides or {} + videos = videos or {} + audio = audio or {} + + # Split by markers, keeping the markers + parts = re.split(r"\[([A-Za-z0-9_:]+)\]", manuscript_text) + + # parts: [text_before, marker1, text_after1, marker2, text_after2, ...] + raw_contexts = [] + for i in range(1, len(parts), 2): + marker_id = parts[i] + + # Skip unknown markers entirely + if not _is_known_marker(marker_id, slides, videos, audio): + continue + + if i + 1 < len(parts): + following_text = parts[i + 1].strip() + # Clean up: remove newlines, collapse whitespace + following_text = " ".join(following_text.split()) + # Strip unknown markers from following text (they're not pronounced) + following_text = _strip_unknown_markers( + following_text, slides, videos, audio + ) + following_text = " ".join(following_text.split()) # Clean up extra spaces + raw_contexts.append((marker_id, following_text)) + + # For markers with no following text (consecutive markers), look ahead + # Return (marker_id, following_text, is_borrowed) - is_borrowed=True means text came from look-ahead + contexts = [] + for i, (marker_id, following_text) in enumerate(raw_contexts): + if following_text: + # Take first ~10 words for matching + words = following_text.split()[:10] + contexts.append((marker_id, " ".join(words), False)) + else: + # Look ahead for next marker with text + for j in range(i + 1, len(raw_contexts)): + if raw_contexts[j][1]: + words = raw_contexts[j][1].split()[:10] + contexts.append((marker_id, " ".join(words), True)) # Borrowed + break + else: + contexts.append((marker_id, "", False)) + + return contexts + + +def _fuzzy_match_ratio( + phrase_words: list[str], + transcription: list[TranscribedWord], + start_idx: int, + window_size: int = 10, +) -> tuple[float, int, int]: + """ + Calculate how many words from phrase match the transcription at start_idx. + + Words are matched sequentially: each phrase word must appear at or after + the position of the previous match. This prevents false matches where + phrase words appear out of order or far into the window. + + Returns (ratio, first_match_offset, last_match_end_offset) where offsets + are relative to start_idx. last_match_end_offset points past the last + matched word. + """ + if not phrase_words: + return 0.0, 0, 0 + + words_to_check = min(len(phrase_words), window_size) + transcript_end = min( + start_idx + words_to_check + 5, len(transcription) + ) # +5 for flexibility (speaker may add filler words) + + if start_idx >= len(transcription): + return 0.0, 0, 0 + + transcript_words = [ + _normalize_text(transcription[j].word) for j in range(start_idx, transcript_end) + ] + + # Match phrase words sequentially against transcript window + matches = 0 + words_checked = 0 + t_pos = 0 # Current search position in transcript window + first_match_offset = 0 + last_match_end_offset = 0 + + for phrase_word in phrase_words[:words_to_check]: + normalized = _normalize_text(phrase_word) + if len(normalized) < 2: + continue # skip very short words (a, I, etc.) - don't count them + words_checked += 1 + + # Search forward from current position (preserves word order) + for j in range(t_pos, len(transcript_words)): + t_word = transcript_words[j] + matched = False + # Exact match + if normalized == t_word: + matched = True + # Allow substring match for words 4+ chars (handles plurals, tenses) + elif len(normalized) >= 4 and len(t_word) >= 4: + if normalized in t_word or t_word in normalized: + matched = True + + if matched: + if matches == 0: + first_match_offset = j + matches += 1 + last_match_end_offset = j + 1 + t_pos = j + 1 # Next word must appear after this one + break + + ratio = matches / words_checked if words_checked > 0 else 0.0 + return ratio, first_match_offset, last_match_end_offset + + +def _find_phrase_timestamp( + phrase: str, + transcription: list[TranscribedWord], + start_from: int = 0, + fuzzy_threshold: float = 0.5, +) -> tuple[int, float, float, int]: + """ + Find a phrase in the transcription using fuzzy matching. + + Returns (word_index, timestamp, confidence, match_end_idx) or + (-1, -1.0, 0.0, -1) if not found. word_index points to the first + matched word. match_end_idx points past the last matched word. + """ + phrase_normalized = _normalize_text(phrase) + phrase_words = phrase_normalized.split() + + if not phrase_words: + return -1, -1.0, 0.0, -1 + + best_idx = -1 + best_ratio = 0.0 + best_first_offset = 0 + best_end_offset = 0 + + # Slide through transcription looking for best match + for i in range(start_from, len(transcription)): + ratio, first_offset, end_offset = _fuzzy_match_ratio( + phrase_words, transcription, i + ) + if ratio > best_ratio: + best_ratio = ratio + best_idx = i + best_first_offset = first_offset + best_end_offset = end_offset + + # If we found a very good match, stop early + if ratio >= 0.95: + break + + if best_ratio >= fuzzy_threshold and best_idx >= 0: + # Use the actual first matched word position for the timestamp, + # not the window start position + actual_idx = best_idx + best_first_offset + match_end_idx = best_idx + best_end_offset + return actual_idx, transcription[actual_idx].start, best_ratio, match_end_idx + + return -1, -1.0, 0.0, -1 + + +def align_markers_to_transcription( + manuscript_text: str, + transcription: list[TranscribedWord], + slides: dict = None, + videos: dict = None, + audio: dict = None, + fuzzy_threshold: float = 0.6, +) -> list[MarkerTiming]: + """ + Align manuscript markers to transcription timestamps using fuzzy matching. + + This is the core alignment function that matches markers in manuscript.txt + to their corresponding timecodes in the whisper transcription. + + Unknown markers are filtered out - they aren't pronounced and shouldn't + be in the render plan. Note: [cite:...] markers are stripped at parse time. + + Args: + manuscript_text: Full manuscript with [S1], [video:xxx], etc. + transcription: Word-level timestamps from whisper + slides: Slide definitions (to identify valid slide markers) + videos: Video definitions (to identify valid video markers) + audio: Audio definitions (to identify valid audio markers) + fuzzy_threshold: Minimum match ratio (default 0.6 = 60% of words) + + Returns: + List of MarkerTiming with timestamps and confidence (known markers only) + """ + contexts = _extract_marker_contexts(manuscript_text, slides, videos, audio) + timings: list[MarkerTiming] = [] + + last_idx = 0 + last_end_time = 0.0 # Track end time of last matched phrase + + for marker_id, following_text, is_borrowed in contexts: + # If no text (empty context), place 1 second after the previous marker/phrase + # This handles markers like [video:xxx] that appear after text + if not following_text.strip(): + # Use 1 second after the previous end time + marker_time = last_end_time + 1.0 + timings.append( + MarkerTiming( + marker_id=marker_id, + timestamp=marker_time, + context="(after previous)", + confidence=1.0, + ) + ) + # Update last_end_time so subsequent markers without text continue to offset + last_end_time = marker_time + continue + + idx, timestamp, confidence, match_end_idx = _find_phrase_timestamp( + following_text, + transcription, + start_from=last_idx, + fuzzy_threshold=fuzzy_threshold, + ) + + if idx >= 0: + # Apply offset: marker should appear slightly before the words + adjusted_time = max(0.0, timestamp - 0.5) + timings.append( + MarkerTiming( + marker_id=marker_id, + timestamp=adjusted_time, + context=following_text[:50], + confidence=confidence, + ) + ) + # Only advance last_idx if this marker owns its text (not borrowed) + # If borrowed, the next marker needs to match the same text + if not is_borrowed: + last_idx = match_end_idx + # Calculate end time of this phrase for markers with no text + if last_idx > 0 and last_idx <= len(transcription): + last_end_time = transcription[last_idx - 1].end + else: + last_end_time = transcription[-1].end if transcription else 0.0 + else: + timings.append( + MarkerTiming( + marker_id=marker_id, + timestamp=-1.0, + context=following_text[:50], + confidence=0.0, + ) + ) + + return timings def build_render_plan( @@ -18,92 +436,546 @@ def build_render_plan( config: ProjectConfig, slides: dict[str, SlideDefinition], videos: dict[str, VideoSource], - transcript: list[TimedWord], -) -> RenderPlan: + videos_dir: Path, + manuscript_text: str, + transcription: list[TranscribedWord], + audio: Optional[dict[str, AudioDefinition]] = None, + audio_dir: Optional[Path] = None, + slide_range: Optional[tuple[str, Optional[str]]] = None, +) -> tuple[RenderPlan, list[MarkerTiming]]: """ - Build a complete render plan from parsed and validated data. + Build a complete render plan from manuscript and transcription. - This transforms transcript markers into timed slide events and - assembles all information needed for the render stage. + This performs on-the-fly alignment of manuscript markers to transcription + timestamps, then builds the render plan. + + Args: + manuscript_text: The manuscript.txt content (source of truth for markers) + transcription: Word-level timestamps from whisper transcription + slide_range: Optional tuple of (start_slide, end_slide) for partial rendering. + + Returns: + Tuple of (RenderPlan, list of MarkerTiming for display) """ - # Determine talking head source: - # 1. If config.talking_head.file is set, use that (may be JSON metadata) - # 2. Otherwise, use first video from videos.json - if config.talking_head.file: - video_path, metadata = resolve_video_file(project_path, config.talking_head.file) - # Create a VideoSource from the resolved metadata - if metadata: - talking_head = VideoSource( - file=str(video_path.relative_to(project_path)) if video_path.is_relative_to(project_path) else str(video_path), - preprocess=metadata.preprocess, - output_file=metadata.output.get("file") if metadata.output else None, + audio = audio or {} + audio_dir = audio_dir or project_path + + # Find the main narration video first (need skip value for timing adjustment) + narration_video_id = config.main_video + if not (narration_video_id and narration_video_id in videos): + raise ValueError("Main video not specified or not found in videos.") + narration_video = videos[narration_video_id] + + # Align markers to transcription timestamps + marker_timings = align_markers_to_transcription( + manuscript_text, transcription, slides=slides, videos=videos, audio=audio + ) + + # Apply skip offset: if narration video has skip, subtract it from all timestamps + # This accounts for the fact that the video will start at skip seconds, not 0 + narration_skip = narration_video.skip + if narration_skip > 0: + for timing in marker_timings: + if timing.timestamp >= 0: + timing.timestamp = max(0.0, timing.timestamp - narration_skip) + + # Build marker -> timestamp lookup + marker_times: dict[str, float] = {} + for timing in marker_timings: + if timing.timestamp >= 0: + marker_times[timing.marker_id] = timing.timestamp + + # Find shared_assets directory + shared_assets_dir = None + if (project_path / "shared_assets").exists(): + shared_assets_dir = project_path / "shared_assets" + elif (project_path.parent / "shared_assets").exists(): + shared_assets_dir = project_path.parent / "shared_assets" + + narration_video = videos[narration_video_id] + cutout = config.cutouts[narration_video.cutout] + + narration_videos: list[tuple[str, VideoSource, CutoutDefinition]] = [] + video_path = _resolve_video_path(videos_dir, narration_video, shared_assets_dir) + full_duration = get_video_duration(video_path) + # Adjust duration for skip (content starts at skip, so effective duration is less) + effective_duration = full_duration - narration_skip + # Get total duration from first always_visible video + narration_videos.append((narration_video_id, narration_video, cutout)) + # Resolve slide range to time range + time_offset = 0.0 + render_end_time = effective_duration + if slide_range: + start_slide, end_slide = slide_range + if start_slide not in marker_times: + raise ValueError( + f"Start slide '{start_slide}' not found in aligned markers" ) - else: - talking_head = VideoSource(file=config.talking_head.file) - else: - # Fall back to first video in videos.json - talking_head_id = next(iter(videos.keys())) - talking_head = videos[talking_head_id] - video_path = project_path / talking_head.file + time_offset = marker_times[start_slide] + if end_slide: + if end_slide not in marker_times: + raise ValueError( + f"End slide '{end_slide}' not found in aligned markers" + ) + render_end_time = marker_times[end_slide] - # Get video duration for end time calculations - total_duration = get_video_duration(video_path) + # Build events from aligned markers + slide_events = _extract_slide_events( + marker_timings, + slides, + effective_duration, + time_range=(time_offset, render_end_time) if slide_range else None, + ) - # Build slide events from transcript markers - slide_events = _extract_slide_events(transcript, slides, total_duration) + video_events = _extract_video_events( + marker_timings, + videos, + config.cutouts, + slides, + effective_duration, + time_range=(time_offset, render_end_time) if slide_range else None, + ) - # Derive slides directory from slides_path + audio_events = _extract_audio_events( + marker_timings, + audio, + time_range=(time_offset, render_end_time) if slide_range else None, + ) + + camera_events, initial_camera_state = _extract_camera_events( + marker_timings, + time_range=(time_offset, render_end_time) if slide_range else None, + ) + + # Apply time offset to all events (for partial rendering) + if time_offset > 0: + for event in slide_events: + event.start_time -= time_offset + event.end_time -= time_offset + for event in video_events: + event.start_time -= time_offset + event.end_time -= time_offset + for event in audio_events: + event.start_time = max(0, event.start_time - time_offset) + for event in camera_events: + event.time -= time_offset + + total_duration = render_end_time - time_offset + + # Handle narration pauses (videos that pause the narration track) + narration_pauses: list[NarrationPause] = [] + pause_video_events = [e for e in video_events if e.video_source.pause_narration] + + if pause_video_events: + # Sort pause events by their narration time + pause_video_events.sort(key=lambda e: e.start_time) + + cumulative_offset = 0.0 + for event in pause_video_events: + pause_duration = event.video_source.pause_narration + narration_time = event.start_time # Time in narration source + + # Create pause record (before applying offset to this event) + narration_pauses.append( + NarrationPause( + output_time=narration_time + cumulative_offset, + narration_time=narration_time, + duration=pause_duration, + video_id=event.video_id, + ) + ) + + # Offset all events that come AFTER this pause + for slide_event in slide_events: + if slide_event.start_time > narration_time: + slide_event.start_time += pause_duration + if slide_event.end_time > narration_time: + slide_event.end_time += pause_duration + + for vid_event in video_events: + if vid_event.start_time > narration_time: + vid_event.start_time += pause_duration + if vid_event.end_time > narration_time: + vid_event.end_time += pause_duration + + for aud_event in audio_events: + if aud_event.start_time > narration_time: + aud_event.start_time += pause_duration + + for cam_event in camera_events: + if cam_event.time > narration_time: + cam_event.time += pause_duration + + cumulative_offset += pause_duration + + # Update total duration + total_duration += cumulative_offset + + # Save narration end time (before outro) + narration_end_time = total_duration + + # Build outro events (plays after narration ends) + outro_events = _extract_outro_events( + config.outro, + videos, + config.cutouts, + total_duration, + videos_dir, + shared_assets_dir, + ) + + # Update total duration to include outro + if outro_events: + total_duration = outro_events[-1].end_time + + # Derive slides directory slides_json_path = project_path / config.slides_path slides_dir = slides_json_path.parent - return RenderPlan( + plan = RenderPlan( project_path=project_path, config=config, - talking_head=talking_head, slide_events=slide_events, total_duration=total_duration, slides=slides, + videos=videos, + video_events=video_events, + narration_videos=narration_videos, slides_dir=slides_dir, - talking_head_path=video_path, + videos_dir=videos_dir, + audio_events=audio_events, + audio=audio, + audio_dir=audio_dir, + camera_events=camera_events, + time_offset=time_offset, + initial_camera_state=initial_camera_state, + input_seek_time=time_offset, + shared_assets_dir=shared_assets_dir, + narration_pauses=narration_pauses, + outro_events=outro_events, + narration_end_time=narration_end_time, ) + return plan, marker_timings + + +def _resolve_video_path( + videos_dir: Path, + video_source: VideoSource, + shared_assets_dir: Path = None, +) -> Path: + """Resolve the actual video file path.""" + if video_source.is_shared and shared_assets_dir: + base_dir = shared_assets_dir + else: + base_dir = videos_dir + + if video_source.output_file: + video_path = base_dir / video_source.output_file + if video_path.exists(): + return video_path + webm_path = video_path.with_suffix(".mov") + if webm_path.exists(): + return webm_path + return base_dir / video_source.source_file + def _extract_slide_events( - transcript: list[TimedWord], + marker_timings: list[MarkerTiming], slides: dict[str, SlideDefinition], total_duration: float, + time_range: Optional[tuple[float, float]] = None, ) -> list[SlideEvent]: + """Extract slide events from aligned marker timings. + + Each slide starts at its own marker timestamp and ends when the next + slide's marker appears. Before the first slide, no slide is shown. """ - Extract slide events from transcript markers. + range_start, range_end = time_range if time_range else (0.0, float("inf")) - Each marker like [S1] in the transcript becomes a SlideEvent with: - - start_time: timestamp of the marker - - end_time: timestamp of next marker, or end of video - """ - # Find all markers in transcript - marker_times: list[tuple[float, str]] = [] + # Get slide markers in manuscript order (not sorted by timestamp!) + # The order in marker_timings reflects manuscript order + slide_markers: list[tuple[float, str]] = [] + for timing in marker_timings: + if timing.marker_id in slides and timing.timestamp >= 0: + slide_markers.append((timing.timestamp, timing.marker_id)) - for timed_word in transcript: - if timed_word.is_marker: - marker_id = timed_word.marker_id - if marker_id and marker_id in slides: - marker_times.append((timed_word.time, marker_id)) + if not slide_markers: + return [] - # Convert markers to slide events events: list[SlideEvent] = [] + for i, (marker_time, marker_id) in enumerate(slide_markers): + # Each slide starts at its own marker time + start_time = marker_time - for i, (start_time, marker_id) in enumerate(marker_times): - # End time is start of next marker, or end of video - if i + 1 < len(marker_times): - end_time = marker_times[i + 1][0] + # End time is when the NEXT slide's marker appears, or end of video + if i + 1 < len(slide_markers): + end_time = slide_markers[i + 1][0] else: end_time = total_duration - events.append(SlideEvent( - slide_id=marker_id, - start_time=start_time, - end_time=end_time, - slide_def=slides[marker_id], - )) + # Filter by time range + if end_time <= range_start or start_time >= range_end: + continue + start_time = max(start_time, range_start) + end_time = min(end_time, range_end) + + events.append( + SlideEvent( + slide_id=marker_id, + start_time=start_time, + end_time=end_time, + slide_def=slides[marker_id], + ) + ) + + return events + + +def _extract_video_events( + marker_timings: list[MarkerTiming], + videos: dict[str, VideoSource], + cutouts: dict[str, CutoutDefinition], + slides: dict[str, SlideDefinition], + total_duration: float, + time_range: Optional[tuple[float, float]] = None, +) -> list[VideoEvent]: + """ + Extract video events from aligned marker timings. + + - [video:xxx] events end at the next SLIDE marker + - [narration:xxx] events run until end + """ + range_start, range_end = time_range if time_range else (0.0, float("inf")) + + # Collect slide times for video: end time calculation + slide_times: list[float] = sorted( + [ + t.timestamp + for t in marker_timings + if t.marker_id in slides and t.timestamp >= 0 + ] + ) + + # Collect video markers + video_markers: list[tuple[float, str, str]] = [] # (time, video_id, type) + for timing in marker_timings: + if timing.timestamp < 0: + continue + + if timing.marker_id.startswith("video:"): + video_id = timing.marker_id[6:] + if video_id in videos: + video_source = videos[video_id] + if video_source.cutout and video_source.cutout in cutouts: + video_markers.append((timing.timestamp, video_id, "video")) + + elif timing.marker_id.startswith("narration:"): + video_id = timing.marker_id[10:] + if video_id in videos: + video_source = videos[video_id] + if video_source.cutout and video_source.cutout in cutouts: + video_markers.append((timing.timestamp, video_id, "narration")) + + events: list[VideoEvent] = [] + for start_time, video_id, marker_type in video_markers: + video_source = videos[video_id] + cutout = cutouts[video_source.cutout] + + if marker_type == "video": + # End at next slide + end_time = total_duration + for slide_time in slide_times: + if slide_time > start_time: + end_time = slide_time + break + else: + # narration: runs to end + end_time = total_duration + + # Filter by time range + if start_time < range_start or start_time >= range_end: + continue + end_time = min(end_time, range_end) + + events.append( + VideoEvent( + video_id=video_id, + start_time=start_time, + end_time=end_time, + video_source=video_source, + cutout=cutout, + ) + ) + + return events + + +def _extract_audio_events( + marker_timings: list[MarkerTiming], + audio: dict[str, AudioDefinition], + time_range: Optional[tuple[float, float]] = None, +) -> list[AudioEvent]: + """Extract audio events from aligned marker timings.""" + range_start, range_end = time_range if time_range else (0.0, float("inf")) + events: list[AudioEvent] = [] + + for timing in marker_timings: + if timing.timestamp < 0: + continue + + marker_id = timing.marker_id + if marker_id.startswith("A") and len(marker_id) > 1: + audio_id = marker_id[1:] + if audio_id in audio: + if timing.timestamp < range_start or timing.timestamp >= range_end: + continue + start_time = max(0, timing.timestamp - AUDIO_OFFSET_SECONDS) + events.append( + AudioEvent( + audio_id=audio_id, + start_time=start_time, + audio_def=audio[audio_id], + ) + ) + + return events + + +def _extract_camera_events( + marker_timings: list[MarkerTiming], + time_range: Optional[tuple[float, float]] = None, +) -> tuple[list[CameraEvent], CameraState]: + """ + Extract camera events from aligned marker timings. + + Camera state is cumulative. Returns (events, initial_state). + """ + range_start, range_end = time_range if time_range else (0.0, float("inf")) + + events: list[CameraEvent] = [] + current_state = CameraState() + initial_state = CameraState() + found_range_start = False + + for timing in marker_timings: + if timing.timestamp < 0: + continue + + marker_id = timing.marker_id + if marker_id not in CAMERA_PRESETS: + continue + + preset = CAMERA_PRESETS[marker_id] + + # Determine new state based on marker type + if marker_id in ("Reset", "NoTilt"): + new_state = CameraState() + elif marker_id.startswith("Zoom"): + new_state = CameraState( + zoom=preset.zoom, + rotation=current_state.rotation, + pan_x=current_state.pan_x, + pan_y=current_state.pan_y, + focal_x=current_state.focal_x, + focal_y=current_state.focal_y, + ) + elif marker_id.startswith("Tilt"): + new_state = CameraState( + zoom=current_state.zoom, + rotation=preset.rotation, + pan_x=current_state.pan_x, + pan_y=current_state.pan_y, + focal_x=current_state.focal_x, + focal_y=current_state.focal_y, + ) + elif marker_id.startswith("Pan"): + new_state = CameraState( + zoom=current_state.zoom, + rotation=current_state.rotation, + pan_x=preset.pan_x, + pan_y=preset.pan_y, + focal_x=current_state.focal_x, + focal_y=current_state.focal_y, + ) + else: + new_state = preset + + # Capture state at range start + if not found_range_start and timing.timestamp >= range_start: + initial_state = current_state + found_range_start = True + + # Only emit events within range + if range_start <= timing.timestamp < range_end: + events.append( + CameraEvent( + time=timing.timestamp, + target_state=new_state, + duration=0.2, + easing="ease-out", + ) + ) + + current_state = new_state + + if not found_range_start: + initial_state = CameraState() + + return events, initial_state + + +def _extract_outro_events( + outro_video_ids: list[str], + videos: dict[str, VideoSource], + cutouts: dict[str, CutoutDefinition], + narration_end_time: float, + videos_dir: Path, + shared_assets_dir: Path = None, +) -> list[OutroEvent]: + """ + Extract outro events that play after the narration ends. + + Outro videos play in sequence, starting from narration_end_time. + Each video plays for its `take` duration (or full source duration if no take). + """ + events: list[OutroEvent] = [] + current_time = narration_end_time + + for video_id in outro_video_ids: + if video_id not in videos: + continue + + video_source = videos[video_id] + + # Get the video duration + video_path = _resolve_video_path(videos_dir, video_source, shared_assets_dir) + if video_path.exists(): + full_duration = get_video_duration(video_path) + else: + full_duration = 10.0 # Fallback + + # Use take if specified, otherwise use full duration + duration = video_source.take if video_source.take is not None else full_duration + + # Account for skip + duration = max(0, duration) + + # Resolve cutout (None = fullscreen) + cutout = None + if video_source.cutout and video_source.cutout in cutouts: + cutout = cutouts[video_source.cutout] + + events.append( + OutroEvent( + video_id=video_id, + start_time=current_time, + end_time=current_time + duration, + video_source=video_source, + cutout=cutout, + ) + ) + + current_time += duration return events diff --git a/gnommo/validator.py b/gnommo/validator.py index 34997a4..106f62d 100644 --- a/gnommo/validator.py +++ b/gnommo/validator.py @@ -3,7 +3,13 @@ from pathlib import Path from .errors import ValidationError, ValidationIssue -from .models import ProjectConfig, SlideDefinition, VideoSource, SLIDE_LAYOUTS +from .models import ( + ProjectConfig, + SlideDefinition, + VideoSource, + SLIDE_LAYOUTS, + CAMERA_PRESETS, +) def validate_project( @@ -12,6 +18,7 @@ def validate_project( config: ProjectConfig, slides: dict[str, SlideDefinition], videos: dict[str, VideoSource], + videos_dir: Path, malformed_markers: list[tuple[int, str]] = None, ) -> None: """ @@ -30,19 +37,59 @@ def validate_project( # Check for malformed markers first (these are likely typos) if malformed_markers: for line_num, marker_text in malformed_markers: - issues.append(ValidationIssue( - f"Malformed marker: {marker_text}", - project_path / "manuscript.txt", - line_num - )) + issues.append( + ValidationIssue( + f"Malformed marker: {marker_text}", + project_path / "manuscript.txt", + line_num, + ) + ) - # Check all manuscript markers have corresponding slides + # Check all manuscript markers have corresponding slides or videos for marker in manuscript_markers: + # Skip camera effect markers (Zoom0, TiltLeft, Reset, etc.) + if marker in CAMERA_PRESETS: + continue + # Skip audio markers (start with 'A' followed by audio id, e.g., Awoosh) + if marker.startswith("A") and len(marker) > 1 and marker[1:].isalnum(): + continue + # Validate video trigger markers (video:xxx) - slide-like videos + if marker.startswith("video:"): + video_id = marker[6:] # Remove 'video:' prefix + if video_id not in videos: + # Check if it's a file extension mismatch + hint = "" + if "." in video_id: + base_name = video_id.rsplit(".", 1)[0] + if base_name in videos: + hint = f" (Did you mean [video:{base_name}]? Don't include file extensions in markers)" + issues.append( + ValidationIssue( + f"Video marker [{marker}] referenced in manuscript but '{video_id}' not defined in videos.json{hint}", + project_path / "manuscript.txt", + ) + ) + continue + + # Validate narration trigger markers (narration:xxx) - continuous videos + if marker.startswith("narration:"): + video_id = marker[10:] # Remove 'narration:' prefix + if video_id not in videos: + issues.append( + ValidationIssue( + f"Narration marker [{marker}] referenced in manuscript but '{video_id}' not defined in videos.json", + project_path / "manuscript.txt", + ) + ) + continue + if marker not in slides: - issues.append(ValidationIssue( - f"Slide marker [{marker}] referenced in manuscript but not defined in slides.json", - project_path / "manuscript.txt" - )) + issues.append( + ValidationIssue( + f"Slide marker [{marker}] referenced in manuscript but not defined in slides.json", + project_path / "manuscript.txt", + ) + ) # Check all slide images exist # Slides are in the same directory as the slides.json file @@ -52,37 +99,68 @@ def validate_project( for slide_id, slide_def in slides.items(): image_path = slides_dir / slide_def.image if not image_path.exists(): - issues.append(ValidationIssue( - f"Slide image not found: {slide_def.image}", - slides_json_path - )) + issues.append( + ValidationIssue( + f"Slide image not found: {slide_def.image}", slides_json_path + ) + ) # Check slide type is valid if slide_def.type not in SLIDE_LAYOUTS: - issues.append(ValidationIssue( - f"Unknown slide type '{slide_def.type}' for slide {slide_id}. " - f"Valid types: {list(SLIDE_LAYOUTS.keys())}", - project_path / "slides.json" - )) + issues.append( + ValidationIssue( + f"Unknown slide type '{slide_def.type}' for slide {slide_id}. " + f"Valid types: {list(SLIDE_LAYOUTS.keys())}", + project_path / "slides.json", + ) + ) + + # Check all video files exist (paths relative to videos_dir or shared_assets) + videos_json_path = project_path / config.videos_path + + # Find shared_assets directory + shared_assets_dir = None + if (project_path / "shared_assets").exists(): + shared_assets_dir = project_path / "shared_assets" + elif (project_path.parent / "shared_assets").exists(): + shared_assets_dir = project_path.parent / "shared_assets" - # Check all video files exist for video_id, video_source in videos.items(): - video_path = project_path / video_source.file - if not video_path.exists(): - issues.append(ValidationIssue( - f"Video file not found: {video_source.file}", - project_path / "videos.json" - )) + # Determine base directory based on is_shared flag + if video_source.is_shared: + if shared_assets_dir: + base_dir = shared_assets_dir + else: + issues.append( + ValidationIssue( + f"Video '{video_id}' has is_shared=true but shared_assets directory not found", + videos_json_path, + ) + ) + continue + else: + base_dir = videos_dir - # Check preprocessed output exists if preprocessing is defined - if video_source.preprocess and video_source.output_file: - output_path = project_path / video_source.output_file + video_path = base_dir / video_source.source_file + if not video_path.exists(): + issues.append( + ValidationIssue( + f"Video file not found: {video_source.source_file}", + videos_json_path, + ) + ) + + # Check preprocessed output exists if filters are defined + if video_source.filter and video_source.output_file: + output_path = base_dir / video_source.output_file if not output_path.exists(): - issues.append(ValidationIssue( - f"Preprocessed output not found: {video_source.output_file}. " - f"Run with -a preprocess first.", - project_path / "videos.json" - )) + issues.append( + ValidationIssue( + f"Preprocessed output not found: {video_source.output_file}. " + f"Run with -a preprocess first.", + videos_json_path, + ) + ) # Check background exists (image or video) # Try 'background' first, fall back to deprecated 'background_video' @@ -94,38 +172,45 @@ def validate_project( # Try parent directory (shared_assets at repo root) bg_path = project_path.parent / bg_file if not bg_path.exists(): - issues.append(ValidationIssue( - f"Background not found: {bg_file}", - project_path / "project.json" - )) + issues.append( + ValidationIssue( + f"Background not found: {bg_file}", project_path / "project.json" + ) + ) # Check we have at least one video source if not videos: - issues.append(ValidationIssue( - "No video sources defined in videos.json", - project_path / "videos.json" - )) + issues.append( + ValidationIssue( + "No video sources defined in videos.json", project_path / "videos.json" + ) + ) # Check resolution is reasonable width, height = config.resolution if width < 100 or height < 100: - issues.append(ValidationIssue( - f"Resolution too small: {width}x{height}", - project_path / "project.json" - )) + issues.append( + ValidationIssue( + f"Resolution too small: {width}x{height}", project_path / "project.json" + ) + ) if width > 7680 or height > 4320: - issues.append(ValidationIssue( - f"Resolution too large: {width}x{height} (max 8K)", - project_path / "project.json" - )) + issues.append( + ValidationIssue( + f"Resolution too large: {width}x{height} (max 8K)", + project_path / "project.json", + ) + ) # Check FPS is reasonable if config.fps < 1 or config.fps > 120: - issues.append(ValidationIssue( - f"Invalid FPS: {config.fps} (must be 1-120)", - project_path / "project.json" - )) + issues.append( + ValidationIssue( + f"Invalid FPS: {config.fps} (must be 1-120)", + project_path / "project.json", + ) + ) # If any issues, raise ValidationError if issues: diff --git a/main.py b/main.py new file mode 100644 index 0000000..e11e752 --- /dev/null +++ b/main.py @@ -0,0 +1,6 @@ +import gnommo + +if __name__ == "__main__": + print("This is the main module.") + + gnommo.main() diff --git a/notes.json b/notes.json new file mode 100644 index 0000000..e69de29 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..4076923 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +openai-whisper + diff --git a/tasks.md b/tasks.md new file mode 100644 index 0000000..c79d59d --- /dev/null +++ b/tasks.md @@ -0,0 +1,476 @@ +# Gnommo Feature Development Roadmap + +## Overview +Features to standardize the Keynote-to-YouTube workflow, so that once the presentation is complete, only a standardized recording session stands between you and a finished video. + +--- + +## 1. Video Description Generator + +**Command:** `gnommo -p description` + +Generate a complete YouTube description with citations, attributions, and chapters. + +--- + +### 1.1 Manuscript Citations (`[cite:...]`) + +Citations embedded in the manuscript represent sources, references, or links mentioned during narration. The text after `cite:` is the **literal reference** that should appear in the description. + +**Format in manuscript.txt:** +``` +[cite:Reference text exactly as it should appear] +``` + +**Examples:** +``` +[S3] +According to this study [cite:Smith et al. (2024) "Effects of AI on Productivity" - https://example.com/paper], +the effect is significant. + +[S7] +I'm using [cite:Keynote by Apple - https://apple.com/keynote] for all my presentations. + +[S12] +This technique was pioneered by [cite:Dr. Jane Doe, MIT Media Lab]. +``` + +**Output in description:** +``` +SOURCES & REFERENCES +━━━━━━━━━━━━━━━━━━━━ +1:23 - Smith et al. (2024) "Effects of AI on Productivity" - https://example.com/paper +4:56 - Keynote by Apple - https://apple.com/keynote +8:30 - Dr. Jane Doe, MIT Media Lab +``` + +**Requirements:** +- Parse `[cite:...]` markers from manuscript.txt +- Extract the literal text after `cite:` as the reference +- Align citations to timestamps (same fuzzy matching as other markers) +- Group citations in order of appearance +- Citations are NOT aligned for rendering (ignored by renderer) but ARE timestamped for description + +**Note:** `[cite:...]` markers should not affect video rendering or narration alignment - they are metadata-only markers for description generation. + +--- + +### 1.2 Pexels/Stock Footage Attribution + +Attribution for Pexels content is **not legally required** but is appreciated and professional. + +**Official Pexels attribution format:** +``` +by [Contributor Name] via Pexels +``` + +**Implementation:** +- Extend `videos.json` to include attribution metadata: + ```json + { + "beach_waves": { + "source_file": "pexels/beach.mp4", + "is_shared": true, + "attribution": { + "source": "pexels", + "creator": "John Doe", + "url": "https://pexels.com/video/12345" + } + } + } + ``` +- Auto-detect Pexels videos from `shared_assets/pexels/` folder +- Support Pexels metadata JSON files (if downloaded with video) +- Generate attribution section for video description: + ``` + STOCK FOOTAGE + ━━━━━━━━━━━━━ + Beach waves by John Doe via Pexels: https://pexels.com/video/12345 + City timelapse by Jane Smith via Pexels: https://pexels.com/video/67890 + ``` + +**Pexels License Notes** (from pexels.com/license): +- Free for personal and commercial use +- Attribution not required but appreciated +- Cannot sell unaltered copies +- Cannot redistribute on other stock platforms + +### 1.3 Complete Description Output + +**Output file:** `out/description_youtube.txt` + +Combine all elements into a ready-to-paste YouTube description. + +**Structure:** +``` +[Video description from project.json "description" field] + +CHAPTERS +━━━━━━━━ +0:00 Introduction +1:23 Topic One +3:45 Topic Two +... + +REFERENCES +━━━━━━━━━━ +1:23 - Smith et al. (2024) "AI Study" - https://example.com +4:56 - Keynote by Apple - https://apple.com/keynote +... + +STOCK FOOTAGE +━━━━━━━━━━━━━ +Beach waves by John Doe via Pexels: https://pexels.com/video/12345 +... + +[Optional footer from project.json "footer" field - social links, subscribe CTA, etc.] +``` + +**project.json additions:** +```json +{ + "description": "In this video, I walk through the complete Gnommo workflow for creating YouTube videos from Keynote presentations.", + "footer": "Subscribe for more tutorials: https://youtube.com/@channel\nTwitter: https://twitter.com/handle" +} +``` + +**Requirements:** +- Pull video description from `project.json` "description" field +- Generate chapters from slide markers (see Section 2) +- Collect all `[cite:...]` references with timestamps +- Collect all Pexels/stock attributions from `videos.json` +- Append optional footer from `project.json` "footer" field +- Output to `out/description_youtube.txt` +- Sections with no content are omitted (e.g., no STOCK FOOTAGE section if none used) + +--- + +## 2. YouTube Chapter Markers + +**Command:** `gnommo -p chapters` + +Auto-generate chapter timestamps from slide markers. + +**Requirements:** +- Extract chapter titles from: + - Keynote slide titles (via presenter notes import) + - First sentence after each `[SN]` marker + - Optional `[chapter:Title]` markers for explicit chapter names +- Calculate timestamps from aligned marker timings +- Output copy-paste ready format: + ``` + CHAPTERS + ━━━━━━━━ + 0:00 Introduction + 1:23 What is Gnommo? + 3:45 Setting Up Your Project + 7:12 Recording Tips + 10:30 Rendering Your Video + 12:45 Outro + ``` +- Option to merge small chapters (minimum duration threshold) +- Support for nested chapters (main topics + subtopics) + +--- + +## 3. Subtitle/Caption Export + +**Command:** `gnommo -p subtitles` + +Generate subtitle files from Whisper transcription. + +**Requirements:** +- Export formats: SRT, VTT, TXT +- Use existing word-level timestamps from transcription +- Smart line breaking (max characters per line, break at punctuation) +- Speaker diarization support (future: multiple speakers) +- Options: + - `--format srt|vtt|txt` + - `--max-chars 42` (characters per line) + - `--max-duration 5` (seconds per subtitle block) + +**Example output (SRT):** +``` +1 +00:00:01,500 --> 00:00:04,200 +Hello and welcome to this tutorial +on video editing with Gnommo. + +2 +00:00:04,500 --> 00:00:07,800 +Today we're going to cover +the complete workflow. +``` + +--- + +## 4. Thumbnail Generation + +**Command:** `gnommo -p thumbnail` + +Auto-generate thumbnail candidates from slides. + +**Requirements:** +- Designate thumbnail slides with `[thumbnail]` marker +- If no marker, use slide 1 or title slide +- Apply text overlays from config: + ```json + { + "thumbnail": { + "title_text": "Episode ${episode_number}", + "subtitle_text": "${title}", + "font": "Impact", + "text_color": "#FFFFFF", + "outline_color": "#000000", + "position": "bottom-left" + } + } + ``` +- Generate multiple variants: + - With/without text overlay + - Different zoom levels + - Different color treatments (saturated, high contrast) +- Output to `out/thumbnails/` folder +- Resolution: 1280x720 (YouTube standard) + +--- + +## 5. Intro/Outro Templates + +**Configuration in project.json:** +```json +{ + "intro": { + "template": "templates/intro_v2.mp4", + "duration": 3.5, + "transition": "fade", + "variables": { + "episode_number": "12", + "title": "Getting Started with Gnommo" + } + }, + "outro": { + "template": "templates/outro_subscribe.mp4", + "duration": 8.0, + "transition": "fade" + } +} +``` + +**Requirements:** +- Define intro/outro templates in `shared_assets/templates/` +- Auto-prepend intro before first slide +- Auto-append outro after last slide +- Support variable substitution in templates (episode number, title) +- Configurable transition types (fade, cut, wipe) +- End screen safe zone support (last 20 seconds) + +--- + +## 6. Multi-Platform Format Presets + +**Command:** `gnommo -p render --format ` + +**Presets:** +| Preset | Aspect | Resolution | Notes | +|--------|--------|------------|-------| +| `youtube` | 16:9 | 1920x1080 | Default, standard horizontal | +| `youtube-4k` | 16:9 | 3840x2160 | 4K export | +| `shorts` | 9:16 | 1080x1920 | Vertical, auto-reframe slides | +| `podcast` | - | Audio only | MP3/M4A export for podcast feeds | +| `square` | 1:1 | 1080x1080 | Instagram/LinkedIn | + +**Requirements:** +- Auto-adjust cutout positions per format +- Smart slide reframing for vertical (zoom to content area) +- Separate output folders per format +- Batch export to multiple formats: `--format youtube,shorts,podcast` + +--- + +## 7. Teleprompter Script Generation + +**Command:** `gnommo -p teleprompter` + +Extract clean narration text for teleprompter display. + +**Requirements:** +- Strip all markers from manuscript +- Keep only spoken text +- Output formats: + - `--format txt` - Plain text + - `--format html` - Scrollable HTML page with large font + - `--format json` - For teleprompter apps +- Optional: Include slide thumbnails as visual cues +- Configurable font size and scroll speed hints + +**Example HTML output:** +```html +
+

[SLIDE: Introduction]

+

Hello and welcome to this tutorial on video editing with Gnommo.

+

[SLIDE: What is Gnommo?]

+

Gnommo is a code-first video editing pipeline...

+
+``` + +--- + +## 8. Recording Checklist Generator + +**Command:** `gnommo -p checklist` + +Generate a pre-recording checklist based on project configuration. + +**Output includes:** +- [ ] Camera settings (resolution, fps from project.json) +- [ ] Lighting setup (if green screen detected in videos.json) +- [ ] Audio check (microphone levels) +- [ ] Props/demos needed (parsed from `[video:...]` markers) +- [ ] Slide count and estimated duration +- [ ] Teleprompter ready +- [ ] Recording space clear + +**Customizable via `checklist_template.md` in project folder.** + +--- + +## 9. Audio Normalization + +**Automatic during render or standalone command:** +`gnommo -p normalize` + +**Requirements:** +- Target: -14 LUFS (YouTube standard) +- Apply loudness normalization to narration track +- Preserve dynamic range (avoid over-compression) +- Normalize intro/outro audio to match +- Option: `--target-lufs -14` + +**Implementation:** +- Use FFmpeg `loudnorm` filter +- Two-pass normalization for accurate results +- Report before/after levels + +--- + +## 10. Project Templates + +**Command:** `gnommo init --template