Files
gnommo/gnommo/preprocessor.py
T
2026-05-12 00:52:14 +02:00

2440 lines
80 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Preprocessing stage: apply filters to source videos."""
import os
import subprocess
import sys
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
from pathlib import Path
from typing import Any, Optional
import shutil
from .errors import PreprocessError
from .models import (
VideoSource,
ChromaKeyConfig,
ColorGradeConfig,
GnommoKeyConfig,
AudioNormalizeConfig,
EQBand,
)
from typing import Union, Optional
# Number of parallel workers for chunk processing
DEFAULT_CHUNK_WORKERS = 4
# Chunk duration in seconds for parallel filter processing (avoids huge intermediate files)
CHUNK_DURATION = 60
# Resolution presets for preview/proxy workflow
# Each entry: (width, height, subdir_name)
RES_CONFIGS: dict[str, Optional[tuple]] = {
"full": None, # no downscale, no subdir
"low": (490, 270, "low"),
"tiny": (320, 180, "proxy"), # "proxy" subdir kept for backward compat
}
# Keep legacy constants pointing at "tiny" values
PROXY_WIDTH, PROXY_HEIGHT = RES_CONFIGS["tiny"][:2] # type: ignore[index]
def get_video_duration(video_path: Path) -> float:
"""Get duration of a video file using ffprobe."""
cmd = [
"ffprobe",
"-v",
"error",
"-show_entries",
"format=duration",
"-of",
"default=noprint_wrappers=1:nokey=1",
str(video_path),
]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
return 0.0
try:
return float(result.stdout.strip())
except ValueError:
return 0.0
def _video_has_alpha(video_path: Path) -> bool:
"""Check if a video file has an alpha channel."""
cmd = [
"ffprobe",
"-v",
"error",
"-select_streams",
"v:0",
"-show_entries",
"stream=pix_fmt",
"-of",
"default=noprint_wrappers=1:nokey=1",
str(video_path),
]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
return False
pix_fmt = result.stdout.strip()
# Pixel formats with alpha contain 'a' (yuva, rgba, bgra, etc.)
return "yuva" in pix_fmt or "rgba" in pix_fmt or "bgra" in pix_fmt
def format_time(seconds: float) -> str:
"""Format seconds as human-readable time string."""
if seconds < 60:
return f"{int(seconds)}s"
elif seconds < 3600:
mins = int(seconds // 60)
secs = int(seconds % 60)
return f"{mins}m {secs}s"
else:
hours = int(seconds // 3600)
mins = int((seconds % 3600) // 60)
return f"{hours}h {mins}m"
def create_downscaled_video(
source_path: Path,
out_dir: Path,
width: int,
height: int,
force: bool = False,
) -> Path:
"""Downscale a video to the given resolution, preserving audio."""
out_dir.mkdir(parents=True, exist_ok=True)
out_path = out_dir / source_path.name
if out_path.exists() and not force:
return out_path
cmd = [
"ffmpeg",
"-y",
"-i",
str(source_path),
"-vf",
f"scale={width}:{height}",
"-c:v",
"libx264",
"-preset",
"ultrafast",
"-crf",
"28",
"-vsync",
"cfr",
"-c:a",
"aac", # re-encode audio so both streams share the same PTS origin,
"-ar", # avoiding the lip-sync drift caused by libx264 encoder delay
"48000", # when audio is copied with its original timestamps
str(out_path),
]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
raise PreprocessError(
f"Failed to downscale {source_path.name} to {width}x{height}",
filter_type="downscale",
command=" ".join(cmd),
stderr=result.stderr,
)
return out_path
# Keep legacy name as alias
def create_proxy_video(source_path: Path, proxy_dir: Path, force: bool = False) -> Path:
w, h, _ = RES_CONFIGS["tiny"] # type: ignore[misc]
return create_downscaled_video(source_path, proxy_dir, w, h, force)
def create_downscaled_videos(
videos_dir: Path,
videos: dict[str, VideoSource],
res: str,
force: bool = False,
verbose: bool = False,
) -> Path:
"""
Create downscaled copies of all source videos for the given res preset.
Returns the path to the output subdirectory.
"""
cfg = RES_CONFIGS[res]
if cfg is None:
return videos_dir # full res — no subdir
width, height, subdir = cfg
out_dir = videos_dir / subdir
out_dir.mkdir(parents=True, exist_ok=True)
source_files: set[str] = set(v.source_file for v in videos.values())
print(f" Creating {res} copies ({width}x{height})...")
for source_file in sorted(source_files):
source_path = videos_dir / source_file
if not source_path.exists():
if verbose:
print(f" Skipping {source_file} (not found)")
continue
out_path = out_dir / source_file
if out_path.exists() and not force:
if verbose:
print(f" {source_file}: exists, skipping")
continue
print(f" {source_file}...", end=" ", flush=True)
create_downscaled_video(source_path, out_dir, width, height, force)
print("done")
return out_dir
# Keep legacy name as alias
def create_proxies_for_videos(
videos_dir: Path,
videos: dict[str, VideoSource],
force: bool = False,
verbose: bool = False,
) -> Path:
return create_downscaled_videos(videos_dir, videos, "tiny", force, verbose)
def ensure_downscaled_files_exist(
source_dir: Path,
res: str,
force: bool = False,
verbose: bool = False,
skip_sources: set = None,
) -> Path:
"""
Ensure downscaled copies exist for all videos in source_dir for the given res preset.
Creates them on-the-fly if missing. Returns the output subdirectory.
skip_sources: optional set of source filenames to skip (e.g. files that have a
preprocessed output_file, where the full-res processed version will be used instead).
"""
cfg = RES_CONFIGS[res]
if cfg is None:
return source_dir
width, height, subdir = cfg
video_extensions = {".mov", ".mp4", ".webm", ".avi", ".mkv", ".m4v"}
out_dir = source_dir / subdir
out_dir.mkdir(parents=True, exist_ok=True)
video_files = [
f
for f in source_dir.iterdir()
if f.is_file()
and f.suffix.lower() in video_extensions
and "_processed" not in f.stem
and not f.name.startswith(".")
and (skip_sources is None or f.name not in skip_sources)
]
if not video_files:
if verbose:
print(f" No video files found in {source_dir}")
return out_dir
missing = [f for f in video_files if not (out_dir / f.name).exists() or force]
if not missing:
if verbose:
print(f" All {res} copies exist in {out_dir}")
return out_dir
print(f" Creating {len(missing)} {res} file(s) ({width}x{height})...")
for video_file in missing:
print(f" {video_file.name}...", end=" ", flush=True)
create_downscaled_video(video_file, out_dir, width, height, force=True)
print("done")
return out_dir
# Keep legacy name as alias
def ensure_proxy_files_exist(
source_dir: Path,
force: bool = False,
verbose: bool = False,
) -> Path:
return ensure_downscaled_files_exist(source_dir, "tiny", force, verbose)
import selectors, time, sys, subprocess
def run_ffmpeg_with_progress(cmd, duration, description="Processing"):
cmd = cmd.copy()
insert_pos = cmd.index("-y") + 1 if "-y" in cmd else 1
cmd[insert_pos:insert_pos] = [
"-progress",
"pipe:1",
"-nostats",
"-loglevel",
"warning",
]
p = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
bufsize=1,
universal_newlines=True,
)
sel = selectors.DefaultSelector()
sel.register(p.stdout, selectors.EVENT_READ)
bar_width = 30
start_time = time.time()
last_update = time.time()
last_percent = 0
seen_any_progress = False
last_log_line = ""
logs = []
def draw(percent, suffix=""):
filled = int(bar_width * percent / 100)
bar = "" * filled + "" * (bar_width - filled)
sys.stdout.write(
f"\r {description}: [{bar}] {percent:3d}% {suffix} "
)
sys.stdout.flush()
draw(0, "Initializing...")
while True:
# If process ended and no more output, break
if p.poll() is not None:
# drain any remaining output quickly
while True:
line = p.stdout.readline()
if not line:
break
logs.append(line)
break
events = sel.select(timeout=0.2)
if not events:
if not seen_any_progress:
# Show elapsed time and last FFmpeg output line during init
elapsed = time.time() - start_time
hint = f" | {last_log_line[:50]}" if last_log_line else ""
draw(0, f"Initializing... ({elapsed:.0f}s){hint}")
elif (
seen_any_progress
and last_percent >= 99
and (time.time() - last_update) > 1.0
):
draw(last_percent, "Finalizing...")
continue
for key, _ in events:
line = key.fileobj.readline()
if not line:
continue
logs.append(line)
# Track last non-empty, non-progress-key line for init diagnostics
stripped = line.strip()
if stripped and "=" not in stripped:
last_log_line = stripped
if line.startswith("out_time_ms="):
val = line.split("=", 1)[1].strip()
if val != "N/A":
try:
t_ms = int(val)
t_s = t_ms / 1_000_000
percent = (
min(99, int((t_s / duration) * 100)) if duration > 0 else 0
)
last_percent = max(last_percent, percent)
last_update = time.time()
seen_any_progress = True
draw(last_percent, "")
except ValueError:
pass
# Completion
if p.returncode == 0:
draw(100, "Done\n")
else:
code = p.returncode
# On macOS/Linux, -9 means SIGKILL (OOM kill by OS), -6 = SIGABRT
signal_hint = (
" (OOM kill)" if code == -9 else (" (abort)" if code == -6 else "")
)
sys.stdout.write(f"\n FFmpeg exited with code {code}{signal_hint}\n")
sys.stdout.flush()
return subprocess.CompletedProcess(
cmd, p.returncode, stdout="", stderr="".join(logs)
)
def _has_audio_stream(video_path: Path) -> bool:
"""Return True if the file has a real (non-ghost) audio stream."""
result = subprocess.run(
[
"ffprobe",
"-v",
"error",
"-analyzeduration",
"0",
"-probesize",
"1000000",
"-select_streams",
"a:0",
"-show_entries",
"stream=index,nb_frames",
"-of",
"csv=p=0",
str(video_path),
],
capture_output=True,
text=True,
)
output = result.stdout.strip()
if not output:
return False
parts = output.split(",")
if len(parts) >= 2 and parts[1].strip() == "0":
return False # Ghost audio track — header present but no sample data
return True
def check_audio_channel_silent(
input_path: Path, channel: str, threshold_db: float = -60.0
) -> tuple[bool, float]:
"""
Quick check whether the specified audio channel is silent.
Uses ffmpeg volumedetect (audio-only pass, much faster than full processing).
Returns (is_silent, max_volume_db).
"""
pan = "pan=mono|c0=c0" if channel == "left" else "pan=mono|c0=c1"
cmd = [
"ffmpeg",
"-i",
str(input_path),
"-af",
f"{pan},volumedetect",
"-f",
"null",
"/dev/null",
]
result = subprocess.run(cmd, capture_output=True, text=True)
for line in result.stderr.splitlines():
if "max_volume:" in line:
try:
max_vol = float(line.split("max_volume:")[1].strip().replace(" dB", ""))
return max_vol < threshold_db, max_vol
except ValueError:
pass
return False, 0.0
def _resolve_auto_channel(input_path: Path, threshold_db: float = -60.0) -> str:
"""
Detect which audio channels have signal and return the appropriate channel setting.
Logic:
- One channel silent, the other not → return the active channel ("left" or "right")
- Both channels have signal → return "both"
"""
left_silent, _ = check_audio_channel_silent(input_path, "left", threshold_db)
right_silent, _ = check_audio_channel_silent(input_path, "right", threshold_db)
if left_silent and not right_silent:
return "right"
if right_silent and not left_silent:
return "left"
return "both"
def detect_silence_bounds(
input_path: Path,
noise_threshold_db: float = -40.0,
min_silence_duration: float = 0.3,
verbose: bool = False,
) -> tuple[float, float]:
"""
Detect when audio content starts and ends in a file.
Uses FFmpeg's silencedetect filter to find the first and last
non-silent moments. Useful for automatically computing skip/take values.
Two common preamble shapes are handled:
- File starts with silence → first_sound = end of that silence.
- File starts with noise (e.g. clothing rustle) followed by a brief
quiet gap before speech → first_sound = end of that first gap.
Args:
input_path: Video or audio file to analyse.
noise_threshold_db: dB level below which audio is considered silent.
Raise (e.g. -25) to treat low-level noise like clothing rustle
as silence.
min_silence_duration: Minimum gap length (seconds) that counts as
silence. Shorter gaps are ignored.
verbose: Print detected silence periods for debugging.
Returns:
(first_sound_time, last_sound_time) in seconds.
first_sound_time — when the first meaningful sound begins.
last_sound_time — when the last meaningful sound ends.
"""
total_duration = get_video_duration(input_path)
cmd = [
"ffmpeg",
"-i",
str(input_path),
"-af",
f"silencedetect=noise={noise_threshold_db}dB:duration={min_silence_duration}",
"-f",
"null",
"/dev/null",
]
result = subprocess.run(cmd, capture_output=True, text=True)
# Parse silence_start / silence_end lines from stderr
silence_periods: list[tuple[float, float]] = []
pending_start: float | None = None
for line in result.stderr.splitlines():
if "silence_start:" in line:
try:
pending_start = float(line.split("silence_start:")[1].strip())
except ValueError:
pass
elif "silence_end:" in line and pending_start is not None:
try:
end_t = float(line.split("silence_end:")[1].split("|")[0].strip())
silence_periods.append((pending_start, end_t))
pending_start = None
except ValueError:
pass
# File ended while still in silence — close the period at total_duration
if pending_start is not None:
silence_periods.append((pending_start, total_duration))
if verbose:
print(f"\n silence periods ({len(silence_periods)}):")
for s, e in silence_periods:
print(f" {s:.3f}s {e:.3f}s")
# --- First sound ---
# Take the end of the FIRST silence period found in the preamble window
# (first 60 s). This handles both:
# • file starts with silence → silence[0].start ≈ 0
# • file starts with noise (crumpling etc.) then has a brief quiet gap
# before speech → silence[0].start > 0
# If no silence is found at all the whole file is assumed to be content.
PREAMBLE_LIMIT = 60.0
first_sound = 0.0
for s_start, s_end in silence_periods:
if s_start < PREAMBLE_LIMIT:
first_sound = s_end
break
# --- Last sound ---
# Where the trailing silence begins (if the file ends with silence).
last_sound = total_duration
if silence_periods and silence_periods[-1][1] >= total_duration - 0.05:
last_sound = silence_periods[-1][0]
return first_sound, last_sound
def preprocess_video(
videos_dir: Path,
video_id: str,
video_source: VideoSource,
verbose: bool = False,
force: bool = False,
custom_gnommo_scratch: Optional[Path] = None,
res: str = "full",
) -> Path:
"""
Apply preprocessing filters to a video source.
Video filters (chroma_key, mask) are combined into single FFmpeg passes
for efficiency. Non-video filters (transcribe) are handled separately.
Args:
videos_dir: Directory containing videos.json and video files
video_id: ID of the video being processed
video_source: VideoSource with source_file, filter, and output_file
custom_gnommo_scratch: Optional external directory for intermediate files (e.g., SSD)
res: Resolution preset — when not "full", source is downscaled before filtering
Returns:
Path to the final preprocessed output file.
"""
if not video_source.filter:
# No filters defined, return original file
return videos_dir / video_source.source_file
# Use custom intermediate dir if provided, otherwise default to videos_dir/intermediate
if custom_gnommo_scratch:
gnommo_scratch = custom_gnommo_scratch / video_id
else:
gnommo_scratch = videos_dir / "intermediate"
gnommo_scratch.mkdir(parents=True, exist_ok=True)
# Start with the source file (relative to videos_dir)
current_input = videos_dir / video_source.source_file
if not current_input.exists():
raise PreprocessError(
f"Source video not found: {current_input}",
filter_type=None,
)
# For non-full res, downscale the raw source first so all subsequent
# filters (chroma key, color grade, etc.) operate on the small file.
if res != "full":
cfg = RES_CONFIGS.get(res)
if cfg:
width, height, _ = cfg
print(f" Downscaling source to {width}x{height} ({res})...")
raw_low_dir = gnommo_scratch / f"raw_{res}"
current_input = create_downscaled_video(
current_input, raw_low_dir, width, height, force
)
# Resolve channel setting (auto-detect if needed) and sanity check
channel = video_source.use_audio_channels
if channel == "auto":
channel = _resolve_auto_channel(current_input)
print(f" Auto channel detection: using '{channel}'")
elif channel in ("left", "right"):
is_silent, max_vol = check_audio_channel_silent(current_input, channel)
if is_silent:
raise PreprocessError(
f"Audio channel '{channel}' is silent (max_volume={max_vol:.1f} dB). "
f"Wrong microphone channel selected?",
filter_type="audio_check",
)
# Track intermediate files for cleanup
intermediate_files: list[Path] = []
# Video filter types that can be combined in a single FFmpeg pass
VIDEO_FILTER_TYPES = {"chroma_key", "mask", "color_grade", "gnommokey"}
# Group consecutive video filters into batches
filter_batches: list[list[dict]] = []
current_batch: list[dict] = []
for filter_config in video_source.filter:
filter_type = filter_config.get("type")
if filter_type in VIDEO_FILTER_TYPES:
current_batch.append(filter_config)
else:
# Non-video filter breaks the batch
if current_batch:
filter_batches.append(current_batch)
current_batch = []
# Add non-video filter as its own "batch"
filter_batches.append([filter_config])
# Don't forget the last batch
if current_batch:
filter_batches.append(current_batch)
# Process each batch
batch_num = 0
for batch in filter_batches:
first_filter_type = batch[0].get("type")
if first_filter_type in VIDEO_FILTER_TYPES:
# Combined video filter batch - use chunked processing for large files
filter_names = "+".join(f.get("type") for f in batch)
print(f" Video filters (combined): {filter_names}")
# Output to WebM (compressed with alpha) instead of ProRes
step_output = gnommo_scratch / f"{video_id}_batch{batch_num}.mov"
intermediate_files.append(step_output)
# Note: skip/take are NOT applied here - they're only used during concatenation
apply_combined_video_filters_chunked(
current_input,
step_output,
batch,
verbose,
take=None,
scratch_dir=gnommo_scratch / "chunks",
)
current_input = step_output
batch_num += 1
elif first_filter_type == "transcribe":
# Transcribe doesn't transform video
print(" Filter: transcribe")
apply_transcribe(current_input, batch[0], verbose, force)
elif first_filter_type == "audio_normalize":
# Audio normalization: denoise, compress, and normalize loudness
# Note: skip/take are NOT applied here - they're only used during concatenation
print(" Filter: audio_normalize")
if not _has_audio_stream(current_input):
raise PreprocessError(
f"audio_normalize requires an audio stream, but '{current_input.name}' has none.\n"
f" Check that the source file has audio, or remove audio_normalize from the filter list.",
filter_type="audio_normalize",
command="",
stderr="",
)
step_output = gnommo_scratch / f"{video_id}_batch{batch_num}_audio.mov"
intermediate_files.append(step_output)
apply_audio_normalize(
current_input,
step_output,
batch[0],
verbose,
take=None,
use_audio_channels=channel,
skip_loudnorm=video_source.defer_loudnorm,
)
current_input = step_output
batch_num += 1
else:
raise PreprocessError(
f"Unknown filter type: {first_filter_type}",
filter_type=first_filter_type,
)
# If output_file is specified, copy/rename to final location and clean up
if video_source.output_file:
import shutil
final_output = videos_dir / video_source.output_file
final_output.parent.mkdir(parents=True, exist_ok=True)
# Copy the final intermediate to the output location
shutil.copy2(current_input, final_output)
if verbose:
print(f" Final output: {final_output}")
# Clean up intermediate files
for intermediate_file in intermediate_files:
if intermediate_file.exists():
intermediate_file.unlink()
if verbose:
print(f" Removed intermediate: {intermediate_file.name}")
# Remove intermediate directory if empty
try:
gnommo_scratch.rmdir()
except OSError:
pass # Directory not empty (other videos may have intermediates)
return final_output
# No output_file specified, return current processed file
return current_input
def apply_combined_video_filters(
input_path: Path,
output_path: Path,
filters: list[dict],
verbose: bool = False,
take: float = None,
) -> None:
"""
Apply multiple video filters in a single FFmpeg pass.
Combines chroma_key, mask, and other video filters into one filter chain.
"""
filter_parts: list[str] = []
for filter_config in filters:
filter_type = filter_config.get("type")
if filter_type == "chroma_key":
filter_parts.append(build_chroma_key_filter(filter_config))
elif filter_type == "mask":
filter_parts.append(build_mask_filter(filter_config))
elif filter_type == "color_grade":
filter_parts.append(build_color_grade_filter(filter_config))
elif filter_type == "gnommokey":
filter_parts.append(build_gnommokey_filter(filter_config))
video_filter = ",".join(filter_parts)
# Build FFmpeg command
cmd = ["ffmpeg", "-y"]
if take is not None:
cmd.extend(["-t", str(take)])
cmd.extend(
[
"-i",
str(input_path),
"-vf",
video_filter,
"-c:v",
"prores_ks",
"-profile:v",
"4", # ProRes 4444
"-pix_fmt",
"yuva444p10le", # 10-bit with alpha
"-c:a",
"pcm_s16le", # Lossless audio
str(output_path),
]
)
if verbose:
print(f" Combined filter: {video_filter}")
print(f" Command: {' '.join(cmd)}")
# Get duration for progress bar
duration = take if take is not None else get_video_duration(input_path)
result = run_ffmpeg_with_progress(cmd, duration, "Processing")
if result.returncode != 0:
raise PreprocessError(
"Combined video filter failed",
filter_type="combined",
command=" ".join(cmd),
stderr=result.stderr,
)
def build_chroma_key_filter(config: dict) -> str:
"""Build FFmpeg chromakey filter string from config."""
chroma_config = parse_chroma_key_config(config)
r, g, b = chroma_config.color
hex_color = f"0x{r:02x}{g:02x}{b:02x}"
parts = [
f"chromakey={hex_color}:{chroma_config.similarity:.3f}:{chroma_config.blend:.3f}"
]
if chroma_config.spill > 0:
parts.append(f"despill=type=green:mix={chroma_config.spill:.3f}")
# Edge erosion: shrink alpha mask to remove green fringe
# Uses erosion filter targeting only alpha channel (plane 3)
# threshold0-2=65535 means Y/U/V unchanged, threshold3=0 erodes alpha
if chroma_config.edge_erode > 0:
erode_passes = min(chroma_config.edge_erode, 5) # Cap at 5 passes
parts.append("format=yuva444p")
for _ in range(erode_passes):
parts.append(
"erosion=threshold0=65535:threshold1=65535:threshold2=65535:threshold3=0"
)
# Color protection: restore alpha for pixels matching protected color
# This runs AFTER chromakey/despill/erosion to restore any incorrectly keyed pixels
if chroma_config.protect_color:
pr, pg, pb = chroma_config.protect_color
# Convert tolerance from 0-1 range to pixel range (0-255)
tol = int(chroma_config.protect_tolerance * 255)
# Ensure we're in RGBA for geq to work with r/g/b/alpha functions
parts.append("format=rgba")
# Build condition: pixel RGB is within tolerance of protected color
# between(value, min, max) returns 1 if min <= value <= max
# Multiply conditions together for AND logic
condition = (
f"between(r(X,Y),{max(0, pr-tol)},{min(255, pr+tol)})*"
f"between(g(X,Y),{max(0, pg-tol)},{min(255, pg+tol)})*"
f"between(b(X,Y),{max(0, pb-tol)},{min(255, pb+tol)})"
)
# geq: if pixel matches protected color, set alpha to 255, else keep current alpha
parts.append(
f"geq=r='r(X,Y)':g='g(X,Y)':b='b(X,Y)':a='if({condition},255,alpha(X,Y))'"
)
return ",".join(parts)
def build_mask_filter(config: dict) -> str:
"""Build FFmpeg geq mask filter string from config."""
left = float(config.get("left", 0))
right = float(config.get("right", 0))
top = float(config.get("top", 0))
bottom = float(config.get("bottom", 0))
conditions = []
if left > 0:
conditions.append(f"lt(X,W*{left})")
if right > 0:
conditions.append(f"gt(X,W*{1-right})")
if top > 0:
conditions.append(f"lt(Y,H*{top})")
if bottom > 0:
conditions.append(f"gt(Y,H*{1-bottom})")
if not conditions:
return "copy" # No-op filter
alpha_expr = "+".join(conditions)
alpha_expr = f"if({alpha_expr},0,alpha(X,Y))"
return f"geq=lum='lum(X,Y)':cb='cb(X,Y)':cr='cr(X,Y)':a='{alpha_expr}'"
def build_color_grade_filter(config: dict) -> str:
"""Build FFmpeg color grading filter string from config.
Applies color balance, curves, and EQ adjustments while preserving alpha.
The filter chain converts to RGBA for color operations, then back to
yuva444p10le to preserve the alpha channel.
"""
grade_config = parse_color_grade_config(config)
parts: list[str] = []
# Start with format conversion to RGBA for color operations
parts.append("format=rgba")
# Color balance (only add if any value is non-zero)
colorbalance_parts = []
if grade_config.rs != 0:
colorbalance_parts.append(f"rs={grade_config.rs:.3f}")
if grade_config.gs != 0:
colorbalance_parts.append(f"gs={grade_config.gs:.3f}")
if grade_config.bs != 0:
colorbalance_parts.append(f"bs={grade_config.bs:.3f}")
if grade_config.rm != 0:
colorbalance_parts.append(f"rm={grade_config.rm:.3f}")
if grade_config.gm != 0:
colorbalance_parts.append(f"gm={grade_config.gm:.3f}")
if grade_config.bm != 0:
colorbalance_parts.append(f"bm={grade_config.bm:.3f}")
if grade_config.rh != 0:
colorbalance_parts.append(f"rh={grade_config.rh:.3f}")
if grade_config.gh != 0:
colorbalance_parts.append(f"gh={grade_config.gh:.3f}")
if grade_config.bh != 0:
colorbalance_parts.append(f"bh={grade_config.bh:.3f}")
if colorbalance_parts:
parts.append(f"colorbalance={':'.join(colorbalance_parts)}")
# Curves preset (if specified)
if grade_config.curves_preset and grade_config.curves_preset != "none":
parts.append(f"curves=preset={grade_config.curves_preset}")
# EQ adjustments (only add if different from defaults)
eq_parts = []
if grade_config.contrast != 1.0:
eq_parts.append(f"contrast={grade_config.contrast:.3f}")
if grade_config.brightness != 0.0:
eq_parts.append(f"brightness={grade_config.brightness:.3f}")
if grade_config.saturation != 1.0:
eq_parts.append(f"saturation={grade_config.saturation:.3f}")
if eq_parts:
parts.append(f"eq={':'.join(eq_parts)}")
# Custom curves (if specified)
custom_curves = []
if grade_config.curves_r:
custom_curves.append(f"r='{grade_config.curves_r}'")
if grade_config.curves_g:
custom_curves.append(f"g='{grade_config.curves_g}'")
if grade_config.curves_b:
custom_curves.append(f"b='{grade_config.curves_b}'")
if grade_config.curves_master:
custom_curves.append(f"master='{grade_config.curves_master}'")
if custom_curves:
parts.append(f"curves={':'.join(custom_curves)}")
# Convert back to yuva444p10le to preserve alpha for downstream filters
parts.append("format=yuva444p10le")
return ",".join(parts)
def parse_color_grade_config(config: dict) -> ColorGradeConfig:
"""Parse a color grade config dictionary into ColorGradeConfig."""
return ColorGradeConfig(
# Shadows
rs=float(config.get("rs", 0.0)),
gs=float(config.get("gs", 0.0)),
bs=float(config.get("bs", 0.0)),
# Midtones
rm=float(config.get("rm", 0.0)),
gm=float(config.get("gm", 0.0)),
bm=float(config.get("bm", 0.0)),
# Highlights
rh=float(config.get("rh", 0.0)),
gh=float(config.get("gh", 0.0)),
bh=float(config.get("bh", 0.0)),
# Curves preset
curves_preset=config.get("curves_preset", "none"),
# EQ
contrast=float(config.get("contrast", 1.0)),
brightness=float(config.get("brightness", 0.0)),
saturation=float(config.get("saturation", 1.0)),
# Custom curves
curves_r=config.get("curves_r", ""),
curves_g=config.get("curves_g", ""),
curves_b=config.get("curves_b", ""),
curves_master=config.get("curves_master", ""),
)
def build_gnommokey_filter(config: dict) -> str:
"""Build FFmpeg gnommokey filter string - Keylight-style color-difference keyer.
Uses YCbCr color-difference keying algorithm:
- For green screen: key signal = (Cb - Cr), high values = green
- screen_gain scales the key extraction strength
- screen_balance mixes luminance into the key calculation
- clip_black/clip_white compress the matte range
- despill shifts green spill toward the bias color
"""
cfg = parse_gnommokey_config(config)
parts: list[str] = []
# Get screen color RGB values
sr, sg, sb = cfg.screen_color
# Determine if this is green or blue screen based on RGB dominance
# Green screen: G is the highest channel
# Blue screen: B is the highest channel
is_green_screen = sg >= sb
# Work in RGBA space for RGB-based color difference keying
parts.append("format=rgba")
# Build the alpha calculation expression
gain = cfg.screen_gain / 100.0
balance = cfg.screen_balance / 100.0
# RGB-based color-difference key calculation:
# For green screen: key = G - max(R, B) → measures "greenness"
# For blue screen: key = B - max(R, G) → measures "blueness"
# This is more reliable than YCbCr for screens that aren't pure colors
if is_green_screen:
# Green screen: how much does G exceed the stronger of R or B?
key_signal = "max(0,g(X,Y)-max(r(X,Y),b(X,Y)))"
else:
# Blue screen: how much does B exceed the stronger of R or G?
key_signal = "max(0,b(X,Y)-max(r(X,Y),g(X,Y)))"
# Apply screen_balance: mix in luminance-based keying
# At balance=0: pure color difference
# At balance=1: luminance contributes (pixels matching screen luma key more)
screen_y = int(0.299 * sr + 0.587 * sg + 0.114 * sb)
if balance > 0:
# Luma similarity: boost keying for pixels with similar luminance to screen
# This helps key darker/lighter greens that might otherwise be missed
luma_expr = f"(0.299*r(X,Y)+0.587*g(X,Y)+0.114*b(X,Y))"
luma_boost = f"(1+{balance:.2f}*(1-abs({luma_expr}-{screen_y})/128))"
key_expr = f"({key_signal})*{luma_boost}"
else:
key_expr = f"({key_signal})"
# Apply gain: screen_gain of 100 = 1.0, 126 = 1.26
# For typical green screen, G-max(R,B) ranges 0-150
# Scale factor maps this to 0-255 range
scale_factor = gain * 2.5
key_expr = f"({key_expr})*{scale_factor:.3f}"
# Apply clip_black and clip_white to compress the matte
# clip_black: key values below this become 0 (those pixels stay opaque)
# clip_white: key values above this become 255 (fully transparent)
# Default 0/100 means: 0-255 maps to 0-255 (no change)
clip_b = cfg.clip_black * 2.55 # Convert 0-100 to 0-255
clip_w = cfg.clip_white * 2.55
if clip_w > clip_b:
# Remap the range [clip_b, clip_w] to [0, 255]
range_scale = 255.0 / (clip_w - clip_b)
key_expr = f"clip(({key_expr}-{clip_b:.1f})*{range_scale:.3f},0,255)"
else:
key_expr = f"clip({key_expr},0,255)"
# Invert: high key value (green) = low alpha (transparent)
alpha_expr = f"255-{key_expr}"
# Build the geq filter for alpha (in RGBA mode)
parts.append(f"geq=r='r(X,Y)':g='g(X,Y)':b='b(X,Y)':a='{alpha_expr}'")
# Despill: shift green/blue spill toward the bias color
if cfg.despill_bias and cfg.despill_strength > 0:
# Already in RGBA format
br, bg, bb = cfg.despill_bias
strength = cfg.despill_strength
if is_green_screen:
# Green spill: G exceeds max(R, B)
spill_expr = "max(0,g(X,Y)-max(r(X,Y),b(X,Y)))"
else:
# Blue spill: B exceeds max(R, G)
spill_expr = "max(0,b(X,Y)-max(r(X,Y),g(X,Y)))"
# Lerp factor based on spill amount
factor_expr = f"({spill_expr}/255*{strength:.2f})"
# Shift colors toward the bias
new_r = f"clip(r(X,Y)+({br}-r(X,Y))*{factor_expr},0,255)"
new_g = f"clip(g(X,Y)+({bg}-g(X,Y))*{factor_expr},0,255)"
new_b = f"clip(b(X,Y)+({bb}-b(X,Y))*{factor_expr},0,255)"
parts.append(f"geq=r='{new_r}':g='{new_g}':b='{new_b}':a='alpha(X,Y)'")
# Edge-aware despill: aggressively suppress green at semi-transparent edges
# This targets the 2-4px green fringe that regular despill misses
# edge_factor is high (1.0) at alpha=128, low (0) at alpha=0 or 255
# At edges: cap G so it never exceeds max(R, B)
if is_green_screen:
# Edge factor: peaks at alpha=128, falls off toward 0 and 255
# Using min(alpha, 255-alpha)/128 gives smooth 0→1→0 curve
edge_factor = "min(alpha(X,Y),255-alpha(X,Y))/128"
# Green excess at this pixel
green_excess = "max(0,g(X,Y)-max(r(X,Y),b(X,Y)))"
# Suppress green proportionally to edge_factor
# At edges: G = G - excess (caps G to max(R,B))
# At interior: G unchanged
new_g = f"clip(g(X,Y)-({green_excess})*({edge_factor}),0,255)"
parts.append(f"geq=r='r(X,Y)':g='{new_g}':b='b(X,Y)':a='alpha(X,Y)'")
else:
# Blue screen edge despill
edge_factor = "min(alpha(X,Y),255-alpha(X,Y))/128"
blue_excess = "max(0,b(X,Y)-max(r(X,Y),g(X,Y)))"
new_b = f"clip(b(X,Y)-({blue_excess})*({edge_factor}),0,255)"
parts.append(f"geq=r='r(X,Y)':g='g(X,Y)':b='{new_b}':a='alpha(X,Y)'")
# Edge erosion: shrink alpha channel to remove green fringe
# threshold=0 means "don't change", threshold=65535 means "full erosion"
# We want to erode only the alpha channel (plane 3), leave RGB unchanged
if cfg.edge_erode > 0:
erode_passes = min(cfg.edge_erode, 5)
for _ in range(erode_passes):
parts.append(
"erosion=threshold0=0:threshold1=0:threshold2=0:threshold3=65535"
)
# Edge softening (blur the alpha)
if cfg.edge_soften > 0:
# Use gblur on alpha channel only via format manipulation
# First extract to a format where we can blur, then re-merge
# Simpler approach: use avgblur with small radius
radius = min(int(cfg.edge_soften), 5)
if radius > 0:
parts.append(f"alphaextract,avgblur=sizeX={radius}:sizeY={radius}[blur]")
# This gets complex - for now, skip alpha blur and just use erosion
# Ensure output is in a good format
parts.append("format=yuva444p10le")
return ",".join(parts)
def parse_gnommokey_config(config: dict) -> GnommoKeyConfig:
"""Parse a gnommokey config dictionary into GnommoKeyConfig."""
# Parse screen_color
screen_color = config.get("screen_color", [0, 177, 64])
if isinstance(screen_color, list) and len(screen_color) == 3:
screen_color = tuple(screen_color)
else:
screen_color = (0, 177, 64)
# Parse despill_bias
despill_bias = config.get("despill_bias")
if despill_bias:
if isinstance(despill_bias, list) and len(despill_bias) == 3:
despill_bias = tuple(despill_bias)
else:
despill_bias = None
# Parse alpha_bias
alpha_bias = config.get("alpha_bias")
if alpha_bias:
if isinstance(alpha_bias, list) and len(alpha_bias) == 3:
alpha_bias = tuple(alpha_bias)
else:
alpha_bias = None
return GnommoKeyConfig(
screen_color=screen_color,
screen_gain=float(config.get("screen_gain", 100.0)),
screen_balance=float(config.get("screen_balance", 50.0)),
clip_black=float(config.get("clip_black", 0.0)),
clip_white=float(config.get("clip_white", 100.0)),
despill_bias=despill_bias,
despill_strength=float(config.get("despill_strength", 0.5)),
alpha_bias=alpha_bias,
edge_erode=int(config.get("edge_erode", 0)),
edge_soften=float(config.get("edge_soften", 0.0)),
)
def apply_combined_video_filters_chunked(
input_path: Path,
output_path: Path,
filters: list[dict],
verbose: bool = False,
take: float = None,
scratch_dir: Path = None,
) -> None:
"""
Apply video filters using chunk-based processing for large files.
For videos longer than CHUNK_DURATION:
1. Split into chunks
2. Process each chunk with filters
3. Encode to ProRes 4444 with alpha
4. Concatenate chunks into final output
Chunking allows parallel processing and avoids huge intermediate files.
"""
duration = take if take is not None else get_video_duration(input_path)
# Short video: process directly without chunking
if duration <= CHUNK_DURATION:
_process_chunk_to_prores4444(
input_path, output_path, filters, 0, duration, verbose, take, True
)
return
# Long video: process in chunks (parallel)
if scratch_dir is None:
scratch_dir = output_path.parent / "chunks"
scratch_dir.mkdir(parents=True, exist_ok=True)
num_chunks = int(duration / CHUNK_DURATION) + 1
chunk_files: list[Path] = []
chunk_tasks: list[tuple] = [] # (index, chunk_path, start_time, chunk_duration)
# Build list of chunk tasks
for i in range(num_chunks):
start_time = i * CHUNK_DURATION
chunk_duration = min(CHUNK_DURATION, duration - start_time)
if chunk_duration <= 0:
break
chunk_path = scratch_dir / f"chunk_{i:04d}.mov"
chunk_files.append(chunk_path)
chunk_tasks.append((i, chunk_path, start_time, chunk_duration))
num_workers = min(DEFAULT_CHUNK_WORKERS, len(chunk_tasks))
print(
f" Processing {len(chunk_tasks)} chunks in parallel ({num_workers} workers)"
)
# Process chunks in parallel
def process_chunk_task(task):
i, chunk_path, start_time, chunk_dur = task
_process_chunk_to_prores4444(
input_path,
chunk_path,
filters,
start_time,
chunk_dur,
verbose=False, # Suppress verbose in parallel mode
take=chunk_dur,
)
return i, chunk_path
completed = 0
with ThreadPoolExecutor(max_workers=num_workers) as executor:
futures = {
executor.submit(process_chunk_task, task): task for task in chunk_tasks
}
for future in as_completed(futures):
i, chunk_path = future.result()
completed += 1
print(
f" Completed chunk {i+1}/{len(chunk_tasks)} ({completed}/{len(chunk_tasks)} done)"
)
# Concatenate chunks into final output
concat_list = scratch_dir / "concat.txt"
with open(concat_list, "w") as cf:
for chunk_path in chunk_files:
cf.write(f"file '{chunk_path.resolve()}'\n")
if verbose:
print(f" Concatenating {len(chunk_files)} chunks → {output_path.name}")
concat_cmd = [
"ffmpeg",
"-y",
"-f",
"concat",
"-safe",
"0",
"-i",
str(concat_list),
"-c",
"copy",
str(output_path),
]
concat_result = run_ffmpeg_with_progress(concat_cmd, duration, "Concatenating")
if concat_result.returncode != 0:
raise PreprocessError(
"Chunk concatenation failed",
filter_type="concat",
command=" ".join(concat_cmd),
stderr=concat_result.stderr,
)
# Clean up chunk files and concat list
for chunk_path in chunk_files:
if chunk_path.exists():
chunk_path.unlink()
concat_list.unlink(missing_ok=True)
# Remove chunks directory if empty
try:
scratch_dir.rmdir()
except OSError:
pass
def _process_chunk_to_prores4444(
input_path: Path,
output_path: Path,
filters: list[dict],
start_time: float,
chunk_duration: float,
verbose: bool = False,
take: float = None,
keep_audio: bool = True,
) -> None:
"""
Process a video chunk with filters and encode to ProRes 4444 (MOV) with alpha.
This is intended as an intermediate format for compositing:
- true alpha channel (non-binary edges)
- 4:4:4 chroma (better key edges than 4:2:0)
- robust for concatenation and further filtering
"""
filter_parts: list[str] = []
for filter_config in filters:
filter_type = filter_config.get("type")
if filter_type == "chroma_key":
filter_parts.append(build_chroma_key_filter(filter_config))
elif filter_type == "mask":
filter_parts.append(build_mask_filter(filter_config))
elif filter_type == "color_grade":
filter_parts.append(build_color_grade_filter(filter_config))
elif filter_type == "gnommokey":
filter_parts.append(build_gnommokey_filter(filter_config))
video_filter = ",".join(filter_parts)
# Ensure we end in an alpha-capable pixel format.
# 10-bit 4:4:4 + alpha is ideal for keyed edges.
if video_filter:
video_filter += ",format=yuva444p10le"
else:
video_filter = "format=yuva444p10le"
# Build FFmpeg command
cmd: list[str] = ["ffmpeg", "-y"]
# Seek to start time (before input for fast seeking)
if start_time > 0:
cmd.extend(["-ss", str(start_time)])
cmd.extend(["-i", str(input_path)])
# Limit duration
actual_take = take if take is not None else chunk_duration
if actual_take is not None:
cmd.extend(["-t", str(actual_take)])
# Video encode: ProRes 4444 with alpha
cmd.extend(
[
"-vf",
video_filter,
"-c:v",
"prores_ks",
"-profile:v",
"4", # 4 = ProRes 4444
"-pix_fmt",
"yuva444p10le", # must carry alpha
"-vendor",
"apl0", # optional; helps some NLEs tag as Apple ProRes
]
)
# Audio handling (optional)
if keep_audio:
# PCM is the least surprising intermediate audio.
# You can also do "-c:a copy" if your source audio codec is stable across chunks.
cmd.extend(["-c:a", "pcm_s16le"])
else:
cmd.append("-an")
cmd.append(str(output_path))
if verbose:
print(f" Filter: {video_filter}")
print(f" Command: {' '.join(cmd)}")
result = run_ffmpeg_with_progress(cmd, actual_take or chunk_duration, "Encoding")
if result.returncode != 0:
raise PreprocessError(
"Chunk processing failed",
filter_type="chunk",
command=" ".join(cmd),
stderr=result.stderr,
)
# Validate the output file is a readable MOV (moov atom present).
# FFmpeg can return 0 but write a corrupt/incomplete file (e.g. moov atom
# missing) when faststart rewrite fails or disk is under pressure.
probe = subprocess.run(
[
"ffprobe",
"-v",
"error",
"-show_entries",
"format=duration",
"-of",
"csv=p=0",
str(output_path),
],
capture_output=True,
text=True,
)
if probe.returncode != 0 or not probe.stdout.strip():
raise PreprocessError(
f"Chunk output file is unreadable or missing moov atom: {output_path.name}",
filter_type="chunk",
command=" ".join(cmd),
stderr=probe.stderr,
)
def _process_chunk_to_webm(
input_path: Path,
output_path: Path,
filters: list[dict],
start_time: float,
chunk_duration: float,
verbose: bool = False,
take: float = None,
) -> None:
"""
Process a video chunk with filters and encode to VP9/WebM with alpha.
VP9 with alpha uses ~10-20% of ProRes 4444 file size while maintaining
good quality for compositing.
"""
filter_parts: list[str] = []
for filter_config in filters:
filter_type = filter_config.get("type")
if filter_type == "chroma_key":
filter_parts.append(build_chroma_key_filter(filter_config))
elif filter_type == "mask":
filter_parts.append(build_mask_filter(filter_config))
elif filter_type == "color_grade":
filter_parts.append(build_color_grade_filter(filter_config))
elif filter_type == "gnommokey":
filter_parts.append(build_gnommokey_filter(filter_config))
video_filter = ",".join(filter_parts)
# Force output to yuva420p to preserve alpha channel through to encoder
video_filter += ",format=yuva420p"
# Build FFmpeg command for VP9 with alpha
cmd = ["ffmpeg", "-y"]
# Seek to start time (before input for fast seeking)
if start_time > 0:
cmd.extend(["-ss", str(start_time)])
cmd.extend(["-i", str(input_path)])
# Limit duration
actual_take = take if take is not None else chunk_duration
if actual_take is not None:
cmd.extend(["-t", str(actual_take)])
cmd.extend(
[
"-vf",
video_filter,
"-c:v",
"libvpx-vp9",
"-pix_fmt",
"yuva420p", # VP9 with alpha
"-auto-alt-ref",
"0", # Required for alpha channel in VP9
"-crf",
"25", # Quality (lower = better, 15-35 typical)
"-b:v",
"0", # Variable bitrate mode
"-deadline",
"good", # Encoding speed (good balance)
"-cpu-used",
"2", # Speed/quality tradeoff (0-5, lower = better)
"-c:a",
"libopus", # Opus audio codec
"-b:a",
"128k",
str(output_path),
]
)
if verbose:
print(f" Filter: {video_filter}")
print(f" Command: {' '.join(cmd)}")
result = run_ffmpeg_with_progress(cmd, actual_take or chunk_duration, "Encoding")
if result.returncode != 0:
raise PreprocessError(
"Chunk processing failed",
filter_type="chunk",
command=" ".join(cmd),
stderr=result.stderr,
)
def _concatenate_prores4444_chunks(
chunk_files: list[Path],
output_path: Path,
verbose: bool = False,
keep_audio: bool = False,
) -> None:
"""
Concatenate ProRes 4444 (MOV) chunks into a single ProRes 4444 output.
Uses FFmpeg concat demuxer, then re-encodes once to ensure alpha and
stream consistency across chunks.
"""
concat_list = output_path.parent / "concat_list.txt"
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(concat_list, "w", encoding="utf-8") as f:
for chunk in chunk_files:
f.write(f"file '{chunk.resolve()}'\n")
cmd: list[str] = [
"ffmpeg",
"-y",
"-f",
"concat",
"-safe",
"0",
"-i",
str(concat_list),
# Encode to ProRes 4444 with alpha
"-c:v",
"prores_ks",
"-profile:v",
"4", # ProRes 4444
"-pix_fmt",
"yuva444p10le", # preserve alpha + best key edges
"-vendor",
"apl0",
"-movflags",
"+faststart",
]
if keep_audio:
# safest for intermediates; alternatively "-c:a copy" if identical across chunks
cmd += ["-c:a", "pcm_s16le"]
else:
cmd += ["-an"]
cmd.append(str(output_path))
if verbose:
print(f" Concat list: {concat_list}")
print(f" Command: {' '.join(cmd)}")
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
raise PreprocessError(
"Chunk concatenation failed",
filter_type="concat",
command=" ".join(cmd),
stderr=result.stderr,
)
def _concatenate_webm_chunks(
chunk_files: list[Path],
output_path: Path,
verbose: bool = False,
) -> None:
"""
Concatenate WebM chunks into a single output file.
Uses FFmpeg's concat demuxer for lossless concatenation.
"""
# Create concat file list
concat_list = output_path.parent / "concat_list.txt"
with open(concat_list, "w") as f:
for chunk in chunk_files:
# FFmpeg concat format: file 'path'
f.write(f"file '{chunk.resolve()}'\n")
cmd = [
"ffmpeg",
"-y",
"-f",
"concat",
"-safe",
"0",
"-i",
str(concat_list),
"-c:v",
"libvpx-vp9",
"-pix_fmt",
"yuva420p", # Stream copy (no re-encoding)
str(output_path),
]
if verbose:
print(f" Concat list: {concat_list}")
print(f" Command: {' '.join(cmd)}")
result = subprocess.run(cmd, capture_output=True, text=True)
# Clean up concat list
concat_list.unlink()
if result.returncode != 0:
raise PreprocessError(
"Chunk concatenation failed",
filter_type="concat",
command=" ".join(cmd),
stderr=result.stderr,
)
def apply_chroma_key(
input_path: Path,
output_path: Path,
config: dict[str, Any],
verbose: bool = False,
take: float = None,
) -> None:
"""
Apply chroma key (green screen) filter using FFmpeg.
Config options:
color: [R, G, B] - Color to key out (default: [0, 255, 0] green)
similarity: float - Color similarity threshold 0.0-1.0 (default: 0.4)
blend: float - Edge blend/feathering 0.0-1.0 (default: 0.08)
spill: float - Spill suppression 0.0-1.0 (default: 0.1)
Args:
take: Optional duration in seconds to limit processing (for quick iteration)
Output is ProRes 4444 with alpha channel for lossless quality.
"""
# Parse config with defaults
chroma_config = parse_chroma_key_config(config)
# Convert RGB to hex format for FFmpeg
r, g, b = chroma_config.color
hex_color = f"0x{r:02x}{g:02x}{b:02x}"
# Build FFmpeg chromakey filter
# chromakey=color:similarity:blend
# Using higher similarity to capture more green shades
filter_parts = [
f"chromakey={hex_color}:{chroma_config.similarity:.3f}:{chroma_config.blend:.3f}"
]
# Add despill to remove green spill on edges (always recommended for green screen)
if chroma_config.spill > 0:
filter_parts.append(f"despill=type=green:mix={chroma_config.spill:.3f}")
video_filter = ",".join(filter_parts)
# Build FFmpeg command
# ProRes 4444 profile for alpha channel support
cmd = [
"ffmpeg",
"-y", # Overwrite output
]
# Add duration limit if specified (before input for efficiency)
if take is not None:
cmd.extend(["-t", str(take)])
cmd.extend(
[
"-i",
str(input_path),
"-vf",
video_filter,
"-c:v",
"prores_ks",
"-profile:v",
"4", # ProRes 4444
"-pix_fmt",
"yuva444p10le", # 10-bit with alpha
"-c:a",
"pcm_s16le", # Lossless audio
str(output_path),
]
)
if verbose:
print(f" Filter: {video_filter}")
if take:
print(f" Duration limit: {take}s")
print(f" Command: {' '.join(cmd)}")
# Get duration for progress bar
duration = take if take is not None else get_video_duration(input_path)
result = run_ffmpeg_with_progress(cmd, duration, "Chroma key")
if result.returncode != 0:
raise PreprocessError(
"Chroma key filter failed",
filter_type="chroma_key",
command=" ".join(cmd),
stderr=result.stderr,
)
def apply_mask(
input_path: Path,
output_path: Path,
config: dict[str, Any],
verbose: bool = False,
take: float = None,
) -> None:
"""
Apply a mask to make edges transparent using FFmpeg.
Config options:
left: float - Percentage of left side to make transparent (0.0-1.0)
right: float - Percentage of right side to make transparent (0.0-1.0)
top: float - Percentage of top to make transparent (0.0-1.0)
bottom: float - Percentage of bottom to make transparent (0.0-1.0)
Args:
take: Optional duration in seconds to limit processing
Uses geq filter to set alpha channel to 0 for masked regions.
"""
left = float(config.get("left", 0))
right = float(config.get("right", 0))
top = float(config.get("top", 0))
bottom = float(config.get("bottom", 0))
# Build alpha expression for geq filter
# Alpha is 255 (opaque) in the center, 0 (transparent) at edges
# X < W*left -> transparent
# X > W*(1-right) -> transparent
# Y < H*top -> transparent
# Y > H*(1-bottom) -> transparent
conditions = []
if left > 0:
conditions.append(f"lt(X,W*{left})")
if right > 0:
conditions.append(f"gt(X,W*{1-right})")
if top > 0:
conditions.append(f"lt(Y,H*{top})")
if bottom > 0:
conditions.append(f"gt(Y,H*{1-bottom})")
if not conditions:
# No masking needed, just copy
import shutil
shutil.copy2(input_path, output_path)
return
# Combine conditions with OR - if any condition is true, alpha = 0
alpha_expr = "+".join(conditions)
# geq alpha: if any condition matches, return 0, else return alpha(X,Y)
# Using: if(condition, 0, alpha(X,Y))
alpha_expr = f"if({alpha_expr},0,alpha(X,Y))"
# Build the geq filter - preserve luma, chroma, modify alpha
video_filter = f"geq=lum='lum(X,Y)':cb='cb(X,Y)':cr='cr(X,Y)':a='{alpha_expr}'"
# Build FFmpeg command
cmd = [
"ffmpeg",
"-y", # Overwrite output
]
if take is not None:
cmd.extend(["-t", str(take)])
cmd.extend(
[
"-i",
str(input_path),
"-vf",
video_filter,
"-c:v",
"prores_ks",
"-profile:v",
"4", # ProRes 4444
"-pix_fmt",
"yuva444p10le", # 10-bit with alpha
"-c:a",
"pcm_s16le", # Lossless audio
str(output_path),
]
)
if verbose:
print(f" Mask: left={left}, right={right}, top={top}, bottom={bottom}")
print(f" Filter: {video_filter}")
print(f" Command: {' '.join(cmd)}")
# Get duration for progress bar
duration = take if take is not None else get_video_duration(input_path)
result = run_ffmpeg_with_progress(cmd, duration, "Mask")
if result.returncode != 0:
raise PreprocessError(
"Mask filter failed",
filter_type="mask",
command=" ".join(cmd),
stderr=result.stderr,
)
def apply_transcribe(
input_path: Path,
config: dict[str, Any],
verbose: bool = False,
force: bool = False,
) -> Path:
"""
Transcribe video audio using Whisper and save to JSON file.
Config options:
model: str - Whisper model size (tiny, base, small, medium, large). Default: "base"
output: str - Output filename. Default: input filename with .transcript.json suffix
This filter doesn't transform the video, it creates a sidecar transcript file.
Skips if output file exists unless force=True.
Returns:
Path to the transcript JSON file.
"""
from .transcriber import transcribe_video, save_transcript
model = config.get("model", "base")
output_name = config.get("output")
if output_name:
output_path = input_path.parent / output_name
else:
output_path = input_path.with_suffix(".transcript.json")
# Skip if exists (unless force)
if output_path.exists() and not force:
print(f" Transcript exists, skipping: {output_path.name}")
print(" (use --force to regenerate)")
return output_path
if verbose:
print(f" Model: {model}")
print(f" Output: {output_path}")
# Run transcription
words = transcribe_video(input_path, model=model)
save_transcript(words, output_path)
print(f" Transcribed {len(words)} words -> {output_path.name}")
return output_path
def apply_audio_normalize(
input_path: Path,
output_path: Path,
config: dict[str, Any],
verbose: bool = False,
take: float = None,
use_audio_channels: str = "both",
skip_loudnorm: bool = False,
) -> None:
"""
Apply audio normalization: denoise, compress, and loudness normalize.
If skip_loudnorm=True, the loudnorm filter is skipped. Use this for segments
that will be concatenated, then apply loudnorm once to the final output.
Config options:
# Room treatment
highpass: float - High-pass filter frequency in Hz (0 = disabled, try 80-120)
lowpass: float - Low-pass filter frequency in Hz (0 = disabled)
room_eq: bool - Enable room resonance EQ cut
room_eq_freq: float - Center frequency for room cut (default: 300)
room_eq_gain: float - Gain in dB, negative = cut (default: -4)
room_eq_width: float - Q/bandwidth (default: 1.5)
# Gate (reverb tail reduction)
gate: bool - Enable noise gate
gate_threshold: float - Threshold in dB (default: -35)
gate_range: float - Attenuation in dB when closed (default: -20)
gate_attack: float - Attack time in ms (default: 10)
gate_release: float - Release time in ms (default: 150)
# Neural de-reverb
dereverb_model: str - Path to RNNoise model file (empty = disabled)
# Noise reduction
denoise: bool - Enable noise reduction (default: True)
noise_floor: float - Noise floor in dB (default: -25)
# Compression
compress: bool - Enable compression (default: True)
threshold: float - Compression threshold in dB (default: -20)
ratio: float - Compression ratio (default: 4)
attack: float - Attack time in ms (default: 5)
release: float - Release time in ms (default: 50)
makeup: float - Makeup gain in dB (default: 2)
# Loudness normalization
normalize: bool - Enable loudness normalization (default: True)
target_lufs: float - Target loudness in LUFS (default: -16)
target_lra: float - Target loudness range (default: 11)
target_tp: float - Target true peak in dB (default: -1.5)
Args:
use_audio_channels: "both", "left", or "right" - which channel(s) to use,
output is always stereo with sound in both channels
Filter chain order:
channel_map -> eq_bands -> highpass -> lowpass -> room_eq -> dereverb -> denoise -> gate -> compress -> normalize
"""
cfg = parse_audio_normalize_config(config)
# Build audio filter chain (order matters!)
audio_filters: list[str] = []
# 0. Channel mapping - take specified channel(s) and output stereo
if use_audio_channels == "left":
# Take left channel, duplicate to both stereo channels
audio_filters.append("pan=stereo|c0=c0|c1=c0")
elif use_audio_channels == "right":
# Take right channel, duplicate to both stereo channels
audio_filters.append("pan=stereo|c0=c1|c1=c1")
# 0.5. Parametric EQ bands (applied early for tonal shaping)
for band in cfg.eq_bands:
if band.type == "lowshelf":
# Low shelf filter: boosts/cuts frequencies below the center
audio_filters.append(
f"lowshelf=f={band.freq:.1f}:g={band.gain:.1f}:t=q:w={band.q:.2f}"
)
elif band.type == "highshelf":
# High shelf filter: boosts/cuts frequencies above the center
audio_filters.append(
f"highshelf=f={band.freq:.1f}:g={band.gain:.1f}:t=q:w={band.q:.2f}"
)
else:
# Peak/parametric EQ band
audio_filters.append(
f"equalizer=f={band.freq:.1f}:width_type=q:width={band.q:.2f}:g={band.gain:.1f}"
)
# 1. High-pass filter (remove room rumble and low-frequency buildup)
if cfg.highpass > 0:
audio_filters.append(f"highpass=f={cfg.highpass:.1f}")
# 2. Low-pass filter (remove harsh highs if needed)
if cfg.lowpass > 0:
audio_filters.append(f"lowpass=f={cfg.lowpass:.1f}")
# 3. Room resonance EQ cut (reduce muddy frequencies from room modes)
if cfg.room_eq:
# equalizer filter: f=frequency, width_type=q, width=Q, g=gain
audio_filters.append(
f"equalizer=f={cfg.room_eq_freq:.1f}"
f":width_type=q:width={cfg.room_eq_width:.2f}"
f":g={cfg.room_eq_gain:.1f}"
)
# 4. Neural de-reverb (arnndn - very effective if model available)
if cfg.dereverb_model:
model_path = Path(cfg.dereverb_model)
if model_path.exists():
audio_filters.append(f"arnndn=m={model_path}:mix={cfg.dereverb_mix:.2f}")
else:
print(f" Warning: dereverb model not found: {model_path}")
# 5. Noise reduction (afftdn)
if cfg.denoise:
audio_filters.append(f"afftdn=nf={cfg.noise_floor:.1f}")
# 6. Noise gate (reduce reverb tails during pauses)
if cfg.gate:
# agate: threshold, range (attenuation), attack, release
audio_filters.append(
f"agate=threshold={cfg.gate_threshold:.1f}dB"
f":range={cfg.gate_range:.1f}dB"
f":attack={cfg.gate_attack:.1f}"
f":release={cfg.gate_release:.1f}"
)
# 7. Compression (acompressor)
if cfg.compress:
audio_filters.append(
f"acompressor=threshold={cfg.threshold:.1f}dB"
f":ratio={cfg.ratio:.1f}"
f":attack={cfg.attack:.1f}"
f":release={cfg.release:.1f}"
f":makeup={cfg.makeup:.1f}dB"
)
# 8. Loudness normalization (loudnorm - EBU R128)
# Skip if skip_loudnorm=True (for segments that will be concatenated)
if cfg.normalize and not skip_loudnorm:
audio_filters.append(
f"loudnorm=I={cfg.target_lufs:.1f}"
f":LRA={cfg.target_lra:.1f}"
f":TP={cfg.target_tp:.1f}"
)
if not audio_filters:
# No filters enabled, just copy
import shutil
shutil.copy2(input_path, output_path)
return
audio_filter = ",".join(audio_filters)
# Build FFmpeg command - copy video, process audio
cmd = ["ffmpeg", "-y"]
if take is not None:
cmd.extend(["-t", str(take)])
cmd.extend(
[
"-i",
str(input_path),
"-c:v",
"copy", # Copy video stream unchanged
"-af",
audio_filter,
"-c:a",
"pcm_s16le", # Lossless audio output
str(output_path),
]
)
if verbose:
print(f" Audio filter: {audio_filter}")
print(f" Command: {' '.join(cmd)}")
# Get duration for progress bar
duration = take if take is not None else get_video_duration(input_path)
result = run_ffmpeg_with_progress(cmd, duration, "Audio normalize")
if result.returncode != 0:
raise PreprocessError(
"Audio normalization failed",
filter_type="audio_normalize",
command=" ".join(cmd),
stderr=result.stderr,
)
def parse_audio_normalize_config(config: dict[str, Any]) -> AudioNormalizeConfig:
"""Parse an audio normalize config dictionary into AudioNormalizeConfig."""
# Parse EQ bands
eq_bands = []
for band in config.get("eq_bands", []):
eq_bands.append(
EQBand(
freq=float(band.get("freq", 1000)),
gain=float(band.get("gain", 0)),
q=float(band.get("q", 1.0)),
type=str(band.get("type", "peak")),
)
)
return AudioNormalizeConfig(
# Parametric EQ
eq_bands=eq_bands,
# Room treatment
highpass=float(config.get("highpass", 0.0)),
lowpass=float(config.get("lowpass", 0.0)),
room_eq=bool(config.get("room_eq", False)),
room_eq_freq=float(config.get("room_eq_freq", 300.0)),
room_eq_gain=float(config.get("room_eq_gain", -4.0)),
room_eq_width=float(config.get("room_eq_width", 1.5)),
# Gate
gate=bool(config.get("gate", False)),
gate_threshold=float(config.get("gate_threshold", -35.0)),
gate_range=float(config.get("gate_range", -20.0)),
gate_attack=float(config.get("gate_attack", 10.0)),
gate_release=float(config.get("gate_release", 150.0)),
# Neural de-reverb
dereverb_model=str(config.get("dereverb_model", "")),
dereverb_mix=float(config.get("dereverb_mix", 0.8)),
# Noise reduction
denoise=bool(config.get("denoise", True)),
noise_floor=float(config.get("noise_floor", -25.0)),
# Compression
compress=bool(config.get("compress", True)),
threshold=float(config.get("threshold", -20.0)),
ratio=float(config.get("ratio", 4.0)),
attack=float(config.get("attack", 5.0)),
release=float(config.get("release", 50.0)),
makeup=float(config.get("makeup", 2.0)),
# Loudness normalization
normalize=bool(config.get("normalize", True)),
target_lufs=float(config.get("target_lufs", -16.0)),
target_lra=float(config.get("target_lra", 11.0)),
target_tp=float(config.get("target_tp", -1.5)),
)
def parse_chroma_key_config(config: dict[str, Any]) -> ChromaKeyConfig:
"""Parse a chroma key config dictionary into ChromaKeyConfig.
Defaults are tuned for aggressive green screen removal:
- similarity 0.4: Captures wide range of green shades (lighting variations)
- blend 0.08: Tight edges with minimal feathering
- spill 0.1: Light despill to remove green reflections on subject
- edge_erode 0: No alpha erosion (set 1-3 to remove green fringe)
- protect_color: Optional RGB color to protect from keying (e.g., yellow jumpsuit)
- protect_tolerance: How much variation from protect_color to allow (0-1, default 0.15)
"""
color = config.get("color", [0, 255, 0])
if isinstance(color, list) and len(color) == 3:
color = tuple(color)
else:
color = (0, 255, 0)
# Parse protect_color if provided
protect_color = config.get("protect_color")
if protect_color:
if isinstance(protect_color, list) and len(protect_color) == 3:
protect_color = tuple(protect_color)
else:
protect_color = None
return ChromaKeyConfig(
color=color,
similarity=float(config.get("similarity", 0.4)),
blend=float(config.get("blend", 0.08)),
spill=float(config.get("spill", 0.1)),
edge_erode=int(config.get("edge_erode", 0)),
protect_color=protect_color,
protect_tolerance=float(config.get("protect_tolerance", 0.15)),
)
def get_preprocessed_path(videos_dir: Path, video_source: VideoSource) -> Path:
"""
Get the path to the preprocessed video file.
Returns output_file if specified, otherwise returns source_file.
"""
if video_source.output_file:
return videos_dir / video_source.output_file
return videos_dir / video_source.source_file
def needs_preprocessing(videos_dir: Path, video_source: VideoSource) -> bool:
"""Check if preprocessing is needed (has filters and output doesn't exist)."""
if not video_source.filter:
return False
if video_source.output_file:
output_path = videos_dir / video_source.output_file
if output_path.exists():
return False
# Also check for WebM variant
webm_path = output_path.with_suffix(".mov")
if webm_path.exists():
return False
return True
return True
def stitch_narration_segments(
videos_dir: Path,
segment_ids: list[str],
videos: dict[str, VideoSource],
output_path: Path,
verbose: bool = False,
default_end_trim: float = 0.0,
loudnorm_config: Optional[dict] = None,
) -> Path:
"""
Stitch multiple narration video segments into a single file.
Each segment's skip and take values are applied to trim dead video at the
start/end of each recording. The segments are concatenated in the order
specified by segment_ids.
Args:
videos_dir: Directory containing video files
segment_ids: Ordered list of video IDs from videos.json
videos: Dict of video ID -> VideoSource from videos.json
output_path: Path for the concatenated output file
verbose: Enable verbose output
default_end_trim: Seconds to trim from the end when no explicit end/take is set
Returns:
Path to the stitched video file.
"""
if len(segment_ids) == 1:
# Single segment - just return its processed path
video_source = videos[segment_ids[0]]
return get_preprocessed_path(videos_dir, video_source)
print(f" Concatenating {len(segment_ids)} narration segments...")
# Create temp directory for trimmed segments
temp_dir = output_path.parent / "concat_temp"
temp_dir.mkdir(parents=True, exist_ok=True)
trimmed_segments: list[Path] = []
for i, video_id in enumerate(segment_ids):
if video_id not in videos:
raise PreprocessError(
f"Narration segment '{video_id}' not found in videos.json",
filter_type=None,
)
video_source = videos[video_id]
source_path = get_preprocessed_path(videos_dir, video_source)
if not source_path.exists():
raise PreprocessError(
f"Narration segment not found: {source_path}",
filter_type=None,
)
# Get segment duration
full_duration = get_video_duration(source_path)
skip = video_source.skip or 0.0
take = video_source.take
# Apply default end trim if no explicit take/end was set
if take is None and default_end_trim > 0:
take = max(0.0, full_duration - skip - default_end_trim)
# Calculate effective duration
if take is not None:
effective_duration = min(take, full_duration - skip)
else:
effective_duration = full_duration - skip
if verbose:
print(f" Segment {i+1}: {video_id}")
print(f" Source: {source_path.name}")
print(
f" Skip: {skip}s, Take: {take or 'all'}s, Duration: {effective_duration:.1f}s"
)
# Always re-encode every segment to normalize fps and timestamps.
# Mixing un-normalized source files (e.g. 60fps camera) with
# trimmed-and-re-encoded 30fps segments causes cumulative A/V drift
# in the final concat.
# Trim/normalize the segment
trimmed_path = temp_dir / f"segment_{i:03d}.mov"
# Check if source has alpha channel (for ProRes 4444, etc.)
has_alpha = _video_has_alpha(source_path)
# Re-encode to normalize framerate and fix timestamps
# Different segments may have different framerates which breaks concatenation
cmd = ["ffmpeg", "-y"]
if skip > 0:
cmd.extend(["-ss", str(skip)])
cmd.extend(["-i", str(source_path)])
if take is not None:
cmd.extend(["-t", str(take)])
if has_alpha:
# Preserve alpha with ProRes 4444
cmd.extend(
[
"-vf",
"fps=30,format=yuva444p10le",
"-c:v",
"prores_ks",
"-profile:v",
"4",
"-pix_fmt",
"yuva444p10le",
"-c:a",
"pcm_s16le",
"-avoid_negative_ts",
"make_zero",
str(trimmed_path),
]
)
else:
# No alpha - use fast h264 encoding
cmd.extend(
[
"-vf",
"fps=30",
"-c:v",
"libx264",
"-preset",
"fast",
"-crf",
"18",
"-c:a",
"aac",
"-b:a",
"192k",
"-avoid_negative_ts",
"make_zero",
"-movflags",
"+faststart",
str(trimmed_path),
]
)
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
raise PreprocessError(
f"Failed to trim segment {video_id}",
filter_type="concat",
command=" ".join(cmd),
stderr=result.stderr,
)
trimmed_segments.append(trimmed_path)
# Build concat file list
concat_list = temp_dir / "concat_list.txt"
with open(concat_list, "w", encoding="utf-8") as f:
for segment in trimmed_segments:
f.write(f"file '{segment.resolve()}'\n")
# Concatenate all segments
print(f" Stitching {len(trimmed_segments)} segments -> {output_path.name}")
cmd = [
"ffmpeg",
"-y",
"-f",
"concat",
"-safe",
"0",
"-i",
str(concat_list),
"-c:v",
"copy",
"-c:a",
"copy",
"-movflags",
"+faststart",
str(output_path),
]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
raise PreprocessError(
"Segment concatenation failed",
filter_type="concat",
command=" ".join(cmd),
stderr=result.stderr,
)
# Apply loudnorm if any segment had defer_loudnorm=True
needs_loudnorm = any(
videos[seg_id].defer_loudnorm for seg_id in segment_ids if seg_id in videos
)
if needs_loudnorm:
print(" Applying loudness normalization to stitched output...")
normalized_path = (
output_path.parent / f"{output_path.stem}_normalized{output_path.suffix}"
)
# Build loudnorm filter string from project config (or fall back to defaults)
_cfg = loudnorm_config or {}
_lufs = float(_cfg.get("target_lufs", -14))
_lra = float(_cfg.get("target_lra", 11))
_tp = float(_cfg.get("target_tp", -1.5))
loudnorm_filter = f"loudnorm=I={_lufs:.1f}:LRA={_lra:.1f}:TP={_tp:.1f}"
loudnorm_cmd = [
"ffmpeg",
"-y",
"-i",
str(output_path),
"-c:v",
"copy",
"-af",
loudnorm_filter,
"-c:a",
"aac",
"-b:a",
"192k",
"-movflags",
"+faststart",
str(normalized_path),
]
result = subprocess.run(loudnorm_cmd, capture_output=True, text=True)
if result.returncode != 0:
raise PreprocessError(
"Loudness normalization failed",
filter_type="loudnorm",
command=" ".join(loudnorm_cmd),
stderr=result.stderr,
)
# Replace original with normalized version
output_path.unlink()
normalized_path.rename(output_path)
print(" Loudness normalization complete.")
# Clean up temp files
for segment in trimmed_segments:
if segment.parent == temp_dir and segment.exists():
segment.unlink()
concat_list.unlink()
try:
temp_dir.rmdir()
except OSError:
pass
total_duration = get_video_duration(output_path)
print(f" Stitched duration: {format_time(total_duration)}")
return output_path