diff --git a/gnommo/preprocessor.py b/gnommo/preprocessor.py index 442abca..66cb5bc 100644 --- a/gnommo/preprocessor.py +++ b/gnommo/preprocessor.py @@ -19,7 +19,7 @@ from .models import ( from typing import Union, Optional # Number of parallel workers for chunk processing -DEFAULT_CHUNK_WORKERS = 4 +DEFAULT_CHUNK_WORKERS = 1 # Chunk duration in seconds for parallel filter processing (avoids huge intermediate files) CHUNK_DURATION = 60 @@ -770,16 +770,17 @@ def apply_combined_video_filters( # Build FFmpeg command cmd = ["ffmpeg", "-y"] + # Global options before -i (after -i they become output options and don't limit filter threads) + cmd.extend(["-threads", "1", "-filter_threads", "1"]) if take is not None: cmd.extend(["-t", str(take)]) cmd.extend( [ + "-probesize", "50000000", "-analyzeduration", "50000000", "-i", str(input_path), - "-filter_threads", - "1", "-vf", video_filter, "-c:v", @@ -887,7 +888,9 @@ def build_mask_filter(config: dict) -> str: alpha_expr = "+".join(conditions) alpha_expr = f"if({alpha_expr},0,alpha(X,Y))" - return f"geq=lum='lum(X,Y)':cb='cb(X,Y)':cr='cr(X,Y)':a='{alpha_expr}'" + # Use r/g/b passthrough so this works in rgba space (as output by gnommokey/color_grade) + # without triggering an rgba→yuv conversion that would spawn 11 more swscaler threads. + return f"geq=r='r(X,Y)':g='g(X,Y)':b='b(X,Y)':a='{alpha_expr}'" def build_color_grade_filter(config: dict) -> str: @@ -1140,8 +1143,9 @@ def build_gnommokey_filter(config: dict) -> str: parts.append(f"alphaextract,avgblur=sizeX={radius}:sizeY={radius}[blur]") # This gets complex - for now, skip alpha blur and just use erosion - # Ensure output is in a good format - parts.append("format=yuva444p10le") + # Stay in rgba so downstream filters (color_grade, mask) don't trigger + # a redundant yuva444p10le→rgba round-trip and its 11-thread swscaler call. + # The caller (_process_chunk_to_prores4444) appends format=yuva444p10le at the end. return ",".join(parts) @@ -1353,10 +1357,20 @@ def _process_chunk_to_prores4444( # Build FFmpeg command cmd: list[str] = ["ffmpeg", "-y"] + # Global thread limits MUST be before the first -i. + # After -i they become output-stream options and FFmpeg ignores them for the + # filter graph — each geq stage then spawns one thread per CPU core (11 on M-series), + # causing the N-way RGBA frame buffer explosion that OOM-kills the process. + cmd.extend(["-threads", "1", "-filter_threads", "1"]) + # Seek to start time (before input for fast seeking) if start_time > 0: cmd.extend(["-ss", str(start_time)]) + # Limit initial file analysis to 50 MB. Without this, FFmpeg scans the entire + # source file when moov is at the end (common for camera recordings), which reads + # gigabytes of data and triggers OOM when multiple chunk workers run in parallel. + cmd.extend(["-probesize", "50000000", "-analyzeduration", "50000000"]) cmd.extend(["-i", str(input_path)]) # Limit duration @@ -1364,13 +1378,8 @@ def _process_chunk_to_prores4444( if actual_take is not None: cmd.extend(["-t", str(actual_take)]) - # Video encode: ProRes 4444 with alpha - # -filter_threads 1: geq is serial anyway; limiting threads eliminates the N-way - # RGBA frame buffer explosion that causes OOM when chunk workers run in parallel. cmd.extend( [ - "-filter_threads", - "1", "-vf", video_filter, "-c:v", @@ -1682,10 +1691,9 @@ def apply_chroma_key( # Build FFmpeg command # ProRes 4444 profile for alpha channel support - cmd = [ - "ffmpeg", - "-y", # Overwrite output - ] + cmd = ["ffmpeg", "-y"] + # Global options before -i + cmd.extend(["-threads", "1", "-filter_threads", "1"]) # Add duration limit if specified (before input for efficiency) if take is not None: @@ -1695,8 +1703,6 @@ def apply_chroma_key( [ "-i", str(input_path), - "-filter_threads", - "1", "-vf", video_filter, "-c:v", @@ -1786,14 +1792,13 @@ def apply_mask( # Using: if(condition, 0, alpha(X,Y)) alpha_expr = f"if({alpha_expr},0,alpha(X,Y))" - # Build the geq filter - preserve luma, chroma, modify alpha - video_filter = f"geq=lum='lum(X,Y)':cb='cb(X,Y)':cr='cr(X,Y)':a='{alpha_expr}'" + # Build the geq filter - preserve RGB channels, modify alpha + video_filter = f"geq=r='r(X,Y)':g='g(X,Y)':b='b(X,Y)':a='{alpha_expr}'" # Build FFmpeg command - cmd = [ - "ffmpeg", - "-y", # Overwrite output - ] + cmd = ["ffmpeg", "-y"] + # Global options before -i + cmd.extend(["-threads", "1", "-filter_threads", "1"]) if take is not None: cmd.extend(["-t", str(take)]) @@ -1802,8 +1807,6 @@ def apply_mask( [ "-i", str(input_path), - "-filter_threads", - "1", "-vf", video_filter, "-c:v", diff --git a/gnommo/renderer.py b/gnommo/renderer.py index 3f7e76a..a2a864a 100644 --- a/gnommo/renderer.py +++ b/gnommo/renderer.py @@ -303,6 +303,11 @@ def build_ffmpeg_command(plan: RenderPlan, output_path: Path) -> list[str]: """Build the complete FFmpeg command as a list of arguments.""" cmd = ["ffmpeg", "-y"] # -y to overwrite output + # Global thread limits before any -i. Without this, each format=rgba conversion + # in the filter graph (one per video layer) spawns one swscaler thread per CPU core, + # causing OOM on Apple Silicon where av_cpu_count() returns 10-11. + cmd.extend(["-threads", "1", "-filter_threads", "1"]) + # Resolve paths to absolute project_path = plan.project_path.resolve() output_path = output_path.resolve()