Fixes to performance

2026-05-12 19:49:15 +02:00
parent ff47ffea8f
commit 41d96501b6
2 changed files with 33 additions and 25 deletions
@@ -19,7 +19,7 @@ from .models import (
 from typing import Union, Optional

 # Number of parallel workers for chunk processing
-DEFAULT_CHUNK_WORKERS = 4
+DEFAULT_CHUNK_WORKERS = 1

 # Chunk duration in seconds for parallel filter processing (avoids huge intermediate files)
 CHUNK_DURATION = 60
@@ -770,16 +770,17 @@ def apply_combined_video_filters(

    # Build FFmpeg command
    cmd = ["ffmpeg", "-y"]
+    # Global options before -i (after -i they become output options and don't limit filter threads)
+    cmd.extend(["-threads", "1", "-filter_threads", "1"])

    if take is not None:
        cmd.extend(["-t", str(take)])

    cmd.extend(
        [
+            "-probesize", "50000000", "-analyzeduration", "50000000",
            "-i",
            str(input_path),
-            "-filter_threads",
-            "1",
            "-vf",
            video_filter,
            "-c:v",
@@ -887,7 +888,9 @@ def build_mask_filter(config: dict) -> str:
    alpha_expr = "+".join(conditions)
    alpha_expr = f"if({alpha_expr},0,alpha(X,Y))"

-    return f"geq=lum='lum(X,Y)':cb='cb(X,Y)':cr='cr(X,Y)':a='{alpha_expr}'"
+    # Use r/g/b passthrough so this works in rgba space (as output by gnommokey/color_grade)
+    # without triggering an rgba→yuv conversion that would spawn 11 more swscaler threads.
+    return f"geq=r='r(X,Y)':g='g(X,Y)':b='b(X,Y)':a='{alpha_expr}'"


 def build_color_grade_filter(config: dict) -> str:
@@ -1140,8 +1143,9 @@ def build_gnommokey_filter(config: dict) -> str:
            parts.append(f"alphaextract,avgblur=sizeX={radius}:sizeY={radius}[blur]")
            # This gets complex - for now, skip alpha blur and just use erosion

-    # Ensure output is in a good format
-    parts.append("format=yuva444p10le")
+    # Stay in rgba so downstream filters (color_grade, mask) don't trigger
+    # a redundant yuva444p10le→rgba round-trip and its 11-thread swscaler call.
+    # The caller (_process_chunk_to_prores4444) appends format=yuva444p10le at the end.

    return ",".join(parts)

@@ -1353,10 +1357,20 @@ def _process_chunk_to_prores4444(
    # Build FFmpeg command
    cmd: list[str] = ["ffmpeg", "-y"]

+    # Global thread limits MUST be before the first -i.
+    # After -i they become output-stream options and FFmpeg ignores them for the
+    # filter graph — each geq stage then spawns one thread per CPU core (11 on M-series),
+    # causing the N-way RGBA frame buffer explosion that OOM-kills the process.
+    cmd.extend(["-threads", "1", "-filter_threads", "1"])
+
    # Seek to start time (before input for fast seeking)
    if start_time > 0:
        cmd.extend(["-ss", str(start_time)])

+    # Limit initial file analysis to 50 MB. Without this, FFmpeg scans the entire
+    # source file when moov is at the end (common for camera recordings), which reads
+    # gigabytes of data and triggers OOM when multiple chunk workers run in parallel.
+    cmd.extend(["-probesize", "50000000", "-analyzeduration", "50000000"])
    cmd.extend(["-i", str(input_path)])

    # Limit duration
@@ -1364,13 +1378,8 @@ def _process_chunk_to_prores4444(
    if actual_take is not None:
        cmd.extend(["-t", str(actual_take)])

-    # Video encode: ProRes 4444 with alpha
-    # -filter_threads 1: geq is serial anyway; limiting threads eliminates the N-way
-    # RGBA frame buffer explosion that causes OOM when chunk workers run in parallel.
    cmd.extend(
        [
-            "-filter_threads",
-            "1",
            "-vf",
            video_filter,
            "-c:v",
@@ -1682,10 +1691,9 @@ def apply_chroma_key(

    # Build FFmpeg command
    # ProRes 4444 profile for alpha channel support
-    cmd = [
-        "ffmpeg",
-        "-y",  # Overwrite output
-    ]
+    cmd = ["ffmpeg", "-y"]
+    # Global options before -i
+    cmd.extend(["-threads", "1", "-filter_threads", "1"])

    # Add duration limit if specified (before input for efficiency)
    if take is not None:
@@ -1695,8 +1703,6 @@ def apply_chroma_key(
        [
            "-i",
            str(input_path),
-            "-filter_threads",
-            "1",
            "-vf",
            video_filter,
            "-c:v",
@@ -1786,14 +1792,13 @@ def apply_mask(
    # Using: if(condition, 0, alpha(X,Y))
    alpha_expr = f"if({alpha_expr},0,alpha(X,Y))"

-    # Build the geq filter - preserve luma, chroma, modify alpha
-    video_filter = f"geq=lum='lum(X,Y)':cb='cb(X,Y)':cr='cr(X,Y)':a='{alpha_expr}'"
+    # Build the geq filter - preserve RGB channels, modify alpha
+    video_filter = f"geq=r='r(X,Y)':g='g(X,Y)':b='b(X,Y)':a='{alpha_expr}'"

    # Build FFmpeg command
-    cmd = [
-        "ffmpeg",
-        "-y",  # Overwrite output
-    ]
+    cmd = ["ffmpeg", "-y"]
+    # Global options before -i
+    cmd.extend(["-threads", "1", "-filter_threads", "1"])

    if take is not None:
        cmd.extend(["-t", str(take)])
@@ -1802,8 +1807,6 @@ def apply_mask(
        [
            "-i",
            str(input_path),
-            "-filter_threads",
-            "1",
            "-vf",
            video_filter,
            "-c:v",
@@ -303,6 +303,11 @@ def build_ffmpeg_command(plan: RenderPlan, output_path: Path) -> list[str]:
    """Build the complete FFmpeg command as a list of arguments."""
    cmd = ["ffmpeg", "-y"]  # -y to overwrite output

+    # Global thread limits before any -i. Without this, each format=rgba conversion
+    # in the filter graph (one per video layer) spawns one swscaler thread per CPU core,
+    # causing OOM on Apple Silicon where av_cpu_count() returns 10-11.
+    cmd.extend(["-threads", "1", "-filter_threads", "1"])
+
    # Resolve paths to absolute
    project_path = plan.project_path.resolve()
    output_path = output_path.resolve()