Fixing loudness issue

This commit is contained in:
2026-05-12 00:52:14 +02:00
parent feb4df0506
commit 994a2e0bb6
7 changed files with 191 additions and 76 deletions
+99 -18
View File
@@ -372,10 +372,11 @@ def cmd_import(project_path: Path, force: bool, verbose: bool) -> int:
shared_assets_dir = _find_shared_assets(project_path) shared_assets_dir = _find_shared_assets(project_path)
if shared_assets_dir: if shared_assets_dir:
_import_shared_assets(shared_assets_dir, verbose) _import_shared_assets(shared_assets_dir, verbose)
_import_shared_audio(shared_assets_dir, project_path, config, verbose)
_sync_shared_videos_to_local(project_path, config, shared_assets_dir, verbose) _sync_shared_videos_to_local(project_path, config, shared_assets_dir, verbose)
# Probe and cache audio file durations into audio.json # Probe and cache audio file durations into audio.json
_probe_audio_durations(project_path, config, force, verbose) _probe_audio_durations(project_path, config, force, verbose, shared_assets_dir)
# Probe and cache video metadata (duration, has_audio) into videos.json # Probe and cache video metadata (duration, has_audio) into videos.json
_probe_video_metadata(project_path, config, shared_assets_dir, force, verbose) _probe_video_metadata(project_path, config, shared_assets_dir, force, verbose)
@@ -384,8 +385,71 @@ def cmd_import(project_path: Path, force: bool, verbose: bool) -> int:
return 0 return 0
def _import_shared_audio(
shared_assets_dir: Path,
project_path: Path,
config,
verbose: bool,
) -> None:
"""Import audio files from shared_assets/media/audio into the project's audio.json."""
audio_extensions = {".mp3", ".wav", ".aac", ".m4a", ".ogg", ".flac"}
shared_audio_dir = shared_assets_dir / "media" / "audio"
if not shared_audio_dir.exists():
if verbose:
print(f" No shared audio dir found at {shared_audio_dir}")
return
audio_files = sorted(
f
for f in shared_audio_dir.iterdir()
if f.is_file()
and f.suffix.lower() in audio_extensions
and not f.name.startswith(".")
)
if not audio_files:
if verbose:
print(f" No audio files found in {shared_audio_dir}")
return
# Resolve project audio.json path
if config and config.audio_path:
audio_json_path = project_path / config.audio_path
else:
audio_json_path = project_path / "media" / "audio" / "audio.json"
audio_json_path.parent.mkdir(parents=True, exist_ok=True)
existing: dict = _read_json(audio_json_path) if audio_json_path.exists() else {}
added = 0
for f in audio_files:
audio_id = f.stem
if audio_id in existing:
if verbose:
print(f" Skipping {audio_id} (already in audio.json)")
continue
existing[audio_id] = {
"file": f.name,
"is_shared": True,
"volume": 1.0,
}
added += 1
if verbose:
print(f" Added shared audio: {audio_id}")
if added > 0:
with open(audio_json_path, "w", encoding="utf-8") as fh:
json.dump(existing, fh, indent=2)
print(f" Updated {audio_json_path.relative_to(project_path)} (+{added} shared audio files)")
else:
if verbose:
print(f" No new shared audio files to add")
def _probe_audio_durations( def _probe_audio_durations(
project_path: Path, config, force: bool, verbose: bool project_path: Path, config, force: bool, verbose: bool,
shared_assets_dir: Optional[Path] = None,
) -> None: ) -> None:
"""Probe and cache audio file durations into audio.json. """Probe and cache audio file durations into audio.json.
@@ -413,6 +477,9 @@ def _probe_audio_durations(
if verbose: if verbose:
print(f" Audio '{audio_id}': cached ({audio_data['duration']:.1f}s)") print(f" Audio '{audio_id}': cached ({audio_data['duration']:.1f}s)")
continue continue
if audio_data.get("is_shared") and shared_assets_dir:
audio_path = shared_assets_dir / "media" / "audio" / audio_data["file"]
else:
audio_path = audio_dir / audio_data["file"] audio_path = audio_dir / audio_data["file"]
if not audio_path.exists(): if not audio_path.exists():
if verbose: if verbose:
@@ -1060,8 +1127,16 @@ _TASKS_VIDEO_PREFIXES = {
"video:": 6, "video:": 6,
"vft:": 4, "vft:": 4,
"vfb:": 4, "vfb:": 4,
"vf2t:": 5,
"vf2b:": 5,
"vst:": 4, "vst:": 4,
"vsb:": 4, "vsb:": 4,
"vftp:": 5,
"vfbp:": 5,
"vf2tp:": 6,
"vf2bp:": 6,
"vstp:": 5,
"vsbp:": 5,
"narration:": 10, "narration:": 10,
} }
@@ -1993,6 +2068,14 @@ def cmd_stitch(
print(f"\n Combined narration exists: {stitch_output.name}") print(f"\n Combined narration exists: {stitch_output.name}")
print(" (use --force to regenerate)") print(" (use --force to regenerate)")
else: else:
# Extract loudnorm config from talkinghead filter so stitch uses
# per-project settings instead of hardcoded defaults.
_loudnorm_cfg = None
if config and config.default_filters:
for _f in (config.default_filters.get("talkinghead") or []):
if isinstance(_f, dict) and _f.get("type") == "audio_normalize":
_loudnorm_cfg = _f
break
stitch_narration_segments( stitch_narration_segments(
narration_dir, narration_dir,
segment_ids, segment_ids,
@@ -2000,6 +2083,7 @@ def cmd_stitch(
stitch_output, stitch_output,
verbose=verbose, verbose=verbose,
default_end_trim=config.default_end_trim if config else 0.0, default_end_trim=config.default_end_trim if config else 0.0,
loudnorm_config=_loudnorm_cfg,
) )
# Run import videos again, because at this point narration_combined might have been created. # Run import videos again, because at this point narration_combined might have been created.
_import_videos(videos_dir, config, verbose) _import_videos(videos_dir, config, verbose)
@@ -2127,14 +2211,10 @@ def _print_render_plan_details(plan, marker_timings, slides: dict) -> None:
marker_id.startswith(p) marker_id.startswith(p)
for p in ( for p in (
"video:", "video:",
"vft:", "vft:", "vfb:", "vf2t:", "vf2b:",
"vfb:", "vst:", "vsb:",
"vst:", "vftp:", "vfbp:", "vf2tp:", "vf2bp:",
"vsb:", "vstp:", "vsbp:",
"vft:",
"vfbp:",
"vstp:",
"vsbp:",
) )
): ):
aligned_count += 1 aligned_count += 1
@@ -2142,14 +2222,10 @@ def _print_render_plan_details(plan, marker_timings, slides: dict) -> None:
len(p) len(p)
for p in ( for p in (
"video:", "video:",
"vft:", "vft:", "vfb:", "vf2t:", "vf2b:",
"vfb:", "vst:", "vsb:",
"vst:", "vftp:", "vfbp:", "vf2tp:", "vf2bp:",
"vsb:", "vstp:", "vsbp:",
"vft:",
"vfbp:",
"vstp:",
"vsbp:",
) )
if marker_id.startswith(p) if marker_id.startswith(p)
) )
@@ -3066,6 +3142,11 @@ _RSYNC_EXCLUDES = [
"media/narration/processed/", "media/narration/processed/",
"media/narration/processed/**", "media/narration/processed/**",
"media/videos/narration_combined.mov", "media/videos/narration_combined.mov",
# Low-res preview files (generated locally, not synced)
"media/narration/low/",
"media/narration/low/**",
"media/videos/low/",
"media/videos/low/**",
# Chunk scratch directories # Chunk scratch directories
"**/chunks/", "**/chunks/",
"**/chunks/**", "**/chunks/**",
+2 -1
View File
@@ -337,7 +337,7 @@ class SlideEvent:
class AudioDefinition: class AudioDefinition:
"""Definition of an audio clip from audio.json.""" """Definition of an audio clip from audio.json."""
file: str # Audio filename (relative to audio.json location) file: str # Audio filename (relative to audio.json location, or to shared_assets/media/audio/ if is_shared)
volume: float = 1.0 # Volume multiplier (0.0-1.0) volume: float = 1.0 # Volume multiplier (0.0-1.0)
loop: bool = False # If True, loop for entire duration from trigger point loop: bool = False # If True, loop for entire duration from trigger point
overlap: Optional[float] = None # Crossfade overlap in seconds when looping overlap: Optional[float] = None # Crossfade overlap in seconds when looping
@@ -345,6 +345,7 @@ class AudioDefinition:
False # If True, audio continues playing during narration pauses False # If True, audio continues playing during narration pauses
) )
duration: Optional[float] = None # Pre-probed duration in seconds (set by import) duration: Optional[float] = None # Pre-probed duration in seconds (set by import)
is_shared: bool = False # If True, file is relative to shared_assets/media/audio/
@dataclass @dataclass
+3 -2
View File
@@ -374,6 +374,7 @@ def parse_audio(
overlap=overlap, overlap=overlap,
ignore_pauses=bool(audio_data.get("ignore_pauses", False)), ignore_pauses=bool(audio_data.get("ignore_pauses", False)),
duration=float(raw_duration) if raw_duration is not None else None, duration=float(raw_duration) if raw_duration is not None else None,
is_shared=bool(audio_data.get("is_shared", False)),
) )
return audio, audio_dir return audio, audio_dir
@@ -494,8 +495,8 @@ def parse_videos(
filter_list = filter_value filter_list = filter_value
# Handle skip/take - can use begin/end as user-friendly alternatives # Handle skip/take - can use begin/end as user-friendly alternatives
skip = video_data.get("skip", 0.0) skip = float(video_data.get("skip") or 0.0)
take = video_data.get("take") take = float(video_data["take"]) if video_data.get("take") not in (None, "") else None
# Convert begin/end to skip/take if provided # Convert begin/end to skip/take if provided
if "begin" in video_data and video_data["begin"]: if "begin" in video_data and video_data["begin"]:
+15 -4
View File
@@ -26,7 +26,7 @@ CHUNK_DURATION = 60
# Resolution presets for preview/proxy workflow # Resolution presets for preview/proxy workflow
# Each entry: (width, height, subdir_name) # Each entry: (width, height, subdir_name)
RES_CONFIGS: dict[str, tuple[int, int, str] | None] = { RES_CONFIGS: dict[str, Optional[tuple]] = {
"full": None, # no downscale, no subdir "full": None, # no downscale, no subdir
"low": (490, 270, "low"), "low": (490, 270, "low"),
"tiny": (320, 180, "proxy"), # "proxy" subdir kept for backward compat "tiny": (320, 180, "proxy"), # "proxy" subdir kept for backward compat
@@ -120,8 +120,12 @@ def create_downscaled_video(
"ultrafast", "ultrafast",
"-crf", "-crf",
"28", "28",
"-vsync",
"cfr",
"-c:a", "-c:a",
"copy", "aac", # re-encode audio so both streams share the same PTS origin,
"-ar", # avoiding the lip-sync drift caused by libx264 encoder delay
"48000", # when audio is copied with its original timestamps
str(out_path), str(out_path),
] ]
result = subprocess.run(cmd, capture_output=True, text=True) result = subprocess.run(cmd, capture_output=True, text=True)
@@ -2189,6 +2193,7 @@ def stitch_narration_segments(
output_path: Path, output_path: Path,
verbose: bool = False, verbose: bool = False,
default_end_trim: float = 0.0, default_end_trim: float = 0.0,
loudnorm_config: Optional[dict] = None,
) -> Path: ) -> Path:
""" """
Stitch multiple narration video segments into a single file. Stitch multiple narration video segments into a single file.
@@ -2379,7 +2384,13 @@ def stitch_narration_segments(
output_path.parent / f"{output_path.stem}_normalized{output_path.suffix}" output_path.parent / f"{output_path.stem}_normalized{output_path.suffix}"
) )
# Use EBU R128 loudnorm targeting YouTube's recommended levels # Build loudnorm filter string from project config (or fall back to defaults)
_cfg = loudnorm_config or {}
_lufs = float(_cfg.get("target_lufs", -14))
_lra = float(_cfg.get("target_lra", 11))
_tp = float(_cfg.get("target_tp", -1.5))
loudnorm_filter = f"loudnorm=I={_lufs:.1f}:LRA={_lra:.1f}:TP={_tp:.1f}"
loudnorm_cmd = [ loudnorm_cmd = [
"ffmpeg", "ffmpeg",
"-y", "-y",
@@ -2388,7 +2399,7 @@ def stitch_narration_segments(
"-c:v", "-c:v",
"copy", "copy",
"-af", "-af",
"loudnorm=I=-14:LRA=11:TP=-1.5", loudnorm_filter,
"-c:a", "-c:a",
"aac", "aac",
"-b:a", "-b:a",
+50 -45
View File
@@ -395,7 +395,7 @@ def build_ffmpeg_command(plan: RenderPlan, output_path: Path) -> list[str]:
video_path = _resolve_video_path( video_path = _resolve_video_path(
videos_dir, event.video_source, shared_assets_dir, project_path videos_dir, event.video_source, shared_assets_dir, project_path
) )
skip = event.video_source.skip skip = event.video_source.skip or 0.0
if skip > 0: if skip > 0:
cmd.extend(["-ss", f"{skip:.3f}"]) cmd.extend(["-ss", f"{skip:.3f}"])
cmd.extend(["-analyzeduration", "0", "-probesize", "1000"]) cmd.extend(["-analyzeduration", "0", "-probesize", "1000"])
@@ -425,7 +425,7 @@ def build_ffmpeg_command(plan: RenderPlan, output_path: Path) -> list[str]:
video_path = _resolve_video_path( video_path = _resolve_video_path(
videos_dir, event.video_source, shared_assets_dir, project_path videos_dir, event.video_source, shared_assets_dir, project_path
) )
skip = event.video_source.skip skip = event.video_source.skip or 0.0
if skip > 0: if skip > 0:
cmd.extend(["-ss", f"{skip:.3f}"]) cmd.extend(["-ss", f"{skip:.3f}"])
cmd.extend(["-analyzeduration", "0", "-probesize", "1000"]) cmd.extend(["-analyzeduration", "0", "-probesize", "1000"])
@@ -455,6 +455,9 @@ def build_ffmpeg_command(plan: RenderPlan, output_path: Path) -> list[str]:
for event in plan.audio_events: for event in plan.audio_events:
if event.audio_id not in audio_inputs: if event.audio_id not in audio_inputs:
if event.audio_def.is_shared and plan.shared_assets_dir:
audio_path = plan.shared_assets_dir / "media" / "audio" / event.audio_def.file
else:
audio_path = audio_dir / event.audio_def.file audio_path = audio_dir / event.audio_def.file
audio_path, _ = resolve_with_cache(audio_path, project_path) audio_path, _ = resolve_with_cache(audio_path, project_path)
# Use pre-probed duration from audio.json if available (set by import). # Use pre-probed duration from audio.json if available (set by import).
@@ -802,13 +805,14 @@ def build_filter_complex(
""" """
Build the filter_complex string for FFmpeg. Build the filter_complex string for FFmpeg.
Layer structure: Layer structure (bottom to top):
- Layer 1: Background (solid color, image, or video) - Layer 1: Background (solid color, image, or video)
- Layer 2: Always visible videos (like talking head) in cutouts - Layer 2: "below" triggered videos (vfb/vsb) — behind talking head
- Layer 3: Slides (with time-based enable) - Layer 3: Always visible videos (like talking head) in cutouts
- Layer 4: Triggered videos in cutouts (with time-based enable) - Layer 4: Slides (with time-based enable)
- Layer 5: Camera transform - Layer 5: "above" triggered videos (vft/vst) — in front of slides
- Layer 6: Outro videos (fullscreen, after narration ends) - Layer 6: Camera transform
- Layer 7: Outro videos (fullscreen, after narration ends)
- Audio: Main audio mixed with triggered sound effects and outro audio - Audio: Main audio mixed with triggered sound effects and outro audio
""" """
outro_inputs = outro_inputs or {} outro_inputs = outro_inputs or {}
@@ -835,6 +839,44 @@ def build_filter_complex(
current_label = "bg" current_label = "bg"
# Add "below" triggered video overlays (vfb/vsb) BEFORE the talking head
# so they sit behind it in the composite stack.
for i, event in enumerate(plan.video_events):
if event.layer != "below":
continue
video_idx = video_inputs[i]
cut_x, cut_y, cut_width, cut_height = _calculate_cutout_position(
event.cutout, width, height
)
duration = event.end_time - event.start_time
if event.video_source.take is not None:
duration = min(duration, event.video_source.take)
effective_end = event.start_time + duration
zoom = event.video_source.zoom
zoomed_width = int(cut_width * zoom)
zoomed_height = int(cut_height * zoom)
video_label = f"tvb{i}"
start_pts = event.start_time
filters.append(
f"[{video_idx}:v]format=yuva444p10le,"
f"setpts=PTS-STARTPTS+{start_pts:.3f}/TB,"
f"scale={zoomed_width}:{zoomed_height}:force_original_aspect_ratio=increase,"
f"crop={cut_width}:{cut_height}:(iw-{cut_width})/2:(ih-{cut_height})/2,"
f"format=rgba[{video_label}]"
)
next_label = f"tvbbase{i}"
enable_expr = f"between(t\\,{event.start_time:.3f}\\,{effective_end:.3f})"
filters.append(
f"[{current_label}][{video_label}]overlay="
f"x={cut_x}:y={cut_y}:enable={enable_expr}"
f"[{next_label}]"
)
current_label = next_label
# Overlay always_visible videos (like talking head) # Overlay always_visible videos (like talking head)
# If there are narration pauses, we need to segment the video # If there are narration pauses, we need to segment the video
for i, (video_id, video_source, cutout) in enumerate(plan.narration_videos): for i, (video_id, video_source, cutout) in enumerate(plan.narration_videos):
@@ -898,43 +940,6 @@ def build_filter_complex(
) )
current_label = next_label current_label = next_label
# Add "below-slides" triggered video overlays (vfb/vsb or layer="below")
for i, event in enumerate(plan.video_events):
if event.layer != "below":
continue
video_idx = video_inputs[i]
cut_x, cut_y, cut_width, cut_height = _calculate_cutout_position(
event.cutout, width, height
)
duration = event.end_time - event.start_time
if event.video_source.take is not None:
duration = min(duration, event.video_source.take)
effective_end = event.start_time + duration
zoom = event.video_source.zoom
zoomed_width = int(cut_width * zoom)
zoomed_height = int(cut_height * zoom)
video_label = f"tvb{i}"
start_pts = event.start_time
filters.append(
f"[{video_idx}:v]format=yuva444p10le,"
f"setpts=PTS-STARTPTS+{start_pts:.3f}/TB,"
f"scale={zoomed_width}:{zoomed_height}:force_original_aspect_ratio=increase,"
f"crop={cut_width}:{cut_height}:(iw-{cut_width})/2:(ih-{cut_height})/2,"
f"format=rgba[{video_label}]"
)
next_label = f"tvbbase{i}"
enable_expr = f"between(t\\,{event.start_time:.3f}\\,{effective_end:.3f})"
filters.append(
f"[{current_label}][{video_label}]overlay="
f"x={cut_x}:y={cut_y}:enable={enable_expr}"
f"[{next_label}]"
)
current_label = next_label
# Add slide overlays with time-based enable # Add slide overlays with time-based enable
for i, event in enumerate(plan.slide_events): for i, event in enumerate(plan.slide_events):
slide_idx = slide_inputs[event.slide_id] slide_idx = slide_inputs[event.slide_id]
+12 -4
View File
@@ -139,10 +139,14 @@ def _is_known_marker(
"narration:", "narration:",
"vft:", "vft:",
"vfb:", "vfb:",
"vf2t:",
"vf2b:",
"vst:", "vst:",
"vsb:", "vsb:",
"vftp:", "vftp:",
"vfbp:", "vfbp:",
"vf2tp:",
"vf2bp:",
"vstp:", "vstp:",
"vsbp:", "vsbp:",
) )
@@ -627,8 +631,8 @@ def build_render_plan(
# Before extracting video events, resolve any referenced videos that are missing # Before extracting video events, resolve any referenced videos that are missing
# from the project's videos.json by looking them up in shared_assets/videos.json. # from the project's videos.json by looking them up in shared_assets/videos.json.
_VIDEO_MARKER_PREFIXES = ( _VIDEO_MARKER_PREFIXES = (
"video:", "narration:", "vft:", "vfb:", "vst:", "vsb:", "video:", "narration:", "vft:", "vfb:", "vf2t:", "vf2b:", "vst:", "vsb:",
"vftp:", "vfbp:", "vstp:", "vsbp:", "vftp:", "vfbp:", "vf2tp:", "vf2bp:", "vstp:", "vsbp:",
) )
missing_video_ids = [ missing_video_ids = [
timing.marker_id[len(prefix):] timing.marker_id[len(prefix):]
@@ -897,8 +901,8 @@ def _extract_slide_events(
events: list[SlideEvent] = [] events: list[SlideEvent] = []
for i, (marker_time, marker_id) in enumerate(resolved): for i, (marker_time, marker_id) in enumerate(resolved):
# Each slide starts at its own marker time # First slide always starts at 0 — it's the opening state of the presentation.
start_time = marker_time start_time = 0.0 if i == 0 else marker_time
# End time is when the NEXT slide's marker appears, or end of video # End time is when the NEXT slide's marker appears, or end of video
if i + 1 < len(resolved): if i + 1 < len(resolved):
@@ -957,10 +961,14 @@ def _extract_video_events(
_SHORTHAND: dict[str, tuple[str, str]] = { _SHORTHAND: dict[str, tuple[str, str]] = {
"vft:": ("fullscreen", "above"), "vft:": ("fullscreen", "above"),
"vfb:": ("fullscreen", "below"), "vfb:": ("fullscreen", "below"),
"vf2t:": ("fullscreen2", "above"),
"vf2b:": ("fullscreen2", "below"),
"vst:": ("square", "above"), "vst:": ("square", "above"),
"vsb:": ("square", "below"), "vsb:": ("square", "below"),
"vftp:": ("fullscreen", "above", "pause_narration"), "vftp:": ("fullscreen", "above", "pause_narration"),
"vfbp:": ("fullscreen", "below", "pause_narration"), "vfbp:": ("fullscreen", "below", "pause_narration"),
"vf2tp:": ("fullscreen2", "above", "pause_narration"),
"vf2bp:": ("fullscreen2", "below", "pause_narration"),
"vstp:": ("square", "above", "pause_narration"), "vstp:": ("square", "above", "pause_narration"),
"vsbp:": ("square", "below", "pause_narration"), "vsbp:": ("square", "below", "pause_narration"),
} }
+8
View File
@@ -66,8 +66,16 @@ def validate_project(
"video:": 6, "video:": 6,
"vft:": 4, "vft:": 4,
"vfb:": 4, "vfb:": 4,
"vf2t:": 5,
"vf2b:": 5,
"vst:": 4, "vst:": 4,
"vsb:": 4, "vsb:": 4,
"vftp:": 5,
"vfbp:": 5,
"vf2tp:": 6,
"vf2bp:": 6,
"vstp:": 5,
"vsbp:": 5,
} }
matched_prefix = next( matched_prefix = next(
(p for p in _VIDEO_PREFIXES if marker.startswith(p)), None (p for p in _VIDEO_PREFIXES if marker.startswith(p)), None