diff --git a/gnommo/cli.py b/gnommo/cli.py index 56674b0..0521c3d 100644 --- a/gnommo/cli.py +++ b/gnommo/cli.py @@ -372,10 +372,11 @@ def cmd_import(project_path: Path, force: bool, verbose: bool) -> int: shared_assets_dir = _find_shared_assets(project_path) if shared_assets_dir: _import_shared_assets(shared_assets_dir, verbose) + _import_shared_audio(shared_assets_dir, project_path, config, verbose) _sync_shared_videos_to_local(project_path, config, shared_assets_dir, verbose) # Probe and cache audio file durations into audio.json - _probe_audio_durations(project_path, config, force, verbose) + _probe_audio_durations(project_path, config, force, verbose, shared_assets_dir) # Probe and cache video metadata (duration, has_audio) into videos.json _probe_video_metadata(project_path, config, shared_assets_dir, force, verbose) @@ -384,8 +385,71 @@ def cmd_import(project_path: Path, force: bool, verbose: bool) -> int: return 0 +def _import_shared_audio( + shared_assets_dir: Path, + project_path: Path, + config, + verbose: bool, +) -> None: + """Import audio files from shared_assets/media/audio into the project's audio.json.""" + audio_extensions = {".mp3", ".wav", ".aac", ".m4a", ".ogg", ".flac"} + shared_audio_dir = shared_assets_dir / "media" / "audio" + + if not shared_audio_dir.exists(): + if verbose: + print(f" No shared audio dir found at {shared_audio_dir}") + return + + audio_files = sorted( + f + for f in shared_audio_dir.iterdir() + if f.is_file() + and f.suffix.lower() in audio_extensions + and not f.name.startswith(".") + ) + + if not audio_files: + if verbose: + print(f" No audio files found in {shared_audio_dir}") + return + + # Resolve project audio.json path + if config and config.audio_path: + audio_json_path = project_path / config.audio_path + else: + audio_json_path = project_path / "media" / "audio" / "audio.json" + + audio_json_path.parent.mkdir(parents=True, exist_ok=True) + existing: dict = _read_json(audio_json_path) if audio_json_path.exists() else {} + + added = 0 + for f in audio_files: + audio_id = f.stem + if audio_id in existing: + if verbose: + print(f" Skipping {audio_id} (already in audio.json)") + continue + existing[audio_id] = { + "file": f.name, + "is_shared": True, + "volume": 1.0, + } + added += 1 + if verbose: + print(f" Added shared audio: {audio_id}") + + if added > 0: + with open(audio_json_path, "w", encoding="utf-8") as fh: + json.dump(existing, fh, indent=2) + print(f" Updated {audio_json_path.relative_to(project_path)} (+{added} shared audio files)") + else: + if verbose: + print(f" No new shared audio files to add") + + def _probe_audio_durations( - project_path: Path, config, force: bool, verbose: bool + project_path: Path, config, force: bool, verbose: bool, + shared_assets_dir: Optional[Path] = None, ) -> None: """Probe and cache audio file durations into audio.json. @@ -413,7 +477,10 @@ def _probe_audio_durations( if verbose: print(f" Audio '{audio_id}': cached ({audio_data['duration']:.1f}s)") continue - audio_path = audio_dir / audio_data["file"] + if audio_data.get("is_shared") and shared_assets_dir: + audio_path = shared_assets_dir / "media" / "audio" / audio_data["file"] + else: + audio_path = audio_dir / audio_data["file"] if not audio_path.exists(): if verbose: print(f" Audio '{audio_id}': file not found, skipping") @@ -1060,8 +1127,16 @@ _TASKS_VIDEO_PREFIXES = { "video:": 6, "vft:": 4, "vfb:": 4, + "vf2t:": 5, + "vf2b:": 5, "vst:": 4, "vsb:": 4, + "vftp:": 5, + "vfbp:": 5, + "vf2tp:": 6, + "vf2bp:": 6, + "vstp:": 5, + "vsbp:": 5, "narration:": 10, } @@ -1993,6 +2068,14 @@ def cmd_stitch( print(f"\n Combined narration exists: {stitch_output.name}") print(" (use --force to regenerate)") else: + # Extract loudnorm config from talkinghead filter so stitch uses + # per-project settings instead of hardcoded defaults. + _loudnorm_cfg = None + if config and config.default_filters: + for _f in (config.default_filters.get("talkinghead") or []): + if isinstance(_f, dict) and _f.get("type") == "audio_normalize": + _loudnorm_cfg = _f + break stitch_narration_segments( narration_dir, segment_ids, @@ -2000,6 +2083,7 @@ def cmd_stitch( stitch_output, verbose=verbose, default_end_trim=config.default_end_trim if config else 0.0, + loudnorm_config=_loudnorm_cfg, ) # Run import videos again, because at this point narration_combined might have been created. _import_videos(videos_dir, config, verbose) @@ -2127,14 +2211,10 @@ def _print_render_plan_details(plan, marker_timings, slides: dict) -> None: marker_id.startswith(p) for p in ( "video:", - "vft:", - "vfb:", - "vst:", - "vsb:", - "vft:", - "vfbp:", - "vstp:", - "vsbp:", + "vft:", "vfb:", "vf2t:", "vf2b:", + "vst:", "vsb:", + "vftp:", "vfbp:", "vf2tp:", "vf2bp:", + "vstp:", "vsbp:", ) ): aligned_count += 1 @@ -2142,14 +2222,10 @@ def _print_render_plan_details(plan, marker_timings, slides: dict) -> None: len(p) for p in ( "video:", - "vft:", - "vfb:", - "vst:", - "vsb:", - "vft:", - "vfbp:", - "vstp:", - "vsbp:", + "vft:", "vfb:", "vf2t:", "vf2b:", + "vst:", "vsb:", + "vftp:", "vfbp:", "vf2tp:", "vf2bp:", + "vstp:", "vsbp:", ) if marker_id.startswith(p) ) @@ -3066,6 +3142,11 @@ _RSYNC_EXCLUDES = [ "media/narration/processed/", "media/narration/processed/**", "media/videos/narration_combined.mov", + # Low-res preview files (generated locally, not synced) + "media/narration/low/", + "media/narration/low/**", + "media/videos/low/", + "media/videos/low/**", # Chunk scratch directories "**/chunks/", "**/chunks/**", diff --git a/gnommo/models.py b/gnommo/models.py index f3f264a..c1da839 100644 --- a/gnommo/models.py +++ b/gnommo/models.py @@ -337,7 +337,7 @@ class SlideEvent: class AudioDefinition: """Definition of an audio clip from audio.json.""" - file: str # Audio filename (relative to audio.json location) + file: str # Audio filename (relative to audio.json location, or to shared_assets/media/audio/ if is_shared) volume: float = 1.0 # Volume multiplier (0.0-1.0) loop: bool = False # If True, loop for entire duration from trigger point overlap: Optional[float] = None # Crossfade overlap in seconds when looping @@ -345,6 +345,7 @@ class AudioDefinition: False # If True, audio continues playing during narration pauses ) duration: Optional[float] = None # Pre-probed duration in seconds (set by import) + is_shared: bool = False # If True, file is relative to shared_assets/media/audio/ @dataclass diff --git a/gnommo/parser.py b/gnommo/parser.py index cc2a835..2ac4331 100644 --- a/gnommo/parser.py +++ b/gnommo/parser.py @@ -374,6 +374,7 @@ def parse_audio( overlap=overlap, ignore_pauses=bool(audio_data.get("ignore_pauses", False)), duration=float(raw_duration) if raw_duration is not None else None, + is_shared=bool(audio_data.get("is_shared", False)), ) return audio, audio_dir @@ -494,8 +495,8 @@ def parse_videos( filter_list = filter_value # Handle skip/take - can use begin/end as user-friendly alternatives - skip = video_data.get("skip", 0.0) - take = video_data.get("take") + skip = float(video_data.get("skip") or 0.0) + take = float(video_data["take"]) if video_data.get("take") not in (None, "") else None # Convert begin/end to skip/take if provided if "begin" in video_data and video_data["begin"]: diff --git a/gnommo/preprocessor.py b/gnommo/preprocessor.py index 544c5fe..a2fe31f 100644 --- a/gnommo/preprocessor.py +++ b/gnommo/preprocessor.py @@ -26,7 +26,7 @@ CHUNK_DURATION = 60 # Resolution presets for preview/proxy workflow # Each entry: (width, height, subdir_name) -RES_CONFIGS: dict[str, tuple[int, int, str] | None] = { +RES_CONFIGS: dict[str, Optional[tuple]] = { "full": None, # no downscale, no subdir "low": (490, 270, "low"), "tiny": (320, 180, "proxy"), # "proxy" subdir kept for backward compat @@ -120,8 +120,12 @@ def create_downscaled_video( "ultrafast", "-crf", "28", + "-vsync", + "cfr", "-c:a", - "copy", + "aac", # re-encode audio so both streams share the same PTS origin, + "-ar", # avoiding the lip-sync drift caused by libx264 encoder delay + "48000", # when audio is copied with its original timestamps str(out_path), ] result = subprocess.run(cmd, capture_output=True, text=True) @@ -2189,6 +2193,7 @@ def stitch_narration_segments( output_path: Path, verbose: bool = False, default_end_trim: float = 0.0, + loudnorm_config: Optional[dict] = None, ) -> Path: """ Stitch multiple narration video segments into a single file. @@ -2379,7 +2384,13 @@ def stitch_narration_segments( output_path.parent / f"{output_path.stem}_normalized{output_path.suffix}" ) - # Use EBU R128 loudnorm targeting YouTube's recommended levels + # Build loudnorm filter string from project config (or fall back to defaults) + _cfg = loudnorm_config or {} + _lufs = float(_cfg.get("target_lufs", -14)) + _lra = float(_cfg.get("target_lra", 11)) + _tp = float(_cfg.get("target_tp", -1.5)) + loudnorm_filter = f"loudnorm=I={_lufs:.1f}:LRA={_lra:.1f}:TP={_tp:.1f}" + loudnorm_cmd = [ "ffmpeg", "-y", @@ -2388,7 +2399,7 @@ def stitch_narration_segments( "-c:v", "copy", "-af", - "loudnorm=I=-14:LRA=11:TP=-1.5", + loudnorm_filter, "-c:a", "aac", "-b:a", diff --git a/gnommo/renderer.py b/gnommo/renderer.py index 02b0fba..3f7e76a 100644 --- a/gnommo/renderer.py +++ b/gnommo/renderer.py @@ -395,7 +395,7 @@ def build_ffmpeg_command(plan: RenderPlan, output_path: Path) -> list[str]: video_path = _resolve_video_path( videos_dir, event.video_source, shared_assets_dir, project_path ) - skip = event.video_source.skip + skip = event.video_source.skip or 0.0 if skip > 0: cmd.extend(["-ss", f"{skip:.3f}"]) cmd.extend(["-analyzeduration", "0", "-probesize", "1000"]) @@ -425,7 +425,7 @@ def build_ffmpeg_command(plan: RenderPlan, output_path: Path) -> list[str]: video_path = _resolve_video_path( videos_dir, event.video_source, shared_assets_dir, project_path ) - skip = event.video_source.skip + skip = event.video_source.skip or 0.0 if skip > 0: cmd.extend(["-ss", f"{skip:.3f}"]) cmd.extend(["-analyzeduration", "0", "-probesize", "1000"]) @@ -455,7 +455,10 @@ def build_ffmpeg_command(plan: RenderPlan, output_path: Path) -> list[str]: for event in plan.audio_events: if event.audio_id not in audio_inputs: - audio_path = audio_dir / event.audio_def.file + if event.audio_def.is_shared and plan.shared_assets_dir: + audio_path = plan.shared_assets_dir / "media" / "audio" / event.audio_def.file + else: + audio_path = audio_dir / event.audio_def.file audio_path, _ = resolve_with_cache(audio_path, project_path) # Use pre-probed duration from audio.json if available (set by import). # For MP3 without Xing/VBRI headers this is critical — FFmpeg otherwise @@ -802,13 +805,14 @@ def build_filter_complex( """ Build the filter_complex string for FFmpeg. - Layer structure: + Layer structure (bottom to top): - Layer 1: Background (solid color, image, or video) - - Layer 2: Always visible videos (like talking head) in cutouts - - Layer 3: Slides (with time-based enable) - - Layer 4: Triggered videos in cutouts (with time-based enable) - - Layer 5: Camera transform - - Layer 6: Outro videos (fullscreen, after narration ends) + - Layer 2: "below" triggered videos (vfb/vsb) — behind talking head + - Layer 3: Always visible videos (like talking head) in cutouts + - Layer 4: Slides (with time-based enable) + - Layer 5: "above" triggered videos (vft/vst) — in front of slides + - Layer 6: Camera transform + - Layer 7: Outro videos (fullscreen, after narration ends) - Audio: Main audio mixed with triggered sound effects and outro audio """ outro_inputs = outro_inputs or {} @@ -835,6 +839,44 @@ def build_filter_complex( current_label = "bg" + # Add "below" triggered video overlays (vfb/vsb) BEFORE the talking head + # so they sit behind it in the composite stack. + for i, event in enumerate(plan.video_events): + if event.layer != "below": + continue + video_idx = video_inputs[i] + cut_x, cut_y, cut_width, cut_height = _calculate_cutout_position( + event.cutout, width, height + ) + + duration = event.end_time - event.start_time + if event.video_source.take is not None: + duration = min(duration, event.video_source.take) + effective_end = event.start_time + duration + + zoom = event.video_source.zoom + zoomed_width = int(cut_width * zoom) + zoomed_height = int(cut_height * zoom) + + video_label = f"tvb{i}" + start_pts = event.start_time + filters.append( + f"[{video_idx}:v]format=yuva444p10le," + f"setpts=PTS-STARTPTS+{start_pts:.3f}/TB," + f"scale={zoomed_width}:{zoomed_height}:force_original_aspect_ratio=increase," + f"crop={cut_width}:{cut_height}:(iw-{cut_width})/2:(ih-{cut_height})/2," + f"format=rgba[{video_label}]" + ) + + next_label = f"tvbbase{i}" + enable_expr = f"between(t\\,{event.start_time:.3f}\\,{effective_end:.3f})" + filters.append( + f"[{current_label}][{video_label}]overlay=" + f"x={cut_x}:y={cut_y}:enable={enable_expr}" + f"[{next_label}]" + ) + current_label = next_label + # Overlay always_visible videos (like talking head) # If there are narration pauses, we need to segment the video for i, (video_id, video_source, cutout) in enumerate(plan.narration_videos): @@ -898,43 +940,6 @@ def build_filter_complex( ) current_label = next_label - # Add "below-slides" triggered video overlays (vfb/vsb or layer="below") - for i, event in enumerate(plan.video_events): - if event.layer != "below": - continue - video_idx = video_inputs[i] - cut_x, cut_y, cut_width, cut_height = _calculate_cutout_position( - event.cutout, width, height - ) - - duration = event.end_time - event.start_time - if event.video_source.take is not None: - duration = min(duration, event.video_source.take) - effective_end = event.start_time + duration - - zoom = event.video_source.zoom - zoomed_width = int(cut_width * zoom) - zoomed_height = int(cut_height * zoom) - - video_label = f"tvb{i}" - start_pts = event.start_time - filters.append( - f"[{video_idx}:v]format=yuva444p10le," - f"setpts=PTS-STARTPTS+{start_pts:.3f}/TB," - f"scale={zoomed_width}:{zoomed_height}:force_original_aspect_ratio=increase," - f"crop={cut_width}:{cut_height}:(iw-{cut_width})/2:(ih-{cut_height})/2," - f"format=rgba[{video_label}]" - ) - - next_label = f"tvbbase{i}" - enable_expr = f"between(t\\,{event.start_time:.3f}\\,{effective_end:.3f})" - filters.append( - f"[{current_label}][{video_label}]overlay=" - f"x={cut_x}:y={cut_y}:enable={enable_expr}" - f"[{next_label}]" - ) - current_label = next_label - # Add slide overlays with time-based enable for i, event in enumerate(plan.slide_events): slide_idx = slide_inputs[event.slide_id] diff --git a/gnommo/transformer.py b/gnommo/transformer.py index d80db6b..4634621 100644 --- a/gnommo/transformer.py +++ b/gnommo/transformer.py @@ -139,10 +139,14 @@ def _is_known_marker( "narration:", "vft:", "vfb:", + "vf2t:", + "vf2b:", "vst:", "vsb:", "vftp:", "vfbp:", + "vf2tp:", + "vf2bp:", "vstp:", "vsbp:", ) @@ -627,8 +631,8 @@ def build_render_plan( # Before extracting video events, resolve any referenced videos that are missing # from the project's videos.json by looking them up in shared_assets/videos.json. _VIDEO_MARKER_PREFIXES = ( - "video:", "narration:", "vft:", "vfb:", "vst:", "vsb:", - "vftp:", "vfbp:", "vstp:", "vsbp:", + "video:", "narration:", "vft:", "vfb:", "vf2t:", "vf2b:", "vst:", "vsb:", + "vftp:", "vfbp:", "vf2tp:", "vf2bp:", "vstp:", "vsbp:", ) missing_video_ids = [ timing.marker_id[len(prefix):] @@ -897,8 +901,8 @@ def _extract_slide_events( events: list[SlideEvent] = [] for i, (marker_time, marker_id) in enumerate(resolved): - # Each slide starts at its own marker time - start_time = marker_time + # First slide always starts at 0 — it's the opening state of the presentation. + start_time = 0.0 if i == 0 else marker_time # End time is when the NEXT slide's marker appears, or end of video if i + 1 < len(resolved): @@ -957,10 +961,14 @@ def _extract_video_events( _SHORTHAND: dict[str, tuple[str, str]] = { "vft:": ("fullscreen", "above"), "vfb:": ("fullscreen", "below"), + "vf2t:": ("fullscreen2", "above"), + "vf2b:": ("fullscreen2", "below"), "vst:": ("square", "above"), "vsb:": ("square", "below"), "vftp:": ("fullscreen", "above", "pause_narration"), "vfbp:": ("fullscreen", "below", "pause_narration"), + "vf2tp:": ("fullscreen2", "above", "pause_narration"), + "vf2bp:": ("fullscreen2", "below", "pause_narration"), "vstp:": ("square", "above", "pause_narration"), "vsbp:": ("square", "below", "pause_narration"), } diff --git a/gnommo/validator.py b/gnommo/validator.py index 7e30e8c..242fd67 100644 --- a/gnommo/validator.py +++ b/gnommo/validator.py @@ -66,8 +66,16 @@ def validate_project( "video:": 6, "vft:": 4, "vfb:": 4, + "vf2t:": 5, + "vf2b:": 5, "vst:": 4, "vsb:": 4, + "vftp:": 5, + "vfbp:": 5, + "vf2tp:": 6, + "vf2bp:": 6, + "vstp:": 5, + "vsbp:": 5, } matched_prefix = next( (p for p in _VIDEO_PREFIXES if marker.startswith(p)), None