From feb4df050676c792e752d092e44dc5c55b009b7e Mon Sep 17 00:00:00 2001 From: jenstandstad Date: Mon, 11 May 2026 21:45:30 +0200 Subject: [PATCH] Adding some files --- gnommo/cli.py | 50 +++++++++++++---- gnommo/preprocessor.py | 14 +++++ gnommo/transformer.py | 121 ++++++++++++++++++++++++++++++----------- 3 files changed, 142 insertions(+), 43 deletions(-) diff --git a/gnommo/cli.py b/gnommo/cli.py index b0124fc..56674b0 100644 --- a/gnommo/cli.py +++ b/gnommo/cli.py @@ -234,7 +234,7 @@ Examples: args.res, ) elif action == "trim": - return cmd_trim(project_path, args.verbose, args.force, args.threshold) + return cmd_trim(project_path, args.verbose, args.force, args.threshold, args.res) elif action == "transcode": return cmd_transcode( project_path, @@ -1197,7 +1197,7 @@ def cmd_preprocess( """ from concurrent.futures import ThreadPoolExecutor, as_completed from .parser import parse_project_config, parse_videos - from .preprocessor import preprocess_video + from .preprocessor import preprocess_video, RES_CONFIGS from .models import VideoSource as _VideoSource mode_str = f" ({res.upper()})" if res != "full" else "" @@ -1278,7 +1278,15 @@ def cmd_preprocess( if using_compressed and segment_id.endswith("_compressed"): segment_id = segment_id[: -len("_compressed")] - output_file = f"processed/{segment_id}_processed.mov" + # For non-full res, write into the res subdir so stitch --res low finds the + # files at narration/low/processed/ (narration.json still records the plain + # "processed/..." path; stitch shifts the base dir itself). + _res_cfg = RES_CONFIGS.get(res) if res != "full" else None + if _res_cfg: + _, _, _subdir = _res_cfg + output_file = f"{_subdir}/processed/{segment_id}_processed.mov" + else: + output_file = f"processed/{segment_id}_processed.mov" output_path = narration_dir / output_file if output_path.exists() and not force: @@ -1343,6 +1351,7 @@ def cmd_preprocess( verbose=False, force=force, custom_gnommo_scratch=gnommo_scratch, + res=res, ) return task @@ -1371,6 +1380,7 @@ def cmd_preprocess( verbose, force, gnommo_scratch, + res=res, ) output_path = narration_dir / segment_source.output_file if output_path.exists(): @@ -1396,8 +1406,8 @@ def cmd_preprocess( for key in _PRESERVE_KEYS: if key in existing_entry: entry[key] = existing_entry[key] - # Point source_file to the processed output - entry["source_file"] = segment_source.output_file + # Always record the plain path; stitch shifts the base dir for low/tiny. + entry["source_file"] = f"processed/{segment_id}_processed.mov" entry.setdefault("use_audio_channels", "auto") entry.setdefault("defer_loudnorm", True) existing_narration[segment_id] = entry @@ -1437,7 +1447,7 @@ def cmd_preprocess( continue print(f" Processing: {video_id}") preprocess_video( - videos_dir, video_id, video_source, verbose, force, gnommo_scratch + videos_dir, video_id, video_source, verbose, force, gnommo_scratch, res=res ) print("\nPreprocessing complete.") @@ -1454,6 +1464,7 @@ def cmd_trim( verbose: bool, force: bool = False, threshold_db: float = -40.0, + res: str = "full", ) -> int: """ Auto-detect silence bounds for all narration segments and write skip/take @@ -1482,6 +1493,22 @@ def cmd_trim( print(" Run 'gnommo -p import' first.") return 1 + # Build a lookup of raw source files by segment ID. Raw files give cleaner + # silence detection — loudnorm can introduce early peaks in processed audio. + _video_exts = {".mov", ".mp4", ".avi", ".mkv", ".m4v"} + raw_dir = narration_dir / "raw_mov" + compressed_dir = narration_dir / "raw_mp4" + + raw_lookup: dict[str, Path] = {} + for search_dir in (raw_dir, compressed_dir): + if search_dir.exists(): + for f in search_dir.iterdir(): + if f.is_file() and f.suffix.lower() in _video_exts and not f.name.startswith("."): + stem = f.stem + if stem.endswith("_compressed"): + stem = stem[: -len("_compressed")] + raw_lookup[stem] = f + narration_json_path = narration_dir / "narration.json" raw_data: dict = _read_json(narration_json_path) @@ -1495,14 +1522,15 @@ def cmd_trim( print(f" {seg_id}: already trimmed, skipping (use --force to redo)") continue - # Always analyse the raw source file — it's always present and has the - # same audio as any processed version (processing is video-only). - source_path = narration_dir / seg.source_file + # Prefer raw file; fall back to processed if raw not available. + source_path = raw_lookup.get(seg_id) + if source_path is None: + source_path = narration_dir / seg.source_file if not source_path.exists(): - print(f" {seg_id}: source file not found ({seg.source_file}), skipping") + print(f" {seg_id}: source file not found, skipping") continue - print(f" {seg_id}: analysing...", end="", flush=True) + print(f" {seg_id}: analysing {source_path.parent.name}/{source_path.name}...", end="", flush=True) first_sound, last_sound = detect_silence_bounds( source_path, noise_threshold_db=threshold_db, verbose=verbose ) diff --git a/gnommo/preprocessor.py b/gnommo/preprocessor.py index 92e09bb..544c5fe 100644 --- a/gnommo/preprocessor.py +++ b/gnommo/preprocessor.py @@ -550,6 +550,7 @@ def preprocess_video( verbose: bool = False, force: bool = False, custom_gnommo_scratch: Optional[Path] = None, + res: str = "full", ) -> Path: """ Apply preprocessing filters to a video source. @@ -562,6 +563,7 @@ def preprocess_video( video_id: ID of the video being processed video_source: VideoSource with source_file, filter, and output_file custom_gnommo_scratch: Optional external directory for intermediate files (e.g., SSD) + res: Resolution preset — when not "full", source is downscaled before filtering Returns: Path to the final preprocessed output file. @@ -586,6 +588,18 @@ def preprocess_video( filter_type=None, ) + # For non-full res, downscale the raw source first so all subsequent + # filters (chroma key, color grade, etc.) operate on the small file. + if res != "full": + cfg = RES_CONFIGS.get(res) + if cfg: + width, height, _ = cfg + print(f" Downscaling source to {width}x{height} ({res})...") + raw_low_dir = gnommo_scratch / f"raw_{res}" + current_input = create_downscaled_video( + current_input, raw_low_dir, width, height, force + ) + # Resolve channel setting (auto-detect if needed) and sanity check channel = video_source.use_audio_channels if channel == "auto": diff --git a/gnommo/transformer.py b/gnommo/transformer.py index 5b84fbd..d80db6b 100644 --- a/gnommo/transformer.py +++ b/gnommo/transformer.py @@ -182,14 +182,17 @@ def _extract_marker_contexts( slides: dict = None, videos: dict = None, audio: dict = None, -) -> list[tuple[str, str, bool]]: +) -> list[tuple[str, str, bool, str]]: """ Extract known markers and the text immediately following them from manuscript. Unknown markers are filtered out and stripped from following text. Note: [cite:...] markers are already stripped at parse time. - Returns list of (marker_id, following_text, is_borrowed) tuples for known markers only. + Returns list of (marker_id, anchor_text, is_borrowed, anchor_type) tuples. + anchor_type is "before" (default — place before the matched phrase) or + "after" (place at the end of the matched phrase — used for markers that + trail a narration block and have no following text of their own). """ slides = slides or {} videos = videos or {} @@ -227,7 +230,7 @@ def _extract_marker_contexts( for i, (marker_id, following_text) in enumerate(raw_contexts): if following_text: words = following_text.split()[:10] - contexts.append((marker_id, " ".join(words), False)) + contexts.append((marker_id, " ".join(words), False, "before")) else: borrowed = False for j in range(i + 1, len(raw_contexts)): @@ -236,11 +239,24 @@ def _extract_marker_contexts( if next_marker_id in (slides or {}): break words = next_text.split()[:10] - contexts.append((marker_id, " ".join(words), True)) + contexts.append((marker_id, " ".join(words), True, "before")) borrowed = True break if not borrowed: - contexts.append((marker_id, "", False)) + # No following text and blocked by a slide boundary — look + # backward for the tail of the preceding narration block and + # anchor to the END of those words instead of extrapolating. + preceding_text = "" + for k in range(i - 1, -1, -1): + if raw_contexts[k][1]: + preceding_text = raw_contexts[k][1] + break + if preceding_text: + words = preceding_text.split() + tail = " ".join(words[-6:]) + contexts.append((marker_id, tail, False, "after")) + else: + contexts.append((marker_id, "", False, "before")) return contexts @@ -250,13 +266,18 @@ def _fuzzy_match_ratio( transcription: list[TranscribedWord], start_idx: int, window_size: int = 10, + pre_filler: int = 30, + inter_filler: int = 3, ) -> tuple[float, int, int]: """ Calculate how many words from phrase match the transcription at start_idx. - Words are matched sequentially: each phrase word must appear at or after - the position of the previous match. This prevents false matches where - phrase words appear out of order or far into the window. + Words are matched sequentially. Two separate filler tolerances: + - pre_filler: max words before the FIRST phrase word (absorbs ad-libs) + - inter_filler: max words between consecutive phrase words (keeps the + match tight so common words don't stretch the window far + into later text, which would push last_idx past subsequent + markers' positions) Returns (ratio, first_match_offset, last_match_end_offset) where offsets are relative to start_idx. last_match_end_offset points past the last @@ -265,14 +286,13 @@ def _fuzzy_match_ratio( if not phrase_words: return 0.0, 0, 0 - words_to_check = min(len(phrase_words), window_size) - # +30 filler allowance: absorbs ad-libbed words spoken before or between - # the manuscript cue words without breaking the match ratio. - transcript_end = min(start_idx + words_to_check + 30, len(transcription)) - if start_idx >= len(transcription): return 0.0, 0, 0 + words_to_check = min(len(phrase_words), window_size) + # Window only needs to cover pre_filler + phrase words + inter_filler slack + transcript_end = min(start_idx + pre_filler + words_to_check + inter_filler, len(transcription)) + transcript_words = [ _normalize_token(transcription[j].word) for j in range(start_idx, transcript_end) @@ -290,7 +310,14 @@ def _fuzzy_match_ratio( continue words_checked += 1 - for j in range(t_pos, len(transcript_words)): + # First phrase word may be preceded by a long ad-lib; subsequent words + # should appear within a few positions of each other. + if matches == 0: + search_end = min(t_pos + pre_filler + 1, len(transcript_words)) + else: + search_end = min(t_pos + inter_filler + 1, len(transcript_words)) + + for j in range(t_pos, search_end): t_word = transcript_words[j] matched = False if normalized == t_word: @@ -344,7 +371,11 @@ def _find_phrase_timestamp( best_first_offset = first_offset best_end_offset = end_offset - if ratio >= 0.95: + # Sequential alignment: stop at the first position that clears the + # threshold. Continuing to scan the full transcript risks jumping + # to a higher-ratio match much later and skipping over subsequent + # markers' positions entirely. + if best_ratio >= fuzzy_threshold: break if best_ratio >= fuzzy_threshold and best_idx >= 0: @@ -393,8 +424,8 @@ def align_markers_to_transcription( last_idx = 0 last_end_time = 0.0 - for marker_id, following_text, is_borrowed in contexts: - if not following_text.strip(): + for marker_id, anchor_text, is_borrowed, anchor_type in contexts: + if not anchor_text.strip(): marker_time = last_end_time + 1.0 timings.append( MarkerTiming( @@ -408,34 +439,50 @@ def align_markers_to_transcription( continue idx, timestamp, confidence, match_end_idx = _find_phrase_timestamp( - following_text, + anchor_text, transcription, start_from=last_idx, fuzzy_threshold=fuzzy_threshold, ) if idx >= 0: - adjusted_time = max(0.0, timestamp - 0.5) - timings.append( - MarkerTiming( - marker_id=marker_id, - timestamp=adjusted_time, - context=following_text[:50], - confidence=confidence, + if anchor_type == "after": + # Marker trails a narration block — place it at the END of the + # matched phrase (when those words finish being spoken). + end_idx = min(match_end_idx - 1, len(transcription) - 1) + marker_time = transcription[end_idx].end if transcription else 0.0 + timings.append( + MarkerTiming( + marker_id=marker_id, + timestamp=marker_time, + context=f"(end of: {anchor_text[:40]})", + confidence=confidence, + ) ) - ) - if not is_borrowed: last_idx = match_end_idx - if last_idx > 0 and last_idx <= len(transcription): - last_end_time = transcription[last_idx - 1].end - else: - last_end_time = transcription[-1].end if transcription else 0.0 + last_end_time = marker_time + else: + adjusted_time = max(0.0, timestamp - 0.5) + timings.append( + MarkerTiming( + marker_id=marker_id, + timestamp=adjusted_time, + context=anchor_text[:50], + confidence=confidence, + ) + ) + if not is_borrowed: + last_idx = match_end_idx + if last_idx > 0 and last_idx <= len(transcription): + last_end_time = transcription[last_idx - 1].end + else: + last_end_time = transcription[-1].end if transcription else 0.0 else: timings.append( MarkerTiming( marker_id=marker_id, timestamp=-1.0, - context=following_text[:50], + context=anchor_text[:50], confidence=0.0, ) ) @@ -696,6 +743,16 @@ def build_render_plan( # Save narration end time (before outro) narration_end_time = total_duration + # Resolve any outro videos missing from videos.json via shared_assets. + if config.outro: + missing_outro_ids = [vid_id for vid_id in config.outro if vid_id not in videos] + if missing_outro_ids: + found = resolve_missing_videos(missing_outro_ids, project_path, config) + videos.update(found) + still_missing = [vid_id for vid_id in config.outro if vid_id not in videos] + for vid_id in still_missing: + print(f" WARNING: outro video '{vid_id}' not found in videos.json or shared_assets — skipped", flush=True) + # Build outro events (plays after narration ends) outro_events = _extract_outro_events( config.outro,