Adding some files

This commit is contained in:
2026-05-11 21:45:30 +02:00
parent b9376cd650
commit feb4df0506
3 changed files with 142 additions and 43 deletions
+39 -11
View File
@@ -234,7 +234,7 @@ Examples:
args.res, args.res,
) )
elif action == "trim": elif action == "trim":
return cmd_trim(project_path, args.verbose, args.force, args.threshold) return cmd_trim(project_path, args.verbose, args.force, args.threshold, args.res)
elif action == "transcode": elif action == "transcode":
return cmd_transcode( return cmd_transcode(
project_path, project_path,
@@ -1197,7 +1197,7 @@ def cmd_preprocess(
""" """
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from .parser import parse_project_config, parse_videos from .parser import parse_project_config, parse_videos
from .preprocessor import preprocess_video from .preprocessor import preprocess_video, RES_CONFIGS
from .models import VideoSource as _VideoSource from .models import VideoSource as _VideoSource
mode_str = f" ({res.upper()})" if res != "full" else "" mode_str = f" ({res.upper()})" if res != "full" else ""
@@ -1278,7 +1278,15 @@ def cmd_preprocess(
if using_compressed and segment_id.endswith("_compressed"): if using_compressed and segment_id.endswith("_compressed"):
segment_id = segment_id[: -len("_compressed")] segment_id = segment_id[: -len("_compressed")]
output_file = f"processed/{segment_id}_processed.mov" # For non-full res, write into the res subdir so stitch --res low finds the
# files at narration/low/processed/ (narration.json still records the plain
# "processed/..." path; stitch shifts the base dir itself).
_res_cfg = RES_CONFIGS.get(res) if res != "full" else None
if _res_cfg:
_, _, _subdir = _res_cfg
output_file = f"{_subdir}/processed/{segment_id}_processed.mov"
else:
output_file = f"processed/{segment_id}_processed.mov"
output_path = narration_dir / output_file output_path = narration_dir / output_file
if output_path.exists() and not force: if output_path.exists() and not force:
@@ -1343,6 +1351,7 @@ def cmd_preprocess(
verbose=False, verbose=False,
force=force, force=force,
custom_gnommo_scratch=gnommo_scratch, custom_gnommo_scratch=gnommo_scratch,
res=res,
) )
return task return task
@@ -1371,6 +1380,7 @@ def cmd_preprocess(
verbose, verbose,
force, force,
gnommo_scratch, gnommo_scratch,
res=res,
) )
output_path = narration_dir / segment_source.output_file output_path = narration_dir / segment_source.output_file
if output_path.exists(): if output_path.exists():
@@ -1396,8 +1406,8 @@ def cmd_preprocess(
for key in _PRESERVE_KEYS: for key in _PRESERVE_KEYS:
if key in existing_entry: if key in existing_entry:
entry[key] = existing_entry[key] entry[key] = existing_entry[key]
# Point source_file to the processed output # Always record the plain path; stitch shifts the base dir for low/tiny.
entry["source_file"] = segment_source.output_file entry["source_file"] = f"processed/{segment_id}_processed.mov"
entry.setdefault("use_audio_channels", "auto") entry.setdefault("use_audio_channels", "auto")
entry.setdefault("defer_loudnorm", True) entry.setdefault("defer_loudnorm", True)
existing_narration[segment_id] = entry existing_narration[segment_id] = entry
@@ -1437,7 +1447,7 @@ def cmd_preprocess(
continue continue
print(f" Processing: {video_id}") print(f" Processing: {video_id}")
preprocess_video( preprocess_video(
videos_dir, video_id, video_source, verbose, force, gnommo_scratch videos_dir, video_id, video_source, verbose, force, gnommo_scratch, res=res
) )
print("\nPreprocessing complete.") print("\nPreprocessing complete.")
@@ -1454,6 +1464,7 @@ def cmd_trim(
verbose: bool, verbose: bool,
force: bool = False, force: bool = False,
threshold_db: float = -40.0, threshold_db: float = -40.0,
res: str = "full",
) -> int: ) -> int:
""" """
Auto-detect silence bounds for all narration segments and write skip/take Auto-detect silence bounds for all narration segments and write skip/take
@@ -1482,6 +1493,22 @@ def cmd_trim(
print(" Run 'gnommo -p <project> import' first.") print(" Run 'gnommo -p <project> import' first.")
return 1 return 1
# Build a lookup of raw source files by segment ID. Raw files give cleaner
# silence detection — loudnorm can introduce early peaks in processed audio.
_video_exts = {".mov", ".mp4", ".avi", ".mkv", ".m4v"}
raw_dir = narration_dir / "raw_mov"
compressed_dir = narration_dir / "raw_mp4"
raw_lookup: dict[str, Path] = {}
for search_dir in (raw_dir, compressed_dir):
if search_dir.exists():
for f in search_dir.iterdir():
if f.is_file() and f.suffix.lower() in _video_exts and not f.name.startswith("."):
stem = f.stem
if stem.endswith("_compressed"):
stem = stem[: -len("_compressed")]
raw_lookup[stem] = f
narration_json_path = narration_dir / "narration.json" narration_json_path = narration_dir / "narration.json"
raw_data: dict = _read_json(narration_json_path) raw_data: dict = _read_json(narration_json_path)
@@ -1495,14 +1522,15 @@ def cmd_trim(
print(f" {seg_id}: already trimmed, skipping (use --force to redo)") print(f" {seg_id}: already trimmed, skipping (use --force to redo)")
continue continue
# Always analyse the raw source file — it's always present and has the # Prefer raw file; fall back to processed if raw not available.
# same audio as any processed version (processing is video-only). source_path = raw_lookup.get(seg_id)
source_path = narration_dir / seg.source_file if source_path is None:
source_path = narration_dir / seg.source_file
if not source_path.exists(): if not source_path.exists():
print(f" {seg_id}: source file not found ({seg.source_file}), skipping") print(f" {seg_id}: source file not found, skipping")
continue continue
print(f" {seg_id}: analysing...", end="", flush=True) print(f" {seg_id}: analysing {source_path.parent.name}/{source_path.name}...", end="", flush=True)
first_sound, last_sound = detect_silence_bounds( first_sound, last_sound = detect_silence_bounds(
source_path, noise_threshold_db=threshold_db, verbose=verbose source_path, noise_threshold_db=threshold_db, verbose=verbose
) )
+14
View File
@@ -550,6 +550,7 @@ def preprocess_video(
verbose: bool = False, verbose: bool = False,
force: bool = False, force: bool = False,
custom_gnommo_scratch: Optional[Path] = None, custom_gnommo_scratch: Optional[Path] = None,
res: str = "full",
) -> Path: ) -> Path:
""" """
Apply preprocessing filters to a video source. Apply preprocessing filters to a video source.
@@ -562,6 +563,7 @@ def preprocess_video(
video_id: ID of the video being processed video_id: ID of the video being processed
video_source: VideoSource with source_file, filter, and output_file video_source: VideoSource with source_file, filter, and output_file
custom_gnommo_scratch: Optional external directory for intermediate files (e.g., SSD) custom_gnommo_scratch: Optional external directory for intermediate files (e.g., SSD)
res: Resolution preset — when not "full", source is downscaled before filtering
Returns: Returns:
Path to the final preprocessed output file. Path to the final preprocessed output file.
@@ -586,6 +588,18 @@ def preprocess_video(
filter_type=None, filter_type=None,
) )
# For non-full res, downscale the raw source first so all subsequent
# filters (chroma key, color grade, etc.) operate on the small file.
if res != "full":
cfg = RES_CONFIGS.get(res)
if cfg:
width, height, _ = cfg
print(f" Downscaling source to {width}x{height} ({res})...")
raw_low_dir = gnommo_scratch / f"raw_{res}"
current_input = create_downscaled_video(
current_input, raw_low_dir, width, height, force
)
# Resolve channel setting (auto-detect if needed) and sanity check # Resolve channel setting (auto-detect if needed) and sanity check
channel = video_source.use_audio_channels channel = video_source.use_audio_channels
if channel == "auto": if channel == "auto":
+89 -32
View File
@@ -182,14 +182,17 @@ def _extract_marker_contexts(
slides: dict = None, slides: dict = None,
videos: dict = None, videos: dict = None,
audio: dict = None, audio: dict = None,
) -> list[tuple[str, str, bool]]: ) -> list[tuple[str, str, bool, str]]:
""" """
Extract known markers and the text immediately following them from manuscript. Extract known markers and the text immediately following them from manuscript.
Unknown markers are filtered out and stripped from following text. Unknown markers are filtered out and stripped from following text.
Note: [cite:...] markers are already stripped at parse time. Note: [cite:...] markers are already stripped at parse time.
Returns list of (marker_id, following_text, is_borrowed) tuples for known markers only. Returns list of (marker_id, anchor_text, is_borrowed, anchor_type) tuples.
anchor_type is "before" (default — place before the matched phrase) or
"after" (place at the end of the matched phrase — used for markers that
trail a narration block and have no following text of their own).
""" """
slides = slides or {} slides = slides or {}
videos = videos or {} videos = videos or {}
@@ -227,7 +230,7 @@ def _extract_marker_contexts(
for i, (marker_id, following_text) in enumerate(raw_contexts): for i, (marker_id, following_text) in enumerate(raw_contexts):
if following_text: if following_text:
words = following_text.split()[:10] words = following_text.split()[:10]
contexts.append((marker_id, " ".join(words), False)) contexts.append((marker_id, " ".join(words), False, "before"))
else: else:
borrowed = False borrowed = False
for j in range(i + 1, len(raw_contexts)): for j in range(i + 1, len(raw_contexts)):
@@ -236,11 +239,24 @@ def _extract_marker_contexts(
if next_marker_id in (slides or {}): if next_marker_id in (slides or {}):
break break
words = next_text.split()[:10] words = next_text.split()[:10]
contexts.append((marker_id, " ".join(words), True)) contexts.append((marker_id, " ".join(words), True, "before"))
borrowed = True borrowed = True
break break
if not borrowed: if not borrowed:
contexts.append((marker_id, "", False)) # No following text and blocked by a slide boundary — look
# backward for the tail of the preceding narration block and
# anchor to the END of those words instead of extrapolating.
preceding_text = ""
for k in range(i - 1, -1, -1):
if raw_contexts[k][1]:
preceding_text = raw_contexts[k][1]
break
if preceding_text:
words = preceding_text.split()
tail = " ".join(words[-6:])
contexts.append((marker_id, tail, False, "after"))
else:
contexts.append((marker_id, "", False, "before"))
return contexts return contexts
@@ -250,13 +266,18 @@ def _fuzzy_match_ratio(
transcription: list[TranscribedWord], transcription: list[TranscribedWord],
start_idx: int, start_idx: int,
window_size: int = 10, window_size: int = 10,
pre_filler: int = 30,
inter_filler: int = 3,
) -> tuple[float, int, int]: ) -> tuple[float, int, int]:
""" """
Calculate how many words from phrase match the transcription at start_idx. Calculate how many words from phrase match the transcription at start_idx.
Words are matched sequentially: each phrase word must appear at or after Words are matched sequentially. Two separate filler tolerances:
the position of the previous match. This prevents false matches where - pre_filler: max words before the FIRST phrase word (absorbs ad-libs)
phrase words appear out of order or far into the window. - inter_filler: max words between consecutive phrase words (keeps the
match tight so common words don't stretch the window far
into later text, which would push last_idx past subsequent
markers' positions)
Returns (ratio, first_match_offset, last_match_end_offset) where offsets Returns (ratio, first_match_offset, last_match_end_offset) where offsets
are relative to start_idx. last_match_end_offset points past the last are relative to start_idx. last_match_end_offset points past the last
@@ -265,14 +286,13 @@ def _fuzzy_match_ratio(
if not phrase_words: if not phrase_words:
return 0.0, 0, 0 return 0.0, 0, 0
words_to_check = min(len(phrase_words), window_size)
# +30 filler allowance: absorbs ad-libbed words spoken before or between
# the manuscript cue words without breaking the match ratio.
transcript_end = min(start_idx + words_to_check + 30, len(transcription))
if start_idx >= len(transcription): if start_idx >= len(transcription):
return 0.0, 0, 0 return 0.0, 0, 0
words_to_check = min(len(phrase_words), window_size)
# Window only needs to cover pre_filler + phrase words + inter_filler slack
transcript_end = min(start_idx + pre_filler + words_to_check + inter_filler, len(transcription))
transcript_words = [ transcript_words = [
_normalize_token(transcription[j].word) _normalize_token(transcription[j].word)
for j in range(start_idx, transcript_end) for j in range(start_idx, transcript_end)
@@ -290,7 +310,14 @@ def _fuzzy_match_ratio(
continue continue
words_checked += 1 words_checked += 1
for j in range(t_pos, len(transcript_words)): # First phrase word may be preceded by a long ad-lib; subsequent words
# should appear within a few positions of each other.
if matches == 0:
search_end = min(t_pos + pre_filler + 1, len(transcript_words))
else:
search_end = min(t_pos + inter_filler + 1, len(transcript_words))
for j in range(t_pos, search_end):
t_word = transcript_words[j] t_word = transcript_words[j]
matched = False matched = False
if normalized == t_word: if normalized == t_word:
@@ -344,7 +371,11 @@ def _find_phrase_timestamp(
best_first_offset = first_offset best_first_offset = first_offset
best_end_offset = end_offset best_end_offset = end_offset
if ratio >= 0.95: # Sequential alignment: stop at the first position that clears the
# threshold. Continuing to scan the full transcript risks jumping
# to a higher-ratio match much later and skipping over subsequent
# markers' positions entirely.
if best_ratio >= fuzzy_threshold:
break break
if best_ratio >= fuzzy_threshold and best_idx >= 0: if best_ratio >= fuzzy_threshold and best_idx >= 0:
@@ -393,8 +424,8 @@ def align_markers_to_transcription(
last_idx = 0 last_idx = 0
last_end_time = 0.0 last_end_time = 0.0
for marker_id, following_text, is_borrowed in contexts: for marker_id, anchor_text, is_borrowed, anchor_type in contexts:
if not following_text.strip(): if not anchor_text.strip():
marker_time = last_end_time + 1.0 marker_time = last_end_time + 1.0
timings.append( timings.append(
MarkerTiming( MarkerTiming(
@@ -408,34 +439,50 @@ def align_markers_to_transcription(
continue continue
idx, timestamp, confidence, match_end_idx = _find_phrase_timestamp( idx, timestamp, confidence, match_end_idx = _find_phrase_timestamp(
following_text, anchor_text,
transcription, transcription,
start_from=last_idx, start_from=last_idx,
fuzzy_threshold=fuzzy_threshold, fuzzy_threshold=fuzzy_threshold,
) )
if idx >= 0: if idx >= 0:
adjusted_time = max(0.0, timestamp - 0.5) if anchor_type == "after":
timings.append( # Marker trails a narration block — place it at the END of the
MarkerTiming( # matched phrase (when those words finish being spoken).
marker_id=marker_id, end_idx = min(match_end_idx - 1, len(transcription) - 1)
timestamp=adjusted_time, marker_time = transcription[end_idx].end if transcription else 0.0
context=following_text[:50], timings.append(
confidence=confidence, MarkerTiming(
marker_id=marker_id,
timestamp=marker_time,
context=f"(end of: {anchor_text[:40]})",
confidence=confidence,
)
) )
)
if not is_borrowed:
last_idx = match_end_idx last_idx = match_end_idx
if last_idx > 0 and last_idx <= len(transcription): last_end_time = marker_time
last_end_time = transcription[last_idx - 1].end else:
else: adjusted_time = max(0.0, timestamp - 0.5)
last_end_time = transcription[-1].end if transcription else 0.0 timings.append(
MarkerTiming(
marker_id=marker_id,
timestamp=adjusted_time,
context=anchor_text[:50],
confidence=confidence,
)
)
if not is_borrowed:
last_idx = match_end_idx
if last_idx > 0 and last_idx <= len(transcription):
last_end_time = transcription[last_idx - 1].end
else:
last_end_time = transcription[-1].end if transcription else 0.0
else: else:
timings.append( timings.append(
MarkerTiming( MarkerTiming(
marker_id=marker_id, marker_id=marker_id,
timestamp=-1.0, timestamp=-1.0,
context=following_text[:50], context=anchor_text[:50],
confidence=0.0, confidence=0.0,
) )
) )
@@ -696,6 +743,16 @@ def build_render_plan(
# Save narration end time (before outro) # Save narration end time (before outro)
narration_end_time = total_duration narration_end_time = total_duration
# Resolve any outro videos missing from videos.json via shared_assets.
if config.outro:
missing_outro_ids = [vid_id for vid_id in config.outro if vid_id not in videos]
if missing_outro_ids:
found = resolve_missing_videos(missing_outro_ids, project_path, config)
videos.update(found)
still_missing = [vid_id for vid_id in config.outro if vid_id not in videos]
for vid_id in still_missing:
print(f" WARNING: outro video '{vid_id}' not found in videos.json or shared_assets — skipped", flush=True)
# Build outro events (plays after narration ends) # Build outro events (plays after narration ends)
outro_events = _extract_outro_events( outro_events = _extract_outro_events(
config.outro, config.outro,