Adding some files

This commit is contained in:
2026-05-11 21:45:30 +02:00
parent b9376cd650
commit feb4df0506
3 changed files with 142 additions and 43 deletions
+39 -11
View File
@@ -234,7 +234,7 @@ Examples:
args.res,
)
elif action == "trim":
return cmd_trim(project_path, args.verbose, args.force, args.threshold)
return cmd_trim(project_path, args.verbose, args.force, args.threshold, args.res)
elif action == "transcode":
return cmd_transcode(
project_path,
@@ -1197,7 +1197,7 @@ def cmd_preprocess(
"""
from concurrent.futures import ThreadPoolExecutor, as_completed
from .parser import parse_project_config, parse_videos
from .preprocessor import preprocess_video
from .preprocessor import preprocess_video, RES_CONFIGS
from .models import VideoSource as _VideoSource
mode_str = f" ({res.upper()})" if res != "full" else ""
@@ -1278,7 +1278,15 @@ def cmd_preprocess(
if using_compressed and segment_id.endswith("_compressed"):
segment_id = segment_id[: -len("_compressed")]
output_file = f"processed/{segment_id}_processed.mov"
# For non-full res, write into the res subdir so stitch --res low finds the
# files at narration/low/processed/ (narration.json still records the plain
# "processed/..." path; stitch shifts the base dir itself).
_res_cfg = RES_CONFIGS.get(res) if res != "full" else None
if _res_cfg:
_, _, _subdir = _res_cfg
output_file = f"{_subdir}/processed/{segment_id}_processed.mov"
else:
output_file = f"processed/{segment_id}_processed.mov"
output_path = narration_dir / output_file
if output_path.exists() and not force:
@@ -1343,6 +1351,7 @@ def cmd_preprocess(
verbose=False,
force=force,
custom_gnommo_scratch=gnommo_scratch,
res=res,
)
return task
@@ -1371,6 +1380,7 @@ def cmd_preprocess(
verbose,
force,
gnommo_scratch,
res=res,
)
output_path = narration_dir / segment_source.output_file
if output_path.exists():
@@ -1396,8 +1406,8 @@ def cmd_preprocess(
for key in _PRESERVE_KEYS:
if key in existing_entry:
entry[key] = existing_entry[key]
# Point source_file to the processed output
entry["source_file"] = segment_source.output_file
# Always record the plain path; stitch shifts the base dir for low/tiny.
entry["source_file"] = f"processed/{segment_id}_processed.mov"
entry.setdefault("use_audio_channels", "auto")
entry.setdefault("defer_loudnorm", True)
existing_narration[segment_id] = entry
@@ -1437,7 +1447,7 @@ def cmd_preprocess(
continue
print(f" Processing: {video_id}")
preprocess_video(
videos_dir, video_id, video_source, verbose, force, gnommo_scratch
videos_dir, video_id, video_source, verbose, force, gnommo_scratch, res=res
)
print("\nPreprocessing complete.")
@@ -1454,6 +1464,7 @@ def cmd_trim(
verbose: bool,
force: bool = False,
threshold_db: float = -40.0,
res: str = "full",
) -> int:
"""
Auto-detect silence bounds for all narration segments and write skip/take
@@ -1482,6 +1493,22 @@ def cmd_trim(
print(" Run 'gnommo -p <project> import' first.")
return 1
# Build a lookup of raw source files by segment ID. Raw files give cleaner
# silence detection — loudnorm can introduce early peaks in processed audio.
_video_exts = {".mov", ".mp4", ".avi", ".mkv", ".m4v"}
raw_dir = narration_dir / "raw_mov"
compressed_dir = narration_dir / "raw_mp4"
raw_lookup: dict[str, Path] = {}
for search_dir in (raw_dir, compressed_dir):
if search_dir.exists():
for f in search_dir.iterdir():
if f.is_file() and f.suffix.lower() in _video_exts and not f.name.startswith("."):
stem = f.stem
if stem.endswith("_compressed"):
stem = stem[: -len("_compressed")]
raw_lookup[stem] = f
narration_json_path = narration_dir / "narration.json"
raw_data: dict = _read_json(narration_json_path)
@@ -1495,14 +1522,15 @@ def cmd_trim(
print(f" {seg_id}: already trimmed, skipping (use --force to redo)")
continue
# Always analyse the raw source file — it's always present and has the
# same audio as any processed version (processing is video-only).
source_path = narration_dir / seg.source_file
# Prefer raw file; fall back to processed if raw not available.
source_path = raw_lookup.get(seg_id)
if source_path is None:
source_path = narration_dir / seg.source_file
if not source_path.exists():
print(f" {seg_id}: source file not found ({seg.source_file}), skipping")
print(f" {seg_id}: source file not found, skipping")
continue
print(f" {seg_id}: analysing...", end="", flush=True)
print(f" {seg_id}: analysing {source_path.parent.name}/{source_path.name}...", end="", flush=True)
first_sound, last_sound = detect_silence_bounds(
source_path, noise_threshold_db=threshold_db, verbose=verbose
)
+14
View File
@@ -550,6 +550,7 @@ def preprocess_video(
verbose: bool = False,
force: bool = False,
custom_gnommo_scratch: Optional[Path] = None,
res: str = "full",
) -> Path:
"""
Apply preprocessing filters to a video source.
@@ -562,6 +563,7 @@ def preprocess_video(
video_id: ID of the video being processed
video_source: VideoSource with source_file, filter, and output_file
custom_gnommo_scratch: Optional external directory for intermediate files (e.g., SSD)
res: Resolution preset — when not "full", source is downscaled before filtering
Returns:
Path to the final preprocessed output file.
@@ -586,6 +588,18 @@ def preprocess_video(
filter_type=None,
)
# For non-full res, downscale the raw source first so all subsequent
# filters (chroma key, color grade, etc.) operate on the small file.
if res != "full":
cfg = RES_CONFIGS.get(res)
if cfg:
width, height, _ = cfg
print(f" Downscaling source to {width}x{height} ({res})...")
raw_low_dir = gnommo_scratch / f"raw_{res}"
current_input = create_downscaled_video(
current_input, raw_low_dir, width, height, force
)
# Resolve channel setting (auto-detect if needed) and sanity check
channel = video_source.use_audio_channels
if channel == "auto":
+89 -32
View File
@@ -182,14 +182,17 @@ def _extract_marker_contexts(
slides: dict = None,
videos: dict = None,
audio: dict = None,
) -> list[tuple[str, str, bool]]:
) -> list[tuple[str, str, bool, str]]:
"""
Extract known markers and the text immediately following them from manuscript.
Unknown markers are filtered out and stripped from following text.
Note: [cite:...] markers are already stripped at parse time.
Returns list of (marker_id, following_text, is_borrowed) tuples for known markers only.
Returns list of (marker_id, anchor_text, is_borrowed, anchor_type) tuples.
anchor_type is "before" (default — place before the matched phrase) or
"after" (place at the end of the matched phrase — used for markers that
trail a narration block and have no following text of their own).
"""
slides = slides or {}
videos = videos or {}
@@ -227,7 +230,7 @@ def _extract_marker_contexts(
for i, (marker_id, following_text) in enumerate(raw_contexts):
if following_text:
words = following_text.split()[:10]
contexts.append((marker_id, " ".join(words), False))
contexts.append((marker_id, " ".join(words), False, "before"))
else:
borrowed = False
for j in range(i + 1, len(raw_contexts)):
@@ -236,11 +239,24 @@ def _extract_marker_contexts(
if next_marker_id in (slides or {}):
break
words = next_text.split()[:10]
contexts.append((marker_id, " ".join(words), True))
contexts.append((marker_id, " ".join(words), True, "before"))
borrowed = True
break
if not borrowed:
contexts.append((marker_id, "", False))
# No following text and blocked by a slide boundary — look
# backward for the tail of the preceding narration block and
# anchor to the END of those words instead of extrapolating.
preceding_text = ""
for k in range(i - 1, -1, -1):
if raw_contexts[k][1]:
preceding_text = raw_contexts[k][1]
break
if preceding_text:
words = preceding_text.split()
tail = " ".join(words[-6:])
contexts.append((marker_id, tail, False, "after"))
else:
contexts.append((marker_id, "", False, "before"))
return contexts
@@ -250,13 +266,18 @@ def _fuzzy_match_ratio(
transcription: list[TranscribedWord],
start_idx: int,
window_size: int = 10,
pre_filler: int = 30,
inter_filler: int = 3,
) -> tuple[float, int, int]:
"""
Calculate how many words from phrase match the transcription at start_idx.
Words are matched sequentially: each phrase word must appear at or after
the position of the previous match. This prevents false matches where
phrase words appear out of order or far into the window.
Words are matched sequentially. Two separate filler tolerances:
- pre_filler: max words before the FIRST phrase word (absorbs ad-libs)
- inter_filler: max words between consecutive phrase words (keeps the
match tight so common words don't stretch the window far
into later text, which would push last_idx past subsequent
markers' positions)
Returns (ratio, first_match_offset, last_match_end_offset) where offsets
are relative to start_idx. last_match_end_offset points past the last
@@ -265,14 +286,13 @@ def _fuzzy_match_ratio(
if not phrase_words:
return 0.0, 0, 0
words_to_check = min(len(phrase_words), window_size)
# +30 filler allowance: absorbs ad-libbed words spoken before or between
# the manuscript cue words without breaking the match ratio.
transcript_end = min(start_idx + words_to_check + 30, len(transcription))
if start_idx >= len(transcription):
return 0.0, 0, 0
words_to_check = min(len(phrase_words), window_size)
# Window only needs to cover pre_filler + phrase words + inter_filler slack
transcript_end = min(start_idx + pre_filler + words_to_check + inter_filler, len(transcription))
transcript_words = [
_normalize_token(transcription[j].word)
for j in range(start_idx, transcript_end)
@@ -290,7 +310,14 @@ def _fuzzy_match_ratio(
continue
words_checked += 1
for j in range(t_pos, len(transcript_words)):
# First phrase word may be preceded by a long ad-lib; subsequent words
# should appear within a few positions of each other.
if matches == 0:
search_end = min(t_pos + pre_filler + 1, len(transcript_words))
else:
search_end = min(t_pos + inter_filler + 1, len(transcript_words))
for j in range(t_pos, search_end):
t_word = transcript_words[j]
matched = False
if normalized == t_word:
@@ -344,7 +371,11 @@ def _find_phrase_timestamp(
best_first_offset = first_offset
best_end_offset = end_offset
if ratio >= 0.95:
# Sequential alignment: stop at the first position that clears the
# threshold. Continuing to scan the full transcript risks jumping
# to a higher-ratio match much later and skipping over subsequent
# markers' positions entirely.
if best_ratio >= fuzzy_threshold:
break
if best_ratio >= fuzzy_threshold and best_idx >= 0:
@@ -393,8 +424,8 @@ def align_markers_to_transcription(
last_idx = 0
last_end_time = 0.0
for marker_id, following_text, is_borrowed in contexts:
if not following_text.strip():
for marker_id, anchor_text, is_borrowed, anchor_type in contexts:
if not anchor_text.strip():
marker_time = last_end_time + 1.0
timings.append(
MarkerTiming(
@@ -408,34 +439,50 @@ def align_markers_to_transcription(
continue
idx, timestamp, confidence, match_end_idx = _find_phrase_timestamp(
following_text,
anchor_text,
transcription,
start_from=last_idx,
fuzzy_threshold=fuzzy_threshold,
)
if idx >= 0:
adjusted_time = max(0.0, timestamp - 0.5)
timings.append(
MarkerTiming(
marker_id=marker_id,
timestamp=adjusted_time,
context=following_text[:50],
confidence=confidence,
if anchor_type == "after":
# Marker trails a narration block — place it at the END of the
# matched phrase (when those words finish being spoken).
end_idx = min(match_end_idx - 1, len(transcription) - 1)
marker_time = transcription[end_idx].end if transcription else 0.0
timings.append(
MarkerTiming(
marker_id=marker_id,
timestamp=marker_time,
context=f"(end of: {anchor_text[:40]})",
confidence=confidence,
)
)
)
if not is_borrowed:
last_idx = match_end_idx
if last_idx > 0 and last_idx <= len(transcription):
last_end_time = transcription[last_idx - 1].end
else:
last_end_time = transcription[-1].end if transcription else 0.0
last_end_time = marker_time
else:
adjusted_time = max(0.0, timestamp - 0.5)
timings.append(
MarkerTiming(
marker_id=marker_id,
timestamp=adjusted_time,
context=anchor_text[:50],
confidence=confidence,
)
)
if not is_borrowed:
last_idx = match_end_idx
if last_idx > 0 and last_idx <= len(transcription):
last_end_time = transcription[last_idx - 1].end
else:
last_end_time = transcription[-1].end if transcription else 0.0
else:
timings.append(
MarkerTiming(
marker_id=marker_id,
timestamp=-1.0,
context=following_text[:50],
context=anchor_text[:50],
confidence=0.0,
)
)
@@ -696,6 +743,16 @@ def build_render_plan(
# Save narration end time (before outro)
narration_end_time = total_duration
# Resolve any outro videos missing from videos.json via shared_assets.
if config.outro:
missing_outro_ids = [vid_id for vid_id in config.outro if vid_id not in videos]
if missing_outro_ids:
found = resolve_missing_videos(missing_outro_ids, project_path, config)
videos.update(found)
still_missing = [vid_id for vid_id in config.outro if vid_id not in videos]
for vid_id in still_missing:
print(f" WARNING: outro video '{vid_id}' not found in videos.json or shared_assets — skipped", flush=True)
# Build outro events (plays after narration ends)
outro_events = _extract_outro_events(
config.outro,