diff --git a/gnommo/cli.py b/gnommo/cli.py
index 56674b0..0521c3d 100644
--- a/gnommo/cli.py
+++ b/gnommo/cli.py
@@ -372,10 +372,11 @@ def cmd_import(project_path: Path, force: bool, verbose: bool) -> int:
     shared_assets_dir = _find_shared_assets(project_path)
     if shared_assets_dir:
         _import_shared_assets(shared_assets_dir, verbose)
+        _import_shared_audio(shared_assets_dir, project_path, config, verbose)
         _sync_shared_videos_to_local(project_path, config, shared_assets_dir, verbose)
 
     # Probe and cache audio file durations into audio.json
-    _probe_audio_durations(project_path, config, force, verbose)
+    _probe_audio_durations(project_path, config, force, verbose, shared_assets_dir)
 
     # Probe and cache video metadata (duration, has_audio) into videos.json
     _probe_video_metadata(project_path, config, shared_assets_dir, force, verbose)
@@ -384,8 +385,71 @@ def cmd_import(project_path: Path, force: bool, verbose: bool) -> int:
     return 0
 
 
+def _import_shared_audio(
+    shared_assets_dir: Path,
+    project_path: Path,
+    config,
+    verbose: bool,
+) -> None:
+    """Import audio files from shared_assets/media/audio into the project's audio.json."""
+    audio_extensions = {".mp3", ".wav", ".aac", ".m4a", ".ogg", ".flac"}
+    shared_audio_dir = shared_assets_dir / "media" / "audio"
+
+    if not shared_audio_dir.exists():
+        if verbose:
+            print(f"  No shared audio dir found at {shared_audio_dir}")
+        return
+
+    audio_files = sorted(
+        f
+        for f in shared_audio_dir.iterdir()
+        if f.is_file()
+        and f.suffix.lower() in audio_extensions
+        and not f.name.startswith(".")
+    )
+
+    if not audio_files:
+        if verbose:
+            print(f"  No audio files found in {shared_audio_dir}")
+        return
+
+    # Resolve project audio.json path
+    if config and config.audio_path:
+        audio_json_path = project_path / config.audio_path
+    else:
+        audio_json_path = project_path / "media" / "audio" / "audio.json"
+
+    audio_json_path.parent.mkdir(parents=True, exist_ok=True)
+    existing: dict = _read_json(audio_json_path) if audio_json_path.exists() else {}
+
+    added = 0
+    for f in audio_files:
+        audio_id = f.stem
+        if audio_id in existing:
+            if verbose:
+                print(f"    Skipping {audio_id} (already in audio.json)")
+            continue
+        existing[audio_id] = {
+            "file": f.name,
+            "is_shared": True,
+            "volume": 1.0,
+        }
+        added += 1
+        if verbose:
+            print(f"    Added shared audio: {audio_id}")
+
+    if added > 0:
+        with open(audio_json_path, "w", encoding="utf-8") as fh:
+            json.dump(existing, fh, indent=2)
+        print(f"  Updated {audio_json_path.relative_to(project_path)} (+{added} shared audio files)")
+    else:
+        if verbose:
+            print(f"  No new shared audio files to add")
+
+
 def _probe_audio_durations(
-    project_path: Path, config, force: bool, verbose: bool
+    project_path: Path, config, force: bool, verbose: bool,
+    shared_assets_dir: Optional[Path] = None,
 ) -> None:
     """Probe and cache audio file durations into audio.json.
 
@@ -413,7 +477,10 @@ def _probe_audio_durations(
             if verbose:
                 print(f"  Audio '{audio_id}': cached ({audio_data['duration']:.1f}s)")
             continue
-        audio_path = audio_dir / audio_data["file"]
+        if audio_data.get("is_shared") and shared_assets_dir:
+            audio_path = shared_assets_dir / "media" / "audio" / audio_data["file"]
+        else:
+            audio_path = audio_dir / audio_data["file"]
         if not audio_path.exists():
             if verbose:
                 print(f"  Audio '{audio_id}': file not found, skipping")
@@ -1060,8 +1127,16 @@ _TASKS_VIDEO_PREFIXES = {
     "video:": 6,
     "vft:": 4,
     "vfb:": 4,
+    "vf2t:": 5,
+    "vf2b:": 5,
     "vst:": 4,
     "vsb:": 4,
+    "vftp:": 5,
+    "vfbp:": 5,
+    "vf2tp:": 6,
+    "vf2bp:": 6,
+    "vstp:": 5,
+    "vsbp:": 5,
     "narration:": 10,
 }
 
@@ -1993,6 +2068,14 @@ def cmd_stitch(
         print(f"\n  Combined narration exists: {stitch_output.name}")
         print("  (use --force to regenerate)")
     else:
+        # Extract loudnorm config from talkinghead filter so stitch uses
+        # per-project settings instead of hardcoded defaults.
+        _loudnorm_cfg = None
+        if config and config.default_filters:
+            for _f in (config.default_filters.get("talkinghead") or []):
+                if isinstance(_f, dict) and _f.get("type") == "audio_normalize":
+                    _loudnorm_cfg = _f
+                    break
         stitch_narration_segments(
             narration_dir,
             segment_ids,
@@ -2000,6 +2083,7 @@ def cmd_stitch(
             stitch_output,
             verbose=verbose,
             default_end_trim=config.default_end_trim if config else 0.0,
+            loudnorm_config=_loudnorm_cfg,
         )
         # Run import videos again, because at this point narration_combined might have been created.
         _import_videos(videos_dir, config, verbose)
@@ -2127,14 +2211,10 @@ def _print_render_plan_details(plan, marker_timings, slides: dict) -> None:
                 marker_id.startswith(p)
                 for p in (
                     "video:",
-                    "vft:",
-                    "vfb:",
-                    "vst:",
-                    "vsb:",
-                    "vft:",
-                    "vfbp:",
-                    "vstp:",
-                    "vsbp:",
+                    "vft:", "vfb:", "vf2t:", "vf2b:",
+                    "vst:", "vsb:",
+                    "vftp:", "vfbp:", "vf2tp:", "vf2bp:",
+                    "vstp:", "vsbp:",
                 )
             ):
                 aligned_count += 1
@@ -2142,14 +2222,10 @@ def _print_render_plan_details(plan, marker_timings, slides: dict) -> None:
                     len(p)
                     for p in (
                         "video:",
-                        "vft:",
-                        "vfb:",
-                        "vst:",
-                        "vsb:",
-                        "vft:",
-                        "vfbp:",
-                        "vstp:",
-                        "vsbp:",
+                        "vft:", "vfb:", "vf2t:", "vf2b:",
+                        "vst:", "vsb:",
+                        "vftp:", "vfbp:", "vf2tp:", "vf2bp:",
+                        "vstp:", "vsbp:",
                     )
                     if marker_id.startswith(p)
                 )
@@ -3066,6 +3142,11 @@ _RSYNC_EXCLUDES = [
     "media/narration/processed/",
     "media/narration/processed/**",
     "media/videos/narration_combined.mov",
+    # Low-res preview files (generated locally, not synced)
+    "media/narration/low/",
+    "media/narration/low/**",
+    "media/videos/low/",
+    "media/videos/low/**",
     # Chunk scratch directories
     "**/chunks/",
     "**/chunks/**",
diff --git a/gnommo/models.py b/gnommo/models.py
index f3f264a..c1da839 100644
--- a/gnommo/models.py
+++ b/gnommo/models.py
@@ -337,7 +337,7 @@ class SlideEvent:
 class AudioDefinition:
     """Definition of an audio clip from audio.json."""
 
-    file: str  # Audio filename (relative to audio.json location)
+    file: str  # Audio filename (relative to audio.json location, or to shared_assets/media/audio/ if is_shared)
     volume: float = 1.0  # Volume multiplier (0.0-1.0)
     loop: bool = False  # If True, loop for entire duration from trigger point
     overlap: Optional[float] = None  # Crossfade overlap in seconds when looping
@@ -345,6 +345,7 @@ class AudioDefinition:
         False  # If True, audio continues playing during narration pauses
     )
     duration: Optional[float] = None  # Pre-probed duration in seconds (set by import)
+    is_shared: bool = False  # If True, file is relative to shared_assets/media/audio/
 
 
 @dataclass
diff --git a/gnommo/parser.py b/gnommo/parser.py
index cc2a835..2ac4331 100644
--- a/gnommo/parser.py
+++ b/gnommo/parser.py
@@ -374,6 +374,7 @@ def parse_audio(
             overlap=overlap,
             ignore_pauses=bool(audio_data.get("ignore_pauses", False)),
             duration=float(raw_duration) if raw_duration is not None else None,
+            is_shared=bool(audio_data.get("is_shared", False)),
         )
 
     return audio, audio_dir
@@ -494,8 +495,8 @@ def parse_videos(
             filter_list = filter_value
 
         # Handle skip/take - can use begin/end as user-friendly alternatives
-        skip = video_data.get("skip", 0.0)
-        take = video_data.get("take")
+        skip = float(video_data.get("skip") or 0.0)
+        take = float(video_data["take"]) if video_data.get("take") not in (None, "") else None
 
         # Convert begin/end to skip/take if provided
         if "begin" in video_data and video_data["begin"]:
diff --git a/gnommo/preprocessor.py b/gnommo/preprocessor.py
index 544c5fe..a2fe31f 100644
--- a/gnommo/preprocessor.py
+++ b/gnommo/preprocessor.py
@@ -26,7 +26,7 @@ CHUNK_DURATION = 60
 
 # Resolution presets for preview/proxy workflow
 # Each entry: (width, height, subdir_name)
-RES_CONFIGS: dict[str, tuple[int, int, str] | None] = {
+RES_CONFIGS: dict[str, Optional[tuple]] = {
     "full": None,  # no downscale, no subdir
     "low": (490, 270, "low"),
     "tiny": (320, 180, "proxy"),  # "proxy" subdir kept for backward compat
@@ -120,8 +120,12 @@ def create_downscaled_video(
         "ultrafast",
         "-crf",
         "28",
+        "-vsync",
+        "cfr",
         "-c:a",
-        "copy",
+        "aac",      # re-encode audio so both streams share the same PTS origin,
+        "-ar",      # avoiding the lip-sync drift caused by libx264 encoder delay
+        "48000",    # when audio is copied with its original timestamps
         str(out_path),
     ]
     result = subprocess.run(cmd, capture_output=True, text=True)
@@ -2189,6 +2193,7 @@ def stitch_narration_segments(
     output_path: Path,
     verbose: bool = False,
     default_end_trim: float = 0.0,
+    loudnorm_config: Optional[dict] = None,
 ) -> Path:
     """
     Stitch multiple narration video segments into a single file.
@@ -2379,7 +2384,13 @@ def stitch_narration_segments(
             output_path.parent / f"{output_path.stem}_normalized{output_path.suffix}"
         )
 
-        # Use EBU R128 loudnorm targeting YouTube's recommended levels
+        # Build loudnorm filter string from project config (or fall back to defaults)
+        _cfg = loudnorm_config or {}
+        _lufs = float(_cfg.get("target_lufs", -14))
+        _lra  = float(_cfg.get("target_lra",  11))
+        _tp   = float(_cfg.get("target_tp",   -1.5))
+        loudnorm_filter = f"loudnorm=I={_lufs:.1f}:LRA={_lra:.1f}:TP={_tp:.1f}"
+
         loudnorm_cmd = [
             "ffmpeg",
             "-y",
@@ -2388,7 +2399,7 @@ def stitch_narration_segments(
             "-c:v",
             "copy",
             "-af",
-            "loudnorm=I=-14:LRA=11:TP=-1.5",
+            loudnorm_filter,
             "-c:a",
             "aac",
             "-b:a",
diff --git a/gnommo/renderer.py b/gnommo/renderer.py
index 02b0fba..3f7e76a 100644
--- a/gnommo/renderer.py
+++ b/gnommo/renderer.py
@@ -395,7 +395,7 @@ def build_ffmpeg_command(plan: RenderPlan, output_path: Path) -> list[str]:
         video_path = _resolve_video_path(
             videos_dir, event.video_source, shared_assets_dir, project_path
         )
-        skip = event.video_source.skip
+        skip = event.video_source.skip or 0.0
         if skip > 0:
             cmd.extend(["-ss", f"{skip:.3f}"])
         cmd.extend(["-analyzeduration", "0", "-probesize", "1000"])
@@ -425,7 +425,7 @@ def build_ffmpeg_command(plan: RenderPlan, output_path: Path) -> list[str]:
         video_path = _resolve_video_path(
             videos_dir, event.video_source, shared_assets_dir, project_path
         )
-        skip = event.video_source.skip
+        skip = event.video_source.skip or 0.0
         if skip > 0:
             cmd.extend(["-ss", f"{skip:.3f}"])
         cmd.extend(["-analyzeduration", "0", "-probesize", "1000"])
@@ -455,7 +455,10 @@ def build_ffmpeg_command(plan: RenderPlan, output_path: Path) -> list[str]:
 
     for event in plan.audio_events:
         if event.audio_id not in audio_inputs:
-            audio_path = audio_dir / event.audio_def.file
+            if event.audio_def.is_shared and plan.shared_assets_dir:
+                audio_path = plan.shared_assets_dir / "media" / "audio" / event.audio_def.file
+            else:
+                audio_path = audio_dir / event.audio_def.file
             audio_path, _ = resolve_with_cache(audio_path, project_path)
             # Use pre-probed duration from audio.json if available (set by import).
             # For MP3 without Xing/VBRI headers this is critical — FFmpeg otherwise
@@ -802,13 +805,14 @@ def build_filter_complex(
     """
     Build the filter_complex string for FFmpeg.
 
-    Layer structure:
+    Layer structure (bottom to top):
     - Layer 1: Background (solid color, image, or video)
-    - Layer 2: Always visible videos (like talking head) in cutouts
-    - Layer 3: Slides (with time-based enable)
-    - Layer 4: Triggered videos in cutouts (with time-based enable)
-    - Layer 5: Camera transform
-    - Layer 6: Outro videos (fullscreen, after narration ends)
+    - Layer 2: "below" triggered videos (vfb/vsb) — behind talking head
+    - Layer 3: Always visible videos (like talking head) in cutouts
+    - Layer 4: Slides (with time-based enable)
+    - Layer 5: "above" triggered videos (vft/vst) — in front of slides
+    - Layer 6: Camera transform
+    - Layer 7: Outro videos (fullscreen, after narration ends)
     - Audio: Main audio mixed with triggered sound effects and outro audio
     """
     outro_inputs = outro_inputs or {}
@@ -835,6 +839,44 @@ def build_filter_complex(
 
     current_label = "bg"
 
+    # Add "below" triggered video overlays (vfb/vsb) BEFORE the talking head
+    # so they sit behind it in the composite stack.
+    for i, event in enumerate(plan.video_events):
+        if event.layer != "below":
+            continue
+        video_idx = video_inputs[i]
+        cut_x, cut_y, cut_width, cut_height = _calculate_cutout_position(
+            event.cutout, width, height
+        )
+
+        duration = event.end_time - event.start_time
+        if event.video_source.take is not None:
+            duration = min(duration, event.video_source.take)
+        effective_end = event.start_time + duration
+
+        zoom = event.video_source.zoom
+        zoomed_width = int(cut_width * zoom)
+        zoomed_height = int(cut_height * zoom)
+
+        video_label = f"tvb{i}"
+        start_pts = event.start_time
+        filters.append(
+            f"[{video_idx}:v]format=yuva444p10le,"
+            f"setpts=PTS-STARTPTS+{start_pts:.3f}/TB,"
+            f"scale={zoomed_width}:{zoomed_height}:force_original_aspect_ratio=increase,"
+            f"crop={cut_width}:{cut_height}:(iw-{cut_width})/2:(ih-{cut_height})/2,"
+            f"format=rgba[{video_label}]"
+        )
+
+        next_label = f"tvbbase{i}"
+        enable_expr = f"between(t\\,{event.start_time:.3f}\\,{effective_end:.3f})"
+        filters.append(
+            f"[{current_label}][{video_label}]overlay="
+            f"x={cut_x}:y={cut_y}:enable={enable_expr}"
+            f"[{next_label}]"
+        )
+        current_label = next_label
+
     # Overlay always_visible videos (like talking head)
     # If there are narration pauses, we need to segment the video
     for i, (video_id, video_source, cutout) in enumerate(plan.narration_videos):
@@ -898,43 +940,6 @@ def build_filter_complex(
                 )
                 current_label = next_label
 
-    # Add "below-slides" triggered video overlays (vfb/vsb or layer="below")
-    for i, event in enumerate(plan.video_events):
-        if event.layer != "below":
-            continue
-        video_idx = video_inputs[i]
-        cut_x, cut_y, cut_width, cut_height = _calculate_cutout_position(
-            event.cutout, width, height
-        )
-
-        duration = event.end_time - event.start_time
-        if event.video_source.take is not None:
-            duration = min(duration, event.video_source.take)
-        effective_end = event.start_time + duration
-
-        zoom = event.video_source.zoom
-        zoomed_width = int(cut_width * zoom)
-        zoomed_height = int(cut_height * zoom)
-
-        video_label = f"tvb{i}"
-        start_pts = event.start_time
-        filters.append(
-            f"[{video_idx}:v]format=yuva444p10le,"
-            f"setpts=PTS-STARTPTS+{start_pts:.3f}/TB,"
-            f"scale={zoomed_width}:{zoomed_height}:force_original_aspect_ratio=increase,"
-            f"crop={cut_width}:{cut_height}:(iw-{cut_width})/2:(ih-{cut_height})/2,"
-            f"format=rgba[{video_label}]"
-        )
-
-        next_label = f"tvbbase{i}"
-        enable_expr = f"between(t\\,{event.start_time:.3f}\\,{effective_end:.3f})"
-        filters.append(
-            f"[{current_label}][{video_label}]overlay="
-            f"x={cut_x}:y={cut_y}:enable={enable_expr}"
-            f"[{next_label}]"
-        )
-        current_label = next_label
-
     # Add slide overlays with time-based enable
     for i, event in enumerate(plan.slide_events):
         slide_idx = slide_inputs[event.slide_id]
diff --git a/gnommo/transformer.py b/gnommo/transformer.py
index d80db6b..4634621 100644
--- a/gnommo/transformer.py
+++ b/gnommo/transformer.py
@@ -139,10 +139,14 @@ def _is_known_marker(
         "narration:",
         "vft:",
         "vfb:",
+        "vf2t:",
+        "vf2b:",
         "vst:",
         "vsb:",
         "vftp:",
         "vfbp:",
+        "vf2tp:",
+        "vf2bp:",
         "vstp:",
         "vsbp:",
     )
@@ -627,8 +631,8 @@ def build_render_plan(
     # Before extracting video events, resolve any referenced videos that are missing
     # from the project's videos.json by looking them up in shared_assets/videos.json.
     _VIDEO_MARKER_PREFIXES = (
-        "video:", "narration:", "vft:", "vfb:", "vst:", "vsb:",
-        "vftp:", "vfbp:", "vstp:", "vsbp:",
+        "video:", "narration:", "vft:", "vfb:", "vf2t:", "vf2b:", "vst:", "vsb:",
+        "vftp:", "vfbp:", "vf2tp:", "vf2bp:", "vstp:", "vsbp:",
     )
     missing_video_ids = [
         timing.marker_id[len(prefix):]
@@ -897,8 +901,8 @@ def _extract_slide_events(
 
     events: list[SlideEvent] = []
     for i, (marker_time, marker_id) in enumerate(resolved):
-        # Each slide starts at its own marker time
-        start_time = marker_time
+        # First slide always starts at 0 — it's the opening state of the presentation.
+        start_time = 0.0 if i == 0 else marker_time
 
         # End time is when the NEXT slide's marker appears, or end of video
         if i + 1 < len(resolved):
@@ -957,10 +961,14 @@ def _extract_video_events(
     _SHORTHAND: dict[str, tuple[str, str]] = {
         "vft:": ("fullscreen", "above"),
         "vfb:": ("fullscreen", "below"),
+        "vf2t:": ("fullscreen2", "above"),
+        "vf2b:": ("fullscreen2", "below"),
         "vst:": ("square", "above"),
         "vsb:": ("square", "below"),
         "vftp:": ("fullscreen", "above", "pause_narration"),
         "vfbp:": ("fullscreen", "below", "pause_narration"),
+        "vf2tp:": ("fullscreen2", "above", "pause_narration"),
+        "vf2bp:": ("fullscreen2", "below", "pause_narration"),
         "vstp:": ("square", "above", "pause_narration"),
         "vsbp:": ("square", "below", "pause_narration"),
     }
diff --git a/gnommo/validator.py b/gnommo/validator.py
index 7e30e8c..242fd67 100644
--- a/gnommo/validator.py
+++ b/gnommo/validator.py
@@ -66,8 +66,16 @@ def validate_project(
             "video:": 6,
             "vft:": 4,
             "vfb:": 4,
+            "vf2t:": 5,
+            "vf2b:": 5,
             "vst:": 4,
             "vsb:": 4,
+            "vftp:": 5,
+            "vfbp:": 5,
+            "vf2tp:": 6,
+            "vf2bp:": 6,
+            "vstp:": 5,
+            "vsbp:": 5,
         }
         matched_prefix = next(
             (p for p in _VIDEO_PREFIXES if marker.startswith(p)), None