gnommo/gnommo/parser.py

"""Extract stage: parse all input files."""

import json
import re
from pathlib import Path
from typing import Any, Optional

from .cache import resolve_with_cache
from .errors import ParseError
from .models import (
    Attribution,
    AudioDefinition,
    Citation,
    CutoutDefinition,
    ProjectConfig,
    SlideDefinition,
    VideoMetadata,
    VideoSource,
)


def _read_json(path: Path) -> Any:
    """Read and parse a JSON file, treating an empty file as {}."""
    text = path.read_text(encoding="utf-8").strip()
    return json.loads(text) if text else {}


def parse_manuscript(
    project_path: Path,
) -> tuple[str, list[str], list[tuple[int, str]], list[Citation]]:
    """
    Parse manuscript.txt and extract text content and slide markers.

    Strips [cite:...] and [marker:...] markers from the returned text so they
    never pollute alignment contexts. Citations are extracted and returned
    separately. Marker cues are personal recording notes and are simply discarded.

    Returns:
        Tuple of (full text, list of marker IDs found, list of malformed markers, list of citations)
    """
    manuscript_path = project_path / "manuscript.txt"

    if not manuscript_path.exists():
        raise ParseError("manuscript.txt not found", manuscript_path)

    text = manuscript_path.read_text(encoding="utf-8")

    # Extract citations before stripping them
    citations = parse_citations(text)

    # Strip [cite:...] markers from text so they don't pollute alignment
    text = re.sub(r"\[cite:[^\]]+\]", "", text)

    # Strip [marker:...] and [cue:...] markers (personal recording cues, ignored by pipeline)
    text = re.sub(r"\[marker:[^\]]+\]", "", text)
    text = re.sub(r"\[cue:[^\]]+\]", "", text)

    # Extract all valid markers like [S1], [video:demo], [Zoom2], etc.
    # Include . in pattern to catch markers with file extensions (so validator can warn about them)
    markers = re.findall(r"\[([A-Za-z0-9_:.]+)\]", text)

    # Find malformed markers (missing brackets, extra spaces, etc.)
    malformed: list[tuple[int, str]] = []
    lines = text.split("\n")

    for line_num, line in enumerate(lines, start=1):
        # Pattern for potential markers that are malformed:
        # - Missing closing bracket: [S1 or [S12 (not followed by ])
        # - Extra spaces: [S 1] or [S1 ] or [ S1]

        # Find unclosed brackets: [S followed by digits, then space/newline/EOF (not ])
        # Match [S1, [S12, [S123 etc that are NOT followed by ]
        for match in re.finditer(r"\[S\d+", line):
            start, end = match.span()
            # Check if there's a ] immediately after
            if end >= len(line) or line[end] != "]":
                malformed.append((line_num, match.group()))

        # Find markers with internal/trailing spaces like [S 1] or [S1 ] or [ S1]
        spaced = re.findall(r"\[\s+S\d+\s*\]|\[S\d+\s+\]|\[S\s+\d+\]", line)
        for match in spaced:
            malformed.append((line_num, match))

    return text, markers, malformed, citations


def parse_citations(manuscript_text: str) -> list[Citation]:
    """
    Extract all [cite:...] markers from manuscript text.

    The text after 'cite:' is the literal reference that should appear
    in the video description.

    Returns:
        List of Citation objects with reference text and context for alignment.
    """
    citations = []

    # Match [cite:...] markers - content can include any characters except ]
    # Use a more permissive pattern that handles multi-word citations
    pattern = r"\[cite:([^\]]+)\]"

    for match in re.finditer(pattern, manuscript_text):
        reference = match.group(1).strip()
        marker_id = f"cite:{reference}"

        # Extract context: text following the citation (for alignment)
        # Get up to 100 chars after the marker, stopping at next marker or newline
        end_pos = match.end()
        context_text = manuscript_text[end_pos : end_pos + 150]

        # Clean up context: take text until next marker or double newline
        context_match = re.match(r"([^\[]*?)(?:\[|\n\n|$)", context_text)
        context = context_match.group(1).strip() if context_match else ""

        # Truncate context to ~50 chars for display
        if len(context) > 50:
            context = context[:47] + "..."

        citations.append(
            Citation(
                reference=reference,
                marker_id=marker_id,
                context=context,
            )
        )

    return citations


def save_citations(citations: list[Citation], path: Path) -> None:
    """Save citations to a JSON file."""
    data = [{"reference": c.reference, "context": c.context} for c in citations]
    path.write_text(json.dumps(data, indent=2), encoding="utf-8")


def load_citations(path: Path) -> list[Citation]:
    """Load citations from a JSON file."""
    if not path.exists():
        return []
    data = _read_json(path)
    return [
        Citation(
            reference=item["reference"],
            marker_id=f"cite:{item['reference']}",
            context=item.get("context", ""),
        )
        for item in data
    ]


def parse_project_config(project_path: Path) -> ProjectConfig:
    """Parse project.json into ProjectConfig."""
    config_path = project_path / "project.json"

    if not config_path.exists():
        raise ParseError("project.json not found", config_path)

    try:
        data = _read_json(config_path)
    except json.JSONDecodeError as e:
        raise ParseError(f"Invalid JSON: {e}", config_path)

    # Parse cutouts (named zones for video placement)
    cutouts: dict[str, CutoutDefinition] = {}
    cutouts_data = data.get("cutouts", {})
    for cutout_name, cutout_data in cutouts_data.items():
        x, x_pct = _parse_dimension(cutout_data.get("x", 0))
        y, y_pct = _parse_dimension(cutout_data.get("y", 0))
        height, height_pct = _parse_dimension(cutout_data.get("height", 200))
        # Width defaults to same as height (square) if not specified
        width, width_pct = _parse_dimension(
            cutout_data.get("width", cutout_data.get("height", 200))
        )
        cutouts[cutout_name] = CutoutDefinition(
            x=x,
            y=y,
            height=height,
            width=width,
            x_percent=x_pct,
            y_percent=y_pct,
            height_percent=height_pct,
            width_percent=width_pct,
        )

    # Parse resolution
    resolution = data.get("resolution", [1920, 1080])
    if not isinstance(resolution, list) or len(resolution) != 2:
        raise ParseError("resolution must be [width, height]", config_path)

    # Parse default_filters (named filter presets)
    default_filters: dict[str, list[dict]] = data.get("default_filters", {})

    return ProjectConfig(
        resolution=tuple(resolution),
        fps=data.get("fps", 30),
        default_slide_type=data.get("defaultSlideType", "square"),
        cutouts=cutouts,
        default_filters=default_filters,
        background=data.get("background", ""),
        background_video=data.get("background_video", ""),  # Deprecated
        slides_path=data.get("slides", "slides.json"),
        videos_path=data.get("videos", "videos.json"),
        audio_path=data.get("audio", "audio.json"),
        audio_source=data.get("audio_source"),
        main_video=data.get("main_video"),
        gnommo_scratch=data.get("gnommo_scratch"),
        default_begin=float(data.get("default_begin", 0.0)),
        default_end_trim=float(data.get("default_end_trim", 0.0)),
        outro=data.get("outro", []),
        description=data.get("description", ""),
        footer=data.get("footer", ""),
        output_video=data.get("output_video", ""),
    )


def _parse_dimension(value: Any) -> tuple[int, float]:
    """
    Parse a dimension value (can be int or string like '100%').

    Returns:
        Tuple of (pixels, percentage). If pixels is -1, use percentage.
    """
    if isinstance(value, int):
        return value, 0.0
    if isinstance(value, str):
        if value.endswith("%"):
            pct = float(value[:-1]) / 100.0
            return -1, pct
        return int(value), 0.0
    return 200, 0.0  # default


def parse_slides(
    project_path: Path, config: ProjectConfig = None
) -> dict[str, SlideDefinition]:
    """Parse slides.json into slide definitions."""
    if config and config.slides_path:
        local_slides_path = project_path / config.slides_path
    else:
        local_slides_path = project_path / "slides.json"

    # Try cache fallback for reading JSON
    slides_path, _ = resolve_with_cache(local_slides_path, project_path)
    if not slides_path.exists():
        raise ParseError(f"slides file not found: {local_slides_path}", local_slides_path)

    try:
        data = _read_json(slides_path)
    except json.JSONDecodeError as e:
        raise ParseError(f"Invalid JSON: {e}", slides_path)

    slides = {}
    for slide_id, slide_data in data.items():
        if "image" not in slide_data:
            raise ParseError(
                f"Slide '{slide_id}' missing required field 'image'", slides_path
            )
        slides[slide_id] = SlideDefinition(
            image=slide_data["image"],
            type=slide_data.get("type", "square"),
        )

    return slides


def parse_audio(
    project_path: Path, config: Optional[ProjectConfig] = None
) -> tuple[dict[str, AudioDefinition], Path]:
    """
    Parse audio.json into audio definitions.

    Returns:
        Tuple of (audio dict, audio_dir) where audio_dir is the directory
        containing audio.json (for resolving relative file paths).
    """
    if config and config.audio_path:
        local_audio_path = project_path / config.audio_path
    else:
        local_audio_path = project_path / "audio.json"

    # Keep local directory for file lookups (cache fallback handles resolution)
    audio_dir = local_audio_path.parent

    # Try cache fallback for reading JSON
    audio_path, _ = resolve_with_cache(local_audio_path, project_path)

    # Audio is optional - return empty dict if not found
    if not audio_path.exists():
        return {}, audio_dir

    try:
        data = _read_json(audio_path)
    except json.JSONDecodeError as e:
        raise ParseError(f"Invalid JSON: {e}", audio_path)

    audio = {}
    for audio_id, audio_data in data.items():
        if "file" not in audio_data:
            raise ParseError(
                f"Audio '{audio_id}' missing required field 'file'", audio_path
            )
        # Parse overlap if specified (timestamp string like "10s")
        overlap = None
        if "overlap" in audio_data and audio_data["overlap"]:
            overlap = parse_timestamp(audio_data["overlap"])

        audio[audio_id] = AudioDefinition(
            file=audio_data["file"],
            volume=float(audio_data.get("volume", 1.0)),
            loop=bool(audio_data.get("loop", False)),
            overlap=overlap,
            ignore_pauses=bool(audio_data.get("ignore_pauses", False)),
        )

    return audio, audio_dir


def parse_timestamp(value: str) -> float:
    """
    Parse a timestamp string into seconds.

    Supported formats:
    - "3.5s" or "3.5" → 3.5 seconds
    - "2:54" → 2 minutes 54 seconds (174.0)
    - "1:23:45" → 1 hour 23 minutes 45 seconds
    - "2:54.5" → 2 minutes 54.5 seconds

    Returns:
        Time in seconds as a float.
    """
    if not value:
        return 0.0

    value = value.strip()

    # Remove trailing 's' if present (e.g., "3.5s")
    if value.endswith("s"):
        value = value[:-1]

    # Check for colon-separated format (MM:SS or HH:MM:SS)
    if ":" in value:
        parts = value.split(":")
        if len(parts) == 2:
            # MM:SS format
            minutes, seconds = parts
            return float(minutes) * 60 + float(seconds)
        elif len(parts) == 3:
            # HH:MM:SS format
            hours, minutes, seconds = parts
            return float(hours) * 3600 + float(minutes) * 60 + float(seconds)
        else:
            raise ParseError(f"Invalid timestamp format: {value}", None)

    # Plain number (seconds)
    return float(value)


def parse_videos(
    project_path: Path, config: Optional[ProjectConfig] = None
) -> tuple[dict[str, VideoSource], Path]:
    """
    Parse videos.json into video source definitions.

    Filter can be specified as:
    - A list of filter configs (inline definition)
    - A string referencing a named preset in config.default_filters

    Trim points can be specified as:
    - skip/take: raw values in seconds (traditional)
    - begin/end: timestamp strings like "3.5s", "2:54", "1:23:45" (user-friendly)
      These are converted to skip/take internally.

    Returns:
        Tuple of (videos dict, videos_dir) where videos_dir is the directory
        containing videos.json (for resolving relative file paths).
    """
    if config and config.videos_path:
        local_videos_path = project_path / config.videos_path
    else:
        local_videos_path = project_path / "videos.json"

    # Keep local directory for file lookups (cache fallback handles resolution)
    videos_dir = local_videos_path.parent

    # Try cache fallback for reading JSON
    videos_path, _ = resolve_with_cache(local_videos_path, project_path)
    if not videos_path.exists():
        raise ParseError(f"videos.json not found: {local_videos_path}", local_videos_path)

    try:
        data = _read_json(videos_path)
    except json.JSONDecodeError as e:
        raise ParseError(f"Invalid JSON: {e}", videos_path)

    # Get default_filters from config for resolving references
    default_filters = config.default_filters if config else {}

    videos = {}
    for video_id, video_data in data.items():
        if "source_file" not in video_data:
            raise ParseError(
                f"Video '{video_id}' missing required field 'source_file'", videos_path
            )

        # Parse attribution if present
        attribution = None
        if "attribution" in video_data:
            attr_data = video_data["attribution"]
            attribution = Attribution(
                source=attr_data.get("source", "unknown"),
                creator=attr_data.get("creator", "Unknown"),
                url=attr_data.get("url"),
            )

        # Resolve filter - can be a list or a string reference to default_filters
        filter_value = video_data.get("filter", [])
        if isinstance(filter_value, str):
            # It's a reference to a named filter preset
            if filter_value not in default_filters:
                raise ParseError(
                    f"Video '{video_id}' references unknown filter preset '{filter_value}'. "
                    f"Available presets: {list(default_filters.keys())}",
                    videos_path,
                )
            filter_list = default_filters[filter_value]
        else:
            # It's an inline filter definition
            filter_list = filter_value

        # Handle skip/take - can use begin/end as user-friendly alternatives
        skip = video_data.get("skip", 0.0)
        take = video_data.get("take")

        # Convert begin/end to skip/take if provided
        if "begin" in video_data and video_data["begin"]:
            skip = parse_timestamp(video_data["begin"])
        if "end" in video_data and video_data["end"]:
            end_time = parse_timestamp(video_data["end"])
            # take = end - begin (duration from begin to end)
            take = end_time - skip

        videos[video_id] = VideoSource(
            source_file=video_data["source_file"],
            filter=filter_list,
            output_file=video_data.get("output_file"),
            take=take,
            skip=skip,
            zoom=video_data.get("zoom", 1.0),
            cutout=video_data.get("cutout"),
            always_visible=video_data.get("always_visible", False),
            is_shared=video_data.get("is_shared", False),
            pause_narration=float(video_data.get("pause_narration", 0)),
            attribution=attribution,
            use_audio_channels=video_data.get("use_audio_channels", "both"),
            defer_loudnorm=video_data.get("defer_loudnorm", False),
            volume=float(video_data.get("volume", 1.0)),
        )

    return videos, videos_dir


def parse_narration(
    project_path: Path, config: Optional[ProjectConfig] = None
) -> tuple[dict[str, VideoSource], Path]:
    """
    Parse narration.json into narration segment definitions.

    Narration segments are stored in media/narration/ and are processed
    separately from videos. Each segment can have filters, begin/end trim
    points, and other properties similar to videos.

    Filter can be specified as:
    - A list of filter configs (inline definition)
    - A string referencing a named preset in config.default_filters

    Trim points can be specified as:
    - skip/take: raw values in seconds (traditional)
    - begin/end: timestamp strings like "3.5s", "2:54", "1:23:45" (user-friendly)
      These are converted to skip/take internally.

    Returns:
        Tuple of (narration dict, narration_dir) where narration_dir is the directory
        containing narration.json (for resolving relative file paths).
    """
    # Narration is always in media/narration/
    # Keep local directory for file lookups (cache fallback handles resolution)
    narration_dir = project_path / "media" / "narration"
    local_narration_path = narration_dir / "narration.json"

    # Try cache fallback for reading JSON
    narration_path, _ = resolve_with_cache(local_narration_path, project_path)

    # Narration is optional - return empty dict if not found
    if not narration_path.exists():
        return {}, narration_dir

    try:
        data = _read_json(narration_path)
    except json.JSONDecodeError as e:
        raise ParseError(f"Invalid JSON: {e}", narration_path)

    # Get default_filters from config for resolving references
    default_filters = config.default_filters if config else {}

    narration = {}
    for segment_id, segment_data in data.items():
        if "source_file" not in segment_data:
            raise ParseError(
                f"Narration segment '{segment_id}' missing required field 'source_file'",
                narration_path,
            )

        # Resolve filter - can be a list or a string reference to default_filters
        filter_value = segment_data.get("filter", [])
        if isinstance(filter_value, str):
            # It's a reference to a named filter preset
            if filter_value not in default_filters:
                raise ParseError(
                    f"Narration segment '{segment_id}' references unknown filter preset '{filter_value}'. "
                    f"Available presets: {list(default_filters.keys())}",
                    narration_path,
                )
            filter_list = default_filters[filter_value]
        else:
            # It's an inline filter definition
            filter_list = filter_value

        # Handle skip/take - can use begin/end as user-friendly alternatives
        # Fall back to project-level defaults if no explicit value is set
        default_begin = config.default_begin if config else 0.0
        skip = segment_data.get("skip", default_begin)
        take = segment_data.get("take")

        # Explicit begin/end always override defaults
        if "begin" in segment_data and segment_data["begin"]:
            skip = parse_timestamp(segment_data["begin"])
        if "end" in segment_data and segment_data["end"]:
            end_time = parse_timestamp(segment_data["end"])
            # take = end - begin (duration from begin to end)
            take = end_time - skip

        narration[segment_id] = VideoSource(
            source_file=segment_data["source_file"],
            filter=filter_list,
            output_file=segment_data.get("output_file"),
            take=take,
            skip=skip,
            zoom=segment_data.get("zoom", 1.0),
            cutout=segment_data.get("cutout"),
            always_visible=segment_data.get("always_visible", False),
            use_audio_channels=segment_data.get("use_audio_channels", "both"),
            defer_loudnorm=segment_data.get("defer_loudnorm", False),
            volume=float(segment_data.get("volume", 1.0)),
        )

    return narration, narration_dir


def get_video_duration(video_path: Path) -> float:
    """Get duration of a video file using ffprobe."""
    import subprocess

    cmd = [
        "ffprobe",
        "-v",
        "error",
        "-show_entries",
        "format=duration",
        "-of",
        "default=noprint_wrappers=1:nokey=1",
        str(video_path),
    ]

    result = subprocess.run(cmd, capture_output=True, text=True)

    if result.returncode != 0:
        raise ParseError(f"Failed to get duration: {result.stderr}", video_path)

    return float(result.stdout.strip())


def parse_video_metadata(metadata_path: Path) -> VideoMetadata:
    """
    Parse a video metadata JSON file.

    Expected format:
    {
        "source_file": "talking_head.mov",
        "preprocess": [
            {"type": "chroma_key", "color": [0, 255, 0], "similarity": 0.15}
        ],
        "output": {
            "file": "intermediate/talking_head_rgba.mov",
            "colorspace": "rgba",
            "alpha": "straight"
        }
    }
    """
    if not metadata_path.exists():
        raise ParseError(f"Video metadata not found: {metadata_path}", metadata_path)

    try:
        data = _read_json(metadata_path)
    except json.JSONDecodeError as e:
        raise ParseError(f"Invalid JSON: {e}", metadata_path)

    if "source_file" not in data:
        raise ParseError(
            "Video metadata missing required field 'source_file'", metadata_path
        )

    return VideoMetadata(
        source_file=data["source_file"],
        preprocess=data.get("preprocess", []),
        output=data.get("output"),
    )


def resolve_video_file(
    project_path: Path, file_ref: str
) -> tuple[Path, Optional[VideoMetadata]]:
    """
    Resolve a video file reference, which can be either:
    1. A direct path to a video file
    2. A path to a metadata JSON file

    Returns:
        Tuple of (actual video path to use, metadata if JSON file was used)
    """
    ref_path = project_path / file_ref

    # Check if it's a metadata JSON file
    if file_ref.endswith(".json") and ref_path.exists():
        metadata = parse_video_metadata(ref_path)

        # Resolve paths relative to the metadata file's directory
        metadata_dir = ref_path.parent

        # If output is specified and exists, use it; otherwise use source
        if metadata.output and metadata.output.get("file"):
            output_path = metadata_dir / metadata.output["file"]
            if output_path.exists():
                return output_path, metadata

        # Fall back to source file
        source_path = metadata_dir / metadata.source_file
        return source_path, metadata

    # Direct video file reference
    return ref_path, None