gnommo/gnommo/parser.py

"""Extract stage: parse all input files."""

import csv
import json
import re
from pathlib import Path
from typing import Any

from .errors import ParseError
from .models import (
    ProjectConfig,
    SlideDefinition,
    TalkingHeadConfig,
    TimedWord,
    VideoSource,
)


def parse_manuscript(project_path: Path) -> tuple[str, list[str], list[tuple[int, str]]]:
    """
    Parse manuscript.txt and extract text content and slide markers.

    Returns:
        Tuple of (full text, list of marker IDs found, list of malformed markers as (line_num, text))
    """
    manuscript_path = project_path / "manuscript.txt"

    if not manuscript_path.exists():
        raise ParseError("manuscript.txt not found", manuscript_path)

    text = manuscript_path.read_text(encoding="utf-8")

    # Extract all valid slide markers like [S1], [S2], etc.
    markers = re.findall(r"\[([A-Za-z0-9_]+)\]", text)

    # Find malformed markers (missing brackets, extra spaces, etc.)
    malformed: list[tuple[int, str]] = []
    lines = text.split("\n")

    for line_num, line in enumerate(lines, start=1):
        # Pattern for potential markers that are malformed:
        # - Missing closing bracket: [S1 or [S12 (not followed by ])
        # - Extra spaces: [S 1] or [S1 ] or [ S1]

        # Find unclosed brackets: [S followed by digits, then space/newline/EOF (not ])
        # Match [S1, [S12, [S123 etc that are NOT followed by ]
        for match in re.finditer(r"\[S\d+", line):
            start, end = match.span()
            # Check if there's a ] immediately after
            if end >= len(line) or line[end] != "]":
                malformed.append((line_num, match.group()))

        # Find markers with internal/trailing spaces like [S 1] or [S1 ] or [ S1]
        spaced = re.findall(r"\[\s+S\d+\s*\]|\[S\d+\s+\]|\[S\s+\d+\]", line)
        for match in spaced:
            malformed.append((line_num, match))

    return text, markers, malformed


def parse_transcript(project_path: Path) -> list[TimedWord]:
    """
    Parse transcript.csv into a list of timed words.

    Expected format:
        t,word
        0.00,This
        0.42,is
        ...
    """
    transcript_path = project_path / "transcript.csv"

    if not transcript_path.exists():
        raise ParseError("transcript.csv not found", transcript_path)

    timed_words = []

    with open(transcript_path, "r", encoding="utf-8") as f:
        reader = csv.DictReader(f)

        if reader.fieldnames is None or "t" not in reader.fieldnames or "word" not in reader.fieldnames:
            raise ParseError(
                "transcript.csv must have columns: t, word",
                transcript_path
            )

        for line_num, row in enumerate(reader, start=2):  # start=2 because line 1 is header
            try:
                time = float(row["t"])
                word = row["word"].strip()
                timed_words.append(TimedWord(time=time, word=word))
            except (ValueError, KeyError) as e:
                raise ParseError(
                    f"Invalid row: {e}",
                    transcript_path,
                    line_num
                )

    return timed_words


def parse_project_config(project_path: Path) -> ProjectConfig:
    """Parse project.json into ProjectConfig."""
    config_path = project_path / "project.json"

    if not config_path.exists():
        raise ParseError("project.json not found", config_path)

    try:
        data = json.loads(config_path.read_text(encoding="utf-8"))
    except json.JSONDecodeError as e:
        raise ParseError(f"Invalid JSON: {e}", config_path)

    # Parse talking head config
    th_data = data.get("talkinghead", {})
    th_height, th_height_pct = _parse_dimension(th_data.get("targetheight", 200))
    talking_head = TalkingHeadConfig(
        x=th_data.get("x", 100),
        y=th_data.get("y", 100),
        target_height=th_height,
        target_height_percent=th_height_pct,
    )

    # Parse resolution
    resolution = data.get("resolution", [1920, 1080])
    if not isinstance(resolution, list) or len(resolution) != 2:
        raise ParseError("resolution must be [width, height]", config_path)

    return ProjectConfig(
        resolution=tuple(resolution),
        fps=data.get("fps", 30),
        talking_head=talking_head,
        default_slide_type=data.get("defaultSlideType", "square"),
        background_video=data.get("background_video", ""),
        slides_path=data.get("slides", "slides.json"),
        audio_source=data.get("audio_source"),
    )


def _parse_dimension(value: Any) -> tuple[int, float]:
    """
    Parse a dimension value (can be int or string like '100%').

    Returns:
        Tuple of (pixels, percentage). If pixels is -1, use percentage.
    """
    if isinstance(value, int):
        return value, 0.0
    if isinstance(value, str):
        if value.endswith("%"):
            pct = float(value[:-1]) / 100.0
            return -1, pct
        return int(value), 0.0
    return 200, 0.0  # default


def parse_slides(project_path: Path, config: ProjectConfig = None) -> dict[str, SlideDefinition]:
    """Parse slides.json into slide definitions."""
    if config and config.slides_path:
        slides_path = project_path / config.slides_path
    else:
        slides_path = project_path / "slides.json"

    if not slides_path.exists():
        raise ParseError(f"slides file not found: {slides_path}", slides_path)

    try:
        data = json.loads(slides_path.read_text(encoding="utf-8"))
    except json.JSONDecodeError as e:
        raise ParseError(f"Invalid JSON: {e}", slides_path)

    slides = {}
    for slide_id, slide_data in data.items():
        if "image" not in slide_data:
            raise ParseError(
                f"Slide '{slide_id}' missing required field 'image'",
                slides_path
            )
        slides[slide_id] = SlideDefinition(
            image=slide_data["image"],
            type=slide_data.get("type", "square"),
        )

    return slides


def parse_videos(project_path: Path) -> dict[str, VideoSource]:
    """Parse videos.json into video source definitions."""
    videos_path = project_path / "videos.json"

    if not videos_path.exists():
        raise ParseError("videos.json not found", videos_path)

    try:
        data = json.loads(videos_path.read_text(encoding="utf-8"))
    except json.JSONDecodeError as e:
        raise ParseError(f"Invalid JSON: {e}", videos_path)

    videos = {}
    for video_id, video_data in data.items():
        if "file" not in video_data:
            raise ParseError(
                f"Video '{video_id}' missing required field 'file'",
                videos_path
            )
        videos[video_id] = VideoSource(
            file=video_data["file"],
            preprocess=video_data.get("preprocess", []),
        )

    return videos


def get_video_duration(video_path: Path) -> float:
    """Get duration of a video file using ffprobe."""
    import subprocess

    cmd = [
        "ffprobe",
        "-v", "error",
        "-show_entries", "format=duration",
        "-of", "default=noprint_wrappers=1:nokey=1",
        str(video_path)
    ]

    result = subprocess.run(cmd, capture_output=True, text=True)

    if result.returncode != 0:
        raise ParseError(f"Failed to get duration: {result.stderr}", video_path)

    return float(result.stdout.strip())