Source code for camtasia.audiate.transcript

"""Word-level transcript with timestamps, parsed from Audiate keyframes or WhisperX."""

from __future__ import annotations

import re

import json
from dataclasses import dataclass

from camtasia.timing import EDIT_RATE


[docs] @dataclass class Word: """A single transcribed word with timing information. Attributes: text: The word text. start: Start time in seconds. end: End time in seconds, or None if unavailable. word_id: Unique identifier for this word. """ text: str start: float end: float | None word_id: str
[docs] class Transcript: """Word-level transcript with search and range queries. Args: words: List of Word objects comprising the transcript. """ def __init__(self, words: list[Word]) -> None: self._words = words @property def words(self) -> list[Word]: """All words in the transcript.""" return self._words @property def full_text(self) -> str: """All words joined by spaces.""" return " ".join(w.text for w in self._words) @property def duration(self) -> float: """Time of the last word's end (or start if end is None).""" if not self._words: return 0.0 last = self._words[-1] return last.end if last.end is not None else last.start
[docs] def find_phrase(self, phrase: str) -> Word | None: """Find the first word matching the start of a phrase. Args: phrase: Phrase to search for (case-insensitive). Returns: The first Word where the phrase begins, or None. """ phrase_lower = phrase.lower() text_words = phrase_lower.split() if not text_words: return None def _normalize(s: str) -> str: return re.sub(r"[^\w\s]", "", s).strip() for i, word in enumerate(self._words): if _normalize(word.text.lower()) == _normalize(text_words[0]): if len(text_words) == 1: return word remaining = text_words[1:] if i + len(remaining) < len(self._words) and all( self._words[i + 1 + j].text.lower() == remaining[j] for j in range(len(remaining)) ): return word return None
[docs] def words_in_range(self, start_seconds: float, end_seconds: float) -> list[Word]: """Return words whose start time falls within [start, end]. Args: start_seconds: Range start in seconds. end_seconds: Range end in seconds. Returns: List of words in the time range. """ return [w for w in self._words if start_seconds <= w.start <= end_seconds]
[docs] @classmethod def from_audiate_keyframes(cls, keyframes: list[dict]) -> Transcript: """Parse Audiate transcription keyframes into a Transcript. Each keyframe has a ``time`` in editRate ticks and a JSON-encoded ``value`` containing ``id`` and ``text`` fields. Args: keyframes: Raw keyframe dicts from ``tracks[0].parameters.transcription.keyframes``. Returns: A Transcript instance. """ words: list[Word] = [] for i, kf in enumerate(keyframes): parsed = json.loads(kf["value"]) start = kf["time"] / EDIT_RATE # Use next keyframe's time as end, if available end = keyframes[i + 1]["time"] / EDIT_RATE if i + 1 < len(keyframes) else None words.append(Word( text=parsed["text"], start=start, end=end, word_id=parsed["id"], )) return cls(words)
[docs] @classmethod def from_whisperx_result(cls, result: dict) -> Transcript: """Parse a WhisperX alignment result into a Transcript. Expected format:: result['segments'][*]['words'][*] = { 'word': str, 'start': float, 'end': float } Args: result: WhisperX result dict with ``segments``. Returns: A Transcript instance. """ words: list[Word] = [] for seg in result.get("segments", []): for i, w in enumerate(seg.get("words", [])): words.append(Word( text=w["word"], start=w.get("start", 0.0), end=w.get("end"), word_id=f"wx-{len(words)}", )) return cls(words)