Source code for camtasia.audiate.transcript
"""Word-level transcript with timestamps, parsed from Audiate keyframes or WhisperX."""
from __future__ import annotations
import re
import json
from dataclasses import dataclass
from camtasia.timing import EDIT_RATE
[docs]
@dataclass
class Word:
"""A single transcribed word with timing information.
Attributes:
text: The word text.
start: Start time in seconds.
end: End time in seconds, or None if unavailable.
word_id: Unique identifier for this word.
"""
text: str
start: float
end: float | None
word_id: str
[docs]
class Transcript:
"""Word-level transcript with search and range queries.
Args:
words: List of Word objects comprising the transcript.
"""
def __init__(self, words: list[Word]) -> None:
self._words = words
@property
def words(self) -> list[Word]:
"""All words in the transcript."""
return self._words
@property
def full_text(self) -> str:
"""All words joined by spaces."""
return " ".join(w.text for w in self._words)
@property
def duration(self) -> float:
"""Time of the last word's end (or start if end is None)."""
if not self._words:
return 0.0
last = self._words[-1]
return last.end if last.end is not None else last.start
[docs]
def find_phrase(self, phrase: str) -> Word | None:
"""Find the first word matching the start of a phrase.
Args:
phrase: Phrase to search for (case-insensitive).
Returns:
The first Word where the phrase begins, or None.
"""
phrase_lower = phrase.lower()
text_words = phrase_lower.split()
if not text_words:
return None
def _normalize(s: str) -> str:
return re.sub(r"[^\w\s]", "", s).strip()
for i, word in enumerate(self._words):
if _normalize(word.text.lower()) == _normalize(text_words[0]):
if len(text_words) == 1:
return word
remaining = text_words[1:]
if i + len(remaining) < len(self._words) and all(
self._words[i + 1 + j].text.lower() == remaining[j]
for j in range(len(remaining))
):
return word
return None
[docs]
def words_in_range(self, start_seconds: float, end_seconds: float) -> list[Word]:
"""Return words whose start time falls within [start, end].
Args:
start_seconds: Range start in seconds.
end_seconds: Range end in seconds.
Returns:
List of words in the time range.
"""
return [w for w in self._words if start_seconds <= w.start <= end_seconds]
[docs]
@classmethod
def from_audiate_keyframes(cls, keyframes: list[dict]) -> Transcript:
"""Parse Audiate transcription keyframes into a Transcript.
Each keyframe has a ``time`` in editRate ticks and a JSON-encoded
``value`` containing ``id`` and ``text`` fields.
Args:
keyframes: Raw keyframe dicts from
``tracks[0].parameters.transcription.keyframes``.
Returns:
A Transcript instance.
"""
words: list[Word] = []
for i, kf in enumerate(keyframes):
parsed = json.loads(kf["value"])
start = kf["time"] / EDIT_RATE
# Use next keyframe's time as end, if available
end = keyframes[i + 1]["time"] / EDIT_RATE if i + 1 < len(keyframes) else None
words.append(Word(
text=parsed["text"],
start=start,
end=end,
word_id=parsed["id"],
))
return cls(words)
[docs]
@classmethod
def from_whisperx_result(cls, result: dict) -> Transcript:
"""Parse a WhisperX alignment result into a Transcript.
Expected format::
result['segments'][*]['words'][*] = {
'word': str, 'start': float, 'end': float
}
Args:
result: WhisperX result dict with ``segments``.
Returns:
A Transcript instance.
"""
words: list[Word] = []
for seg in result.get("segments", []):
for i, w in enumerate(seg.get("words", [])):
words.append(Word(
text=w["word"],
start=w.get("start", 0.0),
end=w.get("end"),
word_id=f"wx-{len(words)}",
))
return cls(words)