Source code for camtasia.operations.sync
"""Audio-video sync from transcript and timeline markers.
Implements the V3 labeled-markers workflow: given markers on a screen
recording and a word-level transcript, calculate per-segment speed
adjustments to align video with audio.
"""
from __future__ import annotations
import re
from dataclasses import dataclass
from fractions import Fraction
from typing import TYPE_CHECKING
from camtasia.timing import EDIT_RATE
from camtasia.audiate.transcript import Word
if TYPE_CHECKING:
from camtasia.timeline.clips.group import Group
[docs]
@dataclass
class SyncSegment:
"""A segment between two sync points with its speed adjustment.
Attributes:
video_start_ticks: Segment start on the video timeline (ticks).
video_end_ticks: Segment end on the video timeline (ticks).
audio_start_seconds: Corresponding audio start (seconds).
audio_end_seconds: Corresponding audio end (seconds).
scalar: Camtasia scalar (video_duration / audio_duration in ticks).
"""
video_start_ticks: int
video_end_ticks: int
audio_start_seconds: float
audio_end_seconds: float
scalar: Fraction
[docs]
def match_marker_to_transcript(
label: str,
words: list[Word],
) -> float | None:
"""Fuzzy-match a marker label to words in a transcript.
Uses simple case-insensitive substring matching. Checks each word
in the label against the running text of the transcript.
Args:
label: Marker label text (e.g. "Selecting a recent batch run").
words: List of dicts with ``word``, ``start``, ``end`` keys.
Returns:
Start timestamp (seconds) of the best match, or None.
"""
label_lower = label.lower().split()
if not label_lower or not words:
return None
# Build running text for substring search
texts = [re.sub(r"[^\w\s]", "", w.text.lower()).strip() for w in words]
full = " ".join(texts)
target = " ".join(label_lower)
idx = full.find(target)
if idx != -1:
# Count words before the match to find the word index
word_idx = full[:idx].count(" ")
return words[min(word_idx, len(words) - 1)].start
# Fallback: match first word of label
first = label_lower[0]
for w in words:
if first in w.text.lower():
return w.start
return None
[docs]
def plan_sync(
markers: list[tuple[str, int]],
transcript_words: list[Word],
edit_rate: int = EDIT_RATE,
) -> list[SyncSegment]:
"""Calculate per-segment speed adjustments to sync video with audio.
For each pair of consecutive markers, finds the corresponding audio
timestamps via transcript matching and computes the scalar needed to
align the video segment duration with the audio segment duration.
Args:
markers: List of ``(label, video_time_ticks)`` from
``timeline.parameters.toc`` keyframes.
transcript_words: List of dicts with ``word``, ``start``, ``end``
keys (from WhisperX or Audiate).
edit_rate: Ticks per second (default 705,600,000).
Returns:
List of SyncSegments, one per gap between consecutive markers.
"""
if len(markers) < 2:
return []
# Resolve audio timestamps for each marker
resolved: list[tuple[int, float]] = []
for label, video_ticks in markers:
audio_time = match_marker_to_transcript(label, transcript_words)
if audio_time is not None:
resolved.append((video_ticks, audio_time))
if len(resolved) < 2:
return []
resolved.sort(key=lambda x: x[0])
segments: list[SyncSegment] = []
for i in range(len(resolved) - 1):
v_start, a_start = resolved[i]
v_end, a_end = resolved[i + 1]
video_dur_ticks = v_end - v_start
audio_dur_ticks = round(float(a_end - a_start) * edit_rate)
if audio_dur_ticks <= 0 or video_dur_ticks <= 0:
continue
scalar = Fraction(video_dur_ticks, audio_dur_ticks)
segments.append(SyncSegment(
video_start_ticks=v_start,
video_end_ticks=v_end,
audio_start_seconds=a_start,
audio_end_seconds=a_end,
scalar=scalar,
))
return segments
[docs]
def apply_sync(
group: 'Group',
segments: list[SyncSegment],
) -> None:
"""Apply sync segments to a Group's internal track.
Converts SyncSegment objects to the (source_start, source_end,
timeline_duration) tuples expected by set_internal_segment_speeds.
"""
from camtasia.timing import ticks_to_seconds
tuples = []
for seg in segments:
tl_dur = ticks_to_seconds(seg.video_end_ticks - seg.video_start_ticks)
tuples.append((seg.audio_start_seconds, seg.audio_end_seconds, tl_dur))
group.set_internal_segment_speeds(tuples)