Source code for sktalk.corpus.utterance

import re
import warnings
from dataclasses import asdict
from dataclasses import dataclass
from datetime import datetime
from datetime import timezone
from typing import Any
from typing import Optional


@dataclass
[docs] class Utterance:
[docs] utterance: str
[docs] participant: Optional[str] = None
[docs] time: Optional[list] = None
[docs] begin: Optional[int] = None
[docs] begin_timestamp: Optional[str] = None
[docs] end: Optional[int] = None
[docs] end_timestamp: Optional[str] = None
[docs] utterance_raw: Optional[str] = None
[docs] utterance_list: Optional[list[str]] = None
[docs] n_words: Optional[int] = None
[docs] n_characters: Optional[int] = None
[docs] FTO: Optional[int] = None
[docs] metadata: Optional[dict[str, Any]] = None
[docs] def __post_init__(self): if self.utterance_raw is None: # if reading in existing data, we do not want to overwrite the raw utterance self.utterance_raw = self.utterance self.utterance = self._clean_utterance(self.utterance) self.utterance_list = self.utterance.split() self.n_words = len(self.utterance_list) self.n_characters = sum(len(word) for word in self.utterance_list) self._validate_time() if (not self.begin or not self.end) and self.time: self.begin = self.time[0] self.end = self.time[1] self.begin_timestamp = self._to_timestamp(self.begin) self.end_timestamp = self._to_timestamp(self.end)
[docs] def get_audio(self): pass
[docs] def asdict(self): return asdict(self)
@classmethod
[docs] def _fromdict(cls, fields): return Utterance(**fields)
[docs] def until(self, other): return other.time[0] - self.time[1]
[docs] def overlap(self, other): return self.window_overlap(other.time)
[docs] def window_overlap(self, time): if not bool(self.time) or not bool(time): return None return self.time[1] >= time[0] and self.time[0] <= time[1]
[docs] def overlap_duration(self, other): return self.window_overlap_duration(other.time)
[docs] def window_overlap_duration(self, time): overlap = self.window_overlap(time) if not bool(overlap): return overlap if overlap is None else int(overlap) return min(self.time[1], time[1]) - max(self.time[0], time[0])
[docs] def overlap_percentage(self, other): return self.window_overlap_percentage(other.time)
[docs] def window_overlap_percentage(self, time): overlap_duration = self.window_overlap_duration(time) if not bool(overlap_duration): return overlap_duration utterance_duration = self.time[1] - self.time[0] return overlap_duration / utterance_duration * 100
[docs] def same_speaker(self, other): return self.participant == other.participant if bool(self.participant) and bool(other.participant) else None
[docs] def precede_with_buffer(self, other, planning_buffer=200): if not bool(self.time) or not bool(other.time): return None return self.time[0] - planning_buffer >= other.time[0]
[docs] def _validate_time(self): valid = isinstance(self.time, list) and len(self.time) == 2 valid = valid and all(isinstance(time, (float, int)) for time in self.time) valid = valid and all(time >= 0 and time < 86399999 # noqa R1716 for time in self.time) valid = valid and self.time[0] < self.time[1] if not valid and self.time is not None: warnings.warn( f"utterance {self.utterance} has invalid time {self.time}") if not valid: self.time = None
@staticmethod
[docs] def _to_timestamp(time_ms): time_dt = datetime.fromtimestamp(time_ms/1000, tz=timezone.utc) return time_dt.strftime("%H:%M:%S.%f")[:-3]
@staticmethod
[docs] def _clean_utterance(utterance): bracketed_content = r'[\[<]\w*[\]>]' # e.g. [laugh] or <laugh> punctuation = r"[^\w\s']" # except apostrophe numbers = r'\b\d+\b' # only as a single word, not when inside a word multiple_spaces = r'\s+(?=\s{1})' clean_utterance = str(utterance).strip() for regex in [bracketed_content, punctuation, numbers, multiple_spaces]: clean_utterance = re.sub(regex, '', clean_utterance) clean_utterance = str(clean_utterance).strip() return clean_utterance