Source code for sktalk.corpus.conversation

import json
import warnings
from typing import Optional
import pandas as pd
from .parsing.cha import ChaFile
from .parsing.eaf import EafFile
from .utterance import Utterance
from .write.writer import Writer


[docs] class Conversation(Writer): def __init__( self, utterances: list["Utterance"], metadata: Optional[dict] = None, suppress_warnings: bool = False ) -> None: """Representation of a transcribed conversation Args: utterances (list[Utterance]): A list of Utterance objects representing the utterances in the conversation. metadata (dict, optional): Additional metadata associated with the conversation. Defaults to None. """ self._metadata = metadata or {"source": "unknown"} self._utterances = utterances # Input utterances should be a list of type Utterance errormsg = "All utterances in a conversation should be of type Utterance" if not isinstance(self._utterances, list): try: self._utterances = list(self._utterances) except TypeError as e: raise TypeError(errormsg) from e for utterance in self._utterances: if not isinstance(utterance, Utterance): raise TypeError(errormsg) # The list can be empty. This would be weird and the user needs to be warned. if not self._utterances and not suppress_warnings: warnings.warn( "This conversation appears to be empty: no Utterances are read.") self._metadata_df = None self._utterance_df = None @property
[docs] def utterances(self): """ Get the list of utterances in the conversation. Returns: list[Utterance]: A list of Utterance objects representing the utterances in the conversation. """ return self._utterances
@property
[docs] def metadata(self): """ Get the metadata associated with the conversation. Returns: dict: Additional metadata associated with the conversation. """ return self._metadata
[docs] def __len__(self): """ Get the number of utterances in the conversation. Returns: int: The number of utterances in the conversation. """ return len(self._utterances)
@property
[docs] def participants(self): """ Get the participants in the conversation. Returns: set[str]: A set of unique participant names. """ return {u.participant for u in self._utterances}
@classmethod
[docs] def from_cha(cls, path): """Parse conversation file in Cha format Args: path (str): Path to the Cha file Returns: Conversation: A Conversation object representing the conversation in the file. """ utterances, metadata = ChaFile(path).parse() return cls(utterances, metadata)
@classmethod
[docs] def from_eaf(cls, path: str, tiers: Optional[list[str]] = None): """Parse conversation file in ELAN format Args: path (str): Path to the ELAN file tiers (Optional[list[str]], optional): List of tiers to parse. Defaults to None, in which case all tiers are parsed. If an empty list is passed, all tiers are parsed, but a warning is issued. Raises: KeyError if tiers are named that are not found in the file. Returns: Conversation: A Conversation object representing the conversation in the file. """ utterances, metadata = EafFile(path, tiers).parse() return cls(utterances, metadata)
@classmethod
[docs] def from_json(cls, path): """Parse conversation file in JSON format Returns: Conversation: A Conversation object representing the conversation in the file. """ with open(path, encoding='utf-8') as f: json_in = json.load(f) return cls._fromdict(json_in)
@classmethod
[docs] def _fromdict(cls, fields): try: utterances = [Utterance._fromdict(u) for u in fields["Utterances"]] del fields["Utterances"] except KeyError as e: raise TypeError( "This object cannot be imported as a Conversation.") from e return Conversation(utterances, metadata=fields)
[docs] def get_utterance(self, index) -> "Utterance": # noqa: F821 raise NotImplementedError
[docs] def summary(self, n=10, **fields): """ Print the first n lines of a conversation. Args: n (int, optional): Number of lines to print. Defaults to 10. fields (dict): key-value pairs with which specific utterances can be selected """ selected = self.select(**fields) for u in selected.utterances[:n]: if len(u.time) != 2: time = "(no timing information)" else: time = f"({u.time[0]} - {u.time[1]})" print(f"{time} {u.participant}: '{u.utterance}'")
[docs] def select(self, **fields): """Select utterances based on content in specific fields Args: fields (dict): key-value pairs with which specific utterances can be selected Returns: Conversation: Conversation object without metadata, containing a reduced set of utterances """ utterances = [utterance for utterance in self._utterances if all( getattr(utterance, key) == value for key, value in fields.items())] return Conversation(utterances, suppress_warnings=True)
[docs] def remove(self, **fields): """Remove utterances based on content in specific fields Args: fields (dict): key-value pairs with which specific utterances can be selected """ to_remove = self.select(**fields) self._utterances = [ u for u in self._utterances if u not in to_remove.utterances]
[docs] def asdict(self): """ Return the Conversation as a dictionary Returns: dict: dictionary containing Conversation metadata and Utterances """ return self._metadata | {"Utterances": [u.asdict() for u in self._utterances]}
@property
[docs] def metadata_df(self): """Return the conversation metadata as a pandas dataframe.""" if self._metadata_df is None: self._metadata_df = self._metadata_to_df(self._metadata) return self._metadata_df
@property
[docs] def utterance_df(self): """Return the conversation utterances as a pandas dataframe.""" if self._utterance_df is None: self._utterance_df = pd.DataFrame(self._utterances) self._utterance_df.insert(loc=0, column="source", value=self._metadata["source"]) return self._utterance_df
[docs] def _subconversation_by_index(self, index: int, before: int = 0, after: Optional[int] = None) -> "Conversation": """Select utterances to provide context as a sub-conversation Args: index (int): The index of the utterance for which to provide context before (int, optional): The number of utterances prior to indicated utterance. Defaults to 0. after (int, optional): The number of utterances after the indicated utterance. Defaults to None, which then assumes the same value as `before`. Raises: IndexError: Index provided must be within range of utterances Returns: Conversation: Conversation object without metadata, containing a reduced set of utterances """ if index < 0 or index >= len(self._utterances): raise IndexError("Utterance index out of range") if after is None: after = before left_bound = max(index-before, 0) right_bound = min(index + after + 1, len(self._utterances)) return Conversation(utterances=self._utterances[left_bound:right_bound], suppress_warnings=True)
[docs] def _subconversation_by_time(self, index: int, before: int = 0, after: int = 0, exclude_utterance_overlap: bool = False) -> "Conversation": """Select utterances to provide context as a sub-conversation Args: index (int): The index of the utterance for which to provide context before (int, optional): The time in ms preceding the utterance's begin. Defaults to 0. after (int, optional): The time in ms following the utterance's end. Defaults to 0 exclude_utterance_overlap (bool, optional): If True, the duration of the utterance itself is not used to identify overlapping utterances, and only the window before or after the utterance is used. Defaults to False. If True, only one of `before` or `after` can be more than 0, as the window for overlap will be limited to the window preceding or following the utterance. Returns: Conversation: Conversation object without metadata, containing a reduced set of utterances """ if index < 0 or index >= len(self._utterances): raise IndexError("Utterance index out of range") if exclude_utterance_overlap and before > 0 and after > 0: raise ValueError( "When utterance is excluded from overlap window, only one of before or after can be more than 0") try: begin = self._utterances[index].time[0] - before end = self._utterances[index].time[1] + after left_bound, right_bound = None, None if exclude_utterance_overlap and before == 0: # only overlap with window following utterance begin = self._utterances[index].time[1] left_bound = index elif exclude_utterance_overlap and after == 0: # only overlap with window preceding utterance end = self._utterances[index].time[0] right_bound = index + 1 indices = [i for i, u in enumerate( self._utterances) if u.window_overlap([begin, end])] left_bound = left_bound if bool(left_bound) else min(indices) right_bound = right_bound if bool( right_bound) else max(indices) + 1 returned_utterances = self._utterances[left_bound:right_bound] except (TypeError, IndexError): # if the utterance's timing is None, a TypeError is raised # if the utterance has no time[0] or time[1], an IndexError is raised # In both cases, there is missing timing information, so no data can be returned. returned_utterances = [] return Conversation(utterances=returned_utterances, suppress_warnings=True)
[docs] def count_participants(self, except_none: bool = False) -> int: """Count the number of participants in a conversation Importantly: if one of the utterances has no participant, it is counted as a separate participant (None). If you want to exclude these, set `except_none` to True. Args: except_none (bool, optional): if `True`, utterances without a participant are not counted. Defaults to `False`. Returns: int: number of participants """ participants = self.participants if except_none: participants = [p for p in participants if p is not None] return len(participants)
[docs] def _update(self, field: str, values: list, **kwargs): """ Update all utterances in the conversation with calculated values This function also stores relevant arguments in the Conversation metadata. Args: field (str): field of the Utterance to update values (list): list of values to update each utterance with kwargs (dict): information about the calculation to store in the Conversation metadata """ if len(values) != len(self.utterances): raise ValueError( "The number of values must match the number of utterances") metadata = {field: kwargs} try: self._metadata["Calculations"].update(metadata) except KeyError: self._metadata = self._metadata | {"Calculations": metadata} for index, utterance in enumerate(self.utterances): setattr(utterance, field, values[index])
[docs] def calculate_FTO(self, window: int = 10000, planning_buffer: int = 200, n_participants: int = 2): """Calculate Floor Transfer Offset (FTO) per utterance FTO is defined as the difference between the time that a turn starts and the end of the most relevant prior turn by the other participant, which is not necessarily the prior utterance. An utterance does not receive an FTO if there are preceding utterances within the window that do not have timing information, or if it lacks timing information itself. Args: window (int, optional): the time in ms prior to utterance in which a relevant preceding utterance can be found. Defaults to 10000. planning_buffer (int, optional): minimum speaking time in ms to allow for a response. Defaults to 200. n_participants (int, optional): maximum number of participants overlapping with the utterance and preceding window. Defaults to 2. """ values = [] for index, utterance in enumerate(self.utterances): relevant = self.relevant_prior_utterance( index, window, planning_buffer, n_participants) values.append(relevant.until(utterance) if bool(relevant) else None) self._update("FTO", values, window=window, planning_buffer=planning_buffer, n_participants=n_participants)
[docs] def relevant_prior_utterance(self, index, window=10000, planning_buffer=200, n_participants=2): """Determine the most relevant prior utterance for a given utterance To be a relevant prior turn, the following conditions must be met, respective to utterance U: - the utterance must be by another speaker than U - the utterance by the other speaker must be the most recent utterance by that speaker - the utterance must have started before utterance U, more than `planning_buffer` ms before. - the utterance must be partly or entirely within the context window (`window` ms prior to the start of utterance U) - within the context window, there must be a maximum of `n_participants` speakers. Args: index (int): index of the utterance to assess window (int, optional): the time in ms prior to utterance in which a relevant preceding utterance can be found. Defaults to 10000. planning_buffer (int, optional): minimum speaking time in ms to allow for a response. Defaults to 200. n_participants (int, optional): maximum number of participants overlapping with the utterance and preceding window. Defaults to 2. Returns: Utterance: the most relevant prior utterance, or None, if no relevant prior utterance can be identified """ utterance_u = self._utterances[index] if not bool(utterance_u.time) or not bool(utterance_u.participant): return None sub = self._subconversation_by_time( index=index, before=window, after=0, exclude_utterance_overlap=True) if not 2 <= sub.count_participants() <= n_participants: return None must_overlap = [] for prior in sub.utterances[::-1]: # if timing or participant information is missing, stop looking for relevant utterances if not bool(prior.time) or not bool(prior.participant): break if prior == utterance_u: continue # if the utterance is by the same speaker, it is not relevant, # but must overlap with potential relevant utterance if utterance_u.same_speaker(prior): must_overlap.append(prior) continue # the relevant utterance must precede utterance U more than planning buffer if not utterance_u.precede_with_buffer(prior, planning_buffer): continue # verify that all utterances in must_overlap do so if all(utt.overlap_percentage(prior) == 100 for utt in must_overlap): return prior return None