Source code for sktalk.corpus.parsing.cha

import re
import pylangacq
from ..utterance import Utterance
from .parser import InputFile


[docs] class ChaFile(InputFile): # participant may not be repeated on each line, hence the ?
[docs] PARTICIPANT_REGEX = r"(^\*(?P<participant>[^:]+)\:)?"
[docs] UTTERANCE_REGEX = r"\s+(?P<utterance>.*)\s+"
[docs] TIMING_REGEX = r"\D(?P<timing>\d{1,9}_\d{1,9})\D"
[docs] LINE_REGEX = PARTICIPANT_REGEX + UTTERANCE_REGEX + TIMING_REGEX
[docs] SPACER_REGEX = r"\((?P<spacer>[\d.]+)\)"
[docs] def _pla_reader(self) -> pylangacq.Reader: return pylangacq.read_chat(self._path)
[docs] def _extract_metadata(self): return self._pla_reader().headers()[0]
[docs] def _extract_utterances(self): with open(self._path, encoding="utf-8") as f: lines = f.readlines() utterance_info = [self._extract_info( line) for line in lines if not line.startswith("@")] # collect all utterance info in a terrible, terrible loop collection = [] participant = None for info in utterance_info: if info["utterance"] is None: continue if info["participant"] is not None: participant = info["participant"] complete_utterance = Utterance( participant=participant, time=info["time"], utterance=info["utterance"]) collection.append(complete_utterance) return collection
@staticmethod
[docs] def _extract_info(line): default_return = {"utterance": None} extract_re = re.search(ChaFile.LINE_REGEX, line) if not extract_re: return default_return try: utterance = ChaFile._clean_utterance(extract_re["utterance"]) except TypeError: return default_return if utterance is not None: timing = ChaFile._clean_timing(extract_re["timing"]) return ({ "participant": extract_re["participant"], "time": timing, "utterance": utterance }) return default_return
@staticmethod
[docs] def _clean_utterance(utterance): if re.match(ChaFile.SPACER_REGEX, utterance): return None return str(utterance).strip()
@staticmethod
[docs] def _clean_timing(timing): timing = timing.split("_") return [int(t) for t in timing] if len(timing) == 2 else None