Source code for nicetoolbox.connectors.elan.labeling_from_elan

"""Convert flat ELAN tier data into hierarchical multiperson labeling structure."""

from collections import defaultdict
from typing import Optional

from .elan_data import ElanData, Interval
from .labeling_data import LabeledInterval, MultipersonHierarchicalData


[docs]def parse_tier_name(tier_name: str) -> tuple[str, str]: """Split '<subject> <category>' tier name into (subject, category).""" parts = tier_name.split(" ", 1) if len(parts) != 2 or not parts[0] or not parts[1]: raise ValueError(f"Tier name '{tier_name}' does not follow the expected '<subject> <category>' convention") return parts[0], parts[1]
[docs]def parse_labels(annotation: str) -> frozenset[str]: annotations = annotation.split(",") return frozenset(p.strip() for p in annotations if p.strip())
[docs]def elan_data_to_hierarchical(elan_data: ElanData) -> MultipersonHierarchicalData: categories_per_subject: defaultdict[str, set[str]] = defaultdict(set) raw_intervals: dict[tuple[str, str], list[Interval]] = {} for tier in elan_data.tiers: subject, category = parse_tier_name(tier.tier_name) categories_per_subject[subject].add(category) raw_intervals[(subject, category)] = tier.intervals categories: Optional[set[str]] = None for subject, subject_categories in categories_per_subject.items(): if categories is None: categories = subject_categories continue missing = categories.difference(subject_categories) if missing: raise ValueError(f"Subject '{subject}' has missing categories: {missing}") if not categories: raise ValueError("Data doesn't contain any categories!") subjects = set(categories_per_subject) data: dict[str, dict[str, list[LabeledInterval]]] = {} for subject in subjects: data[subject] = {} for category in categories: labeled: list[LabeledInterval] = [] for iv in raw_intervals[(subject, category)]: labels = parse_labels(iv.annotation) labeled.append(LabeledInterval(iv.start_sec, iv.end_sec, labels)) data[subject][category] = labeled return MultipersonHierarchicalData(data=data)