Source code for ursa.participant

"""Participant name → catalog-safe slug.

The Ursa catalog requires every identifier to match
``[A-Za-z0-9_-]+`` (:data:`ursa.catalog.schemas.ID_PATTERN`). Operator
input from data-engine's dashboard is free-form (``"Alice"``,
``"José Smith"``, etc.); this module is the single place that converts
it into the canonical ``participant_id`` written to the
``participants`` table and into ``RecordingRow.participant_ids``.

Ownership rationale (v0.1.7): pre-v0.1.7, the data-engine uploader
slugified before calling ``ingest()`` and ursa trusted the input.
That left "where does slugification happen" ambiguous — two
implementations of the same rule, two opportunities for them to drift.
v0.1.7 moves the rule into ursa so every catalog write goes through one
codepath, and ``DataInterface.register_participant`` and
``DataInterface.ingest(participant=...)`` both accept raw display names.
"""

from __future__ import annotations

import re
import unicodedata

__all__ = ["UNKNOWN_SLUG", "slugify_to_catalog_id"]


#: Sentinel returned when no usable label exists (None / empty /
#: whitespace-only / non-ASCII-only). Distinct from any operator-typeable
#: literal so anonymous recordings group under one catalog ID rather than
#: colliding by accident.
UNKNOWN_SLUG: str = "__unknown__"



[docs]
def slugify_to_catalog_id(label: str | None) -> str:
    """Slug a free-form string into an ursa-catalog-safe identifier.

    Rules:

    1. ``None`` / empty / whitespace-only → :data:`UNKNOWN_SLUG`.
    2. NFKD-normalize so diacritics decompose to base letters
       (``José`` → ``Jose``, ``André Smith`` → ``Andre_Smith``). Two
       recordings of the same person spelled with vs. without a
       diacritic land under the same catalog ID.
    3. Lower-case is **not** applied (the catalog is case-sensitive;
       ``Alice`` and ``alice`` remain distinct). Replace runs of
       non-``[A-Za-z0-9_-]`` characters with a single ``_``; strip
       leading/trailing ``_``.

    Returns
    -------
    str
        A catalog-safe slug (matches ``ID_PATTERN``).

    Examples
    --------
    >>> slugify_to_catalog_id("Alice")
    'Alice'
    >>> slugify_to_catalog_id("José")
    'Jose'
    >>> slugify_to_catalog_id("André Smith")
    'Andre_Smith'
    >>> slugify_to_catalog_id("山田")  # non-ASCII-only → fallback
    '__unknown__'
    >>> slugify_to_catalog_id("")
    '__unknown__'
    >>> slugify_to_catalog_id(None)
    '__unknown__'
    """
    if not isinstance(label, str):
        return UNKNOWN_SLUG
    stripped = label.strip()
    if not stripped:
        return UNKNOWN_SLUG
    # NFKD decomposes ``é`` into ``e`` + combining-acute; the ASCII
    # encode-with-ignore drops the combining mark. ``José`` round-trips
    # to ``Jose`` (not lossy-collapsed to ``Jos``). Entirely non-ASCII
    # input (CJK, Cyrillic w/o Latin transliteration) slugs to empty
    # and falls through to UNKNOWN_SLUG.
    decomposed = unicodedata.normalize("NFKD", stripped)
    ascii_only = decomposed.encode("ascii", "ignore").decode("ascii")
    slug = re.sub(r"[^A-Za-z0-9_-]+", "_", ascii_only).strip("_")
    return slug or UNKNOWN_SLUG