Source code for ursa.participant

"""Participant name → catalog-safe slug.

The Ursa catalog requires every identifier to match
``[A-Za-z0-9_-]+`` (:data:`ursa.catalog.schemas.ID_PATTERN`). Operator
input from data-engine's dashboard is free-form (``"Alice"``,
``"José Smith"``, etc.); this module is the single place that converts
it into the canonical ``participant_id`` written to the
``participants`` table and into ``RecordingRow.participant_ids``.

Ownership rationale (v0.1.7): pre-v0.1.7, the data-engine uploader
slugified before calling ``ingest()`` and ursa trusted the input.
That left "where does slugification happen" ambiguous — two
implementations of the same rule, two opportunities for them to drift.
v0.1.7 moves the rule into ursa so every catalog write goes through one
codepath, and ``DataInterface.register_participant`` and
``DataInterface.ingest(participant=...)`` both accept raw display names.
"""

from __future__ import annotations

import re
import unicodedata

__all__ = ["UNKNOWN_SLUG", "slugify_to_catalog_id"]


#: Sentinel returned when no usable label exists (None / empty /
#: whitespace-only / non-ASCII-only). Distinct from any operator-typeable
#: literal so anonymous recordings group under one catalog ID rather than
#: colliding by accident.
UNKNOWN_SLUG: str = "__unknown__"


[docs] def slugify_to_catalog_id(label: str | None) -> str: """Slug a free-form string into an ursa-catalog-safe identifier. Rules: 1. ``None`` / empty / whitespace-only → :data:`UNKNOWN_SLUG`. 2. NFKD-normalize so diacritics decompose to base letters (``José`` → ``Jose``, ``André Smith`` → ``Andre_Smith``). Two recordings of the same person spelled with vs. without a diacritic land under the same catalog ID. 3. Lower-case is **not** applied (the catalog is case-sensitive; ``Alice`` and ``alice`` remain distinct). Replace runs of non-``[A-Za-z0-9_-]`` characters with a single ``_``; strip leading/trailing ``_``. Returns ------- str A catalog-safe slug (matches ``ID_PATTERN``). Examples -------- >>> slugify_to_catalog_id("Alice") 'Alice' >>> slugify_to_catalog_id("José") 'Jose' >>> slugify_to_catalog_id("André Smith") 'Andre_Smith' >>> slugify_to_catalog_id("山田") # non-ASCII-only → fallback '__unknown__' >>> slugify_to_catalog_id("") '__unknown__' >>> slugify_to_catalog_id(None) '__unknown__' """ if not isinstance(label, str): return UNKNOWN_SLUG stripped = label.strip() if not stripped: return UNKNOWN_SLUG # NFKD decomposes ``é`` into ``e`` + combining-acute; the ASCII # encode-with-ignore drops the combining mark. ``José`` round-trips # to ``Jose`` (not lossy-collapsed to ``Jos``). Entirely non-ASCII # input (CJK, Cyrillic w/o Latin transliteration) slugs to empty # and falls through to UNKNOWN_SLUG. decomposed = unicodedata.normalize("NFKD", stripped) ascii_only = decomposed.encode("ascii", "ignore").decode("ascii") slug = re.sub(r"[^A-Za-z0-9_-]+", "_", ascii_only).strip("_") return slug or UNKNOWN_SLUG