Source code for ursa.participant
"""Participant name → catalog-safe slug.
The Ursa catalog requires every identifier to match
``[A-Za-z0-9_-]+`` (:data:`ursa.catalog.schemas.ID_PATTERN`). Operator
input from data-engine's dashboard is free-form (``"Alice"``,
``"José Smith"``, etc.); this module is the single place that converts
it into the canonical ``participant_id`` written to the
``participants`` table and into ``RecordingRow.participant_ids``.
Ownership rationale (v0.1.7): pre-v0.1.7, the data-engine uploader
slugified before calling ``ingest()`` and ursa trusted the input.
That left "where does slugification happen" ambiguous — two
implementations of the same rule, two opportunities for them to drift.
v0.1.7 moves the rule into ursa so every catalog write goes through one
codepath, and ``DataInterface.register_participant`` and
``DataInterface.ingest(participant=...)`` both accept raw display names.
"""
from __future__ import annotations
import re
import unicodedata
__all__ = ["UNKNOWN_SLUG", "slugify_to_catalog_id"]
#: Sentinel returned when no usable label exists (None / empty /
#: whitespace-only / non-ASCII-only). Distinct from any operator-typeable
#: literal so anonymous recordings group under one catalog ID rather than
#: colliding by accident.
UNKNOWN_SLUG: str = "__unknown__"
[docs]
def slugify_to_catalog_id(label: str | None) -> str:
"""Slug a free-form string into an ursa-catalog-safe identifier.
Rules:
1. ``None`` / empty / whitespace-only → :data:`UNKNOWN_SLUG`.
2. NFKD-normalize so diacritics decompose to base letters
(``José`` → ``Jose``, ``André Smith`` → ``Andre_Smith``). Two
recordings of the same person spelled with vs. without a
diacritic land under the same catalog ID.
3. Lower-case is **not** applied (the catalog is case-sensitive;
``Alice`` and ``alice`` remain distinct). Replace runs of
non-``[A-Za-z0-9_-]`` characters with a single ``_``; strip
leading/trailing ``_``.
Returns
-------
str
A catalog-safe slug (matches ``ID_PATTERN``).
Examples
--------
>>> slugify_to_catalog_id("Alice")
'Alice'
>>> slugify_to_catalog_id("José")
'Jose'
>>> slugify_to_catalog_id("André Smith")
'Andre_Smith'
>>> slugify_to_catalog_id("山田") # non-ASCII-only → fallback
'__unknown__'
>>> slugify_to_catalog_id("")
'__unknown__'
>>> slugify_to_catalog_id(None)
'__unknown__'
"""
if not isinstance(label, str):
return UNKNOWN_SLUG
stripped = label.strip()
if not stripped:
return UNKNOWN_SLUG
# NFKD decomposes ``é`` into ``e`` + combining-acute; the ASCII
# encode-with-ignore drops the combining mark. ``José`` round-trips
# to ``Jose`` (not lossy-collapsed to ``Jos``). Entirely non-ASCII
# input (CJK, Cyrillic w/o Latin transliteration) slugs to empty
# and falls through to UNKNOWN_SLUG.
decomposed = unicodedata.normalize("NFKD", stripped)
ascii_only = decomposed.encode("ascii", "ignore").decode("ascii")
slug = re.sub(r"[^A-Za-z0-9_-]+", "_", ascii_only).strip("_")
return slug or UNKNOWN_SLUG