Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ All notable changes to Sheaf are documented here. The format is based on [Keep a

## [Unreleased]

### Added

- **Import deduplication.** Every importer (PluralKit, SimplyPlural, Tupperbox, PluralSpace, Prism, and Sheaf native re-import) now matches each incoming member against the system's existing roster before writing, so re-importing the same export no longer doubles your members. Matching is by PluralKit ID where present (exact, so PK round-trips cleanly) and otherwise by name, scoped so a member and a custom front sharing a name never collide. A new `conflict_strategy` option chooses what happens on a match: `skip` (default - leave the existing member untouched and add nothing), `update` (overwrite the existing member's importable fields from the export), or `create` (the old append-everything behaviour, kept as an escape hatch). The tier member cap now counts only the members an import would actually create, so re-importing into a near-full system no longer trips the cap on members that already exist. Deduplication is member-scoped: fronts, groups, journals, messages, polls, and reminders are still appended on re-import, so re-importing those sections over existing data can still duplicate them. The PluralKit member HID is now also confirmed to land in each member's `pluralkit_id` field, which doubles as the dedup key.
### Fixed

- **Build provenance for local compose builds.** `GET /v1/version` reports the commit/tag/build-time the backend was built from; CI-built ghcr images already set these, but a local `docker compose build` left them null because the compose `args` didn't forward them. The app service now accepts `GIT_COMMIT` / `GIT_TAG` / `BUILD_TIME` from the host environment (documented in SELFHOSTING.md), so a compose build can identify itself too. Unset values stay null, same as before.
Expand Down
3 changes: 3 additions & 0 deletions sheaf/schemas/pk_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@

from pydantic import BaseModel, ConfigDict, Field

from sheaf.services.import_dedup import ImportConflictStrategy


class PKImportOptions(BaseModel):
"""What to import from a PluralKit export."""
Expand All @@ -18,6 +20,7 @@ class PKImportOptions(BaseModel):
# the importer just dropped unknown keys; this is a tightening.
model_config = ConfigDict(extra="forbid")

conflict_strategy: ImportConflictStrategy = ImportConflictStrategy.SKIP
system_profile: bool = True
member_ids: list[str] | None = Field(
default=None,
Expand Down
7 changes: 7 additions & 0 deletions sheaf/schemas/pluralspace_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,13 @@

from pydantic import BaseModel, ConfigDict, Field

from sheaf.services.import_dedup import ImportConflictStrategy


class PluralspaceImportOptions(BaseModel):
model_config = ConfigDict(extra="forbid")

conflict_strategy: ImportConflictStrategy = ImportConflictStrategy.SKIP
system_profile: bool = True
member_ids: list[str] | None = None

Expand Down Expand Up @@ -78,6 +81,10 @@ class PluralspaceImportResult(BaseModel):

members_imported: int = 0
custom_fronts_imported: int = 0
# Dedup dispositions, covering all roster rows (members + custom
# fronts) that matched an existing row instead of being created.
members_skipped: int = 0
members_updated: int = 0
avatars_imported: int = 0
tags_imported: int = 0
groups_imported: int = 0
Expand Down
5 changes: 5 additions & 0 deletions sheaf/schemas/prism_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,13 @@

from pydantic import BaseModel, ConfigDict, Field

from sheaf.services.import_dedup import ImportConflictStrategy


class PrismImportOptions(BaseModel):
model_config = ConfigDict(extra="forbid")

conflict_strategy: ImportConflictStrategy = ImportConflictStrategy.SKIP
system_profile: bool = True
member_ids: list[str] | None = None

Expand Down Expand Up @@ -82,6 +85,8 @@ class PrismImportResult(BaseModel):
model_config = ConfigDict(extra="forbid")

members_imported: int = 0
members_skipped: int = 0
members_updated: int = 0
avatars_imported: int = 0
groups_imported: int = 0
custom_fields_imported: int = 0
Expand Down
3 changes: 3 additions & 0 deletions sheaf/schemas/sheaf_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@

from pydantic import BaseModel, ConfigDict, Field

from sheaf.services.import_dedup import ImportConflictStrategy


class SheafImportOptions(BaseModel):
"""What to import from a Sheaf export. Each flag gates one section;
Expand All @@ -21,6 +23,7 @@ class SheafImportOptions(BaseModel):
# than being silently ignored.
model_config = ConfigDict(extra="forbid")

conflict_strategy: ImportConflictStrategy = ImportConflictStrategy.SKIP
system_profile: bool = True
member_ids: list[str] | None = Field(default=None, max_length=10_000)
fronts: bool = True
Expand Down
7 changes: 7 additions & 0 deletions sheaf/schemas/sp_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

from pydantic import BaseModel, ConfigDict, Field

from sheaf.services.import_dedup import ImportConflictStrategy


class SPImportOptions(BaseModel):
"""What to import from the SP export."""
Expand All @@ -10,6 +12,7 @@ class SPImportOptions(BaseModel):
# than being silently ignored.
model_config = ConfigDict(extra="forbid")

conflict_strategy: ImportConflictStrategy = ImportConflictStrategy.SKIP
system_profile: bool = True
member_ids: list[str] | None = Field(
None, max_length=10_000, description="SP member IDs to import. None = all."
Expand Down Expand Up @@ -46,6 +49,10 @@ class SPPreviewSummary(BaseModel):
class SPImportResult(BaseModel):
members_imported: int = 0
custom_fronts_imported: int = 0
# Dedup dispositions, covering all roster rows (members + custom
# fronts) that matched an existing row instead of being created.
members_skipped: int = 0
members_updated: int = 0
fronts_imported: int = 0
groups_imported: int = 0
custom_fields_imported: int = 0
Expand Down
5 changes: 5 additions & 0 deletions sheaf/schemas/tb_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@

from pydantic import BaseModel, ConfigDict, Field

from sheaf.services.import_dedup import ImportConflictStrategy


class TBImportOptions(BaseModel):
"""What to import from a Tupperbox export."""
Expand All @@ -17,6 +19,7 @@ class TBImportOptions(BaseModel):
# than being silently ignored.
model_config = ConfigDict(extra="forbid")

conflict_strategy: ImportConflictStrategy = ImportConflictStrategy.SKIP
member_ids: list[str] | None = Field(
default=None,
max_length=10_000,
Expand All @@ -41,5 +44,7 @@ class TBPreviewSummary(BaseModel):

class TBImportResult(BaseModel):
members_imported: int = 0
members_skipped: int = 0
members_updated: int = 0
groups_imported: int = 0
warnings: list[str] = []
206 changes: 206 additions & 0 deletions sheaf/services/import_dedup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
"""Member deduplication for re-imports.

Every importer used to append members blindly, so importing the same
export twice doubled the roster. This module adds a match-and-resolve
layer: an importer builds its candidate Member exactly as before, then
asks `resolve_member()` what to do with it given the chosen strategy and
the members already in the system.

Match key: the source's stable id where both the candidate and an
existing member carry one (`pluralkit_id`), otherwise the name
blind-index (`name_hash`) scoped by `is_custom_front`. Names are not
guaranteed unique within a system, so the name-hash path is best-effort:
a system that genuinely has two members sharing a name will match the
first. `pluralkit_id` is exact, so PK re-imports round-trip cleanly.

The name-hash scope matters because some formats (SimplyPlural,
PluralSpace, Prism) store custom fronts as Member rows with
`is_custom_front=True`. Without the scope, a member and a custom front
that happen to share a name would match, and UPDATE would flip
`is_custom_front` and corrupt the member. `pluralkit_id` is member-only
(custom fronts never carry one), so that path needs no scoping.

Strategies:
- CREATE: always insert (the pre-dedup behaviour).
- SKIP (default): an existing match is left untouched; the candidate is
not added.
- UPDATE: an existing match's importable fields are overwritten from the
candidate.

The caller is responsible for two things based on the disposition:
* db.add() the candidate ONLY when disposition == "created";
* use the returned member in its source-id -> member map either way,
so downstream sections (fronts, groups, custom fields) link to the
right row whether it was created, skipped, or updated.
"""

from __future__ import annotations

import enum
import uuid
from dataclasses import dataclass, field

from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession

from sheaf.models.member import Member


class ImportConflictStrategy(enum.StrEnum):
CREATE = "create"
SKIP = "skip"
UPDATE = "update"


# Fields every importer always sets on a new Member, so UPDATE always
# overwrites them. is_custom_front is deliberately NOT here: matching is
# already scoped by it (a member only matches a member, a custom front
# only a custom front), so a match always agrees, and some importers
# leave it None on the candidate (relying on the column server-default),
# which would null out the existing row's NOT NULL column.
_ALWAYS_OVERWRITE = ("name", "name_hash", "privacy")
# Optional fields: UPDATE overwrites only when the candidate carries a
# value, so a re-import never nulls a field the source format doesn't
# model (e.g. PluralKit has no emoji, so a PK update must not wipe an
# emoji the user set after the first import).
_OVERWRITE_IF_SET = (
"display_name",
"description",
"pronouns",
"avatar_url",
"color",
"birthday",
"pluralkit_id",
"emoji",
"note",
)


@dataclass
class MemberMatchIndex:
"""In-memory index of a system's existing members, by match key.

The name-hash index is keyed by `(is_custom_front, name_hash)` so a
member and a custom front sharing a name don't match each other.
"""

by_pk_id: dict[str, Member] = field(default_factory=dict)
by_name_hash: dict[tuple[bool, str], Member] = field(default_factory=dict)

def find(
self,
*,
name_hash: str,
is_custom_front: bool,
pluralkit_id: str | None = None,
) -> Member | None:
if pluralkit_id and pluralkit_id in self.by_pk_id:
return self.by_pk_id[pluralkit_id]
return self.by_name_hash.get((bool(is_custom_front), name_hash))

def register(self, member: Member) -> None:
"""Record a member so later candidates dedup against it too.

First-wins on collisions: the earliest existing (or earliest
created-this-run) member is the canonical target.
"""
if member.pluralkit_id:
self.by_pk_id.setdefault(member.pluralkit_id, member)
if member.name_hash:
self.by_name_hash.setdefault(
(bool(member.is_custom_front), member.name_hash), member
)


async def load_member_match_index(
db: AsyncSession, system_id: uuid.UUID
) -> MemberMatchIndex:
"""Build the match index from the members already in the system."""
rows = await db.execute(select(Member).where(Member.system_id == system_id))
index = MemberMatchIndex()
for m in rows.scalars().all():
index.register(m)
return index


@dataclass
class Resolution:
member: Member
disposition: str # "created" | "skipped" | "updated"


def _apply_update(existing: Member, candidate: Member) -> None:
for fld in _ALWAYS_OVERWRITE:
setattr(existing, fld, getattr(candidate, fld))
for fld in _OVERWRITE_IF_SET:
val = getattr(candidate, fld, None)
if val is not None:
setattr(existing, fld, val)


def resolve_member(
candidate: Member,
*,
index: MemberMatchIndex,
strategy: ImportConflictStrategy,
) -> Resolution:
"""Decide how a freshly-built candidate relates to existing members.

On "created" the candidate is registered in the index so a later
intra-import row with the same key dedups against it too.
"""
if strategy == ImportConflictStrategy.CREATE:
return Resolution(candidate, "created")
existing = index.find(
name_hash=candidate.name_hash,
is_custom_front=bool(candidate.is_custom_front),
pluralkit_id=candidate.pluralkit_id,
)
if existing is None:
index.register(candidate)
return Resolution(candidate, "created")
if strategy == ImportConflictStrategy.SKIP:
return Resolution(existing, "skipped")
_apply_update(existing, candidate)
return Resolution(existing, "updated")


def candidate_key(member: Member) -> tuple[str, str | None, bool]:
"""The (name_hash, pluralkit_id, is_custom_front) match key for a
freshly-built candidate, as `count_new_members` expects it."""
return (member.name_hash, member.pluralkit_id, bool(member.is_custom_front))


def count_new_members(
keys: list[tuple[str, str | None, bool]],
*,
index: MemberMatchIndex,
strategy: ImportConflictStrategy,
) -> int:
"""Count how many (name_hash, pluralkit_id, is_custom_front) candidate
keys would be created rather than skipped/updated.

Used to size the tier member-cap check: under SKIP/UPDATE a pure
re-import of members already in the system adds nothing, so it must
not trip the cap. Mirrors `resolve_member`'s matching (including the
intra-batch dedup of earlier new keys) without building Member rows.
"""
if strategy == ImportConflictStrategy.CREATE:
return len(keys)
seen_new_pk: set[str] = set()
seen_new_name: set[tuple[bool, str]] = set()
new_count = 0
for name_hash, pk_id, is_cf in keys:
if pk_id and (pk_id in index.by_pk_id or pk_id in seen_new_pk):
continue
name_key = (bool(is_cf), name_hash)
if not pk_id and (
name_key in index.by_name_hash or name_key in seen_new_name
):
continue
new_count += 1
if pk_id:
seen_new_pk.add(pk_id)
else:
seen_new_name.add(name_key)
return new_count
Loading
Loading