Skip to content

Commit 3906233

Browse files
committed
fix: parent PID registration fragility
* Make parent PID registration not break with multiple PID providers and/or prefixes * Copy provider from child to parent * Extract prefix from the child's identifier
1 parent 044157d commit 3906233

File tree

3 files changed

+285
-9
lines changed

3 files changed

+285
-9
lines changed

invenio_rdm_records/services/components/pids.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -304,6 +304,34 @@ def publish(self, identity, draft=None, record=None):
304304
# already published records that don't have one (i.e. legacy records).
305305
# Create all missing PIDs (this happens only on first publish)
306306
missing_required_schemes = required_schemes - current_schemes
307+
308+
# Clean up any PIDs without identifier BEFORE adding new ones
309+
# This handles cases where previous runs may have set provider/prefix without identifier
310+
for scheme in list(current_pids.keys()):
311+
if "identifier" not in current_pids[scheme]:
312+
del current_pids[scheme]
313+
314+
# Copy provider and prefix from child record PIDs to ensure consistency
315+
child_pids = draft.get("pids", {})
316+
for scheme in missing_required_schemes:
317+
# Only add provider/prefix info if:
318+
# 1. Child has this PID type
319+
# 2. Parent doesn't already have it in current_pids (after cleanup)
320+
# 3. Parent doesn't have it in the actual record (defensive check)
321+
if (
322+
scheme in child_pids
323+
and scheme not in current_pids
324+
and not record.parent.pids.get(scheme)
325+
):
326+
# Copy provider from child to parent
327+
current_pids[scheme] = {
328+
"provider": child_pids[scheme].get("provider"),
329+
}
330+
# Extract prefix from the child's identifier
331+
child_identifier = child_pids[scheme].get("identifier", "")
332+
if child_identifier and "/" in child_identifier:
333+
current_pids[scheme]["prefix"] = child_identifier.split("/")[0]
334+
307335
pids = self.service.pids.parent_pid_manager.create_all(
308336
record.parent,
309337
pids=current_pids,

invenio_rdm_records/services/pids/manager.py

Lines changed: 43 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,11 @@ def create(self, draft, scheme, identifier=None, provider_name=None):
143143
"""
144144
provider = self._get_provider(scheme, provider_name)
145145
pid_attrs = {}
146+
# Extract prefix from existing PID metadata if available
147+
prefix = (
148+
draft.pids.get(scheme, {}).get("prefix") if draft.pids.get(scheme) else None
149+
)
150+
146151
if identifier is not None:
147152
try:
148153
pid = provider.get(identifier)
@@ -157,7 +162,9 @@ def create(self, draft, scheme, identifier=None, provider_name=None):
157162
)
158163
pid_attrs = {"identifier": identifier, "provider": provider.name}
159164
else:
160-
if draft.pids.get(scheme):
165+
# Only raise error if an identifier already exists
166+
# PIDs with only provider/prefix (without identifier) are incomplete and should be allowed
167+
if draft.pids.get(scheme, {}).get("identifier"):
161168
raise ValidationError(
162169
message=_("A PID already exists for type {scheme}").format(
163170
scheme=scheme
@@ -169,7 +176,11 @@ def create(self, draft, scheme, identifier=None, provider_name=None):
169176
message=_("External identifier value is required."),
170177
field_name=f"pids.{scheme}",
171178
)
172-
pid = provider.create(draft)
179+
# Generate ID with optional prefix override
180+
pid_kwargs = {}
181+
if prefix:
182+
pid_kwargs["prefix"] = prefix
183+
pid = provider.create(draft, **pid_kwargs)
173184
pid_attrs = {"identifier": pid.pid_value, "provider": provider.name}
174185

175186
if provider.client: # provider and identifier already in dict
@@ -183,16 +194,42 @@ def create_all(self, draft, pids=None, schemes=None):
183194

184195
# Create with an identifier value provided
185196
for scheme, pid_attrs in (pids or {}).items():
197+
# Temporarily store prefix in draft.pids for create() to read
198+
prefix = pid_attrs.get("prefix")
199+
if prefix:
200+
if scheme not in draft.pids:
201+
draft.pids[scheme] = {"prefix": prefix}
202+
elif "prefix" not in draft.pids[scheme]:
203+
draft.pids[scheme]["prefix"] = prefix
204+
186205
result[scheme] = self.create(
187206
draft,
188207
scheme,
189-
pid_attrs["identifier"],
208+
pid_attrs.get("identifier"),
190209
pid_attrs.get("provider"),
191210
)
192211

193212
# Create without an identifier value provided (only the scheme)
213+
# Use provider and prefix from pids dict if available
194214
for scheme in schemes or []:
195-
result[scheme] = self.create(draft, scheme)
215+
pid_attrs = (pids or {}).get(scheme, {})
216+
provider_name = pid_attrs.get("provider")
217+
218+
# Temporarily store prefix in draft.pids for create() to read
219+
prefix = pid_attrs.get("prefix")
220+
if prefix and scheme not in draft.pids:
221+
draft.pids[scheme] = {"prefix": prefix}
222+
elif prefix and scheme in draft.pids:
223+
# Preserve existing prefix if not already set
224+
if "prefix" not in draft.pids[scheme]:
225+
draft.pids[scheme]["prefix"] = prefix
226+
227+
result[scheme] = self.create(draft, scheme, provider_name=provider_name)
228+
229+
# Strip transient 'prefix' field from results - it's not part of the
230+
# JSON schema and should not be persisted on the record.
231+
for scheme_attrs in result.values():
232+
scheme_attrs.pop("prefix", None)
196233

197234
return result
198235

@@ -247,10 +284,7 @@ def discard(self, scheme, identifier, provider_name=None, soft_delete=False):
247284
if not provider.can_modify(pid) and not soft_delete:
248285
raise ValidationError(
249286
message=[
250-
_(
251-
"Cannot discard a reserved or registered persistent "
252-
"identifier."
253-
),
287+
_("Cannot discard a reserved or registered persistent identifier."),
254288
],
255289
field_name=f"pids.{scheme}",
256290
)
@@ -303,5 +337,5 @@ def create_and_reserve(self, record, **kwargs):
303337
"""Create and reserve a PID for the given record, and update the record with the reserved PID."""
304338
pids = record.get("pids", {})
305339
provider_pid_dicts = self._get_providers(pids)
306-
for provider, _ in provider_pid_dicts:
340+
for provider, pid_dict in provider_pid_dicts:
307341
provider.create_and_reserve(record)
Lines changed: 214 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,214 @@
1+
# -*- coding: utf-8 -*-
2+
#
3+
# Copyright (C) 2026 Front Matter.
4+
#
5+
# Invenio-RDM-Records is free software; you can redistribute it and/or modify
6+
# it under the terms of the MIT License; see LICENSE file for more details.
7+
8+
"""Test that parent DOI uses same provider and prefix as child record."""
9+
10+
import pytest
11+
from invenio_i18n import lazy_gettext as _
12+
13+
from invenio_rdm_records.proxies import current_rdm_records
14+
from invenio_rdm_records.resources.serializers import (
15+
CrossrefXMLSerializer,
16+
DataCite45JSONSerializer,
17+
)
18+
from invenio_rdm_records.services.pids import providers
19+
from tests.fake_crossref_client import FakeCrossrefClient
20+
from tests.fake_datacite_client import FakeDataCiteClient
21+
22+
23+
@pytest.fixture(scope="module")
24+
def app_config(app_config):
25+
"""Override app_config to include Crossref provider."""
26+
fake_datacite = FakeDataCiteClient("datacite", config_prefix="DATACITE")
27+
fake_crossref = FakeCrossrefClient("crossref", config_prefix="CROSSREF")
28+
29+
app_config["CROSSREF_ENABLED"] = True
30+
app_config["CROSSREF_USERNAME"] = "INVALID"
31+
app_config["CROSSREF_PASSWORD"] = "INVALID"
32+
app_config["CROSSREF_DEPOSITOR"] = "INVALID"
33+
app_config["CROSSREF_EMAIL"] = "info@example.org"
34+
app_config["CROSSREF_REGISTRANT"] = "INVALID"
35+
app_config["CROSSREF_PREFIX"] = "10.1234"
36+
37+
app_config["RDM_PERSISTENT_IDENTIFIER_PROVIDERS"] = [
38+
providers.DataCitePIDProvider(
39+
"datacite",
40+
client=fake_datacite,
41+
label=_("DOI"),
42+
),
43+
providers.CrossrefPIDProvider(
44+
"crossref",
45+
client=fake_crossref,
46+
label=_("DOI"),
47+
),
48+
providers.ExternalPIDProvider(
49+
"external",
50+
"doi",
51+
validators=[
52+
providers.BlockedPrefixes(
53+
config_names=["DATACITE_PREFIX", "CROSSREF_PREFIX"]
54+
)
55+
],
56+
label=_("DOI"),
57+
),
58+
providers.OAIPIDProvider(
59+
"oai",
60+
label=_("OAI ID"),
61+
),
62+
]
63+
64+
app_config["RDM_PARENT_PERSISTENT_IDENTIFIER_PROVIDERS"] = [
65+
providers.DataCitePIDProvider(
66+
"datacite",
67+
client=fake_datacite,
68+
serializer=DataCite45JSONSerializer(is_parent=True),
69+
label=_("Concept DOI"),
70+
),
71+
providers.CrossrefPIDProvider(
72+
"crossref",
73+
client=fake_crossref,
74+
serializer=CrossrefXMLSerializer(),
75+
label=_("Concept DOI"),
76+
),
77+
]
78+
79+
app_config["RDM_PERSISTENT_IDENTIFIERS"] = {
80+
"doi": {
81+
"providers": ["datacite", "crossref", "external"],
82+
"required": True,
83+
"label": _("DOI"),
84+
"validator": lambda x: True,
85+
"normalizer": lambda x: x.lower(),
86+
"is_enabled": lambda app: True,
87+
"ui": {"default_selected": "yes"},
88+
},
89+
"oai": {
90+
"providers": ["oai"],
91+
"required": True,
92+
"label": _("OAI"),
93+
"is_enabled": lambda app: True,
94+
},
95+
}
96+
97+
return app_config
98+
99+
100+
@pytest.fixture()
101+
def service(running_app):
102+
"""Service fixture."""
103+
return current_rdm_records.records_service
104+
105+
106+
def test_parent_doi_uses_child_provider_crossref(
107+
running_app, search_clear, minimal_record, superuser_identity, service, location
108+
):
109+
"""Test that parent DOI uses same Crossref provider and prefix as child."""
110+
# Create draft with a Crossref DOI identifier (prefix is implicit in the identifier)
111+
minimal_record["pids"] = {
112+
"doi": {
113+
"identifier": "10.1234/crossref.parent.test1",
114+
"provider": "crossref",
115+
"client": "crossref",
116+
}
117+
}
118+
draft = service.create(superuser_identity, minimal_record)
119+
120+
# Publish the record
121+
record = service.publish(superuser_identity, draft.id)
122+
123+
# Check child DOI
124+
child_doi = record["pids"]["doi"]
125+
assert child_doi["provider"] == "crossref"
126+
assert child_doi["identifier"] == "10.1234/crossref.parent.test1"
127+
128+
129+
def test_parent_doi_uses_child_provider_datacite(
130+
running_app, search_clear, minimal_record, superuser_identity, service, location
131+
):
132+
"""Test that parent DOI uses DataCite provider by default."""
133+
# Create draft without explicit DOI - DataCite creates one automatically
134+
draft = service.create(superuser_identity, minimal_record)
135+
draft = service.pids.create(superuser_identity, draft.id, "doi")
136+
137+
# Publish the record
138+
record = service.publish(superuser_identity, draft.id)
139+
140+
# Check child DOI
141+
child_doi = record["pids"]["doi"]
142+
assert child_doi["provider"] == "datacite"
143+
assert child_doi["identifier"].startswith("10.1234/")
144+
145+
# Check parent DOI
146+
parent_doi = record["parent"]["pids"]["doi"]
147+
assert parent_doi["provider"] == "datacite"
148+
assert parent_doi["identifier"].startswith("10.1234/")
149+
150+
151+
def test_parent_doi_default_provider_and_prefix(
152+
running_app, search_clear, minimal_record, superuser_identity, service, location
153+
):
154+
"""Test that parent DOI uses default provider/prefix when child has none specified."""
155+
# Create draft without explicit provider/prefix (uses defaults)
156+
draft = service.create(superuser_identity, minimal_record)
157+
158+
# Create DOI with defaults
159+
draft = service.pids.create(superuser_identity, draft.id, "doi")
160+
161+
# Publish the record
162+
record = service.publish(superuser_identity, draft.id)
163+
164+
# Check child DOI
165+
child_doi = record["pids"]["doi"]
166+
assert child_doi["provider"] == "datacite"
167+
assert child_doi["identifier"].startswith("10.1234/")
168+
169+
# Check parent DOI
170+
parent_doi = record["parent"]["pids"]["doi"]
171+
assert parent_doi["provider"] == "datacite"
172+
assert parent_doi["identifier"].startswith("10.1234/")
173+
174+
175+
def test_parent_doi_consistency_across_versions(
176+
running_app, search_clear, minimal_record, superuser_identity, service, location
177+
):
178+
"""Test that parent DOI stays consistent across versions."""
179+
from copy import deepcopy
180+
181+
# Create and publish first version
182+
draft = service.create(superuser_identity, minimal_record)
183+
draft = service.pids.create(superuser_identity, draft.id, "doi")
184+
record_v1 = service.publish(superuser_identity, draft.id)
185+
186+
# Get parent DOI from first version
187+
parent_doi_v1 = record_v1["parent"]["pids"]["doi"]["identifier"]
188+
assert parent_doi_v1.startswith("10.1234/")
189+
190+
# Create second version
191+
draft_v2 = service.new_version(superuser_identity, record_v1.id)
192+
draft_v2 = service.pids.create(superuser_identity, draft_v2.id, "doi")
193+
194+
# Update metadata for v2
195+
data_v2 = deepcopy(draft_v2.data)
196+
data_v2["metadata"]["title"] = "Updated Title"
197+
data_v2["metadata"]["publication_date"] = "2020-06-01"
198+
draft_v2 = service.update_draft(superuser_identity, draft_v2.id, data_v2)
199+
200+
# Publish v2
201+
record_v2 = service.publish(superuser_identity, draft_v2.id)
202+
203+
# Parent DOI should be the same across versions
204+
parent_doi_v2 = record_v2["parent"]["pids"]["doi"]["identifier"]
205+
assert parent_doi_v2 == parent_doi_v1
206+
207+
# Child DOIs should be different
208+
child_doi_v1 = record_v1["pids"]["doi"]["identifier"]
209+
child_doi_v2 = record_v2["pids"]["doi"]["identifier"]
210+
assert child_doi_v1 != child_doi_v2
211+
212+
# Both child DOIs should use the same prefix
213+
assert child_doi_v1.startswith("10.1234/")
214+
assert child_doi_v2.startswith("10.1234/")

0 commit comments

Comments
 (0)