Skip to content

Commit d897466

Browse files
committed
feat: add Crossref DOI provider
* include more flexible datacite doi prefix use similar to crossref
1 parent 06022bc commit d897466

20 files changed

+1911
-367
lines changed

invenio_rdm_records/config.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
from . import tokens
3131
from .requests.community_inclusion import CommunityInclusion
3232
from .requests.community_submission import CommunitySubmission
33-
from .resources.serializers import CrossrefXMLSerializer, DataCite45JSONSerializer
33+
from .resources.serializers import DataCite45JSONSerializer
3434
from .services import facets
3535
from .services.config import lock_edit_published_files
3636
from .services.permissions import RDMRecordPermissionPolicy
@@ -536,14 +536,15 @@ def always_valid(identifier):
536536
"""
537537

538538
RDM_PERSISTENT_IDENTIFIERS = {
539-
# DOI automatically removed if DATACITE_ENABLED is False.
539+
# DOI automatically removed if DATACITE_ENABLED and CROSSREF_ENABLED are False.
540540
"doi": {
541-
"providers": ["datacite", "external"],
541+
"providers": ["datacite", "crossref", "external"],
542542
"required": True,
543543
"label": _("DOI"),
544544
"validator": idutils.is_doi,
545545
"normalizer": idutils.normalize_doi,
546-
"is_enabled": providers.DataCitePIDProvider.is_enabled,
546+
"is_enabled": providers.DataCitePIDProvider.is_enabled
547+
or providers.CrossrefPIDProvider.is_enabled,
547548
"ui": {"default_selected": "yes"}, # "yes", "no" or "not_needed"
548549
},
549550
"oai": {
@@ -612,6 +613,9 @@ def always_valid(identifier):
612613
DATACITE_PREFIX = ""
613614
"""DataCite DOI prefix."""
614615

616+
DATACITE_ADDITIONAL_PREFIXES = []
617+
"""List of additional DataCite DOI prefixes supported for registration."""
618+
615619
DATACITE_TEST_MODE = True
616620
"""DataCite test mode enabled."""
617621

@@ -651,6 +655,9 @@ def make_doi(prefix, record):
651655
CROSSREF_PREFIX = ""
652656
"""Crossref DOI prefix."""
653657

658+
CROSSREF_ADDITIONAL_PREFIXES = []
659+
"""List of additional Crossref DOI prefixes supported for registration."""
660+
654661
CROSSREF_DEPOSITOR = ""
655662
"""Crossref depositor name."""
656663

invenio_rdm_records/resources/serializers/crossref/__init__.py

Lines changed: 53 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
CrossrefXMLSchema,
1414
Metadata,
1515
write_crossref_xml,
16+
tostring,
1617
)
1718
from flask import current_app
1819
from flask_resources import BaseListSchema, MarshmallowSerializer
@@ -24,39 +25,78 @@ class CrossrefXMLSerializer(MarshmallowSerializer):
2425

2526
def __init__(self, **options):
2627
"""Constructor."""
28+
encoder = options.get("encoder", tostring)
2729
super().__init__(
2830
format_serializer_cls=SimpleSerializer,
2931
object_schema_cls=CrossrefXMLSchema,
3032
list_schema_cls=BaseListSchema,
31-
encoder=self.crossref_xml_tostring,
33+
encoder=encoder,
3234
**options,
3335
)
3436

35-
def dump_obj(self, record):
37+
def dump_obj(self, record, url=None):
3638
"""Dump a single record.
3739
38-
Uses config variables for Crossref XML head elements.
40+
Config variables for Crossref XML head elements are used in the
41+
XML head element.
3942
40-
:param record: Record instance.
43+
:param record: Record instance (dict, Record model, or ChainObject).
44+
:param url: the landing page URL for the DOI.
45+
Falls back to ``SITE_UI_URL``/records/<id> if not provided.
4146
"""
47+
# Determine the URL that the DOI resolves to, in the following order:
48+
#
49+
# 1. identifier of type url in ``metadata.identifiers``
50+
# (e.g. archived original content)
51+
# 2. The landing page URL passed by the PID service
52+
# 3. Default constructed from ``SITE_UI_URL`` and record ID
53+
# (e.g. for Celery tasks or tests without UI endpoints)
54+
identifiers = (
55+
record.get("metadata", {}).get("identifiers", [])
56+
if isinstance(record, dict)
57+
else getattr(getattr(record, "metadata", None), "get", lambda *a: [])(
58+
"identifiers", []
59+
)
60+
)
61+
registered_url = (
62+
next(
63+
(
64+
i.get("identifier")
65+
for i in (identifiers or [])
66+
if i.get("scheme") == "url" and i.get("identifier") is not None
67+
),
68+
None,
69+
)
70+
or url
71+
)
72+
73+
if registered_url is None:
74+
site_url = current_app.config.get("SITE_UI_URL", "")
75+
record_id = (
76+
record.get("id")
77+
if isinstance(record, dict)
78+
else getattr(record, "id", None)
79+
)
80+
if site_url and record_id:
81+
registered_url = f"{site_url}/records/{record_id}"
82+
4283
# Convert the metadata to crossref_xml format via the commonmeta intermediary format.
4384
# XML Schema validation errors raise CrossrefError.
4485
try:
4586
metadata = Metadata(
4687
record,
4788
via="inveniordm",
48-
depositor=current_app.config.get("CROSSREF_DEPOSITOR"),
49-
email=current_app.config.get("CROSSREF_EMAIL"),
50-
registrant=current_app.config.get("CROSSREF_REGISTRANT"),
89+
url=registered_url,
5190
)
52-
return write_crossref_xml(metadata)
91+
crossref_xml = write_crossref_xml(metadata)
92+
head = {
93+
"depositor": current_app.config.get("CROSSREF_DEPOSITOR"),
94+
"email": current_app.config.get("CROSSREF_EMAIL"),
95+
"registrant": current_app.config.get("CROSSREF_REGISTRANT"),
96+
}
97+
return tostring(crossref_xml, head=head)
5398
except CrossrefError as e:
5499
current_app.logger.error(
55100
f"CrossrefError while converting {metadata.id} to Crossref XML: {str(e)}"
56101
)
57102
return ""
58-
59-
@classmethod
60-
def crossref_xml_tostring(cls, record):
61-
"""Stringify a Crossref XML record."""
62-
return record

invenio_rdm_records/services/components/pids.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -304,6 +304,34 @@ def publish(self, identity, draft=None, record=None):
304304
# already published records that don't have one (i.e. legacy records).
305305
# Create all missing PIDs (this happens only on first publish)
306306
missing_required_schemes = required_schemes - current_schemes
307+
308+
# Clean up any PIDs without identifier BEFORE adding new ones
309+
# This handles cases where previous runs may have set provider/prefix without identifier
310+
for scheme in list(current_pids.keys()):
311+
if "identifier" not in current_pids[scheme]:
312+
del current_pids[scheme]
313+
314+
# Copy provider and prefix from child record PIDs to ensure consistency
315+
child_pids = draft.get("pids", {})
316+
for scheme in missing_required_schemes:
317+
# Only add provider/prefix info if:
318+
# 1. Child has this PID type
319+
# 2. Parent doesn't already have it in current_pids (after cleanup)
320+
# 3. Parent doesn't have it in the actual record (defensive check)
321+
if (
322+
scheme in child_pids
323+
and scheme not in current_pids
324+
and not record.parent.pids.get(scheme)
325+
):
326+
# Copy provider from child to parent
327+
current_pids[scheme] = {
328+
"provider": child_pids[scheme].get("provider"),
329+
}
330+
# Extract prefix from the child's identifier
331+
child_identifier = child_pids[scheme].get("identifier", "")
332+
if child_identifier and "/" in child_identifier:
333+
current_pids[scheme]["prefix"] = child_identifier.split("/")[0]
334+
307335
pids = self.service.pids.parent_pid_manager.create_all(
308336
record.parent,
309337
pids=current_pids,

invenio_rdm_records/services/pids/manager.py

Lines changed: 43 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,11 @@ def create(self, draft, scheme, identifier=None, provider_name=None):
143143
"""
144144
provider = self._get_provider(scheme, provider_name)
145145
pid_attrs = {}
146+
# Extract prefix from existing PID metadata if available
147+
prefix = (
148+
draft.pids.get(scheme, {}).get("prefix") if draft.pids.get(scheme) else None
149+
)
150+
146151
if identifier is not None:
147152
try:
148153
pid = provider.get(identifier)
@@ -157,7 +162,9 @@ def create(self, draft, scheme, identifier=None, provider_name=None):
157162
)
158163
pid_attrs = {"identifier": identifier, "provider": provider.name}
159164
else:
160-
if draft.pids.get(scheme):
165+
# Only raise error if an identifier already exists
166+
# PIDs with only provider/prefix (without identifier) are incomplete and should be allowed
167+
if draft.pids.get(scheme, {}).get("identifier"):
161168
raise ValidationError(
162169
message=_("A PID already exists for type {scheme}").format(
163170
scheme=scheme
@@ -169,7 +176,11 @@ def create(self, draft, scheme, identifier=None, provider_name=None):
169176
message=_("External identifier value is required."),
170177
field_name=f"pids.{scheme}",
171178
)
172-
pid = provider.create(draft)
179+
# Generate ID with optional prefix override
180+
pid_kwargs = {}
181+
if prefix:
182+
pid_kwargs["prefix"] = prefix
183+
pid = provider.create(draft, **pid_kwargs)
173184
pid_attrs = {"identifier": pid.pid_value, "provider": provider.name}
174185

175186
if provider.client: # provider and identifier already in dict
@@ -183,16 +194,42 @@ def create_all(self, draft, pids=None, schemes=None):
183194

184195
# Create with an identifier value provided
185196
for scheme, pid_attrs in (pids or {}).items():
197+
# Temporarily store prefix in draft.pids for create() to read
198+
prefix = pid_attrs.get("prefix")
199+
if prefix:
200+
if scheme not in draft.pids:
201+
draft.pids[scheme] = {"prefix": prefix}
202+
elif "prefix" not in draft.pids[scheme]:
203+
draft.pids[scheme]["prefix"] = prefix
204+
186205
result[scheme] = self.create(
187206
draft,
188207
scheme,
189-
pid_attrs["identifier"],
208+
pid_attrs.get("identifier"),
190209
pid_attrs.get("provider"),
191210
)
192211

193212
# Create without an identifier value provided (only the scheme)
213+
# Use provider and prefix from pids dict if available
194214
for scheme in schemes or []:
195-
result[scheme] = self.create(draft, scheme)
215+
pid_attrs = (pids or {}).get(scheme, {})
216+
provider_name = pid_attrs.get("provider")
217+
218+
# Temporarily store prefix in draft.pids for create() to read
219+
prefix = pid_attrs.get("prefix")
220+
if prefix and scheme not in draft.pids:
221+
draft.pids[scheme] = {"prefix": prefix}
222+
elif prefix and scheme in draft.pids:
223+
# Preserve existing prefix if not already set
224+
if "prefix" not in draft.pids[scheme]:
225+
draft.pids[scheme]["prefix"] = prefix
226+
227+
result[scheme] = self.create(draft, scheme, provider_name=provider_name)
228+
229+
# Strip transient 'prefix' field from results - it's not part of the
230+
# JSON schema and should not be persisted on the record.
231+
for scheme_attrs in result.values():
232+
scheme_attrs.pop("prefix", None)
196233

197234
return result
198235

@@ -247,10 +284,7 @@ def discard(self, scheme, identifier, provider_name=None, soft_delete=False):
247284
if not provider.can_modify(pid) and not soft_delete:
248285
raise ValidationError(
249286
message=[
250-
_(
251-
"Cannot discard a reserved or registered persistent "
252-
"identifier."
253-
),
287+
_("Cannot discard a reserved or registered persistent identifier."),
254288
],
255289
field_name=f"pids.{scheme}",
256290
)
@@ -303,5 +337,5 @@ def create_and_reserve(self, record, **kwargs):
303337
"""Create and reserve a PID for the given record, and update the record with the reserved PID."""
304338
pids = record.get("pids", {})
305339
provider_pid_dicts = self._get_providers(pids)
306-
for provider, _ in provider_pid_dicts:
340+
for provider, pid_dict in provider_pid_dicts:
307341
provider.create_and_reserve(record)

invenio_rdm_records/services/pids/providers/base.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ def create(self, record, pid_value=None, status=None, **kwargs):
8383
if pid_value is None:
8484
if not self.is_managed():
8585
raise ValueError(_("You must provide a pid value."))
86-
pid_value = self.generate_id(record)
86+
pid_value = self.generate_id(record, **kwargs)
8787

8888
try:
8989
pid = self.get(pid_value)
@@ -98,10 +98,14 @@ def create(self, record, pid_value=None, status=None, **kwargs):
9898
status=status or self.default_status,
9999
)
100100

101-
# re-activate if previously deleted
101+
# PID exists - check if it can be reused
102+
# Re-activate if previously deleted
102103
if pid.is_deleted():
103104
pid.sync_status(PIDStatus.NEW)
104105
return pid
106+
# If PID exists and belongs to the same record, return it (idempotent)
107+
elif pid.object_uuid == record.id:
108+
return pid
105109
else:
106110
raise PIDAlreadyExists(self.pid_type, pid_value)
107111

0 commit comments

Comments
 (0)