Skip to content

Commit feb3f14

Browse files
feat(dataset): export dataset keywords (#3454)
1 parent a355ac8 commit feb3f14

7 files changed

Lines changed: 84 additions & 14 deletions

File tree

renku/command/schema/agent.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
"""Agents JSON-LD schemes."""
1717

1818
from calamus.schema import JsonLDSchema
19-
from marshmallow import EXCLUDE
19+
from marshmallow import EXCLUDE, pre_load
2020

2121
from renku.command.schema.calamus import StringList, fields, prov, schema
2222
from renku.domain_model.provenance.agent import Person, SoftwareAgent
@@ -32,6 +32,27 @@ class Meta:
3232
model = Person
3333
unknown = EXCLUDE
3434

35+
@pre_load
36+
def fix_affiliation(self, data, **kwargs):
37+
"""Fix affiliation to be a string."""
38+
affiliations = []
39+
affiliation = data.get("http://schema.org/affiliation")
40+
if affiliation:
41+
if not isinstance(affiliation, list):
42+
affiliation = [affiliation]
43+
for a in affiliation:
44+
if isinstance(a, dict):
45+
name = a.get("http://schema.org/name", "")
46+
if isinstance(name, list):
47+
name = name[0]
48+
else:
49+
name = str(a)
50+
affiliations.append(name)
51+
52+
data["http://schema.org/affiliation"] = affiliations
53+
54+
return data
55+
3556
affiliation = StringList(schema.affiliation, load_default=None)
3657
alternate_name = StringList(schema.alternateName, load_default=None)
3758
email = fields.String(schema.email, load_default=None)

renku/core/dataset/providers/dataverse.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
AUTHOR_METADATA_TEMPLATE,
3838
CONTACT_METADATA_TEMPLATE,
3939
DATASET_METADATA_TEMPLATE,
40+
KEYWORDS_METADATA_TEMPLATE,
4041
)
4142
from renku.core.dataset.providers.doi import DOIProvider
4243
from renku.core.dataset.providers.repository import RepositoryImporter, make_request
@@ -311,6 +312,7 @@ def __init__(
311312
name=None,
312313
parent_url=None,
313314
type=None,
315+
encoding_format=None,
314316
):
315317
self.content_size = content_size
316318
self.content_url = content_url
@@ -321,6 +323,7 @@ def __init__(
321323
self.name = name
322324
self.parent_url = parent_url
323325
self.type = type
326+
self.encoding_format = encoding_format
324327

325328
@property
326329
def remote_url(self):
@@ -384,13 +387,15 @@ def export(self, **kwargs):
384387
def _get_dataset_metadata(self):
385388
authors, contacts = self._get_creators()
386389
subject = self._get_subject()
390+
keywords = self._get_keywords()
387391
metadata_template = Template(DATASET_METADATA_TEMPLATE)
388392
metadata = metadata_template.substitute(
389393
name=_escape_json_string(self.dataset.title),
390394
authors=json.dumps(authors),
391395
contacts=json.dumps(contacts),
392396
description=_escape_json_string(self.dataset.description),
393397
subject=subject,
398+
keywords=json.dumps(keywords),
394399
)
395400
return json.loads(metadata)
396401

@@ -425,6 +430,16 @@ def _get_creators(self):
425430

426431
return authors, contacts
427432

433+
def _get_keywords(self):
434+
keywords = []
435+
436+
for keyword in self.dataset.keywords:
437+
keyword_template = Template(KEYWORDS_METADATA_TEMPLATE)
438+
keyword_str = keyword_template.substitute(keyword=_escape_json_string(keyword))
439+
keywords.append(json.loads(keyword_str))
440+
441+
return keywords
442+
428443

429444
class _DataverseDeposition:
430445
"""Dataverse record for deposit."""

renku/core/dataset/providers/dataverse_metadata_templates.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,12 @@
3838
"multiple": true,
3939
"typeName": "datasetContact"
4040
},
41+
{
42+
"value": ${keywords},
43+
"typeClass": "compound",
44+
"multiple": true,
45+
"typeName": "keyword"
46+
},
4147
{
4248
"value": [
4349
{
@@ -99,3 +105,14 @@
99105
}
100106
}
101107
"""
108+
109+
KEYWORDS_METADATA_TEMPLATE = """
110+
{
111+
"keywordValue": {
112+
"typeName": "keywordValue",
113+
"multiple": false,
114+
"typeClass": "primitive",
115+
"value": "${keyword}"
116+
}
117+
}
118+
"""

renku/core/dataset/providers/zenodo.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ def __init__(self, uri: str, is_doi: bool = False):
7070

7171
@staticmethod
7272
def supports(uri):
73-
"""Whether or not this provider supports a given URI."""
73+
"""Whether this provider supports a given URI."""
7474
if "zenodo" in uri.lower():
7575
return True
7676

@@ -335,10 +335,7 @@ def __init__(self, dataset, publish, tag):
335335
@property
336336
def zenodo_url(self):
337337
"""Returns correct Zenodo URL based on environment."""
338-
if "ZENODO_USE_SANDBOX" in os.environ:
339-
return ZENODO_SANDBOX_URL
340-
341-
return ZENODO_BASE_URL
338+
return ZENODO_SANDBOX_URL if "ZENODO_USE_SANDBOX" in os.environ else ZENODO_BASE_URL
342339

343340
def set_access_token(self, access_token):
344341
"""Set access token."""
@@ -482,6 +479,7 @@ def attach_metadata(self, dataset, tag):
482479
{"name": creator.name, "affiliation": creator.affiliation if creator.affiliation else None}
483480
for creator in dataset.creators
484481
],
482+
"keywords": dataset.keywords,
485483
}
486484
}
487485

@@ -532,12 +530,12 @@ def _check_response(response):
532530
def _make_request(uri, accept: str = "application/json"):
533531
"""Execute network request."""
534532
record_id = ZenodoProvider.get_record_id(uri)
535-
url = make_records_url(record_id)
533+
url = make_records_url(record_id, uri=uri)
536534

537535
return make_request(url=url, accept=accept)
538536

539537

540-
def make_records_url(record_id):
538+
def make_records_url(record_id, uri: str):
541539
"""Create URL to access record by ID.
542540
543541
Args:
@@ -546,4 +544,6 @@ def make_records_url(record_id):
546544
Returns:
547545
str: Full URL for the record.
548546
"""
549-
return urllib.parse.urljoin(ZENODO_BASE_URL, posixpath.join(ZENODO_API_PATH, "records", record_id))
547+
url = ZENODO_SANDBOX_URL if "sandbox.zenodo.org" in uri.lower() else ZENODO_BASE_URL
548+
549+
return urllib.parse.urljoin(url, posixpath.join(ZENODO_API_PATH, "records", record_id))

renku/core/migration/models/v9.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1544,7 +1544,7 @@ def creators_csv(self):
15441544
@property
15451545
def keywords_csv(self):
15461546
"""Comma-separated list of keywords associated with dataset."""
1547-
return ", ".join(self.keywords)
1547+
return ", ".join(self.keywords or [])
15481548

15491549
@property
15501550
def tags_csv(self):

renku/domain_model/dataset.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -500,7 +500,7 @@ def creators_full_csv(self):
500500
@property
501501
def keywords_csv(self):
502502
"""Comma-separated list of keywords associated with dataset."""
503-
return ", ".join(self.keywords)
503+
return ", ".join(self.keywords or [])
504504

505505
def get_datadir(self) -> Path:
506506
"""Return dataset's data directory relative to project's root."""

tests/cli/test_integration_datasets.py

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -61,12 +61,20 @@
6161
"name": "pyndl_naive_discr_v0.6.4",
6262
"creator": "Konstantin Sering, Marc Weitz, David-Elias Künstle, Lennart Schneider",
6363
"version": "v0.6.4",
64+
"keywords": {
65+
"naive discriminative learning",
66+
"linguistics",
67+
"python",
68+
"cognitive science",
69+
"machine learning",
70+
},
6471
},
6572
{
6673
"doi": "10.7910/DVN/F4NUMR",
6774
"name": "replication_data_for_2.2",
6875
"creator": "James Druckman, Martin Kifer, Michael Parkin",
6976
"version": "2",
77+
"keywords": {"Social Sciences"},
7078
},
7179
],
7280
)
@@ -104,6 +112,7 @@ def test_dataset_import_real_doi(runner, project, doi, prefix, sleep_after):
104112
assert doi["doi"] in dataset.same_as.url
105113
assert dataset.date_created is None
106114
assert dataset.date_published is not None
115+
assert doi["keywords"] == set(dataset.keywords)
107116

108117
result = runner.invoke(cli, ["graph", "export", "--format", "json-ld", "--strict"])
109118
assert 0 == result.exit_code, format_result_exception(result)
@@ -825,10 +834,10 @@ def test_dataset_export_upload_failure(runner, tmpdir, project, zenodo_sandbox):
825834
[("zenodo", [], "zenodo.org/record"), ("dataverse", ["--dataverse-name", "sdsc-published-test-dataverse"], "doi:")],
826835
)
827836
def test_dataset_export_published_url(
828-
runner, tmpdir, project, zenodo_sandbox, dataverse_demo, provider, params, output
837+
runner, tmpdir, project, zenodo_sandbox, dataverse_demo, with_injection, provider, params, output
829838
):
830839
"""Test publishing of dataset."""
831-
result = runner.invoke(cli, ["dataset", "create", "my-dataset"])
840+
result = runner.invoke(cli, ["dataset", "create", "my-dataset", "-k", "keyword", "-k", "data"])
832841

833842
assert 0 == result.exit_code, format_result_exception(result) + str(result.stderr_bytes)
834843
assert "OK" in result.output
@@ -841,7 +850,7 @@ def test_dataset_export_published_url(
841850
result = runner.invoke(cli, ["dataset", "add", "--copy", "my-dataset", str(new_file)])
842851
assert 0 == result.exit_code, format_result_exception(result) + str(result.stderr_bytes)
843852

844-
with with_dataset(name="my-dataset", commit_database=True) as dataset:
853+
with with_injection(), with_dataset(name="my-dataset", commit_database=True) as dataset:
845854
dataset.description = "awesome dataset"
846855
dataset.creators[0].affiliation = "eth"
847856

@@ -854,6 +863,14 @@ def test_dataset_export_published_url(
854863
assert "Exported to:" in result.output
855864
assert output in result.output
856865

866+
m = re.search(r"Exported to:\s*(\S*)$", result.output, flags=re.MULTILINE)
867+
doi = m.group(1)
868+
result = runner.invoke(cli, ["dataset", "import", doi, "--name", "imported"], input="y")
869+
assert 0 == result.exit_code, format_result_exception(result)
870+
871+
dataset = get_dataset_with_injection("imported")
872+
assert {"data", "keyword"} == set(dataset.keywords)
873+
857874

858875
@pytest.mark.integration
859876
@retry_failed

0 commit comments

Comments
 (0)