Skip to content

Commit c672c03

Browse files
authored
Merge pull request #360 from GenomicDataInfrastructure/improved-multilangual
Improved multilangual
2 parents c375c59 + 1671bfe commit c672c03

File tree

11 files changed

+1475
-44
lines changed

11 files changed

+1475
-44
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,4 @@ build/*
1616
tmp/*
1717
package/DEBIAN/control
1818
*.swp
19+
.idea

ckanext/dcat/profiles/base.py

Lines changed: 62 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -533,10 +533,36 @@ def _agents_details(self, subject, predicate):
533533
"""
534534

535535
agents = []
536+
default_locale = config.get("ckan.locale_default", "") or ""
537+
default_lang = default_locale.split("_")[0] if default_locale else None
538+
536539
for agent in self.g.objects(subject, predicate):
537540
agent_details = {}
538541
agent_details["uri"] = str(agent) if isinstance(agent, term.URIRef) else ""
539-
agent_details["name"] = self._object_value(agent, FOAF.name)
542+
543+
names = list(self.g.objects(agent, FOAF.name))
544+
translations = {}
545+
fallback_name = ""
546+
for name_literal in names:
547+
if isinstance(name_literal, Literal):
548+
value = str(name_literal)
549+
lang = name_literal.language
550+
if lang:
551+
translations[lang] = value
552+
elif not fallback_name:
553+
fallback_name = value
554+
elif not fallback_name:
555+
fallback_name = str(name_literal)
556+
557+
if translations:
558+
agent_details["name_translated"] = translations
559+
if default_lang and translations.get(default_lang):
560+
agent_details["name"] = translations[default_lang]
561+
else:
562+
agent_details["name"] = fallback_name or next(iter(translations.values()))
563+
else:
564+
agent_details["name"] = fallback_name
565+
540566
agent_details["email"] = self._without_mailto(
541567
self._object_value(agent, FOAF.mbox)
542568
)
@@ -839,8 +865,25 @@ def _add_agent_to_graph(self, subject_ref, predicate, agent_dict):
839865
self.g.add((agent_ref, RDF.type, FOAF.Organization))
840866
self.g.add((agent_ref, RDF.type, FOAF.Agent))
841867

868+
name_translated = agent_dict.get("name_translated")
869+
translated_values = set()
870+
if isinstance(name_translated, dict):
871+
for lang, values in name_translated.items():
872+
if not values:
873+
continue
874+
if isinstance(values, (list, tuple)):
875+
iterable = values
876+
else:
877+
iterable = [values]
878+
for value in iterable:
879+
if value:
880+
self.g.add((agent_ref, FOAF.name, Literal(value, lang=lang)))
881+
translated_values.add((lang, value))
882+
842883
if agent_dict.get("name"):
843-
self.g.add((agent_ref, FOAF.name, Literal(agent_dict["name"])))
884+
name_value = agent_dict["name"]
885+
if not translated_values or all(val != name_value for _, val in translated_values):
886+
self.g.add((agent_ref, FOAF.name, Literal(name_value)))
844887
if agent_dict.get("email"):
845888
email = agent_dict["email"]
846889
if not email.startswith("mailto:"):
@@ -856,11 +899,26 @@ def _add_agent_to_graph(self, subject_ref, predicate, agent_dict):
856899
self.g.add((agent_ref, DCT.identifier, Literal(agent_dict["identifier"])))
857900

858901
for sub_org in agent_dict.get("actedOnBehalfOf", []):
859-
if sub_org.get("name"):
902+
if sub_org.get("name") or sub_org.get("name_translated"):
860903
org_ref = BNode()
861904
self.g.add((agent_ref, PROV.actedOnBehalfOf, org_ref))
862905
self.g.add((org_ref, RDF.type, PROV.Organization))
863-
self.g.add((org_ref, FOAF.name, Literal(sub_org["name"])))
906+
907+
sub_translations = sub_org.get("name_translated", {}) or {}
908+
if isinstance(sub_translations, dict):
909+
for lang, values in sub_translations.items():
910+
if not values:
911+
continue
912+
if isinstance(values, (list, tuple)):
913+
iterable = values
914+
else:
915+
iterable = [values]
916+
for value in iterable:
917+
if value:
918+
self.g.add((org_ref, FOAF.name, Literal(value, lang=lang)))
919+
920+
if sub_org.get("name"):
921+
self.g.add((org_ref, FOAF.name, Literal(sub_org["name"])))
864922

865923
return agent_ref
866924

ckanext/dcat/profiles/euro_dcat_ap_scheming.py

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,29 @@ def _parse_list_value(data_dict, field_name):
7070
except ValueError:
7171
pass
7272

73+
def _supports_agent_translations(field_name):
74+
schema_field = self._schema_field(field_name)
75+
if schema_field and "repeating_subfields" in schema_field:
76+
return any(
77+
subfield.get("field_name") == "name_translated"
78+
for subfield in schema_field["repeating_subfields"]
79+
)
80+
return False
81+
82+
def _prune_agent_translations(agent_list):
83+
pruned = []
84+
for agent_entry in agent_list:
85+
if isinstance(agent_entry, dict):
86+
agent_entry = dict(agent_entry)
87+
agent_entry.pop("name_translated", None)
88+
acted_lists = agent_entry.get("actedOnBehalfOf")
89+
if isinstance(acted_lists, list):
90+
agent_entry["actedOnBehalfOf"] = _prune_agent_translations(acted_lists)
91+
pruned.append(agent_entry)
92+
else:
93+
pruned.append(agent_entry)
94+
return pruned
95+
7396
for field_name in dataset_dict.keys():
7497
_parse_list_value(dataset_dict, field_name)
7598

@@ -117,6 +140,8 @@ def _parse_list_value(data_dict, field_name):
117140
key, predicate = item
118141
agents = self._agents_details(dataset_ref, predicate)
119142
if agents:
143+
if not _supports_agent_translations(key):
144+
agents = _prune_agent_translations(agents)
120145
dataset_dict[key] = agents
121146

122147
# Add any qualifiedRelations
@@ -239,7 +264,25 @@ def _add_agents(
239264
self.g.add((agent_ref, RDF.type, FOAF.Agent))
240265
self.g.add((dataset_ref, rdf_predicate, agent_ref))
241266

242-
self._add_triple_from_dict(agent, agent_ref, FOAF.name, "name")
267+
name_translated = agent.get("name_translated")
268+
translated_values = set()
269+
if isinstance(name_translated, dict):
270+
for lang, values in name_translated.items():
271+
if not values:
272+
continue
273+
if isinstance(values, (list, tuple)):
274+
iterable = values
275+
else:
276+
iterable = [values]
277+
for value in iterable:
278+
if value:
279+
self.g.add((agent_ref, FOAF.name, Literal(value, lang=lang)))
280+
translated_values.add((lang, value))
281+
282+
if agent.get("name"):
283+
name_value = agent["name"]
284+
if not translated_values or all(val != name_value for _, val in translated_values):
285+
self.g.add((agent_ref, FOAF.name, Literal(name_value)))
243286
self._add_triple_from_dict(
244287
agent, agent_ref, FOAF.homepage, "url", _type=URIRef
245288
)

ckanext/dcat/profiles/euro_health_dcat_ap.py

Lines changed: 55 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,12 @@
2323
"dpv": DPV,
2424
}
2525

26+
# HealthDCAT-AP fields that can contain language-tagged literals
27+
MULTILINGUAL_LITERAL_FIELDS = {
28+
"population_coverage": HEALTHDCATAP.populationCoverage,
29+
"publisher_note": HEALTHDCATAP.publisherNote,
30+
}
31+
2632

2733
class EuropeanHealthDCATAPProfile(EuropeanDCATAP3Profile):
2834
"""
@@ -42,7 +48,11 @@ def parse_dataset(self, dataset_dict, dataset_ref):
4248
return dataset_dict
4349

4450
def _parse_health_fields(self, dataset_dict, dataset_ref):
45-
self.__parse_healthdcat_stringvalues(dataset_dict, dataset_ref)
51+
multilingual_fields = set(self._multilingual_dataset_fields())
52+
53+
self.__parse_healthdcat_stringvalues(
54+
dataset_dict, dataset_ref, multilingual_fields
55+
)
4656
self.__parse_healthdcat_booleanvalues(dataset_dict, dataset_ref)
4757
self.__parse_healthdcat_intvalues(dataset_dict, dataset_ref)
4858

@@ -78,7 +88,9 @@ def __parse_healthdcat_intvalues(self, dataset_dict, dataset_ref):
7888
if value is not None:
7989
dataset_dict[key] = value
8090

81-
def __parse_healthdcat_stringvalues(self, dataset_dict, dataset_ref):
91+
def __parse_healthdcat_stringvalues(
92+
self, dataset_dict, dataset_ref, multilingual_fields
93+
):
8294
for (key, predicate,) in (
8395
("analytics", HEALTHDCATAP.analytics),
8496
("code_values", HEALTHDCATAP.hasCodeValues),
@@ -92,9 +104,18 @@ def __parse_healthdcat_stringvalues(self, dataset_dict, dataset_ref):
92104
("publisher_type", HEALTHDCATAP.publisherType),
93105
("purpose", DPV.hasPurpose),
94106
):
95-
values = self._object_value_list(dataset_ref, predicate)
96-
if values:
97-
dataset_dict[key] = values
107+
if (
108+
key in MULTILINGUAL_LITERAL_FIELDS
109+
and key in multilingual_fields
110+
):
111+
value = self._object_value(
112+
dataset_ref, predicate, multilingual=True
113+
)
114+
else:
115+
value = self._object_value_list(dataset_ref, predicate)
116+
117+
if value:
118+
dataset_dict[key] = value
98119

99120
def __parse_healthdcat_booleanvalues(self, dataset_dict, dataset_ref):
100121
for key, predicate in (
@@ -169,25 +190,45 @@ def graph_from_dataset(self, dataset_dict, dataset_ref):
169190
self.g.bind(prefix, namespace)
170191

171192
# key, predicate, fallbacks, _type, _class
172-
items = [
193+
list_items = [
173194
("analytics", HEALTHDCATAP.analytics, None, URIRefOrLiteral),
174195
("code_values", HEALTHDCATAP.hasCodeValues, None, URIRefOrLiteral),
175196
("coding_system", HEALTHDCATAP.hasCodingSystem, None, URIRefOrLiteral),
176197
("health_category", HEALTHDCATAP.healthCategory, None, URIRefOrLiteral),
177198
("health_theme", HEALTHDCATAP.healthCategory, None, URIRefOrLiteral),
178199
("legal_basis", DPV.hasLegalBasis, None, URIRefOrLiteral),
179-
(
180-
"population_coverage",
181-
HEALTHDCATAP.populationCoverage,
182-
None,
183-
URIRefOrLiteral,
184-
),
185200
("personal_data", DPV.hasPersonalData, None, URIRef),
186-
("publisher_note", HEALTHDCATAP.publisherNote, None, URIRefOrLiteral),
187201
("publisher_type", HEALTHDCATAP.publisherType, None, URIRefOrLiteral),
188202
("purpose", DPV.hasPurpose, None, URIRefOrLiteral),
189203
]
190-
self._add_list_triples_from_dict(dataset_dict, dataset_ref, items)
204+
self._add_list_triples_from_dict(dataset_dict, dataset_ref, list_items)
205+
206+
multilingual_fields = set(self._multilingual_dataset_fields())
207+
for key, predicate in MULTILINGUAL_LITERAL_FIELDS.items():
208+
value = self._get_dataset_value(dataset_dict, key)
209+
if not value:
210+
continue
211+
212+
if key in multilingual_fields and isinstance(value, dict):
213+
for lang, translated_value in value.items():
214+
if translated_value:
215+
self.g.add(
216+
(
217+
dataset_ref,
218+
predicate,
219+
Literal(translated_value, lang=lang),
220+
)
221+
)
222+
continue
223+
224+
self._add_triple_from_dict(
225+
dataset_dict,
226+
dataset_ref,
227+
predicate,
228+
key,
229+
list_value=True,
230+
_type=URIRefOrLiteral,
231+
)
191232

192233
if "trusted_data_holder" in dataset_dict:
193234
self.g.add(

0 commit comments

Comments
 (0)