Skip to content

Commit ef59b04

Browse files
committed
Fix caching of "source-less" SPARQL constructs
and improve logging to be explicit about what is about to be cached and used from the cache.
1 parent 34a7855 commit ef59b04

File tree

1 file changed

+13
-7
lines changed

1 file changed

+13
-7
lines changed

lxltools/datacompiler.py

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -274,9 +274,9 @@ def is_cachable(self, ref):
274274

275275
def get_cached_path(self, url):
276276
fpath = self.cachedir / quote(url.replace(CACHE_SPAQRL_BASE, 'sparql/'), safe="")
277-
print(url, fpath)
278277
return fpath
279278

279+
# TODO: now unused; either remove or use inside of cached_rdf (for more control)
280280
def cache_url(self, url, maxcache=MAX_CACHE):
281281
path = self.get_cached_path(url)
282282
mtime = path.stat().st_mtime if path.exists() else None
@@ -317,7 +317,9 @@ def cached_rdf(self, fpath, construct=None, graph=None):
317317
print("No cache directory configured", file=sys.stderr)
318318
elif construct:
319319
fpath = self.get_cached_path(fpath + '.ttl')
320-
if not fpath.is_file():
320+
321+
if not (fpath.is_file() and fpath.stat().st_size > 0):
322+
print(f'Caching result of {construct} as {fpath}', file=sys.stderr)
321323
with self.path(construct).open() as fp:
322324
try:
323325
res = (graph or Graph()).query(fp.read())
@@ -327,14 +329,17 @@ def cached_rdf(self, fpath, construct=None, graph=None):
327329
except Exception as e:
328330
print(f'Failed to cache {fpath}: {e}', file=sys.stderr)
329331
else:
332+
print(f'Using cached {fpath} as result of {construct}', file=sys.stderr)
330333
source.parse(str(fpath), format='turtle')
334+
331335
return source
332336

333337
elif self.is_cachable(fpath):
334338
remotepath = fpath
335339
fpath = self.get_cached_path(fpath + '.ttl')
336-
print(f'Using cached {fpath} for {remotepath}', file=sys.stderr)
337-
if not fpath.is_file():
340+
341+
if not (fpath.is_file() and fpath.stat().st_size > 0):
342+
print(f'Caching {remotepath} as {fpath}', file=sys.stderr)
338343
fpath.parent.mkdir(parents=True, exist_ok=True)
339344
try:
340345
# At least rdaregistry is *very* picky about what is asked for,
@@ -347,6 +352,7 @@ def cached_rdf(self, fpath, construct=None, graph=None):
347352
source.serialize(str(fpath), format='turtle')
348353
return source
349354
else:
355+
print(f'Using cached {fpath} for {remotepath}', file=sys.stderr)
350356
return source.parse(str(fpath), format='turtle')
351357

352358
fmt = 'nt' if fpath.endswith('.nt') else None
@@ -414,7 +420,7 @@ def _digest_source_data(src):
414420
source['query'] = src['dataQuery']['uri']
415421
elif '@id' in src:
416422
assert 'source' not in source
417-
source['source'] = str(src['@id']) # TODO: bug in rdflib; URIRef in the json-ld
423+
source['source'] = str(src['@id']) # TODO: bug in rdflib; URIRef in the JSON-LD
418424
elif 'uri' in src:
419425
instruct = 'result' if 'sourceData' in src else 'source'
420426
assert instruct not in source
@@ -425,7 +431,7 @@ def _digest_source_data(src):
425431
if 'representationOf' in src:
426432
instruct = 'dataset'
427433
assert instruct not in source
428-
source[instruct] = src['representationOf']['@id']
434+
source[instruct] = str(src['representationOf']['@id']) # TODO: (same as above)
429435
unhandled = False
430436

431437
for part in _aslist(src.get('sourceData')):
@@ -497,7 +503,7 @@ def _construct(compiler, sources, query=None):
497503
if isinstance(sourcedfn, str):
498504
sourcedfn = {'source': sourcedfn}
499505

500-
source = sourcedfn.get('source', [])
506+
source = sourcedfn.get('source') or sourcedfn.get('dataset')
501507
graph = dataset.get_context(URIRef(sourcedfn.get('dataset') or source))
502508
if isinstance(source, (dict, list)):
503509
# TODO: was currently unused, and not yet supported in the data-driven form.

0 commit comments

Comments
 (0)