Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 35 additions & 2 deletions scispacy/file_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,15 @@ def url_to_filename(url: str, etag: Optional[str] = None) -> str:
Convert `url` into a hashed filename in a repeatable way.
If `etag` is specified, append its hash to the url's, delimited
by a period.
"""

Only the file extension from the original URL is preserved (not the
full trailing path component) to keep filenames short enough for
filesystems with a 143-byte NAME_MAX (e.g. eCryptfs).
See: https://github.com/allenai/scispacy/issues/539
"""
last_part = url.split("/")[-1]
_, ext = os.path.splitext(last_part)

url_bytes = url.encode("utf-8")
url_hash = sha256(url_bytes)
filename = url_hash.hexdigest()
Expand All @@ -67,7 +73,7 @@ def url_to_filename(url: str, etag: Optional[str] = None) -> str:
etag_hash = sha256(etag_bytes)
filename += "." + etag_hash.hexdigest()

filename += "." + last_part
filename += ext
return filename


Expand Down Expand Up @@ -106,6 +112,33 @@ def http_get(url: str, temp_file: IO) -> None:
pbar.close()


def _find_existing_cache_file(url: str, cache_dir: str) -> Optional[str]:
"""
Check if a cached file already exists for the given URL.
Since the filename includes the etag (which we may not have without a
network call), we look for any file matching the URL hash prefix.

Supports both old-format filenames (<url_hash>.<etag_hash>.<last_part>)
and new-format filenames (<url_hash>.<etag_hash>.<ext>).
"""
url_bytes = url.encode("utf-8")
url_hash = sha256(url_bytes).hexdigest()
last_part = url.split("/")[-1]
_, ext = os.path.splitext(last_part)

for filename in os.listdir(cache_dir):
if filename.endswith(".json") or not os.path.isfile(
os.path.join(cache_dir, filename)
):
continue
if not filename.startswith(url_hash):
continue
if filename.endswith("." + last_part) or filename.endswith(ext):
return os.path.join(cache_dir, filename)
return None



def get_from_cache(url: str, cache_dir: Optional[str] = None) -> str:
"""
Given a URL, look for the corresponding dataset in the local cache.
Expand Down
20 changes: 20 additions & 0 deletions tests/test_file_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,23 @@ def test_url_to_filename_with_etags_eliminates_quotes(self):
back_to_url, etag = filename_to_url(filename, cache_dir=self.TEST_DIR)
assert back_to_url == url
assert etag == "mytag"

def test_url_to_filename_length_under_ecryptfs_limit(self):
"""Filenames (including .json sidecar) must stay under 143 bytes for eCryptfs.
See: https://github.com/allenai/scispacy/issues/539
"""
# These are the actual URLs used by scispacy linkers
urls = [
'https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linkers/2023-04-23/umls/nmslib_index.bin',
'https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linkers/2023-04-23/umls/tfidf_vectorizer.joblib',
'https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linkers/2023-04-23/umls/tfidf_vectors_sparse.npz',
'https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linkers/2023-04-23/umls/concept_aliases.json',
]
# Simulate a realistic 64-char hex ETag
long_etag = '"d41d8cd98f00b204e9800998ecf8427ed41d8cd98f00b204e9800998ecf8427e"'
for url in urls:
filename = url_to_filename(url, etag=long_etag)
meta_filename = filename + ".json"
assert len(meta_filename) < 143, (
f"Metadata filename too long for eCryptfs ({len(meta_filename)} >= 143): {meta_filename}"
)