Skip to content

Commit 7ac51fb

Browse files
committed
Refactor ReadTarFS to support internal symlinks
1 parent a617523 commit 7ac51fb

File tree

2 files changed

+251
-54
lines changed

2 files changed

+251
-54
lines changed

fs/tarfs.py

Lines changed: 128 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,15 @@
44
from __future__ import print_function
55
from __future__ import unicode_literals
66

7+
import operator
78
import os
89
import tarfile
910
import typing
1011
from collections import OrderedDict
1112
from typing import cast, IO
1213

1314
import six
15+
from six.moves import map
1416

1517
from . import errors
1618
from .base import FS
@@ -22,7 +24,18 @@
2224
from .opener import open_fs
2325
from .permissions import Permissions
2426
from ._url_tools import url_quote
25-
from .path import relpath, basename, isbase, normpath, parts, frombase
27+
from .path import (
28+
dirname,
29+
join,
30+
relpath,
31+
basename,
32+
isbase,
33+
normpath,
34+
parts,
35+
frombase,
36+
recursepath,
37+
relativefrom,
38+
)
2639
from .wrapfs import WrapFS
2740

2841
if typing.TYPE_CHECKING:
@@ -255,6 +268,8 @@ class ReadTarFS(FS):
255268
tarfile.SYMTYPE: ResourceType.symlink,
256269
tarfile.CONTTYPE: ResourceType.file,
257270
tarfile.LNKTYPE: ResourceType.symlink,
271+
# this is how we mark implicit directories
272+
tarfile.DIRTYPE + b"i": ResourceType.directory,
258273
}
259274

260275
@errors.CreateFailed.catch_all
@@ -275,24 +290,66 @@ def _directory_entries(self):
275290
"""Lazy directory cache."""
276291
if self._directory_cache is None:
277292
_decode = self._decode
293+
_encode = self._encode
294+
295+
# collect all directory entries and remove slashes
278296
_directory_entries = (
279297
(_decode(info.name).strip("/"), info) for info in self._tar
280298
)
281299

282-
def _list_tar():
283-
for name, info in _directory_entries:
284-
try:
285-
_name = normpath(name)
286-
except IllegalBackReference:
287-
# Back references outside root, must be up to no good.
288-
pass
289-
else:
290-
if _name:
291-
yield _name, info
292-
293-
self._directory_cache = OrderedDict(_list_tar())
300+
# build the cache first before updating it to reduce chances
301+
# of data races
302+
_cache = OrderedDict()
303+
for name, info in _directory_entries:
304+
# check for any invalid back references
305+
try:
306+
_name = normpath(name)
307+
except IllegalBackReference:
308+
continue
309+
310+
# add all implicit dirnames if not in the cache already
311+
for partial_name in map(relpath, recursepath(_name)):
312+
dirinfo = tarfile.TarInfo(self._encode(partial_name))
313+
dirinfo.type = tarfile.DIRTYPE
314+
_cache.setdefault(partial_name, dirinfo)
315+
316+
# add the entry itself, potentially overwriting implicit entries
317+
_cache[_name] = info
318+
319+
self._directory_cache = _cache
294320
return self._directory_cache
295321

322+
def _follow_symlink(self, entry):
323+
"""Follow an symlink `TarInfo` to find a concrete entry."""
324+
_entry = entry
325+
while _entry.issym():
326+
linkname = normpath(
327+
join(dirname(self._decode(_entry.name)), self._decode(_entry.linkname))
328+
)
329+
resolved = self._resolve(linkname)
330+
if resolved is None:
331+
raise errors.ResourceNotFound(linkname)
332+
_entry = self._directory_entries[resolved]
333+
334+
return _entry
335+
336+
def _resolve(self, path):
337+
"""Replace path components that are symlinks with concrete components.
338+
339+
Returns:
340+
341+
342+
"""
343+
if path in self._directory_entries or not path:
344+
return path
345+
for prefix in map(relpath, reversed(recursepath(path))):
346+
suffix = relativefrom(prefix, path)
347+
entry = self._directory_entries.get(prefix)
348+
if entry is not None and entry.issym():
349+
entry = self._follow_symlink(entry)
350+
return self._resolve(join(self._decode(entry.name), suffix))
351+
return None
352+
296353
def __repr__(self):
297354
# type: () -> Text
298355
return "ReadTarFS({!r})".format(self._file)
@@ -327,31 +384,35 @@ def getinfo(self, path, namespaces=None):
327384
namespaces = namespaces or ()
328385
raw_info = {} # type: Dict[Text, Dict[Text, object]]
329386

387+
# special case for root
330388
if not _path:
331389
raw_info["basic"] = {"name": "", "is_dir": True}
332390
if "details" in namespaces:
333391
raw_info["details"] = {"type": int(ResourceType.directory)}
334392

335393
else:
336-
try:
337-
implicit = False
338-
member = self._directory_entries[_path]
339-
except KeyError:
340-
if not self.isdir(_path):
341-
raise errors.ResourceNotFound(path)
342-
implicit = True
343-
member = tarfile.TarInfo(_path)
344-
member.type = tarfile.DIRTYPE
394+
395+
_realpath = self._resolve(_path)
396+
if _realpath is None:
397+
raise errors.ResourceNotFound(path)
398+
399+
implicit = False
400+
member = self._directory_entries[_realpath]
345401

346402
raw_info["basic"] = {
347403
"name": basename(self._decode(member.name)),
348-
"is_dir": member.isdir(),
404+
"is_dir": self.isdir(_path), # is_dir should follow symlinks
349405
}
350406

351407
if "link" in namespaces:
352-
raw_info["link"] = {
353-
"target": self._decode(member.linkname) if member.issym() else None
354-
}
408+
if member.issym():
409+
target = join(
410+
dirname(self._decode(member.name)),
411+
self._decode(member.linkname),
412+
)
413+
else:
414+
target = None
415+
raw_info["link"] = {"target": target}
355416
if "details" in namespaces:
356417
raw_info["details"] = {
357418
"size": member.size,
@@ -381,23 +442,29 @@ def getinfo(self, path, namespaces=None):
381442

382443
def isdir(self, path):
383444
_path = relpath(self.validatepath(path))
384-
try:
385-
return self._directory_entries[_path].isdir()
386-
except KeyError:
387-
return any(isbase(_path, name) for name in self._directory_entries)
445+
realpath = self._resolve(_path)
446+
if realpath is not None:
447+
entry = self._directory_entries[realpath]
448+
return self._follow_symlink(entry).isdir()
449+
else:
450+
return False
388451

389452
def isfile(self, path):
390453
_path = relpath(self.validatepath(path))
391-
try:
392-
return self._directory_entries[_path].isfile()
393-
except KeyError:
454+
realpath = self._resolve(_path)
455+
if realpath is not None:
456+
entry = self._directory_entries[realpath]
457+
return self._follow_symlink(entry).isfile()
458+
else:
394459
return False
395460

396461
def islink(self, path):
397462
_path = relpath(self.validatepath(path))
398-
try:
399-
return self._directory_entries[_path].issym()
400-
except KeyError:
463+
realpath = self._resolve(_path)
464+
if realpath is not None:
465+
entry = self._directory_entries[realpath]
466+
return entry.issym()
467+
else:
401468
return False
402469

403470
def setinfo(self, path, info):
@@ -409,13 +476,28 @@ def listdir(self, path):
409476
# type: (Text) -> List[Text]
410477
_path = relpath(self.validatepath(path))
411478

412-
if not self.gettype(path) is ResourceType.directory:
413-
raise errors.DirectoryExpected(path)
479+
# check the given path exists
480+
realpath = self._resolve(_path)
481+
if realpath is None:
482+
raise errors.ResourceNotFound(path)
483+
elif realpath:
484+
target = self._follow_symlink(self._directory_entries[realpath])
485+
# check the path is either a symlink mapping to a directory or a directory
486+
if target.isdir():
487+
base = target.name
488+
elif target.issym():
489+
base = target.linkname
490+
else:
491+
raise errors.DirectoryExpected(path)
492+
else:
493+
base = ""
414494

495+
# find all entries in the actual directory
415496
children = (
416-
frombase(_path, n) for n in self._directory_entries if isbase(_path, n)
497+
frombase(base, n) for n in self._directory_entries if isbase(base, n)
417498
)
418499
content = (parts(child)[1] for child in children if relpath(child))
500+
419501
return list(OrderedDict.fromkeys(content))
420502

421503
def makedir(
@@ -432,17 +514,18 @@ def openbin(self, path, mode="r", buffering=-1, **options):
432514
# type: (Text, Text, int, **Any) -> BinaryIO
433515
_path = relpath(self.validatepath(path))
434516

517+
# check the requested mode is only a reading mode
435518
if "w" in mode or "+" in mode or "a" in mode:
436519
raise errors.ResourceReadOnly(path)
437520

438-
try:
439-
member = self._directory_entries[_path]
440-
except KeyError:
441-
six.raise_from(errors.ResourceNotFound(path), None)
521+
# check the path actually resolves after following symlinks
522+
_realpath = self._resolve(_path)
523+
if _realpath is None:
524+
raise errors.ResourceNotFound(path)
442525

443-
# TarFile.extractfile returns None if the entry is
526+
# TarFile.extractfile returns None if the entry is not a file
444527
# neither a file nor a symlink
445-
reader = self._tar.extractfile(member)
528+
reader = self._tar.extractfile(self._directory_entries[_realpath])
446529
if reader is None:
447530
raise errors.FileExpected(path)
448531

0 commit comments

Comments
 (0)