diff --git a/pyproject.toml b/pyproject.toml index f45a25c..0ca2857 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,6 +24,7 @@ requires-python = ">=3.9" dependencies = [ "attrs >=24.1", "bidsschematools @ git+https://github.com/bids-standard/bids-specification.git@refs/pull/2133/head#subdirectory=tools/schemacode", + "orjson>=3.11.3", "universal_pathlib >=0.2", ] diff --git a/src/bids_validator/__main__.py b/src/bids_validator/__main__.py index bbea18e..f2bd949 100644 --- a/src/bids_validator/__main__.py +++ b/src/bids_validator/__main__.py @@ -8,15 +8,25 @@ raise SystemExit(1) from None import sys -from typing import Annotated +from collections.abc import Iterator +from typing import Annotated, Optional + +from bidsschematools.schema import load_schema +from bidsschematools.types import Namespace +from bidsschematools.types.context import Subject from bids_validator import BIDSValidator +from bids_validator.context import Context, Dataset, Sessions from bids_validator.types.files import FileTree app = typer.Typer() -def walk(tree: FileTree): +def is_subject_dir(tree): + return tree.name.startswith('sub-') + + +def walk(tree: FileTree, dataset: Dataset, subject: Subject = None) -> Iterator[Context]: """Iterate over children of a FileTree and check if they are a directory or file. If it's a directory then run again recursively, if it's a file file check the file name is @@ -26,34 +36,39 @@ def walk(tree: FileTree): ---------- tree : FileTree FileTree object to iterate over + dataset: Dataset + Object containing properties for entire dataset + subject: Subject + object containing subject and session info """ + if subject is None and is_subject_dir(tree): + subject = Subject(Sessions(tree)) + for child in tree.children.values(): if child.is_dir: - yield from walk(child) + yield from walk(child, dataset, subject) else: - yield child + yield Context(child, dataset, subject) -def validate(tree: FileTree): +def validate(tree: FileTree, schema: Namespace): """Check if the file path is BIDS compliant. Parameters ---------- tree : FileTree Full FileTree object to iterate over and check + schema : Namespace + Schema object to validate dataset against """ validator = BIDSValidator() + dataset = Dataset(tree, schema) - for file in walk(tree): - # The output of the FileTree.relative_path method always drops the initial for the path - # which makes it fail the validator.is_bids check. THis may be a Windows specific thing. - # This line adds it back. - path = f'/{file.relative_path}' - - if not validator.is_bids(path): - print(f'{path} is not a valid bids filename') + for file in walk(tree, dataset): + if not validator.is_bids(file.path): + print(f'{file.path} is not a valid bids filename') def show_version(): @@ -85,6 +100,7 @@ def version_callback(value: bool): @app.command() def main( bids_path: str, + schema_path: Optional[str] = None, verbose: Annotated[bool, typer.Option('--verbose', '-v', help='Show verbose output')] = False, version: Annotated[ bool, @@ -101,7 +117,9 @@ def main( root_path = FileTree.read_from_filesystem(bids_path) - validate(root_path) + schema = load_schema(schema_path) + + validate(root_path, schema) if __name__ == '__main__': diff --git a/src/bids_validator/context.py b/src/bids_validator/context.py index 2427a59..9baa58f 100644 --- a/src/bids_validator/context.py +++ b/src/bids_validator/context.py @@ -3,10 +3,10 @@ from __future__ import annotations import itertools -import json from functools import cache import attrs +import orjson from bidsschematools.types import Namespace from bidsschematools.types import context as ctx from upath import UPath @@ -46,10 +46,22 @@ class ValidationError(Exception): """TODO: Add issue structure.""" +_DATATYPE_MAP = {} + + +def datatype_to_modality(datatype: str, schema: Namespace) -> str: + """Generate a global map for datatype to modality.""" + global _DATATYPE_MAP + if not _DATATYPE_MAP: + for mod_name, mod_dtypes in schema.rules.modalities.items(): + _DATATYPE_MAP |= dict.fromkeys(mod_dtypes['datatypes'], mod_name) + return _DATATYPE_MAP[datatype] + + @cache def load_tsv(file: FileTree, *, max_rows=0) -> Namespace: """Load TSV contents into a Namespace.""" - with open(file) as fobj: + with file.path_obj.open() as fobj: if max_rows > 0: fobj = itertools.islice(fobj, max_rows) contents = (line.rstrip('\r\n').split('\t') for line in fobj) @@ -60,8 +72,7 @@ def load_tsv(file: FileTree, *, max_rows=0) -> Namespace: @cache def load_json(file: FileTree) -> dict[str]: """Load JSON file contents.""" - with open(file) as fobj: - return json.load(fobj) + return orjson.loads(file.path_obj.read_bytes()) class Subjects: @@ -130,14 +141,7 @@ def dataset_description(self) -> Namespace: @cached_property def modalities(self) -> list[str]: """List of modalities found in the dataset.""" - result = set() - - modalities = self.schema.rules.modalities - for datatype in self.datatypes: - for mod_name, mod_dtypes in modalities.items(): - if datatype in mod_dtypes.datatypes: - result.add(mod_name) - + result = {datatype_to_modality(datatype, self.schema) for datatype in self.datatypes} return list(result) @cached_property @@ -202,6 +206,12 @@ def load_sidecar(file: FileTree) -> dict[str, t.Any]: # Uses walk back algorithm # https://bids-validator.readthedocs.io/en/latest/validation-model/inheritance-principle.html # Accumulates all sidecars + metadata = {} + + for json in walk_back(file, inherit=True): + metadata = load_json(json) | metadata + + return metadata def walk_back( @@ -298,3 +308,133 @@ def from_file(cls, file: FileTree, schema: Namespace | None = None) -> t.Self: suffix=suffix, extension=extension, ) + + +@attrs.define +class Context: + """A context object that creates context for file on access.""" + + file: FileTree + dataset: Dataset + subject: ctx.Subject | None + file_parts: FileParts = attrs.field(init=False) + + def __attrs_post_init__(self): + self.file_parts = FileParts.from_file(self.file, self.schema) + + @property + def schema(self) -> Namespace: + """The BIDS specification schema.""" + return self.dataset.schema + + @property + def path(self) -> str: + """Path of the current file.""" + return self.file_parts.path + + @property + def entities(self) -> dict[str, str] | None: + """Entities parsed from the current filename.""" + return self.file_parts.entities + + @property + def datatype(self) -> str | None: + """Datatype of current file, for examples, anat.""" + return self.file_parts.datatype + + @property + def suffix(self) -> str | None: + """Suffix of current file.""" + return self.file_parts.suffix + + @property + def extension(self) -> str | None: + """Extension of current file including initial dot.""" + return self.file_parts.extension + + @property + def modality(self) -> str | None: + """Modality of current file, for examples, MRI.""" + return datatype_to_modality(self.file_parts.datatype, self.schema) + + @property + def size(self) -> int: + """Length of the current file in bytes.""" + return self.file.path_obj.stat().st_size + + @property + def associations(self) -> ctx.Associations: + """Associated files, indexed by suffix, selected according to the inheritance principle.""" + return ctx.Associations() + + @property + def columns(self) -> None: + """TSV columns, indexed by column header, values are arrays with column contents.""" + pass + + @property + def json(self) -> Namespace | None: + """Contents of the current JSON file.""" + if self.file_parts.extension == '.json': + return Namespace.build(load_json(self.file)) + + return None + + @property + def gzip(self) -> None: + """Parsed contents of gzip header.""" + pass + + @property + def nifti_header(self) -> None: + """Parsed contents of NIfTI header referenced elsewhere in schema.""" + pass + + @property + def ome(self) -> None: + """Parsed contents of OME-XML header, which may be found in OME-TIFF or OME-ZARR files.""" + pass + + @property + def tiff(self) -> None: + """TIFF file format metadata.""" + pass + + @property + def sidecar(self) -> Namespace | None: + """Sidecar metadata constructed via the inheritance principle.""" + sidecar = load_sidecar(self.file) or {} + + return Namespace.build(sidecar) + + +class Sessions: + """Collections of sessions in subject.""" + + def __init__(self, tree: FileTree): + self._tree = tree + + @cached_property + def ses_dirs(self) -> list[str]: + """Sessions as determined by ses-* directories.""" + return [ + child.name + for child in self._tree.children.values() + if child.is_dir and child.name.startswith('ses-') + ] + + @property + def session_id(self) -> list[str] | None: + """The session_id column of *_sessions.tsv.""" + for name, value in self._tree.children.items(): + if name.endswith('_sessions.tsv'): + return self._get_session_id(value) + else: + return None + + @staticmethod + def _get_session_id(phenotype_file: FileTree) -> list[str] | None: + columns = load_tsv(phenotype_file) + if 'session_id' not in columns: + return None + return list(columns['session_id']) diff --git a/src/bids_validator/types/files.py b/src/bids_validator/types/files.py index a3a6f62..40db627 100644 --- a/src/bids_validator/types/files.py +++ b/src/bids_validator/types/files.py @@ -4,103 +4,50 @@ import os import posixpath -import stat from functools import cached_property from pathlib import Path import attrs +from upath import UPath from . import _typings as t __all__ = ('FileTree',) -@attrs.define -class UserDirEntry: - """Partial reimplementation of :class:`os.DirEntry`. - - :class:`os.DirEntry` can't be instantiated from Python, but this can. - """ - - path: str = attrs.field(repr=False, converter=os.fspath) - name: str = attrs.field(init=False) - _stat: os.stat_result = attrs.field(init=False, repr=False, default=None) - _lstat: os.stat_result = attrs.field(init=False, repr=False, default=None) - - def __attrs_post_init__(self) -> None: - self.name = os.path.basename(self.path) - - def __fspath__(self) -> str: - return self.path - - def stat(self, *, follow_symlinks: bool = True) -> os.stat_result: - """Return stat_result object for the entry; cached per entry.""" - if follow_symlinks: - if self._stat is None: - self._stat = os.stat(self.path, follow_symlinks=True) - return self._stat - else: - if self._lstat is None: - self._lstat = os.stat(self.path, follow_symlinks=False) - return self._lstat - - def is_dir(self, *, follow_symlinks: bool = True) -> bool: - """Return True if the entry is a directory; cached per entry.""" - _stat = self.stat(follow_symlinks=follow_symlinks) - return stat.S_ISDIR(_stat.st_mode) - - def is_file(self, *, follow_symlinks: bool = True) -> bool: - """Return True if the entry is a file; cached per entry.""" - _stat = self.stat(follow_symlinks=follow_symlinks) - return stat.S_ISREG(_stat.st_mode) - - def is_symlink(self) -> bool: - """Return True if the entry is a symlink; cached per entry.""" - _stat = self.stat(follow_symlinks=False) - return stat.S_ISLNK(_stat.st_mode) - - -def as_direntry(obj: os.PathLike) -> os.DirEntry | UserDirEntry: - """Convert PathLike into DirEntry-like object.""" - if isinstance(obj, os.DirEntry): - return obj - return UserDirEntry(obj) - - -@attrs.define +@attrs.define(frozen=True) class FileTree: """Represent a FileTree with cached metadata.""" - direntry: os.DirEntry | UserDirEntry = attrs.field(repr=False, converter=as_direntry) - parent: FileTree | None = attrs.field(repr=False, default=None) - is_dir: bool = attrs.field(default=False) - children: dict[str, FileTree] = attrs.field(repr=False, factory=dict) - name: str = attrs.field(init=False) + path_obj: UPath = attrs.field(repr=False, converter=UPath) + is_dir: bool = attrs.field(repr=False, default=None) + parent: FileTree | None = attrs.field(repr=False, default=None, eq=False) + children: dict[str, FileTree] = attrs.field(repr=False, factory=dict, eq=False) def __attrs_post_init__(self): - self.name = self.direntry.name - self.children = { - name: attrs.evolve(child, parent=self) for name, child in self.children.items() - } + if self.is_dir is None: + object.__setattr__(self, 'is_dir', self.path_obj.is_dir()) + object.__setattr__( + self, + 'children', + {name: attrs.evolve(child, parent=self) for name, child in self.children.items()}, + ) @classmethod - def read_from_filesystem( - cls, - direntry: os.PathLike, - parent: FileTree | None = None, - ) -> t.Self: - """Read a FileTree from the filesystem. - - Uses :func:`os.scandir` to walk the directory tree. - """ - self = cls(direntry, parent=parent) - if self.direntry.is_dir(): - self.is_dir = True - self.children = { - entry.name: FileTree.read_from_filesystem(entry, parent=self) - for entry in os.scandir(self.direntry) + def read_from_filesystem(cls, path_obj: os.PathLike) -> t.Self: + """Read a FileTree from the filesystem.""" + path_obj = UPath(path_obj) + children = {} + if is_dir := path_obj.is_dir(): + children = { + entry.name: FileTree.read_from_filesystem(entry) for entry in path_obj.iterdir() } - return self + return cls(path_obj, is_dir=is_dir, children=children) + + @property + def name(self) -> bool: + """The name of the current FileTree node.""" + return self.path_obj.name def __contains__(self, relpath: os.PathLike) -> bool: parts = Path(relpath).parts @@ -110,10 +57,7 @@ def __contains__(self, relpath: os.PathLike) -> bool: return child and (len(parts) == 1 or posixpath.join(*parts[1:]) in child) def __fspath__(self): - return self.direntry.path - - def __hash__(self): - return hash(self.direntry.path) + return self.path_obj.__fspath__() def __truediv__(self, relpath: str | os.PathLike) -> t.Self: parts = Path(relpath).parts diff --git a/tests/test_context.py b/tests/test_context.py index 234f393..6dcb040 100644 --- a/tests/test_context.py +++ b/tests/test_context.py @@ -1,4 +1,8 @@ +import json + +import fsspec import pytest +from bidsschematools.types.context import Subject from bids_validator import context from bids_validator.types.files import FileTree @@ -9,6 +13,14 @@ def synthetic_dataset(examples): return FileTree.read_from_filesystem(examples / 'synthetic') +@pytest.fixture +def memfs(): + mem = fsspec.filesystem('memory') + mem.store.clear() + yield mem + mem.store.clear() + + def test_load(synthetic_dataset, schema): ds = context.Dataset(synthetic_dataset, schema) @@ -52,3 +64,99 @@ def test_walkback(synthetic_dataset, schema): sidecars = list(context.walk_back(bold, inherit=True)) assert len(sidecars) == 1 assert sidecars[0] is synthetic_dataset / 'task-nback_bold.json' + + +def test_context(synthetic_dataset, schema): + sub01 = synthetic_dataset / 'sub-01' + T1w = sub01 / 'ses-01' / 'anat' / 'sub-01_ses-01_T1w.nii' + bold = sub01 / 'ses-01' / 'func' / 'sub-01_ses-01_task-nback_run-01_bold.nii' + + subject = Subject(context.Sessions(sub01)) + ds = context.Dataset(synthetic_dataset, schema) + T1w_context = context.Context(T1w, ds, subject) + + assert T1w_context.schema is schema + assert T1w_context.dataset is ds + assert T1w_context.entities == {'sub': '01', 'ses': '01'} + assert T1w_context.path == '/sub-01/ses-01/anat/sub-01_ses-01_T1w.nii' + assert T1w_context.datatype == 'anat' + assert T1w_context.suffix == 'T1w' + assert T1w_context.extension == '.nii' + assert T1w_context.modality == 'mri' + assert T1w_context.size == 352 + assert isinstance(T1w_context.subject.sessions, context.Sessions) + assert sorted(T1w_context.subject.sessions.ses_dirs) == ['ses-01', 'ses-02'] + assert sorted(T1w_context.subject.sessions.session_id) == ['ses-01', 'ses-02'] + assert T1w_context.sidecar == {} + assert T1w_context.json is None + + bold_context = context.Context(bold, ds, subject) + + assert bold_context.sidecar.to_dict() == {'TaskName': 'N-Back', 'RepetitionTime': 2.5} + assert bold_context.json is None + + ## Tests for: + # associations + # columns + # gzip + # nifti_header + # ome + # tiff + + +def test_context_json(examples, schema): + dataset = FileTree.read_from_filesystem(examples / 'qmri_vfa') + file = dataset / 'sub-01' / 'anat' / 'sub-01_flip-1_VFA.json' + + ds = context.Dataset(dataset, schema) + file_context = context.Context(file, ds, subject=None) + + assert file_context.json.to_dict() == {'FlipAngle': 3, 'RepetitionTimeExcitation': 0.0150} + + +def test_sidecar_inheritance(examples): + """Test to ensure inheritance principle is executed correctly""" + dataset = FileTree.read_from_filesystem(examples / 'qmri_mp2rage') + file = dataset / 'sub-1' / 'anat' / 'sub-1_inv-2_part-mag_MP2RAGE.nii' + + sidecar = context.load_sidecar(file) + + assert sidecar['FlipAngle'] == 7 + assert sidecar['InversionTime'] == 2.7 + assert sidecar['RepetitionTimePreparation'] == 5.5 + + +def test_sidecar_order(memfs): + """Test to ensure inheritance principle is skipped when inherit=False""" + root_json = {'rootOverwriteA': 'root', 'rootOverwriteB': 'root', 'rootValue': 'root'} + subject_json = {'rootOverwriteA': 'subject', 'subOverwrite': 'subject', 'subValue': 'subject'} + anat_json = {'rootOverwriteB': 'anat', 'subOverwrite': 'anat', 'anatValue': 'anat'} + memfs.pipe( + { + '/T1w.json': json.dumps(root_json).encode(), + '/sub-01/sub-01_T1w.json': json.dumps(subject_json).encode(), + '/sub-01/anat/sub-01_T1w.json': json.dumps(anat_json).encode(), + '/sub-01/anat/sub-01_T1w.nii': b'', + } + ) + + dataset = FileTree.read_from_filesystem('memory://') + file = dataset / 'sub-01' / 'anat' / 'sub-01_T1w.nii' + sidecar = context.load_sidecar(file) + assert sidecar == { + 'rootValue': 'root', + 'subValue': 'subject', + 'rootOverwriteA': 'subject', + 'anatValue': 'anat', + 'rootOverwriteB': 'anat', + 'subOverwrite': 'anat', + } + + +def test_sessions(synthetic_dataset): + sub01 = synthetic_dataset / 'sub-01' + + sessions = context.Sessions(sub01) + + assert sorted(sessions.ses_dirs) == ['ses-01', 'ses-02'] + assert sorted(sessions.session_id) == ['ses-01', 'ses-02']