SpikeInterface
diff --git a/‎pyproject.toml
Lines changed: 0 additions & 1 deletion b/‎pyproject.toml
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/spikeinterface/extractors/cellexplorersortingextractor.py
Lines changed: 177 additions & 88 deletions b/‎src/spikeinterface/extractors/cellexplorersortingextractor.py
Lines changed: 177 additions & 88 deletions
@@ -62,7 +62,6 @@ extractors = [
     "pyedflib>=0.1.30",
     "sonpy;python_version<'3.10'",
     "lxml", # lxml for neuroscope
-    "hdf5storage", # hdf5storage and scipy for cellexplorer
     "scipy",
     # ONE-api and ibllib for streaming IBL
     "ONE-api>=1.19.1",
 
@@ -1,121 +1,209 @@
+from __future__ import annotations
+
 import numpy as np
 from pathlib import Path
-from typing import Union, Optional
+import warnings
+import datetime
 
 from ..core import BaseSorting, BaseSortingSegment
 from ..core.core_tools import define_function_from_class
 
 
-try:
-    import scipy.io
-    import hdf5storage
-
-    HAVE_SCIPY_AND_HDF5STORAGE = True
-except ImportError:
-    HAVE_SCIPY_AND_HDF5STORAGE = False
-
-
-PathType = Union[str, Path]
-OptionalPathType = Optional[PathType]
-
-
 class CellExplorerSortingExtractor(BaseSorting):
     """
-    Extracts spiking information from .mat files stored in the CellExplorer format.
-    Spike times are stored in units of seconds.
+    Extracts spiking information from `.mat` file stored in the CellExplorer format.
+    Spike times are stored in units of seconds so we transform them to units of samples.
+
+    The newer version of the format is described here:
+    https://cellexplorer.org/data-structure/
+
+    Whereas the old format is described here:
+    https://github.com/buzsakilab/buzcode/wiki/Data-Formatting-Standards
 
     Parameters
     ----------
-    spikes_matfile_path : PathType
-        Path to the sorting_id.spikes.cellinfo.mat file.
+    file_path: str | Path
+        Path to `.mat` file containing spikes. Usually named `session_id.spikes.cellinfo.mat`
+    sampling_frequency: float | None, optional
+        The sampling frequency of the data. If None, it will be extracted from the files.
+    session_info_file_path: str | Path | None, optional
+        Path to the `sessionInfo.mat` file. If None, it will be inferred from the file_path.
     """
 
     extractor_name = "CellExplorerSortingExtractor"
-    installed = HAVE_SCIPY_AND_HDF5STORAGE
     is_writable = True
     mode = "file"
-    installation_mesg = (
-        "To use the CellExplorerSortingExtractor install scipy and hdf5storage: \n\n pip install scipy  hdf5storage"
-    )
+    installation_mesg = "To use the CellExplorerSortingExtractor install scipy and h5py"
 
     def __init__(
         self,
-        spikes_matfile_path: PathType,
-        session_info_matfile_path: OptionalPathType = None,
-        sampling_frequency: Optional[float] = None,
+        file_path: str | Path | None = None,
+        sampling_frequency: float | None = None,
+        session_info_file_path: str | Path | None = None,
+        spikes_matfile_path: str | Path | None = None,
+        session_info_matfile_path: str | Path | None = None,
     ):
-        assert self.installed, self.installation_mesg
+        try:
+            import h5py
+            import scipy.io
+        except ImportError:
+            raise ImportError(self.installation_mesg)
+
+        assert (
+            file_path is not None or spikes_matfile_path is not None
+        ), "Either file_path or spikes_matfile_path must be provided!"
+
+        if spikes_matfile_path is not None:
+            # Raise an error if the warning period has expired
+            deprecation_issued = datetime.datetime(2023, 4, 1)
+            deprecation_deadline = deprecation_issued + datetime.timedelta(days=180)
+            if datetime.datetime.now() > deprecation_deadline:
+                raise ValueError("The spikes_matfile_path argument is no longer supported in. Use file_path instead.")
+
+            # Otherwise, issue a DeprecationWarning
+            else:
+                warnings.warn(
+                    "The spikes_matfile_path argument is deprecated and will be removed in six months. "
+                    "Use file_path instead.",
+                    DeprecationWarning,
+                )
+            file_path = spikes_matfile_path if file_path is None else file_path
+
+        if session_info_matfile_path is not None:
+            # Raise an error if the warning period has expired
+            deprecation_issued = datetime.datetime(2023, 4, 1)
+            deprecation_deadline = deprecation_issued + datetime.timedelta(days=180)
+            if datetime.datetime.now() > deprecation_deadline:
+                raise ValueError(
+                    "The session_info_matfile_path argument is no longer supported in. Use session_info_file_path instead."
+                )
+
+            # Otherwise, issue a DeprecationWarning
+            else:
+                warnings.warn(
+                    "The session_info_matfile_path argument is deprecated and will be removed in six months. "
+                    "Use session_info_file_path instead.",
+                    DeprecationWarning,
+                )
+            session_info_file_path = (
+                session_info_matfile_path if session_info_file_path is None else session_info_file_path
+            )
 
-        spikes_matfile_path = Path(spikes_matfile_path)
-        assert spikes_matfile_path.is_file(), f"The spikes_matfile_path ({spikes_matfile_path}) must exist!"
+        self.spikes_cellinfo_path = Path(file_path).absolute()
+        assert self.spikes_cellinfo_path.is_file(), f"The spikes.cellinfo.mat file must exist in {self.folder_path}!"
 
-        if sampling_frequency is None:
-            folder_path = spikes_matfile_path.parent
-            sorting_id = spikes_matfile_path.name.split(".")[0]
-            if session_info_matfile_path is None:
-                session_info_matfile_path = folder_path / f"{sorting_id}.sessionInfo.mat"
-            session_info_matfile_path = Path(session_info_matfile_path)
-            assert (session_info_matfile_path).is_file(), f"No {sorting_id}.sessionInfo.mat file found in the folder!"
-
-            try:
-                session_info_mat = scipy.io.loadmat(file_name=str(session_info_matfile_path))
-                self.read_session_info_with_scipy = True
-            except NotImplementedError:
-                session_info_mat = hdf5storage.loadmat(file_name=str(session_info_matfile_path))
-                self.read_session_info_with_scipy = False
-
-            assert session_info_mat["sessionInfo"]["rates"][0][0]["wideband"], (
-                "The sesssionInfo.mat file must contain "
-                "a 'sessionInfo' struct with field 'rates' containing field 'wideband' to extract the sampling frequency!"
-            )
-            if self.read_session_info_with_scipy:
-                sampling_frequency = float(
-                    session_info_mat["sessionInfo"]["rates"][0][0]["wideband"][0][0][0][0]
-                )  # careful not to confuse it with the lfpsamplingrate; reported in units Hz
-            else:
-                sampling_frequency = float(
-                    session_info_mat["sessionInfo"]["rates"][0][0]["wideband"][0][0]
-                )  # careful not to confuse it with the lfpsamplingrate; reported in units Hz
+        self.folder_path = self.spikes_cellinfo_path.parent
+        self.session_info_file_path = session_info_file_path
 
+        self.session_id = self.spikes_cellinfo_path.stem.split(".")[0]
+
+        read_as_hdf5 = False
         try:
-            spikes_mat = scipy.io.loadmat(file_name=str(spikes_matfile_path))
-            self.read_spikes_info_with_scipy = True
+            matlab_file = scipy.io.loadmat(file_name=str(self.spikes_cellinfo_path), simplify_cells=True)
+            spikes_mat = matlab_file["spikes"]
+            assert isinstance(spikes_mat, dict), f"field `spikes` must be a dict, not {type(spikes_mat)}!"
+
         except NotImplementedError:
-            spikes_mat = hdf5storage.loadmat(file_name=str(spikes_matfile_path))
-            self.read_spikes_info_with_scipy = False
+            matlab_file = h5py.File(name=self.spikes_cellinfo_path, mode="r")
+            spikes_mat = matlab_file["spikes"]
+            assert isinstance(spikes_mat, h5py.Group), f"field `spikes` must be a Group, not {type(spikes_mat)}!"
+            read_as_hdf5 = True
 
-        assert np.all(
-            np.isin(["UID", "times"], spikes_mat["spikes"].dtype.names)
-        ), "The spikes.cellinfo.mat file must contain a 'spikes' struct with fields 'UID' and 'times'!"
+        if sampling_frequency is None:
+            # First try the new format of spikes.cellinfo.mat files where sampling rate is included in the file
+            sr_data = spikes_mat.get("sr", None)
+            sampling_frequency = sr_data[()] if isinstance(sr_data, h5py.Dataset) else None
+
+        if sampling_frequency is None:
+            sampling_frequency = self._retrieve_sampling_frequency_from_session_info()
+
+        sampling_frequency = float(sampling_frequency)
+
+        unit_ids_available = "UID" in spikes_mat.keys()
+        assert unit_ids_available, f"The `spikes struct` must contain field 'UID'! fields: {spikes_mat.keys()}"
+
+        spike_times_available = "times" in spikes_mat.keys()
+        assert spike_times_available, f"The `spike struct` must contain field 'times'! fields: {spikes_mat.keys()}"
+
+        unit_ids = spikes_mat["UID"]
+        spike_times = spikes_mat["times"]
+
+        if read_as_hdf5:
+            assert isinstance(unit_ids, h5py.Dataset), f"`unit_ids` must be a Dataset, not {type(unit_ids)}!"
+            assert isinstance(spike_times, h5py.Dataset), f"`spike_times` must be a Dataset, not {type(spike_times)}!"
+
+            unit_ids = unit_ids[:].squeeze().astype("int")
+            references = (ref[0] for ref in spike_times[:])  # These are HDF5 references
+            spike_times_data = (matlab_file[ref] for ref in references if isinstance(matlab_file[ref], h5py.Dataset))
+            # Format as a list of numpy arrays
+            spike_times = [data[()].squeeze() for data in spike_times_data]
 
         # CellExplorer reports spike times in units seconds; SpikeExtractors uses time units of sampling frames
-        # Rounding is necessary to prevent data loss from int-casting floating point errors
-        if self.read_spikes_info_with_scipy:
-            unit_ids = np.asarray(spikes_mat["spikes"]["UID"][0][0][0])
-            spiketrains = [
-                (np.array([y[0] for y in x]) * sampling_frequency).round().astype(np.int64)
-                for x in spikes_mat["spikes"]["times"][0][0][0]
-            ]
-        else:
-            unit_ids = np.asarray(spikes_mat["spikes"]["UID"][0][0])
-            spiketrains = [
-                (np.array([y[0] for y in x]) * sampling_frequency).round().astype(np.int64)
-                for x in spikes_mat["spikes"]["times"][0][0]
-            ]
+        unit_ids = unit_ids[:].tolist()
+        spiketrains_dict = {unit_id: spike_times[index] for index, unit_id in enumerate(unit_ids)}
+        for unit_id in unit_ids:
+            spiketrains_dict[unit_id] = (sampling_frequency * spiketrains_dict[unit_id]).round().astype(np.int64)
+            # Rounding is necessary to prevent data loss from int-casting floating point errors
 
         BaseSorting.__init__(self, unit_ids=unit_ids, sampling_frequency=sampling_frequency)
-        sorting_segment = CellExplorerSortingSegment(spiketrains, unit_ids)
+        sorting_segment = CellExplorerSortingSegment(spiketrains_dict, unit_ids)
         self.add_sorting_segment(sorting_segment)
 
         self.extra_requirements.append("scipy")
-        self.extra_requirements.append("hdf5storage")
 
-        self._kwargs = dict(spikes_matfile_path=str(spikes_matfile_path.absolute()))
+        self._kwargs = dict(
+            file_path=str(self.spikes_cellinfo_path),
+            sampling_frequency=sampling_frequency,
+            session_info_file_path=str(session_info_file_path),
+        )
+
+    def _retrieve_sampling_frequency_from_session_info(self) -> float:
+        """
+        Retrieve the sampling frequency from the `sessionInfo.mat` file when available.
+
+        This function tries to locate a .sessionInfo.mat file corresponding to the current session. It then loads this
+        file (either as a standard .mat file or as an HDF5 file if the former is not possible) and extracts the wideband
+        sampling frequency from the 'rates' field of the 'sessionInfo' struct.
+
+        Returns
+        -------
+        float
+            The wideband sampling frequency for the current session.
+        """
+        import h5py
+        import scipy.io
+
+        if self.session_info_file_path is None:
+            self.session_info_file_path = self.folder_path / f"{self.session_id}.sessionInfo.mat"
+
+        self.session_info_file_path = Path(self.session_info_file_path).absolute()
+        assert (
+            self.session_info_file_path.is_file()
+        ), f"No {self.session_id}.sessionInfo.mat file found in the {self.folder_path}!, can't inferr sampling rate"
+
+        read_as_hdf5 = False
+        try:
+            session_info_mat = scipy.io.loadmat(file_name=str(self.session_info_file_path), simplify_cells=True)
+        except NotImplementedError:
+            session_info_mat = h5py.File(name=str(self.session_info_file_path), mode="r")
+            read_as_hdf5 = True
+
+        rates = session_info_mat["sessionInfo"]["rates"]
+        wideband_in_rates = "wideband" in rates.keys()
+        assert wideband_in_rates, "a 'sessionInfo' should contain a  'wideband' to extract the sampling frequency!"
+
+        # Not to be connfused with the lfpsamplingrate; reported in units Hz also present in rates
+        sampling_frequency = rates["wideband"]
+
+        if read_as_hdf5:
+            sampling_frequency = sampling_frequency[()]
+
+        return sampling_frequency
 
 
 class CellExplorerSortingSegment(BaseSortingSegment):
-    def __init__(self, spiketrains, unit_ids):
-        self._spiketrains = spiketrains
+    def __init__(self, spiketrains_dict, unit_ids):
+        self.spiketrains_dict = spiketrains_dict
         self._unit_ids = list(unit_ids)
         BaseSortingSegment.__init__(self)
 
@@ -125,14 +213,15 @@ def get_unit_spike_train(
         start_frame,
         end_frame,
     ) -> np.ndarray:
-        # must be implemented in subclass
-        if start_frame is None:
-            start_frame = 0
-        if end_frame is None:
-            end_frame = np.inf
-        spike_frames = self._spiketrains[self._unit_ids.index(unit_id)]
-        inds = np.where((start_frame <= spike_frames) & (spike_frames < end_frame))
-        return spike_frames[inds]
+        spike_frames = self.spiketrains_dict[unit_id]
+        # clip
+        if start_frame is not None:
+            spike_frames = spike_frames[spike_frames >= start_frame]
+
+        if end_frame is not None:
+            spike_frames = spike_frames[spike_frames <= end_frame]
+
+        return spike_frames
 
 
 read_cellexplorer = define_function_from_class(source_class=CellExplorerSortingExtractor, name="read_cellexplorer")