diff --git a/.github/pull-request-links.yaml b/.github/pull-request-links.yaml new file mode 100644 index 00000000..7a552123 --- /dev/null +++ b/.github/pull-request-links.yaml @@ -0,0 +1,16 @@ +name: readthedocs/actions +on: + pull_request_target: + types: + - opened + +permissions: + pull-requests: write + +jobs: + pull-request-links: + runs-on: ubuntu-latest + steps: + - uses: readthedocs/actions/preview@v1 + with: + project-slug: "cleanvision" diff --git a/docs/source/cleanvision/issue_managers/index.rst b/docs/source/cleanvision/issue_managers/index.rst index 88657fcc..5a374b76 100644 --- a/docs/source/cleanvision/issue_managers/index.rst +++ b/docs/source/cleanvision/issue_managers/index.rst @@ -1,5 +1,6 @@ Issue Managers ============== +Contains modules for managing data issues of a particular type in Imagelab. .. automodule:: cleanvision.issue_managers :autosummary: diff --git a/examples/custom_issue_manager.py b/examples/custom_issue_manager.py index df737733..1afa0837 100644 --- a/examples/custom_issue_manager.py +++ b/examples/custom_issue_manager.py @@ -25,13 +25,13 @@ class CustomIssueManager(IssueManager): def __init__(self) -> None: super().__init__() - self.params = self.get_default_params() + self.params = self._get_default_params() - def get_default_params(self) -> Dict[str, Any]: + def _get_default_params(self) -> Dict[str, Any]: return {"threshold": 0.4} - def update_params(self, params: Dict[str, Any]) -> None: - self.params = self.get_default_params() + def _update_params(self, params: Dict[str, Any]) -> None: + self.params = self._get_default_params() non_none_params = {k: v for k, v in params.items() if v is not None} self.params = {**self.params, **non_none_params} @@ -65,7 +65,7 @@ def find_issues( assert imagelab_info is not None assert dataset is not None - self.update_params(params) + self._update_params(params) raw_scores = [] for idx in tqdm(dataset.index): diff --git a/src/cleanvision/issue_managers/duplicate_issue_manager.py b/src/cleanvision/issue_managers/duplicate_issue_manager.py index 81a1fe76..0dc326c0 100644 --- a/src/cleanvision/issue_managers/duplicate_issue_manager.py +++ b/src/cleanvision/issue_managers/duplicate_issue_manager.py @@ -47,21 +47,23 @@ def compute_hash_wrapper(args: Dict[str, Any]) -> Dict[str, Union[str, int]]: @register_issue_manager(DUPLICATE) class DuplicateIssueManager(IssueManager): + """Checks for exact and near duplicates in images.""" + issue_name: str = DUPLICATE visualization: str = "image_sets" def __init__(self) -> None: super().__init__() self.issue_types: List[str] = [] - self.params = self.get_default_params() + self.params = self._get_default_params() - def get_default_params(self) -> Dict[str, Any]: + def _get_default_params(self) -> Dict[str, Any]: return { IssueType.EXACT_DUPLICATES.value: {"hash_type": "md5"}, IssueType.NEAR_DUPLICATES.value: {"hash_type": "phash", "hash_size": 8}, } - def update_params(self, params: Dict[str, Any]) -> None: + def _update_params(self, params: Dict[str, Any]) -> None: for issue_type in self.params: non_none_params = { k: v for k, v in params.get(issue_type, {}).items() if v is not None @@ -102,13 +104,34 @@ def find_issues( n_jobs: Optional[int] = None, **kwargs: Any, ) -> None: + """Finds exact and near duplicates in the images + + Parameters + ---------- + params: Dict[str, Any], optional + Dict of custom hyperparameters for checking duplicate issues. Default value is empty. + dataset: Dataset + Dataset object on which to run the duplicate checks + imagelab_info: Dict[str, Any] + imagelab.info dict containing computations for reuse + n_jobs: int + Number of processing threads used by multiprocessing. + Default None sets to the number of cores on your CPU (physical cores if you have psutil package installed, otherwise logical cores). + Set this to 1 to disable parallel processing (if its causing issues). Windows users may see a speed-up with n_jobs=1. + For :py:class:`TorchDataset` this is set to 1. + kwargs: Any + + Returns + ------- + + """ super().find_issues(**kwargs) assert params is not None assert imagelab_info is not None assert dataset is not None self.issue_types = list(params.keys()) - self.update_params(params) + self._update_params(params) to_compute = self._get_issue_types_to_compute(self.issue_types, imagelab_info) issue_type_hash_mapping: Dict[str, Any] = { diff --git a/src/cleanvision/issue_managers/image_property_issue_manager.py b/src/cleanvision/issue_managers/image_property_issue_manager.py index 3c98d5e8..1af74205 100644 --- a/src/cleanvision/issue_managers/image_property_issue_manager.py +++ b/src/cleanvision/issue_managers/image_property_issue_manager.py @@ -54,10 +54,10 @@ class ImagePropertyIssueManager(IssueManager): def __init__(self) -> None: super().__init__() self.issue_types: List[str] = [] - self.params = self.get_default_params() + self.params = self._get_default_params() self.image_properties = self._get_image_properties() - def get_default_params(self) -> Dict[str, Any]: + def _get_default_params(self) -> Dict[str, Any]: return { IssueType.DARK.value: {"threshold": 0.37}, IssueType.LIGHT.value: {"threshold": 0.05}, @@ -70,7 +70,7 @@ def get_default_params(self) -> Dict[str, Any]: IssueType.GRAYSCALE.value: {}, } - def update_params(self, params: Dict[str, Any]) -> None: + def _update_params(self, params: Dict[str, Any]) -> None: for issue_type in self.params: non_none_params = { k: v for k, v in params.get(issue_type, {}).items() if v is not None @@ -132,7 +132,7 @@ def find_issues( additional_set = self._get_additional_to_compute_set(self.issue_types) self.issue_types = self.issue_types + additional_set - self.update_params(params) + self._update_params(params) agg_computations = pd.DataFrame(index=dataset.index) agg_computations = self._add_prev_computations(agg_computations, imagelab_info) diff --git a/src/cleanvision/utils/base_issue_manager.py b/src/cleanvision/utils/base_issue_manager.py index 67a01c1e..1289a7bb 100644 --- a/src/cleanvision/utils/base_issue_manager.py +++ b/src/cleanvision/utils/base_issue_manager.py @@ -49,12 +49,12 @@ def find_issues(self, **kwargs: Any) -> None: return @abstractmethod - def get_default_params(self) -> Dict[str, Any]: + def _get_default_params(self) -> Dict[str, Any]: """Returns default params to be used by the issue_manager""" raise NotImplementedError @abstractmethod - def update_params(self, params: Dict[str, Any]) -> None: + def _update_params(self, params: Dict[str, Any]) -> None: """Sets params for an issue manager. Default params will be overridden by user provided params""" raise NotImplementedError diff --git a/tests/test_duplicate_issue_manager.py b/tests/test_duplicate_issue_manager.py index a6c5bbf7..9c184aca 100644 --- a/tests/test_duplicate_issue_manager.py +++ b/tests/test_duplicate_issue_manager.py @@ -47,7 +47,7 @@ def test_set_params(self, params, expected_params, issue_manager): 1. If no params are specified for an issue_type, default params are used 2. If params are specified, those specific params are updated, for the remaining ones default values are used """ - issue_manager.update_params(params) + issue_manager._update_params(params) assert issue_manager.params == expected_params @pytest.mark.parametrize( diff --git a/tests/test_image_property_issue_manager.py b/tests/test_image_property_issue_manager.py index 1f1a5faf..b7928281 100644 --- a/tests/test_image_property_issue_manager.py +++ b/tests/test_image_property_issue_manager.py @@ -62,7 +62,7 @@ def test_set_params(self, params, expected_params, issue_manager): issue_manager: instance of ImagePropertyIssueManager """ - issue_manager.update_params(params) + issue_manager._update_params(params) assert issue_manager.params == expected_params @pytest.mark.parametrize(