From cffc14051709d1d05bd1157b5f57ae1e1122d553 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Sun, 27 Jul 2025 15:36:22 +0200 Subject: [PATCH 1/4] no dynamic import needed for hugging face - download --- chebifier/ensemble/base_ensemble.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/chebifier/ensemble/base_ensemble.py b/chebifier/ensemble/base_ensemble.py index 7a3aef2..6a3acef 100644 --- a/chebifier/ensemble/base_ensemble.py +++ b/chebifier/ensemble/base_ensemble.py @@ -3,15 +3,15 @@ import torch import tqdm -from chebifier.inconsistency_resolution import PredictionSmoother -from chebifier.utils import load_chebi_graph, get_disjoint_files from chebifier.check_env import check_package_installed +from chebifier.hugging_face import download_model_files +from chebifier.inconsistency_resolution import PredictionSmoother from chebifier.prediction_models.base_predictor import BasePredictor +from chebifier.utils import get_disjoint_files, load_chebi_graph class BaseEnsemble: - def __init__( self, model_configs: dict, @@ -29,8 +29,6 @@ def __init__( for model_name, model_config in model_configs.items(): model_cls = MODEL_TYPES[model_config["type"]] if "hugging_face" in model_config: - from chebifier.hugging_face import download_model_files - hugging_face_kwargs = download_model_files(model_config["hugging_face"]) else: hugging_face_kwargs = {} From e81298f198c51c8ab6ab73af679f2d8f1ff591dc Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Sun, 27 Jul 2025 15:37:12 +0200 Subject: [PATCH 2/4] remove redundant config file --- configs/huggingface_config.yml | 22 ---------------------- 1 file changed, 22 deletions(-) delete mode 100644 configs/huggingface_config.yml diff --git a/configs/huggingface_config.yml b/configs/huggingface_config.yml deleted file mode 100644 index c26950d..0000000 --- a/configs/huggingface_config.yml +++ /dev/null @@ -1,22 +0,0 @@ - -chemlog_peptides: - type: chemlog - model_weight: 100 - -#resgated_huggingface: -# type: resgated -# hugging_face: -# repo_id: aditya0by0/python-chebifier -# subfolder: resgated -# files: -# ckpt: resgated.ckpt -# labels: classes.txt - -electra_huggingface: - type: electra - hugging_face: - repo_id: aditya0by0/python-chebifier - subfolder: electra - files: - ckpt: electra.ckpt - labels: classes.txt From cd9919b16afc4765578449ab70af8ac9d9fd7714 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Sun, 27 Jul 2025 18:35:55 +0200 Subject: [PATCH 3/4] fix chemlog key error --- configs/example_config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/example_config.yml b/configs/example_config.yml index 504daac..bc8efbc 100644 --- a/configs/example_config.yml +++ b/configs/example_config.yml @@ -1,6 +1,6 @@ chemlog_peptides: - type: chemlog + type: chemlog_peptides model_weight: 100 # if chemlog is available, it always gets chosen my_resgated: type: resgated From 0d517f40bd09477ed760a405ca5e437f6d463fc2 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Sun, 27 Jul 2025 18:59:09 +0200 Subject: [PATCH 4/4] update readme for hugging face model weights --- README.md | 43 +++++++++++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 651f707..fcbbb37 100644 --- a/README.md +++ b/README.md @@ -8,9 +8,9 @@ A web application for the ensemble is available at https://chebifier.hastingslab Not all models can be installed automatically at the moment: - `chebai-graph` and its dependencies. To install them, follow -the instructions in the [chebai-graph repository](https://github.com/ChEB-AI/python-chebai-graph). +the instructions in the [chebai-graph repository](https://github.com/ChEB-AI/python-chebai-graph). - `chemlog-extra` can be installed with `pip install git+https://github.com/ChEB-AI/chemlog-extra.git` -- The automatically installed version of `c3p` may not work under Windows. If you want to run chebifier on Windows, we +- The automatically installed version of `c3p` may not work under Windows. If you want to run chebifier on Windows, we recommend using this forked version: `pip install git+https://github.com/sfluegel05/c3p.git` @@ -38,11 +38,26 @@ The package provides a command-line interface (CLI) for making predictions using The ensemble configuration is given by a configuration file (by default, this is `chebifier/ensemble.yml`). If you want to change which models are included in the ensemble or how they are weighted, you can create your own configuration file. -Model weights for deep learning models are downloaded automatically from [Hugging Face](https://huggingface.co/chebai). +Model weights for deep learning models are automatically downloaded from [Hugging Face](https://huggingface.co/chebai). +To use specific model weights from Hugging face, add the `load_model` key in your configuration file. For example: + +```yaml +my_electra: + type: electra + load_model: "electra_chebi50_v241" +``` + +### Available model weights: + +* `electra_chebi50_v241` +* `resgated_chebi50_v241` +* `c3p_with_weights` + + However, you can also supply your own model checkpoints (see `configs/example_config.yml` for an example). ```bash -# Make predictions +# Make predictions python -m chebifier predict --smiles "CC(=O)OC1=CC=CC=C1C(=O)O" --smiles "C1=CC=C(C=C1)C(=O)O" # Make predictions using SMILES from a file @@ -96,7 +111,7 @@ Currently, the following models are supported: | `c3p` | A collection _Chemical Classifier Programs_, generated by LLMs based on the natural language definitions of ChEBI classes. | 338 | [Mungall, Christopher J., et al., 2025: Chemical classification program synthesis using generative artificial intelligence, arXiv](https://arxiv.org/abs/2505.18470) | [c3p](https://github.com/chemkg/c3p) | In addition, Chebifier also includes a ChEBI lookup that automatically retrieves the ChEBI superclasses for a class -matched by a SMILES string. This is not activated by default, but can be included by adding +matched by a SMILES string. This is not activated by default, but can be included by adding ```yaml chebi_lookup: type: chebi_lookup @@ -109,7 +124,7 @@ to your configuration file. Given a sample (i.e., a SMILES string) and models $m_1, m_2, \ldots, m_n$, the ensemble works as follows: 1. Get predictions from each model $m_i$ for the sample. -2. For each class $c$, aggregate predictions $p_c^{m_i}$ from all models that made a prediction for that class. +2. For each class $c$, aggregate predictions $p_c^{m_i}$ from all models that made a prediction for that class. The aggregation happens separately for all positive predictions (i.e., $p_c^{m_i} \geq 0.5$) and all negative predictions ($p_c^{m_i} < 0.5$). If the aggregated value is larger for the positive predictions than for the negative predictions, the ensemble makes a positive prediction for class $c$: @@ -117,7 +132,7 @@ the ensemble makes a positive prediction for class $c$: image