Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
fb13401
Add parameter merging to utils
mo-fu Nov 26, 2021
e249715
Allow atomic save to handle directories.
mo-fu Dec 3, 2021
5cc207b
Add XTransformer backend.
mo-fu Dec 6, 2021
5a18d98
Remove redundant import in fasttext
mo-fu Dec 8, 2021
6129965
Use parsed parameter in suggest batch_size.
mo-fu Dec 8, 2021
02ff772
Use provided parameters in xtransformer training.
mo-fu Dec 17, 2021
3d06ebe
Fix import for Xtransformer
mo-fu Jan 6, 2022
8555bab
Split atomic_save in folder and directory variant.
mo-fu Jan 6, 2022
c11ba38
Disable gpu use for xtransformer suggest.
mo-fu Mar 14, 2022
4a82ea2
Update pecos dependency.
mo-fu Sep 2, 2022
367e493
Adapt xtransformer backend to new vocab model.
mo-fu Sep 2, 2022
aa96ebc
Merge branch 'master' of github.com:mo-fu/Annif into mo-fu-master
juhoinkinen Mar 8, 2023
efbb05c
Working transformer backend
Lakshmi-bashyam Dec 6, 2023
3731f47
Working transformer backend
Lakshmi-bashyam Dec 7, 2023
6187e91
Resolve conflicts
Lakshmi-bashyam Aug 14, 2024
3e02a72
xtrans test fixed, stwfsa import fixed
Lakshmi-bashyam Aug 20, 2024
7379061
Change default to smaller model
Lakshmi-bashyam Aug 28, 2024
2078a65
Fix linting errors
Lakshmi-bashyam Sep 19, 2024
f1b9c78
code formatting changes
Lakshmi-bashyam Sep 25, 2024
5e41dce
security bot fix
Lakshmi-bashyam Sep 25, 2024
4c33a31
typo fix
Lakshmi-bashyam Sep 25, 2024
dcb5b97
Flake8 fix
Lakshmi-bashyam Sep 25, 2024
1fa4f73
Merge remote-tracking branch 'origin/main' into xtransformer
Lakshmi-bashyam Mar 4, 2025
bacbfab
CICD pecos installation
Lakshmi-bashyam Mar 10, 2025
a5528a9
Merge changes
Lakshmi-bashyam Mar 10, 2025
1c5f6ff
Pecos ver-support python 3.11
Lakshmi-bashyam Mar 10, 2025
a25c8cf
Remove duplicate method
Lakshmi-bashyam Mar 10, 2025
8269f59
Merged main branch
Lakshmi-bashyam Jul 10, 2025
6960ee5
Add pecos to cicd
Lakshmi-bashyam Jul 10, 2025
0e9ad2c
Addn xtransformer hyper params
Lakshmi-bashyam Jul 10, 2025
8ed3438
Pin transformers<=4.49.0 to fix AdamW import issue in pecos (see peco…
Lakshmi-bashyam Aug 28, 2025
22397eb
Pecos TFIDF vectorizer
Lakshmi-bashyam Sep 10, 2025
2cf6cf0
Resolve merge conflicts
Lakshmi-bashyam Sep 11, 2025
a194ece
Unit test for Pecostfidf
Lakshmi-bashyam Sep 11, 2025
ce6f529
merge changes on suggest method
Lakshmi-bashyam Sep 15, 2025
1df26aa
Merge remote-tracking branch 'origin/main' into xtransformer
Lakshmi-bashyam Oct 2, 2025
d2e75d3
Downgrade scipy for pecos
Lakshmi-bashyam Oct 2, 2025
f0aa91f
fix xtrans suggest method in unit test
Lakshmi-bashyam Oct 2, 2025
3997512
refactor(deps): Restrict optional dependencies' Python compatibility
Lakshmi-bashyam Oct 7, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/workflows/cicd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -81,17 +81,17 @@ jobs:
# Selectively install the optional dependencies for some Python versions
# For Python 3.10:
if [[ ${{ matrix.python-version }} == '3.10' ]]; then
poetry install -E "nn omikuji yake voikko stwfsa";
poetry install -E "nn omikuji yake voikko stwfsa pecos";
fi
# For Python 3.11:
if [[ ${{ matrix.python-version }} == '3.11' ]]; then
poetry install -E "fasttext spacy estnltk";
poetry install -E "fasttext spacy estnltk pecos";
# download the small English pretrained spaCy model needed by spacy analyzer
poetry run python -m spacy download en_core_web_sm --upgrade-strategy only-if-needed
fi
# For Python 3.12:
if [[ ${{ matrix.python-version }} == '3.12' ]]; then
poetry install -E "nn fasttext yake stwfsa voikko spacy";
poetry install -E "nn fasttext yake stwfsa voikko spacy pecos";
# download the small English pretrained spaCy model needed by spacy analyzer
poetry run python -m spacy download en_core_web_sm --upgrade-strategy only-if-needed
fi
Expand Down
6 changes: 5 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ FROM python:3.12-slim-bookworm
LABEL org.opencontainers.image.authors="grp-natlibfi-annif@helsinki.fi"
SHELL ["/bin/bash", "-c"]

ARG optional_dependencies="voikko fasttext nn omikuji yake spacy stwfsa"
ARG optional_dependencies="voikko fasttext nn omikuji yake spacy stwfsa pecos"
ARG POETRY_VIRTUALENVS_CREATE=false

# Install system dependencies needed at runtime:
Expand Down Expand Up @@ -36,6 +36,10 @@ RUN if [[ $optional_dependencies =~ "spacy" ]]; then \
python -m spacy download $model; \
done; \
fi
RUN if [[ $optional_dependencies =~ "pecos" ]]; then \
mkdir /.cache -m a=rwx; \
fi


# Second round of installation with the actual code:
COPY annif /Annif/annif
Expand Down
12 changes: 12 additions & 0 deletions annif/backend/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,17 @@ def _tfidf() -> Type[AnnifBackend]:
return tfidf.TFIDFBackend


def _xtransformer() -> Type[AnnifBackend]:
try:
from . import xtransformer

return xtransformer.XTransformerBackend
except ImportError:
raise ValueError(
"XTransformer not available, not enabling XTransformer backend"
)


def _yake() -> Type[AnnifBackend]:
try:
from . import yake
Expand All @@ -111,6 +122,7 @@ def _yake() -> Type[AnnifBackend]:
"stwfsa": _stwfsa,
"svc": _svc,
"tfidf": _tfidf,
"xtransformer": _xtransformer,
"yake": _yake,
}

Expand Down
6 changes: 1 addition & 5 deletions annif/backend/fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,11 +117,7 @@ def _create_model(self, params: dict[str, Any], jobs: int) -> None:
self.info("creating fastText model")
trainpath = os.path.join(self.datadir, self.TRAIN_FILE)
modelpath = os.path.join(self.datadir, self.MODEL_FILE)
params = {
param: self.FASTTEXT_PARAMS[param](val)
for param, val in params.items()
if param in self.FASTTEXT_PARAMS
}
params = annif.util.apply_param_parse_config(self.FASTTEXT_PARAMS, params)
if jobs != 0: # jobs set by user to non-default value
params["thread"] = jobs
self.debug("Model parameters: {}".format(params))
Expand Down
57 changes: 56 additions & 1 deletion annif/backend/mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
from typing import TYPE_CHECKING, Any

import joblib
import numpy as np
from pecos.utils.featurization.text.vectorizers import Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

import annif.util
Expand Down Expand Up @@ -57,7 +59,6 @@ def _suggest(
return []
return self._suggest_chunks(chunktexts, params)


class TfidfVectorizerMixin:
"""Annif backend mixin that implements TfidfVectorizer functionality"""

Expand Down Expand Up @@ -92,3 +93,57 @@ def create_vectorizer(
self.vectorizer, self.datadir, self.VECTORIZER_FILE, method=joblib.dump
)
return veccorpus

class PecosTfidfVectorizerMixin:
"""Annif backend mixin that implements TfidfVectorizer functionality from Pecos"""

VECTORIZER_FILE = "vectorizer"

vectorizer = None

def initialize_vectorizer(self) -> None:
if self.vectorizer is None:
path = os.path.join(self.datadir, self.VECTORIZER_FILE)
if os.path.exists(path):
self.debug("loading vectorizer from {}".format(path))

self.vectorizer = Vectorizer.load(path)
else:
raise NotInitializedException(
"vectorizer file '{}' not found".format(path),
backend_id=self.backend_id,
)

def vectorizer_dict(self, params: dict[str, Any]) -> dict[str, Any]:
"""Create a vectorizer configuration dictionary from the given parameters."""

config = {
"base_vect_configs": [
{
"ngram_range": params.get("ngram_range", [1, 1]),
"max_df_ratio": 0.98,
"analyzer": "word",
"min_df_cnt": params.get("min_df", 1),
}
]
}
return {"type": "tfidf", "kwargs": {**config}}


def create_vectorizer(
self, input: Iterable[str], params: dict[str, Any] = None
) -> csr_matrix:

self.info("creating Pecos vectorizer")
if params is None:
params = {}
data = list(input)
vectorizer_config = self.vectorizer_dict(params)
self.vectorizer = Vectorizer.train(data, vectorizer_config, np.float32)
self.vectorizer.save(os.path.join(self.datadir, self.VECTORIZER_FILE))
veccorpus = self.vectorizer.predict(
data,
threads=params.get("threads", -1)
)

return veccorpus
5 changes: 1 addition & 4 deletions annif/backend/omikuji.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from __future__ import annotations

import os.path
import shutil
from typing import TYPE_CHECKING, Any

import omikuji
Expand Down Expand Up @@ -103,9 +102,7 @@ def _create_model(self, params: dict[str, Any], jobs: int) -> None:
hyper_param.collapse_every_n_layers = int(params["collapse_every_n_layers"])

self._model = omikuji.Model.train_on_data(train_path, hyper_param, jobs or None)
if os.path.exists(model_path):
shutil.rmtree(model_path)
self._model.save(os.path.join(self.datadir, self.MODEL_FILE))
annif.util.atomic_save_folder(self._model, model_path)

def _train(
self,
Expand Down
8 changes: 2 additions & 6 deletions annif/backend/stwfsa.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from annif.exception import NotInitializedException, NotSupportedException
from annif.suggestion import SubjectSuggestion
from annif.util import atomic_save, boolean
from annif.util import apply_param_parse_config, atomic_save, boolean

from . import backend

Expand Down Expand Up @@ -106,11 +106,7 @@ def _train(
jobs: int = 0,
) -> None:
X, y = self._load_data(corpus)
new_params = {
key: self.STWFSA_PARAMETERS[key](val)
for key, val in params.items()
if key in self.STWFSA_PARAMETERS
}
new_params = apply_param_parse_config(self.STWFSA_PARAMETERS, params)
p = StwfsapyPredictor(
graph=self.project.vocab.as_graph(),
langs=frozenset([params["language"]]),
Expand Down
Loading
Loading