-
-
Notifications
You must be signed in to change notification settings - Fork 23
Addition of non-contiguous search and parameterization #116
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
a9f8fa4
949682f
c3bddd4
cd24bc7
df92af1
f0b0bde
04becb1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,7 +17,7 @@ | |
| import os | ||
| from typing import Any, Dict, List, Tuple, Optional, cast | ||
|
|
||
| from .utils import split_into_batches, is_subsequence_in_list | ||
| from .utils import split_into_batches, is_subsequence_in_list, is_subsequence_non_contiguous | ||
|
|
||
| # Optional GPU (CuPy) support | ||
| _gpu_available = False | ||
|
|
@@ -111,7 +111,7 @@ def _support_counts_gpu_singletons( | |
| vocab_size: int, | ||
| ) -> List[Tuple[List[int], int]]: | ||
| """GPU-accelerated support counts for singleton candidates using CuPy. | ||
|
|
||
| This computes the number of transactions containing each candidate item ID. | ||
| It uniquifies items per transaction on CPU to preserve presence semantics, | ||
| then performs a single bincount on GPU. | ||
|
|
@@ -126,8 +126,8 @@ def _support_counts_gpu_singletons( | |
| if not flat: | ||
| return [] | ||
|
|
||
| cp_flat = cp.asarray(flat, dtype=cp.int32) # type: ignore[name-defined] | ||
| counts = cp.bincount(cp_flat, minlength=vocab_size) # type: ignore[attr-defined] | ||
| cp_flat = cp.asarray(flat, dtype=cp.int32) # type: ignore[name-defined] | ||
| counts = cp.bincount(cp_flat, minlength=vocab_size) # type: ignore[name-defined] | ||
| counts_host: Any = counts.get() # back to host as a NumPy array | ||
|
|
||
| out: List[Tuple[List[int], int]] = [] | ||
|
|
@@ -143,20 +143,24 @@ def support_counts_python( | |
| candidates: List[Tuple[str, ...]], | ||
| min_support_abs: int, | ||
| batch_size: int = 100, | ||
| contiguous: bool=False , | ||
| ) -> Dict[Tuple[str, ...], int]: | ||
| """Pure-Python fallback for support counting (single-process). | ||
|
|
||
| Evaluates each candidate pattern's frequency across all transactions | ||
| using the same contiguous-subsequence semantics as the Rust backend. | ||
|
|
||
| Note: This implementation is single-process and optimized for simplicity. | ||
| Heavy workloads may benefit from the Rust backend. | ||
| """ | ||
| # Simple non-multiprocessing version to avoid import cycles. | ||
|
|
||
| results: Dict[Tuple[str, ...], int] = {} | ||
| subsequence_checker = is_subsequence_in_list if contiguous else is_subsequence_non_contiguous | ||
|
|
||
| for batch in split_into_batches(candidates, batch_size): | ||
| for cand in batch: | ||
| freq = sum(1 for t in transactions if is_subsequence_in_list(cand, t)) | ||
| freq = sum(1 for t in transactions if subsequence_checker(cand, t)) | ||
| if freq >= min_support_abs: | ||
| results[cand] = freq | ||
| return results | ||
|
|
@@ -168,37 +172,49 @@ def support_counts( | |
| min_support_abs: int, | ||
| batch_size: int = 100, | ||
| backend: Optional[str] = None, | ||
| contiguous: bool = False, | ||
| ) -> Dict[Tuple[str, ...], int]: | ||
| """Choose the best available backend for support counting. | ||
|
|
||
| Backend selection is controlled by the `backend` argument when provided, | ||
| otherwise by the env var GSPPY_BACKEND: | ||
| - "rust": require Rust extension (raise if missing) | ||
| - "gpu": try GPU path when available (currently singletons optimized), | ||
| - "gpu": try GPU path when available (currently singletons optimized), | ||
| fall back to CPU for the rest | ||
| - "python": force pure-Python fallback | ||
| - otherwise: try Rust first and fall back to Python | ||
| """ | ||
| # Intentionally fallback to Python for non-contiguous queries. | ||
| # The acceleration path is currently disabled for non-contiguous cases | ||
| # to facilitate testing and validation of the contiguous logic. | ||
| if not contiguous: | ||
| return support_counts_python( | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The function immediately returns the Python fallback when contiguous is False, even if the user has chosen the Rust or GPU backend. This design silently discards the acceleration path. If that is intentional, it should be clearly documented; otherwise, the accelerators need updating to handle non‑contiguous subsequences. |
||
| transactions, candidates, min_support_abs, batch_size, contiguous | ||
| ) | ||
|
|
||
| backend_sel = (backend or _env_backend()).lower() | ||
|
|
||
| if backend_sel == "python": | ||
| return support_counts_python( | ||
| transactions, candidates, min_support_abs, batch_size, contiguous | ||
| ) | ||
|
|
||
| if backend_sel == "gpu": | ||
| if not _gpu_available: | ||
| raise RuntimeError("GSPPY_BACKEND=gpu but CuPy GPU is not available") | ||
| # Encode once | ||
| enc_tx, inv_vocab, vocab = _get_encoded_transactions(transactions) | ||
| enc_cands = _encode_candidates(candidates, vocab) | ||
|
|
||
| # Partition candidates into singletons and non-singletons | ||
| singletons: List[Tuple[int, Tuple[str, ...]]] = [] | ||
| others: List[Tuple[List[int], Tuple[str, ...]]] = [] | ||
| # Pair original and encoded candidates; lengths should match | ||
| assert len(candidates) == len(enc_cands), "Encoded candidates length mismatch" | ||
| for orig, enc in zip(candidates, enc_cands): # noqa: B905 - lengths checked above | ||
| for orig, enc in zip(candidates, enc_cands, strict=False): # noqa: B905 - lengths checked above | ||
| if len(enc) == 1: | ||
| singletons.append((enc[0], orig)) | ||
| else: | ||
| others.append((enc, orig)) | ||
|
|
||
| out: Dict[Tuple[str, ...], int] = {} | ||
|
|
||
| # GPU path for singletons | ||
|
|
@@ -210,14 +226,14 @@ def support_counts( | |
| min_support_abs=min_support_abs, | ||
| vocab_size=vocab_size, | ||
| ) | ||
| # Map back to original strings | ||
| # Map back to original strings | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the comment does not seems in the right identation |
||
| cand_by_id: Dict[int, Tuple[str, ...]] = {cid: orig for cid, orig in singletons} | ||
| for enc_cand, freq in gpu_res: | ||
| cid = enc_cand[0] | ||
| out[cand_by_id[cid]] = int(freq) | ||
|
|
||
| # Fallback for others (prefer rust when available) | ||
| if others: | ||
| other_candidates = [orig for _, orig in others] | ||
| if _rust_available: | ||
| try: | ||
| other_enc = [enc for enc, _ in others] | ||
|
|
@@ -228,19 +244,12 @@ def support_counts( | |
| out[tuple(inv_vocab[i] for i in enc_cand)] = int(freq) | ||
| except Exception: | ||
| # fallback to python | ||
| out.update( | ||
| support_counts_python(transactions, [orig for _, orig in others], min_support_abs, batch_size) | ||
| ) | ||
| else: | ||
| out.update( | ||
| support_counts_python(transactions, [orig for _, orig in others], min_support_abs, batch_size) | ||
| ) | ||
|
|
||
| out.update(support_counts_python(transactions, other_candidates, min_support_abs, batch_size, contiguous )) | ||
| else: | ||
| out.update(support_counts_python(transactions, other_candidates, min_support_abs, batch_size, contiguous )) | ||
| return out | ||
|
|
||
| if backend_sel == "python": | ||
| return support_counts_python(transactions, candidates, min_support_abs, batch_size) | ||
|
|
||
| if backend_sel == "rust": | ||
| if not _rust_available: | ||
| raise RuntimeError("GSPPY_BACKEND=rust but Rust extension _gsppy_rust is not available") | ||
|
|
@@ -252,18 +261,19 @@ def support_counts( | |
| for enc_cand, freq in result: | ||
| out_rust[tuple(inv_vocab[i] for i in enc_cand)] = int(freq) | ||
| return out_rust | ||
|
|
||
| # auto: try rust then fallback | ||
| if _rust_available: | ||
| enc_tx, inv_vocab, vocab = _get_encoded_transactions(transactions) | ||
| enc_cands = _encode_candidates(candidates, vocab) | ||
| try: | ||
| enc_tx, inv_vocab, vocab = _get_encoded_transactions(transactions) | ||
| enc_cands = _encode_candidates(candidates, vocab) | ||
| result = cast(List[Tuple[List[int], int]], _compute_supports_rust(enc_tx, enc_cands, int(min_support_abs))) | ||
| out2: Dict[Tuple[str, ...], int] = {} | ||
| out_auto: Dict[Tuple[str, ...], int] = {} | ||
| for enc_cand, freq in result: | ||
| out2[tuple(inv_vocab[i] for i in enc_cand)] = int(freq) | ||
| return out2 | ||
| out_auto[tuple(inv_vocab[i] for i in enc_cand)] = int(freq) | ||
| return out_auto | ||
| except Exception: | ||
| pass | ||
|
|
||
| return support_counts_python(transactions, candidates, min_support_abs, batch_size) | ||
| return support_counts_python( | ||
| transactions, candidates, min_support_abs, batch_size, contiguous | ||
| ) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -68,6 +68,15 @@ def is_subsequence_in_list(subsequence: Tuple[str, ...], sequence: Tuple[str, .. | |
| # Use any to check if any slice matches the sequence | ||
| return any(sequence[i : i + len_sub] == subsequence for i in range(len_seq - len_sub + 1)) | ||
|
|
||
| @lru_cache(maxsize=32768) | ||
| def is_subsequence_non_contiguous(subsequence: Tuple[str, ...], sequence: Tuple[str, ...]) -> bool: | ||
| """ | ||
| Check if a subsequence exists within a sequence, allowing for gaps (non-contiguous). | ||
| """ | ||
| if not subsequence: | ||
| return True | ||
| it = iter(sequence) | ||
| return all(item in it for item in subsequence) | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It returns False for an empty subsequence. By definition, the empty subsequence exists in every sequence. This edge case doesn’t affect the current algorithm (it never checks empty candidates), but a trivial fix would return True when subsequence is empty. A bounded lru_cache (maxsize) could also prevent unbounded memory use. |
||
|
|
||
| def generate_candidates_from_previous(prev_patterns: Dict[Tuple[str, ...], int]) -> List[Tuple[str, ...]]: | ||
| """ | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why did you remove my comments?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
sorry sir i will add it back