Skip to content

Commit 33e265c

Browse files
authored
Merge branch 'master' into feature/other_inputs_for_sqlalchemy_bit
2 parents 3d14b22 + 1a72b75 commit 33e265c

File tree

5 files changed

+67
-19
lines changed

5 files changed

+67
-19
lines changed

.github/workflows/build.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ jobs:
1919
dev-files: true
2020
- run: |
2121
cd /tmp
22-
git clone --branch v0.8.0 https://github.com/pgvector/pgvector.git
22+
git clone --branch v0.8.1 https://github.com/pgvector/pgvector.git
2323
cd pgvector
2424
make
2525
sudo make install

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
## 0.4.2 (unreleased)
2+
3+
- Added support for `str` objects for `bit` type with SQLAlchemy
4+
15
## 0.4.1 (2025-04-26)
26

37
- Fixed `SparseVector` constructor for SciPy sparse matrices

README.md

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -259,7 +259,6 @@ index = Index(
259259
'my_index',
260260
func.cast(Item.embedding, HALFVEC(3)).label('embedding'),
261261
postgresql_using='hnsw',
262-
postgresql_with={'m': 16, 'ef_construction': 64},
263262
postgresql_ops={'embedding': 'halfvec_l2_ops'}
264263
)
265264
```
@@ -271,6 +270,37 @@ order = func.cast(Item.embedding, HALFVEC(3)).l2_distance([3, 1, 2])
271270
session.scalars(select(Item).order_by(order).limit(5))
272271
```
273272

273+
#### Binary Quantization
274+
275+
Use expression indexing for binary quantization
276+
277+
```python
278+
from pgvector.sqlalchemy import BIT
279+
from sqlalchemy.sql import func
280+
281+
index = Index(
282+
'my_index',
283+
func.cast(func.binary_quantize(Item.embedding), BIT(3)).label('embedding'),
284+
postgresql_using='hnsw',
285+
postgresql_ops={'embedding': 'bit_hamming_ops'}
286+
)
287+
```
288+
289+
Get the nearest neighbors by Hamming distance
290+
291+
```python
292+
order = func.cast(func.binary_quantize(Item.embedding), BIT(3)).hamming_distance(func.binary_quantize(func.cast([3, -1, 2], VECTOR(3))))
293+
session.scalars(select(Item).order_by(order).limit(5))
294+
```
295+
296+
Re-rank by the original vectors for better recall
297+
298+
```python
299+
order = func.cast(func.binary_quantize(Item.embedding), BIT(3)).hamming_distance(func.binary_quantize(func.cast([3, -1, 2], VECTOR(3))))
300+
subquery = session.query(Item).order_by(order).limit(20).subquery()
301+
session.scalars(select(subquery).order_by(subquery.c.embedding.cosine_distance([3, -1, 2])).limit(5))
302+
```
303+
274304
#### Arrays
275305

276306
Add an array column

pgvector/sqlalchemy/bit.py

Lines changed: 10 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -17,21 +17,16 @@ def get_col_spec(self, **kw):
1717
return 'BIT(%d)' % self.length
1818

1919
def bind_processor(self, dialect):
20-
def process(value):
21-
value = Bit._to_db(value)
22-
if value and isinstance(dialect, PGDialect_asyncpg):
23-
return asyncpg.BitString(value)
24-
return value
25-
return process
26-
27-
def result_processor(self, dialect, coltype):
28-
def process(value):
29-
if value is None: return None
30-
else:
31-
if isinstance(dialect, PGDialect_asyncpg):
32-
return value.as_string()
33-
return Bit._from_db(value).to_text()
34-
return process
20+
if dialect.__class__.__name__ == 'PGDialect_asyncpg':
21+
import asyncpg
22+
23+
def process(value):
24+
if isinstance(value, str):
25+
return asyncpg.BitString(value)
26+
return value
27+
return process
28+
else:
29+
return super().bind_processor(dialect)
3530

3631
class comparator_factory(UserDefinedType.Comparator):
3732
def hamming_distance(self, other):

tests/test_sqlalchemy.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,6 @@ class Item(Base):
103103
'sqlalchemy_orm_half_precision_index',
104104
func.cast(Item.embedding, HALFVEC(3)).label('embedding'),
105105
postgresql_using='hnsw',
106-
postgresql_with={'m': 16, 'ef_construction': 64},
107106
postgresql_ops={'embedding': 'halfvec_l2_ops'}
108107
)
109108
half_precision_index.create(setup_engine)
@@ -112,7 +111,6 @@ class Item(Base):
112111
'sqlalchemy_orm_binary_quantize_index',
113112
func.cast(func.binary_quantize(Item.embedding), BIT(3)).label('embedding'),
114113
postgresql_using='hnsw',
115-
postgresql_with={'m': 16, 'ef_construction': 64},
116114
postgresql_ops={'embedding': 'bit_hamming_ops'}
117115
)
118116
binary_quantize_index.create(setup_engine)
@@ -528,6 +526,22 @@ def test_binary_quantize(self, engine):
528526
items = session.query(Item).order_by(distance).all()
529527
assert [v.id for v in items] == [2, 3, 1]
530528

529+
def test_binary_quantize_reranking(self, engine):
530+
# recreate index (could also vacuum table)
531+
binary_quantize_index.drop(setup_engine)
532+
binary_quantize_index.create(setup_engine)
533+
534+
with Session(engine) as session:
535+
session.add(Item(id=1, embedding=[-1, -2, -3]))
536+
session.add(Item(id=2, embedding=[1, -2, 3]))
537+
session.add(Item(id=3, embedding=[1, 2, 3]))
538+
session.commit()
539+
540+
distance = func.cast(func.binary_quantize(Item.embedding), BIT(3)).hamming_distance(func.binary_quantize(func.cast([3, -1, 2], VECTOR(3))))
541+
subquery = session.query(Item).order_by(distance).limit(20).subquery()
542+
items = session.query(subquery).order_by(subquery.c.embedding.cosine_distance([3, -1, 2])).limit(5).all()
543+
assert [v.id for v in items] == [2, 3, 1]
544+
531545

532546
@pytest.mark.parametrize('engine', array_engines)
533547
class TestSqlalchemyArray:
@@ -596,6 +610,11 @@ async def test_bit(self, engine):
596610
item = await session.get(Item, 1)
597611
assert item.binary_embedding == embedding
598612

613+
if engine == asyncpg_engine:
614+
session.add(Item(id=2, binary_embedding='101'))
615+
item = await session.get(Item, 2)
616+
assert item.binary_embedding == embedding
617+
599618
await engine.dispose()
600619

601620
@pytest.mark.asyncio

0 commit comments

Comments
 (0)