Skip to content

Commit f327933

Browse files
committed
Incorporated a Doc.content_hash to ensure doc_id uniqueness, with tests
1 parent a4c1633 commit f327933

File tree

7 files changed

+117
-32
lines changed

7 files changed

+117
-32
lines changed

src/paperqa/clients/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,8 @@ async def upgrade_doc_to_doc_details(self, doc: Doc, **kwargs) -> DocDetails:
246246
doc_details.key = doc.docname
247247
if "citation" in doc.fields_to_overwrite_from_metadata:
248248
doc_details.citation = doc.citation
249+
if "content_hash" in doc.fields_to_overwrite_from_metadata:
250+
doc_details.content_hash = doc.content_hash
249251
return provided_doc_details + doc_details
250252

251253
# if we can't get metadata, just return the doc, but don't overwrite any fields

src/paperqa/docs.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -266,18 +266,20 @@ async def aadd( # noqa: PLR0912
266266
"""Add a document to the collection."""
267267
all_settings = get_settings(settings)
268268
parse_config = all_settings.parsing
269+
content_hash = md5sum(path)
269270
dockey_is_content_hash = False
270271
if dockey is None:
271-
# md5 sum of file contents (not path!)
272-
dockey = md5sum(path)
272+
dockey = content_hash
273273
dockey_is_content_hash = True
274274
if llm_model is None:
275275
llm_model = all_settings.get_llm()
276276
if citation is None:
277277
# Peek first chunk
278278
texts = await read_doc(
279279
path,
280-
Doc(docname="", citation="", dockey=dockey), # Fake doc
280+
Doc( # Fake doc
281+
docname="", citation="", dockey=dockey, content_hash=content_hash
282+
),
281283
page_size_limit=parse_config.page_size_limit,
282284
parse_images=False, # Peeking is text only
283285
# We only use the first chunk, so let's peek just enough pages for that.
@@ -312,6 +314,7 @@ async def aadd( # noqa: PLR0912
312314
),
313315
citation=citation,
314316
dockey=dockey,
317+
content_hash=content_hash,
315318
)
316319

317320
# try to extract DOI / title from the citation

src/paperqa/llms.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
)
3131
from typing_extensions import override
3232

33-
from paperqa.types import Doc, Text
33+
from paperqa.types import AUTOPOPULATE_VALUE, Doc, Text
3434

3535
if TYPE_CHECKING:
3636
from qdrant_client.http.models import Record
@@ -495,6 +495,7 @@ async def fetch_batch_with_semaphore(offset: int) -> None:
495495
docname=doc_data.get("docname", ""),
496496
citation=doc_data.get("citation", ""),
497497
dockey=doc_data["dockey"],
498+
content_hash=doc_data.get("content_hash", AUTOPOPULATE_VALUE),
498499
)
499500
docs.docnames.add(doc_data.get("docname", ""))
500501

src/paperqa/types.py

Lines changed: 44 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from __future__ import annotations
22

33
import ast
4+
import contextlib
45
import csv
56
import hashlib
67
import json
@@ -38,12 +39,14 @@
3839
)
3940

4041
from paperqa.utils import (
42+
compute_unique_doc_id,
4143
create_bibtex_key,
4244
encode_id,
4345
format_bibtex,
4446
get_citation_ids,
4547
get_parenthetical_substrings,
4648
maybe_get_date,
49+
md5sum,
4750
)
4851
from paperqa.version import __version__ as pqa_version
4952

@@ -64,7 +67,10 @@
6467
"docname",
6568
"dockey",
6669
"citation",
70+
"content_hash", # Metadata providers won't give this
6771
}
72+
# Sentinel to autopopulate a field within model_validator
73+
AUTOPOPULATE_VALUE = "" # NOTE: this is falsy by design
6874

6975

7076
class Doc(Embeddable):
@@ -73,6 +79,13 @@ class Doc(Embeddable):
7379
docname: str
7480
dockey: DocKey
7581
citation: str
82+
content_hash: str | None = Field(
83+
default=AUTOPOPULATE_VALUE,
84+
description=(
85+
"Optional hash of the document's contents (to reiterate, not a file path to"
86+
" the document, but the document's contents itself)."
87+
),
88+
)
7689
# Sort the serialization to minimize the diff of serialized objects
7790
fields_to_overwrite_from_metadata: Annotated[set[str], PlainSerializer(sorted)] = (
7891
Field(
@@ -203,10 +216,6 @@ async def get_embeddable_text(self, with_enrichment: bool = False) -> str:
203216
return "\n\n".join((self.text, *enriched_media))
204217

205218

206-
# Sentinel to autopopulate a field within model_validator
207-
AUTOPOPULATE_VALUE = "" # NOTE: this is falsy by design
208-
209-
210219
class Context(BaseModel):
211220
"""A class to hold the context of a question."""
212221

@@ -742,8 +751,8 @@ class DocDetails(Doc):
742751
doc_id: str | None = Field(
743752
default=None,
744753
description=(
745-
"Unique ID for this document. Simple ways to acquire one include"
746-
" hashing the DOI or a stringifying a UUID."
754+
"Unique ID for this document. A simple and robust way to acquire one is"
755+
" hashing the paper content's hash concatenate with the lowercased DOI."
747756
),
748757
)
749758
file_location: str | os.PathLike | None = Field(
@@ -811,9 +820,9 @@ def lowercase_doi_and_populate_doc_id(cls, data: dict[str, Any]) -> dict[str, An
811820
doi = doi.replace(url_prefix_to_remove, "")
812821
data["doi"] = doi.lower()
813822
if not data.get("doc_id"): # keep user defined doc_ids
814-
data["doc_id"] = encode_id(doi.lower())
823+
data["doc_id"] = compute_unique_doc_id(doi, data.get("content_hash"))
815824
elif not data.get("doc_id"): # keep user defined doc_ids
816-
data["doc_id"] = encode_id(uuid4())
825+
data["doc_id"] = compute_unique_doc_id(doi, data.get("content_hash"))
817826

818827
if "dockey" in data.get(
819828
"fields_to_overwrite_from_metadata",
@@ -1024,6 +1033,17 @@ def populate_bibtex_key_citation(cls, data: dict[str, Any]) -> dict[str, Any]:
10241033
data["citation"] = data.get("title") or CITATION_FALLBACK_DATA["title"]
10251034
return data
10261035

1036+
@classmethod
1037+
def populate_content_hash(cls, data: dict[str, Any]) -> dict[str, Any]:
1038+
if ( # Check for missing or autopopulate value, but preserve `None`
1039+
data.get("content_hash", AUTOPOPULATE_VALUE) == AUTOPOPULATE_VALUE
1040+
):
1041+
data["content_hash"] = None # Assume we don't have it
1042+
if data.get("file_location"): # Try to update it
1043+
with contextlib.suppress(FileNotFoundError):
1044+
data["content_hash"] = md5sum(data["file_location"])
1045+
return data
1046+
10271047
@model_validator(mode="before")
10281048
@classmethod
10291049
def validate_all_fields(cls, data: Mapping[str, Any]) -> dict[str, Any]:
@@ -1049,6 +1069,7 @@ def validate_all_fields(cls, data: Mapping[str, Any]) -> dict[str, Any]:
10491069
data[possibly_str_field], str
10501070
):
10511071
data[possibly_str_field] = ast.literal_eval(data[possibly_str_field])
1072+
data = cls.populate_content_hash(data)
10521073
data = cls.lowercase_doi_and_populate_doc_id(data)
10531074
data = cls.remove_invalid_authors(data)
10541075
data = cls.misc_string_cleaning(data)
@@ -1209,6 +1230,14 @@ def __add__(self, other: DocDetails | int) -> DocDetails: # noqa: PLR0912
12091230
)
12101231
else:
12111232
merged_data[field] = max(self_value, other_value)
1233+
elif field == "content_hash" and (
1234+
# Hashes are both present but differ
1235+
(self_value and other_value and self_value != other_value)
1236+
# One hash is explicitly disabled (not autopopulated)
1237+
or (self_value is None or other_value is None)
1238+
):
1239+
# We don't know which to pick, so just discard the value
1240+
merged_data[field] = None
12121241

12131242
else:
12141243
# Prefer non-null values, default preference for 'other' object.
@@ -1223,10 +1252,13 @@ def __add__(self, other: DocDetails | int) -> DocDetails: # noqa: PLR0912
12231252
else self_value
12241253
)
12251254

1226-
# Recalculate doc_id if doi has changed
1227-
if merged_data["doi"] != self.doi:
1228-
merged_data["doc_id"] = (
1229-
encode_id(merged_data["doi"].lower()) if merged_data["doi"] else None
1255+
if (
1256+
merged_data["doi"] != self.doi
1257+
or merged_data["content_hash"] != self.content_hash
1258+
):
1259+
# Recalculate doc_id if doi or content hash has changed
1260+
merged_data["doc_id"] = compute_unique_doc_id(
1261+
merged_data["doi"], merged_data.get("content_hash")
12301262
)
12311263

12321264
# Create and return new DocDetails instance

src/paperqa/utils.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
from http import HTTPStatus
1515
from pathlib import Path
1616
from typing import Any, BinaryIO, ClassVar, TypeVar
17-
from uuid import UUID
17+
from uuid import UUID, uuid4
1818

1919
import httpx
2020
from lmi import configure_llm_logs
@@ -94,7 +94,7 @@ def strings_similarity(s1: str, s2: str, case_insensitive: bool = True) -> float
9494

9595
def hexdigest(data: str | bytes) -> str:
9696
if isinstance(data, str):
97-
return hashlib.md5(data.encode("utf-8")).hexdigest() # noqa: S324
97+
data = data.encode("utf-8")
9898
return hashlib.md5(data).hexdigest() # noqa: S324
9999

100100

@@ -228,6 +228,14 @@ def encode_id(value: str | bytes | UUID, maxsize: int | None = 16) -> str:
228228
return hashlib.md5(value).hexdigest()[:maxsize] # noqa: S324
229229

230230

231+
def compute_unique_doc_id(doi: str | None, content_hash: str | None) -> str:
232+
if doi:
233+
value_to_encode: str = doi.lower() + (content_hash or "")
234+
else:
235+
value_to_encode = content_hash or str(uuid4())
236+
return encode_id(value_to_encode)
237+
238+
231239
def get_year(ts: datetime | None = None) -> str:
232240
"""Get the year from the input datetime, otherwise using the current datetime."""
233241
if ts is None:

tests/test_agents.py

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@
6161
from paperqa.prompts import CANNOT_ANSWER_PHRASE, CONTEXT_INNER_PROMPT_NOT_DETAILED
6262
from paperqa.settings import AgentSettings, IndexSettings, Settings
6363
from paperqa.types import Context, Doc, PQASession, Text
64-
from paperqa.utils import encode_id, extract_thought, get_year, md5sum
64+
from paperqa.utils import compute_unique_doc_id, extract_thought, get_year, md5sum
6565

6666

6767
@pytest.mark.asyncio
@@ -119,11 +119,16 @@ async def test_get_directory_index(
119119
results = await index.query(query="what is a gravity hill?", min_score=5)
120120
assert results
121121
first_result = results[0]
122+
assert len(first_result.docs) == 1, "Expected one result (gravity_hill.md)"
122123
target_doc_path = (paper_dir / "gravity_hill.md").absolute()
123124
expected_ids = {
124-
md5sum(target_doc_path), # What we actually expect
125-
encode_id(
126-
"10.2307/j.ctt5vkfh7.11" # Crossref may match this Gravity Hill poem, lol
125+
compute_unique_doc_id(
126+
None,
127+
md5sum(target_doc_path), # What we actually expect
128+
),
129+
compute_unique_doc_id(
130+
"10.2307/j.ctt5vkfh7.11", # Crossref may match this Gravity Hill poem, lol
131+
next(iter(first_result.docs.values())).content_hash,
127132
),
128133
}
129134
for expected_id in expected_ids:
@@ -134,9 +139,9 @@ async def test_get_directory_index(
134139
f"Failed to match an ID in {expected_ids}, got citations"
135140
f" {[d.formatted_citation for d in first_result.docs.values()]}."
136141
)
137-
assert all(
138-
x in first_result.docs[expected_id].formatted_citation
139-
for x in ("Wikipedia", "Gravity")
142+
assert (
143+
"gravity hill"
144+
in first_result.docs[expected_id].formatted_citation.lower()
140145
)
141146

142147
# Check getting the same index name will not reprocess files

tests/test_paperqa.py

Lines changed: 41 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from unittest.mock import MagicMock, call, patch
1818
from uuid import UUID, uuid4
1919

20+
import anyio
2021
import httpx
2122
import litellm
2223
import litellm.llms.anthropic.common_utils
@@ -849,9 +850,11 @@ async def test_get_reasoning(docs_fixture: Docs, llm: str, llm_settings: dict) -
849850

850851

851852
@pytest.mark.asyncio
852-
async def test_duplicate(stub_data_dir: Path) -> None:
853+
async def test_duplicate(stub_data_dir: Path, tmp_path) -> None:
853854
"""Check Docs doesn't store duplicates, while checking nonduplicate docs are stored."""
854855
docs = Docs()
856+
857+
# First, check adding a straight-up duplicate doc
855858
assert await docs.aadd(
856859
stub_data_dir / "bates.txt",
857860
citation="WikiMedia Foundation, 2023, Accessed now",
@@ -864,16 +867,44 @@ async def test_duplicate(stub_data_dir: Path) -> None:
864867
dockey="test1",
865868
)
866869
is None
867-
)
870+
), "Expected duplicate add to indicate no new doc was added"
868871
assert len(docs.docs) == 1, "Should have added only one document"
872+
873+
# Next, check adding a different doc works, and also check citation inference
874+
common_doi = "10.1234/flag"
869875
assert await docs.aadd(
870-
stub_data_dir / "flag_day.html",
871-
citation="WikiMedia Foundation, 2023, Accessed now",
872-
dockey="test2",
876+
stub_data_dir / "flag_day.html", dockey="flag_day", doi=common_doi
873877
)
874878
assert (
875-
len(set(docs.docs.values())) == 2
879+
len(set(docs.docs.keys())) == 2
876880
), "Unique documents should be hashed as unique"
881+
flag_day = docs.docs["flag_day"]
882+
assert isinstance(flag_day, DocDetails)
883+
assert flag_day.doi == common_doi
884+
assert all(
885+
x in flag_day.citation.lower() for x in ("wikipedia", "flag")
886+
), "Expected citation to be inferred"
887+
assert flag_day.content_hash
888+
889+
# Now, check adding a different file but same metadata
890+
# (emulating main text vs supplemental information)
891+
# will be seen as a different doc
892+
flag_day_content = await anyio.Path(stub_data_dir / "flag_day.html").read_bytes()
893+
assert len(flag_day_content) >= 1000, "Expected long file to test truncation"
894+
await anyio.Path(tmp_path / "flag_day.html").write_bytes(flag_day_content[:-100])
895+
assert await docs.aadd(
896+
tmp_path / "flag_day.html", dockey="flag_day_shorter", doi=common_doi
897+
)
898+
assert len(set(docs.docs.keys())) == 3, "Expected a third document to be added"
899+
shorter_flag_day = docs.docs["flag_day_shorter"]
900+
assert isinstance(shorter_flag_day, DocDetails)
901+
assert shorter_flag_day.doi == common_doi
902+
assert all(
903+
x in shorter_flag_day.citation.lower() for x in ("wikipedia", "flag")
904+
), "Expected citation to be inferred"
905+
assert shorter_flag_day.content_hash
906+
assert flag_day.content_hash != shorter_flag_day.content_hash
907+
assert flag_day.doc_id != shorter_flag_day.doc_id
877908

878909

879910
@pytest.mark.asyncio
@@ -1173,6 +1204,7 @@ async def test_pdf_reader_w_no_match_doc_details(stub_data_dir: Path) -> None:
11731204
"Wellawatte et al, XAI Review, 2023",
11741205
)
11751206
(doc_details,) = docs.docs.values()
1207+
assert doc_details.content_hash == "41f786fcc56d27ff0c1507153fae3774"
11761208
assert doc_details.docname == docname, "Added name should match between details"
11771209
# doc will be a DocDetails object, but nothing can be found
11781210
# thus, we retain the prior citation data
@@ -1262,10 +1294,12 @@ async def test_pdf_reader_match_doc_details(stub_data_dir: Path) -> None:
12621294
fields=["author", "journal", "citation_count"],
12631295
)
12641296
(doc_details,) = docs.docs.values()
1297+
assert doc_details.content_hash == "41f786fcc56d27ff0c1507153fae3774"
12651298
assert doc_details.docname == docname, "Added name should match between details"
12661299
# Crossref is non-deterministic in its ordering for results
1300+
# (it can give DOI '10.1021/acs.jctc.2c01235' or DOI '10.26434/chemrxiv-2022-qfv02')
12671301
# thus we need to capture both possible dockeys
1268-
assert doc_details.dockey in {"d7763485f06aabde", "5300ef1d5fb960d7"}
1302+
assert doc_details.dockey in {"8ce7ddba9c9dcae6", "a353fa2478475c9c"}
12691303
assert isinstance(doc_details, DocDetails)
12701304
# note year is unknown because citation string is only parsed for authors/title/doi
12711305
# AND we do not request it back from the metadata sources

0 commit comments

Comments
 (0)