Skip to content

Adding Doc.content_hash and integrating for unique DocDetails.doc_id #1029

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
25 changes: 13 additions & 12 deletions src/paperqa/clients/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,16 +208,16 @@ async def bulk_query(
)

async def upgrade_doc_to_doc_details(self, doc: Doc, **kwargs) -> DocDetails:

# note we have some extra fields which may have come from reading the doc text,
# but aren't in the doc object, we add them here too.
extra_fields = {
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The usage of "extra" here is an overloaded term:

  • Extra in a Pydantic context (and we have DocDetails.other, to add to the confusion)
  • Extra in a query context

Paired with vagueness such as "some extra fields", I decided to rename and reworded this whole section

# Collect fields (e.g. title, DOI, or authors) that have been externally
# specified (e.g. by a caller, or inferred from the document's contents)
# but are not on the input `doc` object
provided_fields = {
k: v for k, v in kwargs.items() if k in set(DocDetails.model_fields)
}
# abuse our doc_details object to be an int if it's empty
# our __add__ operation supports int by doing nothing
extra_doc: int | DocDetails = (
0 if not extra_fields else DocDetails(**extra_fields)
# DocDetails.__add__ supports `int` as a no-op route, so if we have no
# provided fields, let's use that no-op route
provided_doc_details: int | DocDetails = (
0 if not provided_fields else DocDetails(**provided_fields)
)

if doc_details := await self.query(**kwargs):
Expand All @@ -233,9 +233,10 @@ async def upgrade_doc_to_doc_details(self, doc: Doc, **kwargs) -> DocDetails:
doc_details.key = doc.docname
if "citation" in doc.fields_to_overwrite_from_metadata:
doc_details.citation = doc.citation
return extra_doc + doc_details
if "content_hash" in doc.fields_to_overwrite_from_metadata:
doc_details.content_hash = doc.content_hash
return provided_doc_details + doc_details

# if we can't get metadata, just return the doc, but don't overwrite any fields
prior_doc = doc.model_dump()
prior_doc["fields_to_overwrite_from_metadata"] = set()
return DocDetails(**(prior_doc | extra_fields))
orig_fields = doc.model_dump() | {"fields_to_overwrite_from_metadata": set()}
return DocDetails(**(orig_fields | provided_fields))
10 changes: 7 additions & 3 deletions src/paperqa/docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,18 +265,20 @@ async def aadd( # noqa: PLR0912
"""Add a document to the collection."""
all_settings = get_settings(settings)
parse_config = all_settings.parsing
content_hash = md5sum(path)
dockey_is_content_hash = False
if dockey is None:
# md5 sum of file contents (not path!)
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I moved this comment to Doc.content_hash's description, fyi

dockey = md5sum(path)
dockey = content_hash
dockey_is_content_hash = True
if llm_model is None:
llm_model = all_settings.get_llm()
if citation is None:
# Peek first chunk
texts = await read_doc(
path,
Doc(docname="", citation="", dockey=dockey), # Fake doc
Doc( # Fake doc
docname="", citation="", dockey=dockey, content_hash=content_hash
),
chunk_chars=parse_config.chunk_size,
overlap=parse_config.overlap,
page_size_limit=parse_config.page_size_limit,
Expand Down Expand Up @@ -307,6 +309,7 @@ async def aadd( # noqa: PLR0912
),
citation=citation,
dockey=dockey,
content_hash=content_hash,
)

# try to extract DOI / title from the citation
Expand Down Expand Up @@ -360,6 +363,7 @@ async def aadd( # noqa: PLR0912
clients=kwargs.pop("clients", DEFAULT_CLIENTS),
)

# Query here means a query to a metadata provider
query_kwargs: dict[str, Any] = {}

if doi:
Expand Down
3 changes: 2 additions & 1 deletion src/paperqa/llms.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
)
from typing_extensions import override

from paperqa.types import Doc, Text
from paperqa.types import AUTOPOPULATE_VALUE, Doc, Text

if TYPE_CHECKING:
from qdrant_client.http.models import Record
Expand Down Expand Up @@ -495,6 +495,7 @@ async def fetch_batch_with_semaphore(offset: int) -> None:
docname=doc_data.get("docname", ""),
citation=doc_data.get("citation", ""),
dockey=doc_data["dockey"],
content_hash=doc_data.get("content_hash", AUTOPOPULATE_VALUE),
)
docs.docnames.add(doc_data.get("docname", ""))

Expand Down
72 changes: 54 additions & 18 deletions src/paperqa/types.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

import ast
import contextlib
import csv
import hashlib
import json
Expand Down Expand Up @@ -37,11 +38,13 @@

from paperqa.utils import (
bytes_to_string,
compute_unique_doc_id,
create_bibtex_key,
encode_id,
format_bibtex,
get_citation_ids,
maybe_get_date,
md5sum,
string_to_bytes,
)
from paperqa.version import __version__ as pqa_version
Expand All @@ -61,7 +64,10 @@
"docname",
"dockey",
"citation",
"content_hash", # Metadata providers won't give this
}
# Sentinel to autopopulate a field within model_validator
AUTOPOPULATE_VALUE = "" # NOTE: this is falsy by design


class Doc(Embeddable):
Expand All @@ -70,6 +76,13 @@ class Doc(Embeddable):
docname: str
dockey: DocKey
citation: str
content_hash: str | None = Field(
default=AUTOPOPULATE_VALUE,
description=(
"Optional hash of the document's contents (to reiterate, not a file path to"
" the document, but the document's contents itself)."
),
)
# Sort the serialization to minimize the diff of serialized objects
fields_to_overwrite_from_metadata: Annotated[set[str], PlainSerializer(sorted)] = (
Field(
Expand Down Expand Up @@ -171,10 +184,6 @@ def __hash__(self) -> int:
return hash((self.name, self.text))


# Sentinel to autopopulate a field within model_validator
AUTOPOPULATE_VALUE = "" # NOTE: this is falsy by design


class Context(BaseModel):
"""A class to hold the context of a question."""

Expand Down Expand Up @@ -660,11 +669,17 @@ class DocDetails(Doc):
doc_id: str | None = Field(
default=None,
description=(
"Unique ID for this document. Simple ways to acquire one include"
" hashing the DOI or a stringifying a UUID."
"Unique ID for this document. A simple and robust way to acquire one is"
" hashing the paper content's hash concatenate with the lowercased DOI."
),
)
file_location: str | os.PathLike | None = Field(
default=None,
description=(
"Optional path to the stored paper, if stored locally"
" or in a mountable location such as a cloud bucket."
),
)
file_location: str | os.PathLike | None = None
license: str | None = Field(
default=None,
description=(
Expand Down Expand Up @@ -713,10 +728,10 @@ def lowercase_doi_and_populate_doc_id(cls, data: dict[str, Any]) -> dict[str, An
if doi.startswith(url_prefix_to_remove):
doi = doi.replace(url_prefix_to_remove, "")
data["doi"] = doi.lower()
if "doc_id" not in data or not data["doc_id"]: # keep user defined doc_ids
data["doc_id"] = encode_id(doi.lower())
elif "doc_id" not in data or not data["doc_id"]: # keep user defined doc_ids
data["doc_id"] = encode_id(uuid4())
if not data.get("doc_id"): # keep user defined doc_ids
data["doc_id"] = compute_unique_doc_id(doi, data.get("content_hash"))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we talked about this in person, but let's move this into an optional feature

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I piloted this today some more btw. Moving compute_unique_doc_id into something like Settings.details_id_factory requires us to pass a throwaway Settings reference to all DocDetails constructions, which is both awkward and error prone.

Perhaps a less awkward route is having an bool environment variable PQA_USE_OLD_ID_FACTORY that gets used inside compute_unique_doc_id

elif not data.get("doc_id"): # keep user defined doc_ids
data["doc_id"] = compute_unique_doc_id(doi, data.get("content_hash"))

if "dockey" in data.get(
"fields_to_overwrite_from_metadata",
Expand Down Expand Up @@ -927,6 +942,17 @@ def populate_bibtex_key_citation(cls, data: dict[str, Any]) -> dict[str, Any]:
data["citation"] = data.get("title") or CITATION_FALLBACK_DATA["title"]
return data

@classmethod
def populate_content_hash(cls, data: dict[str, Any]) -> dict[str, Any]:
if ( # Check for missing or autopopulate value, but preserve `None`
data.get("content_hash", AUTOPOPULATE_VALUE) == AUTOPOPULATE_VALUE
):
data["content_hash"] = None # Assume we don't have it
if data.get("file_location"): # Try to update it
with contextlib.suppress(FileNotFoundError):
data["content_hash"] = md5sum(data["file_location"])
return data

@model_validator(mode="before")
@classmethod
def validate_all_fields(cls, data: Mapping[str, Any]) -> dict[str, Any]:
Expand All @@ -946,6 +972,7 @@ def validate_all_fields(cls, data: Mapping[str, Any]) -> dict[str, Any]:
data[possibly_str_field], str
):
data[possibly_str_field] = ast.literal_eval(data[possibly_str_field])
data = cls.populate_content_hash(data)
data = cls.lowercase_doi_and_populate_doc_id(data)
data = cls.remove_invalid_authors(data)
data = cls.misc_string_cleaning(data)
Expand Down Expand Up @@ -1046,7 +1073,7 @@ def __add__(self, other: DocDetails | int) -> DocDetails: # noqa: PLR0912
if self.publication_date and other.publication_date:
PREFER_OTHER = self.publication_date <= other.publication_date

merged_data = {}
merged_data: dict[str, Any] = {}
# pylint: disable-next=not-an-iterable # pylint bug: https://github.com/pylint-dev/pylint/issues/10144
for field in type(self).model_fields:
self_value = getattr(self, field)
Expand Down Expand Up @@ -1086,11 +1113,11 @@ def __add__(self, other: DocDetails | int) -> DocDetails: # noqa: PLR0912
)
else other.authors
)
merged_data[field] = best_authors or None # type: ignore[assignment]
merged_data[field] = best_authors or None

elif field == "key" and self_value is not None and other_value is not None:
# if we have multiple keys, we wipe them and allow regeneration
merged_data[field] = None # type: ignore[assignment]
merged_data[field] = None

elif field in {"citation_count", "year", "publication_date"}:
# get the latest data
Expand All @@ -1106,6 +1133,12 @@ def __add__(self, other: DocDetails | int) -> DocDetails: # noqa: PLR0912
)
else:
merged_data[field] = max(self_value, other_value)
elif field == "content_hash" and (
self_value and other_value and self_value != other_value
):
# If hashes are both present but differ,
# we don't know which to pick, so just discard the value
merged_data[field] = None

else:
# Prefer non-null values, default preference for 'other' object.
Expand All @@ -1120,10 +1153,13 @@ def __add__(self, other: DocDetails | int) -> DocDetails: # noqa: PLR0912
else self_value
)

# Recalculate doc_id if doi has changed
if merged_data["doi"] != self.doi:
merged_data["doc_id"] = (
encode_id(merged_data["doi"].lower()) if merged_data["doi"] else None # type: ignore[attr-defined,assignment]
if (
merged_data["doi"] != self.doi
or merged_data["content_hash"] != self.content_hash
):
# Recalculate doc_id if doi or content hash has changed
merged_data["doc_id"] = compute_unique_doc_id(
merged_data["doi"], merged_data.get("content_hash")
)

# Create and return new DocDetails instance
Expand Down
12 changes: 10 additions & 2 deletions src/paperqa/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from http import HTTPStatus
from pathlib import Path
from typing import TYPE_CHECKING, Any, BinaryIO, ClassVar, TypeVar
from uuid import UUID
from uuid import UUID, uuid4

import aiohttp
import httpx
Expand Down Expand Up @@ -104,7 +104,7 @@ def strings_similarity(s1: str, s2: str, case_insensitive: bool = True) -> float

def hexdigest(data: str | bytes) -> str:
if isinstance(data, str):
return hashlib.md5(data.encode("utf-8")).hexdigest() # noqa: S324
data = data.encode("utf-8")
return hashlib.md5(data).hexdigest() # noqa: S324


Expand Down Expand Up @@ -217,6 +217,14 @@ def encode_id(value: str | bytes | UUID, maxsize: int | None = 16) -> str:
return hashlib.md5(value).hexdigest()[:maxsize] # noqa: S324


def compute_unique_doc_id(doi: str | None, content_hash: str | None) -> str:
if doi:
value_to_encode: str = doi.lower() + (content_hash or "")
else:
value_to_encode = content_hash or str(uuid4())
Copy link
Preview

Copilot AI Jul 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using str(uuid4()) as a fallback when both DOI and content_hash are None could lead to non-deterministic document IDs for the same document across different runs. Consider if this is the intended behavior or if a more deterministic fallback would be appropriate.

Suggested change
value_to_encode = content_hash or str(uuid4())
value_to_encode = content_hash or "default"

Copilot uses AI. Check for mistakes.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We recompute doc_id if there's changes in DocDetails.__add__, so I believe we're safe

return encode_id(value_to_encode)


def get_year(ts: datetime | None = None) -> str:
"""Get the year from the input datetime, otherwise using the current datetime."""
if ts is None:
Expand Down
19 changes: 12 additions & 7 deletions tests/test_agents.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@
from paperqa.prompts import CANNOT_ANSWER_PHRASE, CONTEXT_INNER_PROMPT_NOT_DETAILED
from paperqa.settings import AgentSettings, IndexSettings, Settings
from paperqa.types import Context, Doc, PQASession, Text
from paperqa.utils import encode_id, extract_thought, get_year, md5sum
from paperqa.utils import compute_unique_doc_id, extract_thought, get_year, md5sum


@pytest.mark.asyncio
Expand Down Expand Up @@ -117,11 +117,16 @@ async def test_get_directory_index(
results = await index.query(query="what is a gravity hill?", min_score=5)
assert results
first_result = results[0]
assert len(first_result.docs) == 1, "Expected one result (gravity_hill.md)"
target_doc_path = (paper_dir / "gravity_hill.md").absolute()
expected_ids = {
md5sum(target_doc_path), # What we actually expect
encode_id(
"10.2307/j.ctt5vkfh7.11" # Crossref may match this Gravity Hill poem, lol
compute_unique_doc_id(
None,
md5sum(target_doc_path), # What we actually expect
),
compute_unique_doc_id(
"10.2307/j.ctt5vkfh7.11", # Crossref may match this Gravity Hill poem, lol
next(iter(first_result.docs.values())).content_hash,
),
}
for expected_id in expected_ids:
Expand All @@ -132,9 +137,9 @@ async def test_get_directory_index(
f"Failed to match an ID in {expected_ids}, got citations"
f" {[d.formatted_citation for d in first_result.docs.values()]}."
)
assert all(
x in first_result.docs[expected_id].formatted_citation
for x in ("Wikipedia", "Gravity")
assert (
"gravity hill"
in first_result.docs[expected_id].formatted_citation.lower()
)

# Check getting the same index name will not reprocess files
Expand Down
Loading