-
Notifications
You must be signed in to change notification settings - Fork 763
Adding Doc.content_hash
and integrating for unique DocDetails.doc_id
#1029
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
42dc1e9
6b3de75
a87fa17
31cd959
2776815
e36bbec
1df5ece
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -265,18 +265,20 @@ async def aadd( # noqa: PLR0912 | |
"""Add a document to the collection.""" | ||
all_settings = get_settings(settings) | ||
parse_config = all_settings.parsing | ||
content_hash = md5sum(path) | ||
dockey_is_content_hash = False | ||
if dockey is None: | ||
# md5 sum of file contents (not path!) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I moved this comment to |
||
dockey = md5sum(path) | ||
dockey = content_hash | ||
dockey_is_content_hash = True | ||
if llm_model is None: | ||
llm_model = all_settings.get_llm() | ||
if citation is None: | ||
# Peek first chunk | ||
texts = await read_doc( | ||
path, | ||
Doc(docname="", citation="", dockey=dockey), # Fake doc | ||
Doc( # Fake doc | ||
docname="", citation="", dockey=dockey, content_hash=content_hash | ||
), | ||
chunk_chars=parse_config.chunk_size, | ||
overlap=parse_config.overlap, | ||
page_size_limit=parse_config.page_size_limit, | ||
|
@@ -307,6 +309,7 @@ async def aadd( # noqa: PLR0912 | |
), | ||
citation=citation, | ||
dockey=dockey, | ||
content_hash=content_hash, | ||
) | ||
|
||
# try to extract DOI / title from the citation | ||
|
@@ -360,6 +363,7 @@ async def aadd( # noqa: PLR0912 | |
clients=kwargs.pop("clients", DEFAULT_CLIENTS), | ||
) | ||
|
||
# Query here means a query to a metadata provider | ||
query_kwargs: dict[str, Any] = {} | ||
|
||
if doi: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,7 @@ | ||
from __future__ import annotations | ||
|
||
import ast | ||
import contextlib | ||
import csv | ||
import hashlib | ||
import json | ||
|
@@ -37,11 +38,13 @@ | |
|
||
from paperqa.utils import ( | ||
bytes_to_string, | ||
compute_unique_doc_id, | ||
create_bibtex_key, | ||
encode_id, | ||
format_bibtex, | ||
get_citation_ids, | ||
maybe_get_date, | ||
md5sum, | ||
string_to_bytes, | ||
) | ||
from paperqa.version import __version__ as pqa_version | ||
|
@@ -61,7 +64,10 @@ | |
"docname", | ||
"dockey", | ||
"citation", | ||
"content_hash", # Metadata providers won't give this | ||
} | ||
# Sentinel to autopopulate a field within model_validator | ||
AUTOPOPULATE_VALUE = "" # NOTE: this is falsy by design | ||
|
||
|
||
class Doc(Embeddable): | ||
|
@@ -70,6 +76,13 @@ class Doc(Embeddable): | |
docname: str | ||
dockey: DocKey | ||
citation: str | ||
content_hash: str | None = Field( | ||
default=AUTOPOPULATE_VALUE, | ||
description=( | ||
"Optional hash of the document's contents (to reiterate, not a file path to" | ||
" the document, but the document's contents itself)." | ||
), | ||
) | ||
# Sort the serialization to minimize the diff of serialized objects | ||
fields_to_overwrite_from_metadata: Annotated[set[str], PlainSerializer(sorted)] = ( | ||
Field( | ||
|
@@ -171,10 +184,6 @@ def __hash__(self) -> int: | |
return hash((self.name, self.text)) | ||
|
||
|
||
# Sentinel to autopopulate a field within model_validator | ||
AUTOPOPULATE_VALUE = "" # NOTE: this is falsy by design | ||
|
||
|
||
class Context(BaseModel): | ||
"""A class to hold the context of a question.""" | ||
|
||
|
@@ -660,11 +669,17 @@ class DocDetails(Doc): | |
doc_id: str | None = Field( | ||
default=None, | ||
description=( | ||
"Unique ID for this document. Simple ways to acquire one include" | ||
" hashing the DOI or a stringifying a UUID." | ||
"Unique ID for this document. A simple and robust way to acquire one is" | ||
" hashing the paper content's hash concatenate with the lowercased DOI." | ||
), | ||
) | ||
file_location: str | os.PathLike | None = Field( | ||
default=None, | ||
description=( | ||
"Optional path to the stored paper, if stored locally" | ||
" or in a mountable location such as a cloud bucket." | ||
), | ||
) | ||
file_location: str | os.PathLike | None = None | ||
license: str | None = Field( | ||
default=None, | ||
description=( | ||
|
@@ -713,10 +728,10 @@ def lowercase_doi_and_populate_doc_id(cls, data: dict[str, Any]) -> dict[str, An | |
if doi.startswith(url_prefix_to_remove): | ||
doi = doi.replace(url_prefix_to_remove, "") | ||
data["doi"] = doi.lower() | ||
if "doc_id" not in data or not data["doc_id"]: # keep user defined doc_ids | ||
data["doc_id"] = encode_id(doi.lower()) | ||
elif "doc_id" not in data or not data["doc_id"]: # keep user defined doc_ids | ||
data["doc_id"] = encode_id(uuid4()) | ||
if not data.get("doc_id"): # keep user defined doc_ids | ||
data["doc_id"] = compute_unique_doc_id(doi, data.get("content_hash")) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we talked about this in person, but let's move this into an optional feature There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I piloted this today some more btw. Moving Perhaps a less awkward route is having an |
||
elif not data.get("doc_id"): # keep user defined doc_ids | ||
data["doc_id"] = compute_unique_doc_id(doi, data.get("content_hash")) | ||
|
||
if "dockey" in data.get( | ||
"fields_to_overwrite_from_metadata", | ||
|
@@ -927,6 +942,17 @@ def populate_bibtex_key_citation(cls, data: dict[str, Any]) -> dict[str, Any]: | |
data["citation"] = data.get("title") or CITATION_FALLBACK_DATA["title"] | ||
return data | ||
|
||
@classmethod | ||
def populate_content_hash(cls, data: dict[str, Any]) -> dict[str, Any]: | ||
if ( # Check for missing or autopopulate value, but preserve `None` | ||
data.get("content_hash", AUTOPOPULATE_VALUE) == AUTOPOPULATE_VALUE | ||
): | ||
data["content_hash"] = None # Assume we don't have it | ||
if data.get("file_location"): # Try to update it | ||
with contextlib.suppress(FileNotFoundError): | ||
data["content_hash"] = md5sum(data["file_location"]) | ||
return data | ||
|
||
@model_validator(mode="before") | ||
@classmethod | ||
def validate_all_fields(cls, data: Mapping[str, Any]) -> dict[str, Any]: | ||
|
@@ -946,6 +972,7 @@ def validate_all_fields(cls, data: Mapping[str, Any]) -> dict[str, Any]: | |
data[possibly_str_field], str | ||
): | ||
data[possibly_str_field] = ast.literal_eval(data[possibly_str_field]) | ||
data = cls.populate_content_hash(data) | ||
data = cls.lowercase_doi_and_populate_doc_id(data) | ||
data = cls.remove_invalid_authors(data) | ||
data = cls.misc_string_cleaning(data) | ||
|
@@ -1046,7 +1073,7 @@ def __add__(self, other: DocDetails | int) -> DocDetails: # noqa: PLR0912 | |
if self.publication_date and other.publication_date: | ||
PREFER_OTHER = self.publication_date <= other.publication_date | ||
|
||
merged_data = {} | ||
merged_data: dict[str, Any] = {} | ||
# pylint: disable-next=not-an-iterable # pylint bug: https://github.com/pylint-dev/pylint/issues/10144 | ||
for field in type(self).model_fields: | ||
self_value = getattr(self, field) | ||
|
@@ -1086,11 +1113,11 @@ def __add__(self, other: DocDetails | int) -> DocDetails: # noqa: PLR0912 | |
) | ||
else other.authors | ||
) | ||
merged_data[field] = best_authors or None # type: ignore[assignment] | ||
merged_data[field] = best_authors or None | ||
|
||
elif field == "key" and self_value is not None and other_value is not None: | ||
# if we have multiple keys, we wipe them and allow regeneration | ||
merged_data[field] = None # type: ignore[assignment] | ||
merged_data[field] = None | ||
|
||
elif field in {"citation_count", "year", "publication_date"}: | ||
# get the latest data | ||
|
@@ -1106,6 +1133,12 @@ def __add__(self, other: DocDetails | int) -> DocDetails: # noqa: PLR0912 | |
) | ||
else: | ||
merged_data[field] = max(self_value, other_value) | ||
elif field == "content_hash" and ( | ||
self_value and other_value and self_value != other_value | ||
): | ||
# If hashes are both present but differ, | ||
# we don't know which to pick, so just discard the value | ||
merged_data[field] = None | ||
|
||
else: | ||
# Prefer non-null values, default preference for 'other' object. | ||
|
@@ -1120,10 +1153,13 @@ def __add__(self, other: DocDetails | int) -> DocDetails: # noqa: PLR0912 | |
else self_value | ||
) | ||
|
||
# Recalculate doc_id if doi has changed | ||
if merged_data["doi"] != self.doi: | ||
merged_data["doc_id"] = ( | ||
encode_id(merged_data["doi"].lower()) if merged_data["doi"] else None # type: ignore[attr-defined,assignment] | ||
if ( | ||
merged_data["doi"] != self.doi | ||
or merged_data["content_hash"] != self.content_hash | ||
): | ||
# Recalculate doc_id if doi or content hash has changed | ||
merged_data["doc_id"] = compute_unique_doc_id( | ||
merged_data["doi"], merged_data.get("content_hash") | ||
) | ||
|
||
# Create and return new DocDetails instance | ||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -17,7 +17,7 @@ | |||||
from http import HTTPStatus | ||||||
from pathlib import Path | ||||||
from typing import TYPE_CHECKING, Any, BinaryIO, ClassVar, TypeVar | ||||||
from uuid import UUID | ||||||
from uuid import UUID, uuid4 | ||||||
|
||||||
import aiohttp | ||||||
import httpx | ||||||
|
@@ -104,7 +104,7 @@ def strings_similarity(s1: str, s2: str, case_insensitive: bool = True) -> float | |||||
|
||||||
def hexdigest(data: str | bytes) -> str: | ||||||
if isinstance(data, str): | ||||||
return hashlib.md5(data.encode("utf-8")).hexdigest() # noqa: S324 | ||||||
data = data.encode("utf-8") | ||||||
return hashlib.md5(data).hexdigest() # noqa: S324 | ||||||
|
||||||
|
||||||
|
@@ -217,6 +217,14 @@ def encode_id(value: str | bytes | UUID, maxsize: int | None = 16) -> str: | |||||
return hashlib.md5(value).hexdigest()[:maxsize] # noqa: S324 | ||||||
|
||||||
|
||||||
def compute_unique_doc_id(doi: str | None, content_hash: str | None) -> str: | ||||||
if doi: | ||||||
value_to_encode: str = doi.lower() + (content_hash or "") | ||||||
else: | ||||||
value_to_encode = content_hash or str(uuid4()) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Using
Suggested change
Copilot uses AI. Check for mistakes. Positive FeedbackNegative Feedback There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We recompute |
||||||
return encode_id(value_to_encode) | ||||||
|
||||||
|
||||||
def get_year(ts: datetime | None = None) -> str: | ||||||
"""Get the year from the input datetime, otherwise using the current datetime.""" | ||||||
if ts is None: | ||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The usage of "extra" here is an overloaded term:
DocDetails.other
, to add to the confusion)Paired with vagueness such as "some extra fields", I decided to rename and reworded this whole section