11from __future__ import annotations
22
33import ast
4+ import contextlib
45import csv
56import hashlib
67import json
3839)
3940
4041from paperqa .utils import (
42+ compute_unique_doc_id ,
4143 create_bibtex_key ,
4244 encode_id ,
4345 format_bibtex ,
4446 get_citation_ids ,
4547 get_parenthetical_substrings ,
4648 maybe_get_date ,
49+ md5sum ,
4750)
4851from paperqa .version import __version__ as pqa_version
4952
6467 "docname" ,
6568 "dockey" ,
6669 "citation" ,
70+ "content_hash" , # Metadata providers won't give this
6771}
72+ # Sentinel to autopopulate a field within model_validator
73+ AUTOPOPULATE_VALUE = "" # NOTE: this is falsy by design
6874
6975
7076class Doc (Embeddable ):
@@ -73,6 +79,13 @@ class Doc(Embeddable):
7379 docname : str
7480 dockey : DocKey
7581 citation : str
82+ content_hash : str | None = Field (
83+ default = AUTOPOPULATE_VALUE ,
84+ description = (
85+ "Optional hash of the document's contents (to reiterate, not a file path to"
86+ " the document, but the document's contents itself)."
87+ ),
88+ )
7689 # Sort the serialization to minimize the diff of serialized objects
7790 fields_to_overwrite_from_metadata : Annotated [set [str ], PlainSerializer (sorted )] = (
7891 Field (
@@ -203,10 +216,6 @@ async def get_embeddable_text(self, with_enrichment: bool = False) -> str:
203216 return "\n \n " .join ((self .text , * enriched_media ))
204217
205218
206- # Sentinel to autopopulate a field within model_validator
207- AUTOPOPULATE_VALUE = "" # NOTE: this is falsy by design
208-
209-
210219class Context (BaseModel ):
211220 """A class to hold the context of a question."""
212221
@@ -742,8 +751,8 @@ class DocDetails(Doc):
742751 doc_id : str | None = Field (
743752 default = None ,
744753 description = (
745- "Unique ID for this document. Simple ways to acquire one include "
746- " hashing the DOI or a stringifying a UUID ."
754+ "Unique ID for this document. A simple and robust way to acquire one is "
755+ " hashing the paper content's hash concatenate with the lowercased DOI ."
747756 ),
748757 )
749758 file_location : str | os .PathLike | None = Field (
@@ -811,9 +820,9 @@ def lowercase_doi_and_populate_doc_id(cls, data: dict[str, Any]) -> dict[str, An
811820 doi = doi .replace (url_prefix_to_remove , "" )
812821 data ["doi" ] = doi .lower ()
813822 if not data .get ("doc_id" ): # keep user defined doc_ids
814- data ["doc_id" ] = encode_id (doi . lower ( ))
823+ data ["doc_id" ] = compute_unique_doc_id (doi , data . get ( "content_hash" ))
815824 elif not data .get ("doc_id" ): # keep user defined doc_ids
816- data ["doc_id" ] = encode_id ( uuid4 ( ))
825+ data ["doc_id" ] = compute_unique_doc_id ( doi , data . get ( "content_hash" ))
817826
818827 if "dockey" in data .get (
819828 "fields_to_overwrite_from_metadata" ,
@@ -1024,6 +1033,17 @@ def populate_bibtex_key_citation(cls, data: dict[str, Any]) -> dict[str, Any]:
10241033 data ["citation" ] = data .get ("title" ) or CITATION_FALLBACK_DATA ["title" ]
10251034 return data
10261035
1036+ @classmethod
1037+ def populate_content_hash (cls , data : dict [str , Any ]) -> dict [str , Any ]:
1038+ if ( # Check for missing or autopopulate value, but preserve `None`
1039+ data .get ("content_hash" , AUTOPOPULATE_VALUE ) == AUTOPOPULATE_VALUE
1040+ ):
1041+ data ["content_hash" ] = None # Assume we don't have it
1042+ if data .get ("file_location" ): # Try to update it
1043+ with contextlib .suppress (FileNotFoundError ):
1044+ data ["content_hash" ] = md5sum (data ["file_location" ])
1045+ return data
1046+
10271047 @model_validator (mode = "before" )
10281048 @classmethod
10291049 def validate_all_fields (cls , data : Mapping [str , Any ]) -> dict [str , Any ]:
@@ -1049,6 +1069,7 @@ def validate_all_fields(cls, data: Mapping[str, Any]) -> dict[str, Any]:
10491069 data [possibly_str_field ], str
10501070 ):
10511071 data [possibly_str_field ] = ast .literal_eval (data [possibly_str_field ])
1072+ data = cls .populate_content_hash (data )
10521073 data = cls .lowercase_doi_and_populate_doc_id (data )
10531074 data = cls .remove_invalid_authors (data )
10541075 data = cls .misc_string_cleaning (data )
@@ -1209,6 +1230,14 @@ def __add__(self, other: DocDetails | int) -> DocDetails: # noqa: PLR0912
12091230 )
12101231 else :
12111232 merged_data [field ] = max (self_value , other_value )
1233+ elif field == "content_hash" and (
1234+ # Hashes are both present but differ
1235+ (self_value and other_value and self_value != other_value )
1236+ # One hash is explicitly disabled (not autopopulated)
1237+ or (self_value is None or other_value is None )
1238+ ):
1239+ # We don't know which to pick, so just discard the value
1240+ merged_data [field ] = None
12121241
12131242 else :
12141243 # Prefer non-null values, default preference for 'other' object.
@@ -1223,10 +1252,13 @@ def __add__(self, other: DocDetails | int) -> DocDetails: # noqa: PLR0912
12231252 else self_value
12241253 )
12251254
1226- # Recalculate doc_id if doi has changed
1227- if merged_data ["doi" ] != self .doi :
1228- merged_data ["doc_id" ] = (
1229- encode_id (merged_data ["doi" ].lower ()) if merged_data ["doi" ] else None
1255+ if (
1256+ merged_data ["doi" ] != self .doi
1257+ or merged_data ["content_hash" ] != self .content_hash
1258+ ):
1259+ # Recalculate doc_id if doi or content hash has changed
1260+ merged_data ["doc_id" ] = compute_unique_doc_id (
1261+ merged_data ["doi" ], merged_data .get ("content_hash" )
12301262 )
12311263
12321264 # Create and return new DocDetails instance
0 commit comments