1
1
from __future__ import annotations
2
2
3
3
import ast
4
+ import contextlib
4
5
import csv
5
6
import logging
6
7
import os
30
31
)
31
32
32
33
from paperqa .utils import (
34
+ compute_unique_doc_id ,
33
35
create_bibtex_key ,
34
36
encode_id ,
35
37
format_bibtex ,
36
38
get_citation_ids ,
37
39
maybe_get_date ,
40
+ md5sum ,
38
41
)
39
42
from paperqa .version import __version__ as pqa_version
40
43
53
56
"docname" ,
54
57
"dockey" ,
55
58
"citation" ,
59
+ "content_hash" , # Metadata providers won't give this
56
60
}
61
+ # Sentinel to autopopulate a field within model_validator
62
+ AUTOPOPULATE_VALUE = "" # NOTE: this is falsy by design
57
63
58
64
59
65
class Doc (Embeddable ):
@@ -62,6 +68,13 @@ class Doc(Embeddable):
62
68
docname : str
63
69
dockey : DocKey
64
70
citation : str
71
+ content_hash : str | None = Field (
72
+ default = AUTOPOPULATE_VALUE ,
73
+ description = (
74
+ "Optional hash of the document's contents (to reiterate, not a file path to"
75
+ " the document, but the document's contents itself)."
76
+ ),
77
+ )
65
78
# Sort the serialization to minimize the diff of serialized objects
66
79
fields_to_overwrite_from_metadata : Annotated [set [str ], PlainSerializer (sorted )] = (
67
80
Field (
@@ -160,10 +173,6 @@ def __hash__(self) -> int:
160
173
return hash ((self .name , self .text ))
161
174
162
175
163
- # Sentinel to autopopulate a field within model_validator
164
- AUTOPOPULATE_VALUE = "" # NOTE: this is falsy by design
165
-
166
-
167
176
class Context (BaseModel ):
168
177
"""A class to hold the context of a question."""
169
178
@@ -570,8 +579,8 @@ class DocDetails(Doc):
570
579
doc_id : str | None = Field (
571
580
default = None ,
572
581
description = (
573
- "Unique ID for this document. Simple ways to acquire one include "
574
- " hashing the DOI or a stringifying a UUID ."
582
+ "Unique ID for this document. A simple and robust way to acquire one is "
583
+ " hashing the paper content's hash concatenate with the lowercased DOI ."
575
584
),
576
585
)
577
586
file_location : str | os .PathLike | None = Field (
@@ -630,9 +639,9 @@ def lowercase_doi_and_populate_doc_id(cls, data: dict[str, Any]) -> dict[str, An
630
639
doi = doi .replace (url_prefix_to_remove , "" )
631
640
data ["doi" ] = doi .lower ()
632
641
if not data .get ("doc_id" ): # keep user defined doc_ids
633
- data ["doc_id" ] = encode_id (doi . lower ( ))
642
+ data ["doc_id" ] = compute_unique_doc_id (doi , data . get ( "content_hash" ))
634
643
elif not data .get ("doc_id" ): # keep user defined doc_ids
635
- data ["doc_id" ] = encode_id ( uuid4 ( ))
644
+ data ["doc_id" ] = compute_unique_doc_id ( doi , data . get ( "content_hash" ))
636
645
637
646
if "dockey" in data .get (
638
647
"fields_to_overwrite_from_metadata" ,
@@ -838,6 +847,17 @@ def populate_bibtex_key_citation(cls, data: dict[str, Any]) -> dict[str, Any]:
838
847
data ["citation" ] = data .get ("title" ) or CITATION_FALLBACK_DATA ["title" ]
839
848
return data
840
849
850
+ @classmethod
851
+ def populate_content_hash (cls , data : dict [str , Any ]) -> dict [str , Any ]:
852
+ if ( # Check for missing or autopopulate value, but preserve `None`
853
+ data .get ("content_hash" , AUTOPOPULATE_VALUE ) == AUTOPOPULATE_VALUE
854
+ ):
855
+ data ["content_hash" ] = None # Assume we don't have it
856
+ if data .get ("file_location" ): # Try to update it
857
+ with contextlib .suppress (FileNotFoundError ):
858
+ data ["content_hash" ] = md5sum (data ["file_location" ])
859
+ return data
860
+
841
861
@model_validator (mode = "before" )
842
862
@classmethod
843
863
def validate_all_fields (cls , data : Mapping [str , Any ]) -> dict [str , Any ]:
@@ -857,6 +877,7 @@ def validate_all_fields(cls, data: Mapping[str, Any]) -> dict[str, Any]:
857
877
data [possibly_str_field ], str
858
878
):
859
879
data [possibly_str_field ] = ast .literal_eval (data [possibly_str_field ])
880
+ data = cls .populate_content_hash (data )
860
881
data = cls .lowercase_doi_and_populate_doc_id (data )
861
882
data = cls .remove_invalid_authors (data )
862
883
data = cls .misc_string_cleaning (data )
@@ -1017,6 +1038,14 @@ def __add__(self, other: DocDetails | int) -> DocDetails: # noqa: PLR0912
1017
1038
)
1018
1039
else :
1019
1040
merged_data [field ] = max (self_value , other_value )
1041
+ elif field == "content_hash" and ( # noqa: PLR0916
1042
+ # Hashes are both present but differ
1043
+ (self_value and other_value and self_value != other_value )
1044
+ # One hash is explicitly disabled (not autopopulated)
1045
+ or (self_value is None or other_value is None )
1046
+ ):
1047
+ # We don't know which to pick, so just discard the value
1048
+ merged_data [field ] = None
1020
1049
1021
1050
else :
1022
1051
# Prefer non-null values, default preference for 'other' object.
@@ -1031,10 +1060,13 @@ def __add__(self, other: DocDetails | int) -> DocDetails: # noqa: PLR0912
1031
1060
else self_value
1032
1061
)
1033
1062
1034
- # Recalculate doc_id if doi has changed
1035
- if merged_data ["doi" ] != self .doi :
1036
- merged_data ["doc_id" ] = (
1037
- encode_id (merged_data ["doi" ].lower ()) if merged_data ["doi" ] else None
1063
+ if (
1064
+ merged_data ["doi" ] != self .doi
1065
+ or merged_data ["content_hash" ] != self .content_hash
1066
+ ):
1067
+ # Recalculate doc_id if doi or content hash has changed
1068
+ merged_data ["doc_id" ] = compute_unique_doc_id (
1069
+ merged_data ["doi" ], merged_data .get ("content_hash" )
1038
1070
)
1039
1071
1040
1072
# Create and return new DocDetails instance
0 commit comments