Skip to content

Commit 1f510d0

Browse files
committed
Fixed duplicate keys
1 parent ffb2ef9 commit 1f510d0

File tree

4 files changed

+74
-10
lines changed

4 files changed

+74
-10
lines changed

paperqa/docs.py

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,14 +39,34 @@ class Docs:
3939
def __init__(self, chunk_size_limit: int = 3000) -> None:
4040
self.docs = dict()
4141
self.chunk_size_limit = chunk_size_limit
42+
self.keys = set()
4243

4344
def add(self, path: str, citation: str, key: Optional[str] = None) -> bool:
4445
"""Add a document to the collection."""
46+
if path in self.docs:
47+
return False
4548
if key is None:
4649
# get first name and year from citation
47-
author = re.search(r"([A-Z][a-z]+)", citation).group(1)
48-
year = re.search(r"(\d{4})", citation).group(1)
50+
try:
51+
author = re.search(r"([A-Z][a-z]+)", citation).group(1)
52+
except AttributeError:
53+
# panicking - no word??
54+
return False
55+
try:
56+
year = re.search(r"(\d{4})", citation).group(1)
57+
except AttributeError:
58+
year = ""
4959
key = f"{author}{year}"
60+
suffix = ""
61+
while key + suffix in self.keys:
62+
# move suffix to next letter
63+
if suffix == "":
64+
suffix = "a"
65+
else:
66+
suffix = chr(ord(suffix) + 1)
67+
key += suffix
68+
self.keys.add(key)
69+
5070
data = {"citation": citation, "key": key}
5171
d = gpt_index.SimpleDirectoryReader(input_files=[path]).load_data()
5272
# loose check to see if document was loaded
@@ -124,7 +144,8 @@ def query(self, query: str, k: int = 3, max_sources: int = 5):
124144
else:
125145
answer = qa_chain.run(question=query, context_str=context_str)[1:]
126146
for data in self.docs.values():
127-
if data["key"] in answer:
147+
# do check for whole key (so we don't catch Callahan2019a with Callahan2019)
148+
if data["key"] + " " in answer or data["key"] + ")" in answer:
128149
bib.append(f'{data["key"]}: {data["citation"]}')
129150
bib_str = "\n\n".join(bib)
130151
formatted_answer = f"Question: {query}\n\n{answer}\n"

paperqa/qaprompts.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,14 +32,14 @@
3232

3333
_qa_prompt = prompts.PromptTemplate(
3434
input_variables=["question", "context_str"],
35-
template="Generate a comprehensive answer (about 100 words) "
35+
template="Write a comprehensive answer (about 100 words) "
3636
"for the question below solely based on the provided context. "
37-
"If the context is insufficient to "
37+
"If the context is insufficient "
3838
'answer, reply "I cannot answer". '
3939
"For each sentence in your answer, indicate which sources most support it "
40-
"via valid citation markers at the end of sentences like (Foo2012). "
40+
"via valid citation markers at the end of sentences, like (Foo2012). "
4141
"Answer in an unbiased, balanced, and scientific tone. "
42-
# "To answer, start by writing out the reasoning and then "
42+
"Try to use the direct quotes, if present, from the context. "
4343
# "write a complete unbiased answer prefixed by \"Answer:\""
4444
"\n--------------------\n"
4545
"{context_str}\n"

paperqa/version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.0.2"
1+
__version__ = "0.0.3"

tests/test_paperqa.py

Lines changed: 45 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def test_docs():
3333
def test_evidence():
3434
doc_path = "example.txt"
3535
with open(doc_path, "w", encoding="utf-8") as f:
36-
# get wiki page about Obama
36+
# get wiki page about politician
3737
r = requests.get("https://en.wikipedia.org/wiki/Frederick_Bates_(politician)")
3838
f.write(r.text)
3939
docs = paperqa.Docs()
@@ -48,7 +48,7 @@ def test_evidence():
4848
def test_query():
4949
doc_path = "example.txt"
5050
with open(doc_path, "w", encoding="utf-8") as f:
51-
# get wiki page about Obama
51+
# get wiki page about politician
5252
r = requests.get("https://en.wikipedia.org/wiki/Frederick_Bates_(politician)")
5353
f.write(r.text)
5454
docs = paperqa.Docs()
@@ -73,3 +73,46 @@ def test_docs_pickle():
7373
"What is today?", k=1, max_sources=1
7474
) == docs2.get_evidence("What is today?", k=1, max_sources=1)
7575
os.remove(doc_path)
76+
77+
78+
def test_bad_context():
79+
doc_path = "example.txt"
80+
with open(doc_path, "w", encoding="utf-8") as f:
81+
# get wiki page about politician
82+
r = requests.get("https://en.wikipedia.org/wiki/Frederick_Bates_(politician)")
83+
f.write(r.text)
84+
docs = paperqa.Docs()
85+
docs.add(doc_path, "WikiMedia Foundation, 2023, Accessed now")
86+
answer = docs.query("What year was Barack Obama born greatest accomplishment?")
87+
assert (
88+
answer.answer
89+
== "I cannot answer this question due to insufficient information."
90+
)
91+
os.remove(doc_path)
92+
93+
94+
def test_repeat_keys():
95+
doc_path = "example.txt"
96+
with open(doc_path, "w", encoding="utf-8") as f:
97+
# get wiki page about politician
98+
r = requests.get("https://en.wikipedia.org/wiki/Frederick_Bates_(politician)")
99+
f.write(r.text)
100+
docs = paperqa.Docs()
101+
docs.add(doc_path, "WikiMedia Foundation, 2023, Accessed now")
102+
docs.add(doc_path, "WikiMedia Foundation, 2023, Accessed now")
103+
assert len(docs.docs) == 1
104+
105+
# now with different paths
106+
doc_path2 = "example2.txt"
107+
with open(doc_path2, "w", encoding="utf-8") as f:
108+
# get wiki page about politician
109+
f.write(r.text)
110+
docs.add(doc_path2, "WikiMedia Foundation, 2023, Accessed now")
111+
assert len(docs.docs) == 2
112+
113+
# check keys
114+
assert docs.docs[doc_path]["key"] == "Wiki2023"
115+
assert docs.docs[doc_path2]["key"] == "Wiki2023a"
116+
117+
os.remove(doc_path)
118+
os.remove(doc_path2)

0 commit comments

Comments
 (0)