Skip to content

Commit 3443579

Browse files
authored
Author self-references (#91)
* feat: support per author analysis of self-references * feat: include API key, robustify retries and name overlap check * test: expand test for new cases * fix: avoid None API key * fix: async * ci: use SS API key * chore: upper bound impact factor * fixes
1 parent e1bbd3f commit 3443579

File tree

13 files changed

+385
-143
lines changed

13 files changed

+385
-143
lines changed

.github/workflows/test_pypi.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,5 +26,6 @@ jobs:
2626
python -c "import paperscraper.server_dumps"
2727
python -c "import paperscraper.tests"
2828
python -c "import paperscraper.impact"
29+
python -c "import paperscraper.citations"
2930
3031

.github/workflows/test_tip.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ on: [push, release]
55
jobs:
66
test-source-install:
77
runs-on: ubuntu-latest
8+
env:
9+
SS_API_KEY: ${{ secrets.SS_API_KEY }}
810
strategy:
911
max-parallel: 3
1012
matrix:
@@ -49,6 +51,8 @@ jobs:
4951

5052
test-potential-wheel-install:
5153
runs-on: ubuntu-latest
54+
env:
55+
SS_API_KEY: ${{ secrets.SS_API_KEY }}
5256
steps:
5357
- name: Checkout code
5458
uses: actions/checkout@v2

paperscraper/async_utils.py

Lines changed: 36 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -49,14 +49,20 @@ def wrapper(*args, **kwargs) -> Union[T, Awaitable[T]]:
4949

5050

5151
def retry_with_exponential_backoff(
52-
*, max_retries: int = 5, base_delay: float = 1.0
52+
*,
53+
max_retries: int = 5,
54+
base_delay: float = 1.0,
55+
factor: float = 1.3,
56+
constant_delay: float = 0.2,
5357
) -> Callable[[F], F]:
5458
"""
5559
Decorator factory that retries an `async def` on HTTP 429, with exponential backoff.
5660
5761
Args:
5862
max_retries: how many times to retry before giving up.
59-
base_delay: initial delay in seconds; next delays will be duplication of previous.
63+
base_delay: initial delay in seconds; next delays will be multiplied by `factor`.
64+
factor: multiplier for delay after each retry.
65+
constant_delay: fixed delay before each attempt.
6066
6167
Usage:
6268
@@ -70,18 +76,39 @@ def decorator(func: F) -> F:
7076
@wraps(func)
7177
async def wrapper(*args, **kwargs) -> Any:
7278
delay = base_delay
73-
for attempt in range(max_retries):
79+
last_exception: BaseException | None = None
80+
for attempt in range(1, max_retries + 1):
81+
await asyncio.sleep(constant_delay)
7482
try:
7583
return await func(*args, **kwargs)
7684
except httpx.HTTPStatusError as e:
77-
# only retry on 429
7885
status = e.response.status_code if e.response is not None else None
79-
if status != 429 or attempt == max_retries - 1:
86+
if status != 429:
8087
raise
81-
# backoff
82-
await asyncio.sleep(delay)
83-
delay *= 2
84-
# in theory we never reach here
88+
last_exception = e
89+
sleep_for = delay
90+
if e.response is not None:
91+
ra = e.response.headers.get("Retry-After")
92+
if ra is not None:
93+
try:
94+
sleep_for = float(ra)
95+
except ValueError:
96+
pass
97+
delay *= factor
98+
99+
except httpx.ReadError as e:
100+
last_exception = e
101+
sleep_for = delay
102+
delay *= factor
103+
104+
if attempt == max_retries:
105+
msg = (
106+
f"{func.__name__} failed after {attempt} attempts with "
107+
f"last delay {sleep_for:.2f}s"
108+
)
109+
raise RuntimeError(msg) from last_exception
110+
111+
await asyncio.sleep(sleep_for)
85112

86113
return wrapper
87114

paperscraper/citations/entity/core.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,15 @@
55

66

77
class EntityResult(BaseModel):
8-
num_citations: int
9-
num_references: int
10-
# keys are authors or papers and values are absolute self links
11-
self_citations: Dict[str, int] = {}
12-
self_references: Dict[str, int] = {}
138
# aggregated results
149
self_citation_ratio: float = 0
1510
self_reference_ratio: float = 0
11+
# total number of author citations/references
12+
num_citations: int
13+
num_references: int
14+
# keys are papers and values are percentage of self citations/references
15+
self_citations: Dict[str, float] = {}
16+
self_references: Dict[str, float] = {}
1617

1718

1819
class Entity:

paperscraper/citations/entity/paper.py

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -68,33 +68,41 @@ def self_references(self):
6868
Extracts the self references of a paper, for each author.
6969
"""
7070
if isinstance(self.doi, str):
71-
self.ref_result: ReferenceResult = self_references_paper(self.doi)
71+
self.self_ref: ReferenceResult = self_references_paper(self.doi)
7272

7373
def self_citations(self):
7474
"""
7575
Extracts the self citations of a paper, for each author.
7676
"""
7777
if isinstance(self.doi, str):
78-
self.citation_result: CitationResult = self_citations_paper(self.doi)
78+
self.self_cite: CitationResult = self_citations_paper(self.doi)
7979

8080
def get_result(self) -> Optional[PaperResult]:
8181
"""
8282
Provides the result of the analysis.
8383
8484
Returns: PaperResult if available.
8585
"""
86-
if not hasattr(self, "ref_result"):
86+
if not hasattr(self, "self_ref"):
8787
logger.warning(
8888
f"Can't get result since no referencing result for {self.input} exists. Run `.self_references` first."
8989
)
9090
return
91-
elif not hasattr(self, "citation_result"):
91+
elif not hasattr(self, "self_cite"):
9292
logger.warning(
9393
f"Can't get result since no citation result for {self.input} exists. Run `.self_citations` first."
9494
)
9595
return
96-
ref_result = self.ref_result.model_dump()
97-
ref_result.pop("ssid", None)
9896
return PaperResult(
99-
title=self.title, **ref_result, **self.citation_result.model_dump()
97+
title=self.title,
98+
**{
99+
k: v
100+
for k, v in self.self_ref.model_dump().items()
101+
if k not in ["ssid", "title"]
102+
},
103+
**{
104+
k: v
105+
for k, v in self.self_cite.model_dump().items()
106+
if k not in ["title"]
107+
},
100108
)

paperscraper/citations/entity/researcher.py

Lines changed: 86 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
1+
import asyncio
12
import os
2-
from typing import List, Literal, Optional
3+
from typing import Any, List, Literal, Optional, Tuple
34

45
from semanticscholar import SemanticScholar
5-
from tqdm import tqdm
66

77
from ..orcid import orcid_to_author_name
8-
from ..self_references import ReferenceResult
8+
from ..self_citations import CitationResult
9+
from ..self_references import ReferenceResult, self_references_paper
910
from ..utils import author_name_to_ssaid, get_papers_for_author
1011
from .core import Entity, EntityResult
1112

@@ -14,7 +15,27 @@ class ResearcherResult(EntityResult):
1415
name: str
1516
ssid: int
1617
orcid: Optional[str] = None
17-
# TODO: the ratios will be averaged across all papers for that author
18+
19+
def _ordered_items(self) -> List[Tuple[str, Any]]:
20+
# enforce specific ordering
21+
return [
22+
("name", self.name),
23+
("self_reference_ratio", self.self_reference_ratio),
24+
("self_citation_ratio", self.self_citation_ratio),
25+
("num_references", self.num_references),
26+
("num_citations", self.num_citations),
27+
("self_references", self.self_references),
28+
("self_citations", self.self_citations),
29+
("ssid", self.ssid),
30+
("orcid", self.orcid),
31+
]
32+
33+
def __repr__(self) -> str:
34+
inner = ", ".join(f"{k}={v!r}" for k, v in self._ordered_items())
35+
return f"{self.__class__.__name__}({inner})"
36+
37+
def __str__(self) -> str:
38+
return " ".join(f"{k}={v!r}" for k, v in self._ordered_items())
1839

1940

2041
ModeType = Literal[tuple(MODES := ("name", "orcid", "ssaid", "infer"))]
@@ -32,7 +53,7 @@ def __init__(self, input: str, mode: ModeType = "infer"):
3253
Construct researcher object for self citation/reference analysis.
3354
3455
Args:
35-
input: A researcher to search for.
56+
input: A researcher to search for, identified by name, ORCID iD, or Semantic Scholar Author ID.
3657
mode: This can be a `name` `orcid` (ORCID iD) or `ssaid` (Semantic Scholar Author ID).
3758
Defaults to "infer".
3859
@@ -53,32 +74,74 @@ def __init__(self, input: str, mode: ModeType = "infer"):
5374
):
5475
mode = "orcid"
5576
else:
56-
mode = "author"
57-
77+
mode = "name"
5878
if mode == "ssaid":
59-
self.author = sch.get_author(input)
79+
self.name = sch.get_author(input)._name
6080
self.ssid = input
6181
elif mode == "orcid":
62-
self.author = orcid_to_author_name(input)
82+
orcid_name = orcid_to_author_name(input)
6383
self.orcid = input
64-
self.ssid = author_name_to_ssaid(input)
65-
elif mode == "author":
66-
self.author = input
67-
self.ssid = author_name_to_ssaid(input)
68-
69-
# TODO: Skip over erratum / corrigendum
70-
self.ssids = get_papers_for_author(self.ssid)
71-
72-
def self_references(self):
84+
self.ssid, self.name = author_name_to_ssaid(orcid_name)
85+
elif mode == "name":
86+
name = input
87+
self.ssid, self.name = author_name_to_ssaid(input)
88+
89+
async def _self_references_async(
90+
self, verbose: bool = False
91+
) -> List[ReferenceResult]:
92+
"""Async version of self_references."""
93+
self.ssids = await get_papers_for_author(self.ssid)
94+
95+
results: List[ReferenceResult] = await self_references_paper(
96+
self.ssids, verbose=verbose
97+
)
98+
# Remove papers with zero references or that are erratum/corrigendum
99+
results = [
100+
r
101+
for r in results
102+
if r.num_references > 0
103+
and "erratum" not in r.title.lower()
104+
and "corrigendum" not in r.title.lower()
105+
]
106+
107+
return results
108+
109+
def self_references(self, verbose: bool = False) -> ResearcherResult:
73110
"""
74111
Sifts through all papers of a researcher and extracts the self references.
75-
"""
76-
# TODO: Asynchronous call to self_references
77-
print("Going through SSIDs", self.ssids)
78112
79-
# TODO: Aggregate results
113+
Args:
114+
verbose: If True, logs detailed information for each paper.
80115
81-
def self_citations(self):
116+
Returns:
117+
A ResearcherResult containing aggregated self-reference data.
118+
"""
119+
reference_results = asyncio.run(self._self_references_async(verbose=verbose))
120+
121+
individual_self_references = {
122+
getattr(result, "title"): getattr(result, "self_references").get(self.name, 0.0)
123+
for result in reference_results
124+
}
125+
reference_ratio = sum(individual_self_references.values()) / max(1, len(
126+
individual_self_references
127+
))
128+
return ResearcherResult(
129+
name=self.name,
130+
ssid=int(self.ssid),
131+
orcid=self.orcid,
132+
num_references=sum(r.num_references for r in reference_results),
133+
num_citations=-1,
134+
self_references=dict(
135+
sorted(
136+
individual_self_references.items(), key=lambda x: x[1], reverse=True
137+
)
138+
),
139+
self_citations={},
140+
self_reference_ratio=round(reference_ratio, 3),
141+
self_citation_ratio=-1.0,
142+
)
143+
144+
def self_citations(self) -> ResearcherResult:
82145
"""
83146
Sifts through all papers of a researcher and finds how often they are self-cited.
84147
"""

paperscraper/citations/self_citations.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
class CitationResult(BaseModel):
2020
ssid: str # semantic scholar paper id
21+
title: str
2122
num_citations: int
2223
self_citations: Dict[str, float] = {}
2324
citation_score: float
@@ -87,6 +88,7 @@ async def _process_single(client: httpx.AsyncClient, identifier: str) -> Citatio
8788

8889
return CitationResult(
8990
ssid=identifier,
91+
title=paper.get("title", ""),
9092
num_citations=total_cites,
9193
self_citations=ratios,
9294
citation_score=avg_score,

0 commit comments

Comments
 (0)