Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/test_pypi.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,5 +26,6 @@ jobs:
python -c "import paperscraper.server_dumps"
python -c "import paperscraper.tests"
python -c "import paperscraper.impact"
python -c "import paperscraper.citations"


4 changes: 4 additions & 0 deletions .github/workflows/test_tip.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ on: [push, release]
jobs:
test-source-install:
runs-on: ubuntu-latest
env:
SS_API_KEY: ${{ secrets.SS_API_KEY }}
strategy:
max-parallel: 3
matrix:
Expand Down Expand Up @@ -49,6 +51,8 @@ jobs:

test-potential-wheel-install:
runs-on: ubuntu-latest
env:
SS_API_KEY: ${{ secrets.SS_API_KEY }}
steps:
- name: Checkout code
uses: actions/checkout@v2
Expand Down
45 changes: 36 additions & 9 deletions paperscraper/async_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,14 +49,20 @@ def wrapper(*args, **kwargs) -> Union[T, Awaitable[T]]:


def retry_with_exponential_backoff(
*, max_retries: int = 5, base_delay: float = 1.0
*,
max_retries: int = 5,
base_delay: float = 1.0,
factor: float = 1.3,
constant_delay: float = 0.2,
) -> Callable[[F], F]:
"""
Decorator factory that retries an `async def` on HTTP 429, with exponential backoff.

Args:
max_retries: how many times to retry before giving up.
base_delay: initial delay in seconds; next delays will be duplication of previous.
base_delay: initial delay in seconds; next delays will be multiplied by `factor`.
factor: multiplier for delay after each retry.
constant_delay: fixed delay before each attempt.

Usage:

Expand All @@ -70,18 +76,39 @@ def decorator(func: F) -> F:
@wraps(func)
async def wrapper(*args, **kwargs) -> Any:
delay = base_delay
for attempt in range(max_retries):
last_exception: BaseException | None = None
for attempt in range(1, max_retries + 1):
await asyncio.sleep(constant_delay)
try:
return await func(*args, **kwargs)
except httpx.HTTPStatusError as e:
# only retry on 429
status = e.response.status_code if e.response is not None else None
if status != 429 or attempt == max_retries - 1:
if status != 429:
raise
# backoff
await asyncio.sleep(delay)
delay *= 2
# in theory we never reach here
last_exception = e
sleep_for = delay
if e.response is not None:
ra = e.response.headers.get("Retry-After")
if ra is not None:
try:
sleep_for = float(ra)
except ValueError:
pass
delay *= factor

except httpx.ReadError as e:
last_exception = e
sleep_for = delay
delay *= factor

if attempt == max_retries:
msg = (
f"{func.__name__} failed after {attempt} attempts with "
f"last delay {sleep_for:.2f}s"
)
raise RuntimeError(msg) from last_exception

await asyncio.sleep(sleep_for)

return wrapper

Expand Down
11 changes: 6 additions & 5 deletions paperscraper/citations/entity/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,15 @@


class EntityResult(BaseModel):
num_citations: int
num_references: int
# keys are authors or papers and values are absolute self links
self_citations: Dict[str, int] = {}
self_references: Dict[str, int] = {}
# aggregated results
self_citation_ratio: float = 0
self_reference_ratio: float = 0
# total number of author citations/references
num_citations: int
num_references: int
# keys are papers and values are percentage of self citations/references
self_citations: Dict[str, float] = {}
self_references: Dict[str, float] = {}


class Entity:
Expand Down
22 changes: 15 additions & 7 deletions paperscraper/citations/entity/paper.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,33 +68,41 @@ def self_references(self):
Extracts the self references of a paper, for each author.
"""
if isinstance(self.doi, str):
self.ref_result: ReferenceResult = self_references_paper(self.doi)
self.self_ref: ReferenceResult = self_references_paper(self.doi)

def self_citations(self):
"""
Extracts the self citations of a paper, for each author.
"""
if isinstance(self.doi, str):
self.citation_result: CitationResult = self_citations_paper(self.doi)
self.self_cite: CitationResult = self_citations_paper(self.doi)

def get_result(self) -> Optional[PaperResult]:
"""
Provides the result of the analysis.

Returns: PaperResult if available.
"""
if not hasattr(self, "ref_result"):
if not hasattr(self, "self_ref"):
logger.warning(
f"Can't get result since no referencing result for {self.input} exists. Run `.self_references` first."
)
return
elif not hasattr(self, "citation_result"):
elif not hasattr(self, "self_cite"):
logger.warning(
f"Can't get result since no citation result for {self.input} exists. Run `.self_citations` first."
)
return
ref_result = self.ref_result.model_dump()
ref_result.pop("ssid", None)
return PaperResult(
title=self.title, **ref_result, **self.citation_result.model_dump()
title=self.title,
**{
k: v
for k, v in self.self_ref.model_dump().items()
if k not in ["ssid", "title"]
},
**{
k: v
for k, v in self.self_cite.model_dump().items()
if k not in ["title"]
},
)
109 changes: 86 additions & 23 deletions paperscraper/citations/entity/researcher.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import asyncio
import os
from typing import List, Literal, Optional
from typing import Any, List, Literal, Optional, Tuple

from semanticscholar import SemanticScholar
from tqdm import tqdm

from ..orcid import orcid_to_author_name
from ..self_references import ReferenceResult
from ..self_citations import CitationResult
from ..self_references import ReferenceResult, self_references_paper
from ..utils import author_name_to_ssaid, get_papers_for_author
from .core import Entity, EntityResult

Expand All @@ -14,7 +15,27 @@ class ResearcherResult(EntityResult):
name: str
ssid: int
orcid: Optional[str] = None
# TODO: the ratios will be averaged across all papers for that author

def _ordered_items(self) -> List[Tuple[str, Any]]:
# enforce specific ordering
return [
("name", self.name),
("self_reference_ratio", self.self_reference_ratio),
("self_citation_ratio", self.self_citation_ratio),
("num_references", self.num_references),
("num_citations", self.num_citations),
("self_references", self.self_references),
("self_citations", self.self_citations),
("ssid", self.ssid),
("orcid", self.orcid),
]

def __repr__(self) -> str:
inner = ", ".join(f"{k}={v!r}" for k, v in self._ordered_items())
return f"{self.__class__.__name__}({inner})"

def __str__(self) -> str:
return " ".join(f"{k}={v!r}" for k, v in self._ordered_items())


ModeType = Literal[tuple(MODES := ("name", "orcid", "ssaid", "infer"))]
Expand All @@ -32,7 +53,7 @@ def __init__(self, input: str, mode: ModeType = "infer"):
Construct researcher object for self citation/reference analysis.

Args:
input: A researcher to search for.
input: A researcher to search for, identified by name, ORCID iD, or Semantic Scholar Author ID.
mode: This can be a `name` `orcid` (ORCID iD) or `ssaid` (Semantic Scholar Author ID).
Defaults to "infer".

Expand All @@ -53,32 +74,74 @@ def __init__(self, input: str, mode: ModeType = "infer"):
):
mode = "orcid"
else:
mode = "author"

mode = "name"
if mode == "ssaid":
self.author = sch.get_author(input)
self.name = sch.get_author(input)._name
self.ssid = input
elif mode == "orcid":
self.author = orcid_to_author_name(input)
orcid_name = orcid_to_author_name(input)
self.orcid = input
self.ssid = author_name_to_ssaid(input)
elif mode == "author":
self.author = input
self.ssid = author_name_to_ssaid(input)

# TODO: Skip over erratum / corrigendum
self.ssids = get_papers_for_author(self.ssid)

def self_references(self):
self.ssid, self.name = author_name_to_ssaid(orcid_name)
elif mode == "name":
name = input
self.ssid, self.name = author_name_to_ssaid(input)

async def _self_references_async(
self, verbose: bool = False
) -> List[ReferenceResult]:
"""Async version of self_references."""
self.ssids = await get_papers_for_author(self.ssid)

results: List[ReferenceResult] = await self_references_paper(
self.ssids, verbose=verbose
)
# Remove papers with zero references or that are erratum/corrigendum
results = [
r
for r in results
if r.num_references > 0
and "erratum" not in r.title.lower()
and "corrigendum" not in r.title.lower()
]

return results

def self_references(self, verbose: bool = False) -> ResearcherResult:
"""
Sifts through all papers of a researcher and extracts the self references.
"""
# TODO: Asynchronous call to self_references
print("Going through SSIDs", self.ssids)

# TODO: Aggregate results
Args:
verbose: If True, logs detailed information for each paper.

def self_citations(self):
Returns:
A ResearcherResult containing aggregated self-reference data.
"""
reference_results = asyncio.run(self._self_references_async(verbose=verbose))

individual_self_references = {
getattr(result, "title"): getattr(result, "self_references").get(self.name, 0.0)
for result in reference_results
}
reference_ratio = sum(individual_self_references.values()) / max(1, len(
individual_self_references
))
return ResearcherResult(
name=self.name,
ssid=int(self.ssid),
orcid=self.orcid,
num_references=sum(r.num_references for r in reference_results),
num_citations=-1,
self_references=dict(
sorted(
individual_self_references.items(), key=lambda x: x[1], reverse=True
)
),
self_citations={},
self_reference_ratio=round(reference_ratio, 3),
self_citation_ratio=-1.0,
)

def self_citations(self) -> ResearcherResult:
"""
Sifts through all papers of a researcher and finds how often they are self-cited.
"""
Expand Down
2 changes: 2 additions & 0 deletions paperscraper/citations/self_citations.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

class CitationResult(BaseModel):
ssid: str # semantic scholar paper id
title: str
num_citations: int
self_citations: Dict[str, float] = {}
citation_score: float
Expand Down Expand Up @@ -87,6 +88,7 @@ async def _process_single(client: httpx.AsyncClient, identifier: str) -> Citatio

return CitationResult(
ssid=identifier,
title=paper.get("title", ""),
num_citations=total_cites,
self_citations=ratios,
citation_score=avg_score,
Expand Down
Loading