jannisborn · jannisborn · Nov 26, 2025 · Nov 22, 2025 · Nov 22, 2025 · Nov 22, 2025
diff --git a/.github/workflows/test_pypi.yml b/.github/workflows/test_pypi.yml
@@ -26,5 +26,6 @@ jobs:
           python -c "import paperscraper.server_dumps"
           python -c "import paperscraper.tests"
           python -c "import paperscraper.impact"
+          python -c "import paperscraper.citations"
 
 
diff --git a/.github/workflows/test_tip.yml b/.github/workflows/test_tip.yml
@@ -5,6 +5,8 @@ on: [push, release]
 jobs:
   test-source-install:
     runs-on: ubuntu-latest
+    env:
+      SS_API_KEY: ${{ secrets.SS_API_KEY }}
     strategy:
       max-parallel: 3
       matrix:
@@ -49,6 +51,8 @@ jobs:
 
   test-potential-wheel-install:
     runs-on: ubuntu-latest
+    env:
+      SS_API_KEY: ${{ secrets.SS_API_KEY }}
     steps:
     - name: Checkout code
       uses: actions/checkout@v2

diff --git a/paperscraper/async_utils.py b/paperscraper/async_utils.py
@@ -49,14 +49,20 @@ def wrapper(*args, **kwargs) -> Union[T, Awaitable[T]]:
 
 
 def retry_with_exponential_backoff(
-    *, max_retries: int = 5, base_delay: float = 1.0
+    *,
+    max_retries: int = 5,
+    base_delay: float = 1.0,
+    factor: float = 1.3,
+    constant_delay: float = 0.2,
 ) -> Callable[[F], F]:
     """
     Decorator factory that retries an `async def` on HTTP 429, with exponential backoff.
 
     Args:
         max_retries: how many times to retry before giving up.
-        base_delay: initial delay in seconds; next delays will be duplication of previous.
+        base_delay: initial delay in seconds; next delays will be multiplied by `factor`.
+        factor: multiplier for delay after each retry.
+        constant_delay: fixed delay before each attempt.
 
     Usage:
 
@@ -70,18 +76,39 @@ def decorator(func: F) -> F:
         @wraps(func)
         async def wrapper(*args, **kwargs) -> Any:
             delay = base_delay
-            for attempt in range(max_retries):
+            last_exception: BaseException | None = None
+            for attempt in range(1, max_retries + 1):
+                await asyncio.sleep(constant_delay)
                 try:
                     return await func(*args, **kwargs)
                 except httpx.HTTPStatusError as e:
-                    # only retry on 429
                     status = e.response.status_code if e.response is not None else None
-                    if status != 429 or attempt == max_retries - 1:
+                    if status != 429:
                         raise
-                # backoff
-                await asyncio.sleep(delay)
-                delay *= 2
-            # in theory we never reach here
+                    last_exception = e
+                    sleep_for = delay
+                    if e.response is not None:
+                        ra = e.response.headers.get("Retry-After")
+                        if ra is not None:
+                            try:
+                                sleep_for = float(ra)
+                            except ValueError:
+                                pass
+                    delay *= factor
+
+                except httpx.ReadError as e:
+                    last_exception = e
+                    sleep_for = delay
+                    delay *= factor
+
+                if attempt == max_retries:
+                    msg = (
+                        f"{func.__name__} failed after {attempt} attempts with "
+                        f"last delay {sleep_for:.2f}s"
+                    )
+                    raise RuntimeError(msg) from last_exception
+
+                await asyncio.sleep(sleep_for)
 
         return wrapper
 

diff --git a/paperscraper/citations/entity/core.py b/paperscraper/citations/entity/core.py
@@ -5,14 +5,15 @@
 
 
 class EntityResult(BaseModel):
-    num_citations: int
-    num_references: int
-    # keys are authors or papers and values are absolute self links
-    self_citations: Dict[str, int] = {}
-    self_references: Dict[str, int] = {}
     # aggregated results
     self_citation_ratio: float = 0
     self_reference_ratio: float = 0
+    # total number of author citations/references
+    num_citations: int
+    num_references: int
+    # keys are papers and values are percentage of self citations/references
+    self_citations: Dict[str, float] = {}
+    self_references: Dict[str, float] = {}
 
 
 class Entity:

diff --git a/paperscraper/citations/entity/paper.py b/paperscraper/citations/entity/paper.py
@@ -68,33 +68,41 @@ def self_references(self):
         Extracts the self references of a paper, for each author.
         """
         if isinstance(self.doi, str):
-            self.ref_result: ReferenceResult = self_references_paper(self.doi)
+            self.self_ref: ReferenceResult = self_references_paper(self.doi)
 
     def self_citations(self):
         """
         Extracts the self citations of a paper, for each author.
         """
         if isinstance(self.doi, str):
-            self.citation_result: CitationResult = self_citations_paper(self.doi)
+            self.self_cite: CitationResult = self_citations_paper(self.doi)
 
     def get_result(self) -> Optional[PaperResult]:
         """
         Provides the result of the analysis.
 
         Returns: PaperResult if available.
         """
-        if not hasattr(self, "ref_result"):
+        if not hasattr(self, "self_ref"):
             logger.warning(
                 f"Can't get result since no referencing result for {self.input} exists. Run `.self_references` first."
             )
             return
-        elif not hasattr(self, "citation_result"):
+        elif not hasattr(self, "self_cite"):
             logger.warning(
                 f"Can't get result since no citation result for {self.input} exists. Run `.self_citations` first."
             )
             return
-        ref_result = self.ref_result.model_dump()
-        ref_result.pop("ssid", None)
         return PaperResult(
-            title=self.title, **ref_result, **self.citation_result.model_dump()
+            title=self.title,
+            **{
+                k: v
+                for k, v in self.self_ref.model_dump().items()
+                if k not in ["ssid", "title"]
+            },
+            **{
+                k: v
+                for k, v in self.self_cite.model_dump().items()
+                if k not in ["title"]
+            },
         )
diff --git a/paperscraper/citations/entity/researcher.py b/paperscraper/citations/entity/researcher.py
@@ -1,11 +1,12 @@
+import asyncio
 import os
-from typing import List, Literal, Optional
+from typing import Any, List, Literal, Optional, Tuple
 
 from semanticscholar import SemanticScholar
-from tqdm import tqdm
 
 from ..orcid import orcid_to_author_name
-from ..self_references import ReferenceResult
+from ..self_citations import CitationResult
+from ..self_references import ReferenceResult, self_references_paper
 from ..utils import author_name_to_ssaid, get_papers_for_author
 from .core import Entity, EntityResult
 
@@ -14,7 +15,27 @@ class ResearcherResult(EntityResult):
     name: str
     ssid: int
     orcid: Optional[str] = None
-    # TODO: the ratios will be averaged across all papers for that author
+
+    def _ordered_items(self) -> List[Tuple[str, Any]]:
+        # enforce specific ordering
+        return [
+            ("name", self.name),
+            ("self_reference_ratio", self.self_reference_ratio),
+            ("self_citation_ratio", self.self_citation_ratio),
+            ("num_references", self.num_references),
+            ("num_citations", self.num_citations),
+            ("self_references", self.self_references),
+            ("self_citations", self.self_citations),
+            ("ssid", self.ssid),
+            ("orcid", self.orcid),
+        ]
+
+    def __repr__(self) -> str:
+        inner = ", ".join(f"{k}={v!r}" for k, v in self._ordered_items())
+        return f"{self.__class__.__name__}({inner})"
+
+    def __str__(self) -> str:
+        return " ".join(f"{k}={v!r}" for k, v in self._ordered_items())
 
 
 ModeType = Literal[tuple(MODES := ("name", "orcid", "ssaid", "infer"))]
@@ -32,7 +53,7 @@ def __init__(self, input: str, mode: ModeType = "infer"):
         Construct researcher object for self citation/reference analysis.
 
         Args:
-            input: A researcher to search for.
+            input: A researcher to search for, identified by name, ORCID iD, or Semantic Scholar Author ID.
             mode: This can be a `name` `orcid` (ORCID iD) or `ssaid` (Semantic Scholar Author ID).
                 Defaults to "infer".
 
@@ -53,32 +74,74 @@ def __init__(self, input: str, mode: ModeType = "infer"):
             ):
                 mode = "orcid"
             else:
-                mode = "author"
-
+                mode = "name"
         if mode == "ssaid":
-            self.author = sch.get_author(input)
+            self.name = sch.get_author(input)._name
             self.ssid = input
         elif mode == "orcid":
-            self.author = orcid_to_author_name(input)
+            orcid_name = orcid_to_author_name(input)
             self.orcid = input
-            self.ssid = author_name_to_ssaid(input)
-        elif mode == "author":
-            self.author = input
-            self.ssid = author_name_to_ssaid(input)
-
-        # TODO: Skip over erratum / corrigendum
-        self.ssids = get_papers_for_author(self.ssid)
-
-    def self_references(self):
+            self.ssid, self.name = author_name_to_ssaid(orcid_name)
+        elif mode == "name":
+            name = input
+            self.ssid, self.name = author_name_to_ssaid(input)
+
+    async def _self_references_async(
+        self, verbose: bool = False
+    ) -> List[ReferenceResult]:
+        """Async version of self_references."""
+        self.ssids = await get_papers_for_author(self.ssid)
+
+        results: List[ReferenceResult] = await self_references_paper(
+            self.ssids, verbose=verbose
+        )
+        # Remove papers with zero references or that are erratum/corrigendum
+        results = [
+            r
+            for r in results
+            if r.num_references > 0
+            and "erratum" not in r.title.lower()
+            and "corrigendum" not in r.title.lower()
+        ]
+
+        return results
+
+    def self_references(self, verbose: bool = False) -> ResearcherResult:
         """
         Sifts through all papers of a researcher and extracts the self references.
-        """
-        # TODO: Asynchronous call to self_references
-        print("Going through SSIDs", self.ssids)
 
-        # TODO: Aggregate results
+        Args:
+            verbose: If True, logs detailed information for each paper.
 
-    def self_citations(self):
+        Returns:
+            A ResearcherResult containing aggregated self-reference data.
+        """
+        reference_results = asyncio.run(self._self_references_async(verbose=verbose))
+
+        individual_self_references = {
+            getattr(result, "title"): getattr(result, "self_references").get(self.name, 0.0)
+            for result in reference_results
+        }
+        reference_ratio = sum(individual_self_references.values()) / max(1, len(
+            individual_self_references
+        ))
+        return ResearcherResult(
+            name=self.name,
+            ssid=int(self.ssid),
+            orcid=self.orcid,
+            num_references=sum(r.num_references for r in reference_results),
+            num_citations=-1,
+            self_references=dict(
+                sorted(
+                    individual_self_references.items(), key=lambda x: x[1], reverse=True
+                )
+            ),
+            self_citations={},
+            self_reference_ratio=round(reference_ratio, 3),
+            self_citation_ratio=-1.0,
+        )
+
+    def self_citations(self) -> ResearcherResult:
         """
         Sifts through all papers of a researcher and finds how often they are self-cited.
         """

diff --git a/paperscraper/citations/self_citations.py b/paperscraper/citations/self_citations.py
@@ -18,6 +18,7 @@
 
 class CitationResult(BaseModel):
     ssid: str  # semantic scholar paper id
+    title: str
     num_citations: int
     self_citations: Dict[str, float] = {}
     citation_score: float
@@ -87,6 +88,7 @@ async def _process_single(client: httpx.AsyncClient, identifier: str) -> Citatio
 
     return CitationResult(
         ssid=identifier,
+        title=paper.get("title", ""),
         num_citations=total_cites,
         self_citations=ratios,
         citation_score=avg_score,