From 43d8378a941485b8c71c6af166af3ccad8e2d0d0 Mon Sep 17 00:00:00 2001
From: Jannis Born <jab@zurich.ibm.com>
Date: Sat, 22 Nov 2025 13:32:17 +0100
Subject: [PATCH 1/8] feat: support per author analysis of self-references

---
 paperscraper/citations/entity/core.py       |  11 +-
 paperscraper/citations/entity/researcher.py | 103 +++++++++++++----
 paperscraper/citations/self_citations.py    |   2 +
 paperscraper/citations/self_references.py   | 121 +++++++++++++-------
 4 files changed, 170 insertions(+), 67 deletions(-)

diff --git a/paperscraper/citations/entity/core.py b/paperscraper/citations/entity/core.py
index b25b035..04b14ca 100644
--- a/paperscraper/citations/entity/core.py
+++ b/paperscraper/citations/entity/core.py
@@ -5,14 +5,15 @@
 
 
 class EntityResult(BaseModel):
-    num_citations: int
-    num_references: int
-    # keys are authors or papers and values are absolute self links
-    self_citations: Dict[str, int] = {}
-    self_references: Dict[str, int] = {}
     # aggregated results
     self_citation_ratio: float = 0
     self_reference_ratio: float = 0
+    # total number of author citations/references
+    num_citations: int
+    num_references: int
+    # keys are papers and values are percentage of self citations/references
+    self_citations: Dict[str, float] = {}
+    self_references: Dict[str, float] = {}
 
 
 class Entity:
diff --git a/paperscraper/citations/entity/researcher.py b/paperscraper/citations/entity/researcher.py
index 4a27312..533a60d 100644
--- a/paperscraper/citations/entity/researcher.py
+++ b/paperscraper/citations/entity/researcher.py
@@ -1,11 +1,12 @@
+import asyncio
 import os
-from typing import List, Literal, Optional
+from typing import Any, List, Literal, Optional, Tuple
 
 from semanticscholar import SemanticScholar
-from tqdm import tqdm
 
 from ..orcid import orcid_to_author_name
-from ..self_references import ReferenceResult
+from ..self_citations import CitationResult
+from ..self_references import ReferenceResult, self_references_paper
 from ..utils import author_name_to_ssaid, get_papers_for_author
 from .core import Entity, EntityResult
 
@@ -14,7 +15,27 @@ class ResearcherResult(EntityResult):
     name: str
     ssid: int
     orcid: Optional[str] = None
-    # TODO: the ratios will be averaged across all papers for that author
+
+    def _ordered_items(self) -> List[Tuple[str, Any]]:
+        # enforce specific ordering
+        return [
+            ("name", self.name),
+            ("self_reference_ratio", self.self_reference_ratio),
+            ("self_citation_ratio", self.self_citation_ratio),
+            ("num_references", self.num_references),
+            ("num_citations", self.num_citations),
+            ("self_references", self.self_references),
+            ("self_citations", self.self_citations),
+            ("ssid", self.ssid),
+            ("orcid", self.orcid),
+        ]
+
+    def __repr__(self) -> str:
+        inner = ", ".join(f"{k}={v!r}" for k, v in self._ordered_items())
+        return f"{self.__class__.__name__}({inner})"
+
+    def __str__(self) -> str:
+        return " ".join(f"{k}={v!r}" for k, v in self._ordered_items())
 
 
 ModeType = Literal[tuple(MODES := ("name", "orcid", "ssaid", "infer"))]
@@ -32,7 +53,7 @@ def __init__(self, input: str, mode: ModeType = "infer"):
         Construct researcher object for self citation/reference analysis.
 
         Args:
-            input: A researcher to search for.
+            input: A researcher to search for, identified by name, ORCID iD, or Semantic Scholar Author ID.
             mode: This can be a `name` `orcid` (ORCID iD) or `ssaid` (Semantic Scholar Author ID).
                 Defaults to "infer".
 
@@ -53,32 +74,74 @@ def __init__(self, input: str, mode: ModeType = "infer"):
             ):
                 mode = "orcid"
             else:
-                mode = "author"
-
+                mode = "name"
         if mode == "ssaid":
-            self.author = sch.get_author(input)
+            self.name = sch.get_author(input)._name
             self.ssid = input
         elif mode == "orcid":
-            self.author = orcid_to_author_name(input)
+            self.name = orcid_to_author_name(input)
             self.orcid = input
             self.ssid = author_name_to_ssaid(input)
-        elif mode == "author":
-            self.author = input
+        elif mode == "name":
+            self.name = input
             self.ssid = author_name_to_ssaid(input)
 
-        # TODO: Skip over erratum / corrigendum
-        self.ssids = get_papers_for_author(self.ssid)
-
-    def self_references(self):
+    async def _self_references_async(
+        self, verbose: bool = False
+    ) -> List[ReferenceResult]:
+        """Async version of self_references."""
+        self.ssids = await get_papers_for_author(self.ssid)
+
+        results: List[ReferenceResult] = await self_references_paper(
+            self.ssids, verbose=verbose
+        )
+        # Remove papers with zero references or that are erratum/corrigendum
+        results = [
+            r
+            for r in results
+            if r.num_references > 0
+            and "erratum" not in r.title.lower()
+            and "corrigendum" not in r.title.lower()
+        ]
+
+        return results
+
+    def self_references(self, verbose: bool = False) -> ResearcherResult:
         """
         Sifts through all papers of a researcher and extracts the self references.
-        """
-        # TODO: Asynchronous call to self_references
-        print("Going through SSIDs", self.ssids)
 
-        # TODO: Aggregate results
+        Args:
+            verbose: If True, logs detailed information for each paper.
 
-    def self_citations(self):
+        Returns:
+            A ResearcherResult containing aggregated self-reference data.
+        """
+        reference_results = asyncio.run(self._self_references_async(verbose=verbose))
+
+        individual_self_references = {
+            getattr(result, "title"): getattr(result, "self_references")[self.name]
+            for result in reference_results
+        }
+        reference_ratio = sum(individual_self_references.values()) / len(
+            individual_self_references
+        )
+        return ResearcherResult(
+            name=self.name,
+            ssid=int(self.ssid),
+            orcid=self.orcid,
+            num_references=sum(r.num_references for r in reference_results),
+            num_citations=-1,
+            self_references=dict(
+                sorted(
+                    individual_self_references.items(), key=lambda x: x[1], reverse=True
+                )
+            ),
+            self_citations={},
+            self_reference_ratio=round(reference_ratio, 3),
+            self_citation_ratio=-1.0,
+        )
+
+    def self_citations(self) -> ResearcherResult:
         """
         Sifts through all papers of a researcher and finds how often they are self-cited.
         """
diff --git a/paperscraper/citations/self_citations.py b/paperscraper/citations/self_citations.py
index 57cef83..9d0b704 100644
--- a/paperscraper/citations/self_citations.py
+++ b/paperscraper/citations/self_citations.py
@@ -18,6 +18,7 @@
 
 class CitationResult(BaseModel):
     ssid: str  # semantic scholar paper id
+    title: str
     num_citations: int
     self_citations: Dict[str, float] = {}
     citation_score: float
@@ -87,6 +88,7 @@ async def _process_single(client: httpx.AsyncClient, identifier: str) -> Citatio
 
     return CitationResult(
         ssid=identifier,
+        title=paper.get("title", ""),
         num_citations=total_cites,
         self_citations=ratios,
         citation_score=avg_score,
diff --git a/paperscraper/citations/self_references.py b/paperscraper/citations/self_references.py
index 5014377..014216d 100644
--- a/paperscraper/citations/self_references.py
+++ b/paperscraper/citations/self_references.py
@@ -1,5 +1,6 @@
 import asyncio
 import logging
+import os
 import re
 import sys
 from typing import Any, Dict, List, Literal, Union
@@ -7,6 +8,7 @@
 import httpx
 import numpy as np
 from pydantic import BaseModel
+from tqdm import tqdm
 
 from ..async_utils import optional_async, retry_with_exponential_backoff
 from .utils import DOI_PATTERN, find_matching
@@ -16,15 +18,20 @@
 logging.getLogger("httpx").setLevel(logging.WARNING)
 ModeType = Literal[tuple(MODES := ("doi", "infer", "ssid"))]
 
+CONCURRENCY_LIMIT = 10
+_SEM = asyncio.Semaphore(CONCURRENCY_LIMIT)
+
 
 class ReferenceResult(BaseModel):
     ssid: str  # semantic scholar paper id
+    title: str
     num_references: int
     self_references: Dict[str, float] = {}
     reference_score: float
 
 
-async def _fetch_reference_data(
+@retry_with_exponential_backoff(max_retries=14, base_delay=1.0)
+async def _fetch_paper_with_references(
     client: httpx.AsyncClient, suffix: str
 ) -> Dict[str, Any]:
     """
@@ -40,11 +47,24 @@ async def _fetch_reference_data(
     response = await client.get(
         f"https://api.semanticscholar.org/graph/v1/paper/{suffix}",
         params={"fields": "title,authors,references.authors"},
+        headers={"x-api-key": os.getenv("SS_API_KEY")},
     )
     response.raise_for_status()
     return response.json()
 
 
+@retry_with_exponential_backoff(max_retries=10, base_delay=1.0, factor=1.4)
+async def fetch_references(client: httpx.AsyncClient, paper_id: str) -> list[dict]:
+    resp = await client.get(
+        f"https://api.semanticscholar.org/graph/v1/paper/{paper_id}/references",
+        params={"fields": "citedPaper.paperId,citedPaper.title,citedPaper.authors"},
+        headers={"x-api-key": os.getenv("SS_API_KEY")},
+    )
+    resp.raise_for_status()
+    data = resp.json()
+    return data.get("data", []) or []
+
+
 async def _process_single_reference(
     client: httpx.AsyncClient, identifier: str
 ) -> ReferenceResult:
@@ -58,47 +78,54 @@ async def _process_single_reference(
     Returns:
         A ReferenceResult containing counts and percentages of self-references.
     """
-    # Determine prefix for API
-    if len(identifier) > 15 and identifier.isalnum() and identifier.islower():
-        prefix = ""
-    elif len(re.findall(DOI_PATTERN, identifier, re.IGNORECASE)) == 1:
-        prefix = "DOI:"
-    else:
-        prefix = ""
-
-    suffix = f"{prefix}{identifier}"
-    paper = await _fetch_reference_data(client, suffix)
-
-    # Initialize counters
-    author_counts: Dict[str, int] = {a["name"]: 0 for a in paper.get("authors", [])}
-    references = paper.get("references", [])
-    total_refs = len(references)
-
-    # Tally self-references
-    for ref in references:
-        matched = find_matching(paper.get("authors", []), ref.get("authors", []))
-        for name in matched:
-            author_counts[name] += 1
-
-    # Compute percentages per author
-    ratios: Dict[str, float] = {
-        name: round((count / total_refs * 100), 2) if total_refs > 0 else 0.0
-        for name, count in author_counts.items()
-    }
-
-    # Compute average score
-    avg_score = round(float(np.mean(list(ratios.values()))) if ratios else 0.0, 3)
-
-    return ReferenceResult(
-        ssid=identifier,
-        num_references=total_refs,
-        self_references=ratios,
-        reference_score=avg_score,
-    )
+    async with _SEM:
+        # Determine prefix for API
+        if len(identifier) > 15 and identifier.isalnum() and identifier.islower():
+            prefix = ""
+        elif len(re.findall(DOI_PATTERN, identifier, re.IGNORECASE)) == 1:
+            prefix = "DOI:"
+        else:
+            prefix = ""
+
+        suffix = f"{prefix}{identifier}"
+        paper = await _fetch_paper_with_references(client, suffix)
+
+        # Initialize counters
+        author_counts: Dict[str, int] = {a["name"]: 0 for a in paper.get("authors", [])}
+        references = paper.get("references", []) or []
+
+        # # Sometimes references are not included in the initial fetch
+        # if len(references) == 0:
+        #     await asyncio.sleep(0.2)
+        #     references = await fetch_references(client, paper.get("paperId"))
+
+        total_refs = len(references)
+
+        # Tally self-references
+        for ref in references:
+            matched = find_matching(paper.get("authors", []), ref.get("authors", []))
+            for name in matched:
+                author_counts[name] += 1
+
+        # Compute percentages per author
+        ratios: Dict[str, float] = {
+            name: round((count / total_refs * 100), 2) if total_refs > 0 else 0.0
+            for name, count in author_counts.items()
+        }
+
+        # Compute average score
+        avg_score = round(float(np.mean(list(ratios.values()))) if ratios else 0.0, 3)
+
+        return ReferenceResult(
+            ssid=identifier,
+            title=paper.get("title", ""),
+            num_references=total_refs,
+            self_references=ratios,
+            reference_score=avg_score,
+        )
 
 
 @optional_async
-@retry_with_exponential_backoff(max_retries=4, base_delay=1.0)
 async def self_references_paper(
     inputs: Union[str, List[str]], verbose: bool = False
 ) -> Union[ReferenceResult, List[ReferenceResult]]:
@@ -120,15 +147,25 @@ async def self_references_paper(
 
     async with httpx.AsyncClient(timeout=httpx.Timeout(20)) as client:
         tasks = [_process_single_reference(client, ident) for ident in identifiers]
-        results = await asyncio.gather(*tasks)
+        results: List[ReferenceResult] = []
+
+        iterator = asyncio.as_completed(tasks)
+        if verbose:
+            iterator = tqdm(
+                iterator, total=len(tasks), desc="Collecting self-references"
+            )
+
+        for coro in iterator:
+            res = await coro
+            results.append(res)
 
     if verbose:
         for res in results:
             logger.info(
-                f'Self-references in "{res.ssid}": N={res.num_references}, '
+                f'Self-references in "{res.title}": N={res.num_references}, '
                 f"Score={res.reference_score}%"
             )
             for author, pct in res.self_references.items():
-                logger.info(f"  {author}: {pct}% self-reference")
+                logger.info(f"  {author}: {pct}% self-references")
 
     return results[0] if single_input else results

From 2678c564c8d310a62050636cf356a8918ab16cd4 Mon Sep 17 00:00:00 2001
From: Jannis Born <jab@zurich.ibm.com>
Date: Sat, 22 Nov 2025 13:32:59 +0100
Subject: [PATCH 2/8] feat: include API key, robustify retries and name overlap
 check

---
 paperscraper/async_utils.py     | 45 +++++++++++++++++-----
 paperscraper/citations/utils.py | 66 +++++++++++++++++++++++++++------
 2 files changed, 91 insertions(+), 20 deletions(-)

diff --git a/paperscraper/async_utils.py b/paperscraper/async_utils.py
index a4bc596..816ce27 100644
--- a/paperscraper/async_utils.py
+++ b/paperscraper/async_utils.py
@@ -49,14 +49,20 @@ def wrapper(*args, **kwargs) -> Union[T, Awaitable[T]]:
 
 
 def retry_with_exponential_backoff(
-    *, max_retries: int = 5, base_delay: float = 1.0
+    *,
+    max_retries: int = 5,
+    base_delay: float = 1.0,
+    factor: float = 1.3,
+    constant_delay: float = 0.2,
 ) -> Callable[[F], F]:
     """
     Decorator factory that retries an `async def` on HTTP 429, with exponential backoff.
 
     Args:
         max_retries: how many times to retry before giving up.
-        base_delay: initial delay in seconds; next delays will be duplication of previous.
+        base_delay: initial delay in seconds; next delays will be multiplied by `factor`.
+        factor: multiplier for delay after each retry.
+        constant_delay: fixed delay before each attempt.
 
     Usage:
 
@@ -70,18 +76,39 @@ def decorator(func: F) -> F:
         @wraps(func)
         async def wrapper(*args, **kwargs) -> Any:
             delay = base_delay
-            for attempt in range(max_retries):
+            last_exception: BaseException | None = None
+            for attempt in range(1, max_retries + 1):
+                await asyncio.sleep(constant_delay)
                 try:
                     return await func(*args, **kwargs)
                 except httpx.HTTPStatusError as e:
-                    # only retry on 429
                     status = e.response.status_code if e.response is not None else None
-                    if status != 429 or attempt == max_retries - 1:
+                    if status != 429:
                         raise
-                # backoff
-                await asyncio.sleep(delay)
-                delay *= 2
-            # in theory we never reach here
+                    last_exception = e
+                    sleep_for = delay
+                    if e.response is not None:
+                        ra = e.response.headers.get("Retry-After")
+                        if ra is not None:
+                            try:
+                                sleep_for = float(ra)
+                            except ValueError:
+                                pass
+                    delay *= factor
+
+                except httpx.ReadError as e:
+                    last_exception = e
+                    sleep_for = delay
+                    delay *= factor
+
+                if attempt == max_retries:
+                    msg = (
+                        f"{func.__name__} failed after {attempt} attempts with "
+                        f"last delay {sleep_for:.2f}s"
+                    )
+                    raise RuntimeError(msg) from last_exception
+
+                await asyncio.sleep(sleep_for)
 
         return wrapper
 
diff --git a/paperscraper/citations/utils.py b/paperscraper/citations/utils.py
index 9710206..4380619 100644
--- a/paperscraper/citations/utils.py
+++ b/paperscraper/citations/utils.py
@@ -1,4 +1,5 @@
 import logging
+import os
 import re
 import sys
 from time import sleep
@@ -9,6 +10,8 @@
 from tqdm import tqdm
 from unidecode import unidecode
 
+from ..async_utils import retry_with_exponential_backoff
+
 logging.basicConfig(stream=sys.stdout, level=logging.INFO)
 logger = logging.getLogger(__name__)
 logging.getLogger("httpx").setLevel(logging.WARNING)
@@ -31,6 +34,7 @@ def get_doi_from_title(title: str) -> Optional[str]:
     response = requests.get(
         PAPER_URL + "search",
         params={"query": title, "fields": "externalIds", "limit": 1},
+        headers={"x-api-key": os.getenv("SS_API_KEY")},
     )
     data = response.json()
 
@@ -61,7 +65,9 @@ def get_doi_from_ssid(ssid: str, max_retries: int = 10) -> Optional[str]:
     ):
         # Make the GET request to Semantic Scholar.
         response = requests.get(
-            f"{PAPER_URL}{ssid}", params={"fields": "externalIds", "limit": 1}
+            f"{PAPER_URL}{ssid}",
+            params={"fields": "externalIds", "limit": 1},
+            headers={"x-api-key": os.getenv("SS_API_KEY")},
         )
 
         # If successful, try to extract and return the DOI.
@@ -88,7 +94,10 @@ def get_title_and_id_from_doi(doi: str) -> Dict[str, Any]:
     """
 
     # Send the GET request to Semantic Scholar
-    response = requests.get(f"{PAPER_URL}DOI:{doi}")
+    response = requests.get(
+        f"{PAPER_URL}DOI:{doi}",
+        headers={"x-api-key": os.getenv("SS_API_KEY")},
+    )
     if response.status_code == 200:
         data = response.json()
         return {"title": data.get("title"), "ssid": data.get("paperId")}
@@ -109,7 +118,9 @@ def author_name_to_ssaid(author_name: str) -> str:
     """
 
     response = requests.get(
-        AUTHOR_URL, params={"query": author_name, "fields": "name", "limit": 1}
+        AUTHOR_URL,
+        params={"query": author_name, "fields": "name", "limit": 1},
+        headers={"x-api-key": os.getenv("SS_API_KEY")},
     )
     if response.status_code == 200:
         data = response.json()
@@ -145,6 +156,7 @@ def determine_paper_input_type(input: str) -> Literal["ssid", "doi", "title"]:
     return mode
 
 
+@retry_with_exponential_backoff(max_retries=10, base_delay=1.0)
 async def get_papers_for_author(ss_author_id: str) -> List[str]:
     """
     Given a Semantic Scholar author ID, returns a list of all Semantic Scholar paper IDs for that author.
@@ -210,22 +222,54 @@ def find_matching(
     return list(overlap_ids | overlap_names)
 
 
-def check_overlap(n1: str, n2: str) -> bool:
+def check_overlap(
+    n1: str,
+    n2: str,
+    recursive: bool = True,
+) -> bool:
     """
     Check whether two author names are identical.
-    TODO: This can be made more robust
+
+    Heuristics:
+        - Case insensitive
+        - If name sets are identical, a match is assumed (e.g. "John Walter" vs "Walter John").
+        - Assume the last token is the surname and require:
+            * same surname
+            * both have at least one given name
+            * first given names are compatible (same, or initial vs full)
 
     Args:
-        n1: first name
-        n2: second name
+        n1: first name (e.g., "John A. Smith")
+        n2: second name (e.g., "J. Smith")
 
     Returns:
         bool: Whether names are identical.
     """
-    # remove initials and check for name intersection
-    s1 = {w for w in clean_name(n1).split()}
-    s2 = {w for w in clean_name(n2).split()}
-    return len(s2) > 0 and len(s1 | s2) == len(s1)
+    t1 = [w for w in clean_name(n1).split() if w]
+    t2 = [w for w in clean_name(n2).split() if w]
+
+    if not t1 or not t2:
+        return False  # One name is empty after cleaning
+
+    if set(t1) == set(t2):
+        return True  # Name sets are identical
+
+    # Assume last token is surname
+    surname1, given1 = t1[-1], t1[:-1]
+    surname2, given2 = t2[-1], t2[:-1]
+
+    if surname1 != surname2:
+        return False  # Surnames do not match
+
+    if not given1 or not given2:
+        return False  # One name has no given names
+
+    # Compare only the *first* given name; middle names are optional
+    return (
+        given1[0] == given2[0]
+        or (len(given1[0]) == 1 and given2[0].startswith(given1[0]))
+        or (len(given2[0]) == 1 and given1[0].startswith(given2[0]))
+    )
 
 
 def clean_name(s: str) -> str:

From da84a863d6133c59dce467e09835e75d2cbe5083 Mon Sep 17 00:00:00 2001
From: Jannis Born <jab@zurich.ibm.com>
Date: Sat, 22 Nov 2025 13:33:16 +0100
Subject: [PATCH 3/8] test: expand test for new cases

---
 .../citations/tests/test_citations.py         | 11 +++++++++
 .../citations/tests/test_self_references.py   | 23 +++++++++++++++++++
 2 files changed, 34 insertions(+)

diff --git a/paperscraper/citations/tests/test_citations.py b/paperscraper/citations/tests/test_citations.py
index 4a6902c..a21389f 100644
--- a/paperscraper/citations/tests/test_citations.py
+++ b/paperscraper/citations/tests/test_citations.py
@@ -1,6 +1,7 @@
 import logging
 
 from paperscraper.citations import get_citations_by_doi
+from paperscraper.citations.utils import check_overlap
 
 logging.disable(logging.INFO)
 
@@ -13,3 +14,13 @@ def test_citations(self):
         # Try invalid DOI
         num = get_citations_by_doi("10.1035348/s42256-023-00639-z")
         assert isinstance(num, int) and num == 0
+
+    def test_name_overlap(self):
+        assert check_overlap("John Smith", "J. Smith")
+        assert check_overlap("J. Smith", "John Smith")
+        assert check_overlap("John A. Smith", "J. Smith")
+        assert check_overlap("John Smith", "John A. Smith")
+        assert check_overlap("J A. Smith", "J. Smith")
+        assert not check_overlap("Alice B. Cooper", "Bob A. Cooper")
+        assert not check_overlap("Alice Cooper", "Bob A. Cooper")
+        assert check_overlap("John Walter", "Walter John")
diff --git a/paperscraper/citations/tests/test_self_references.py b/paperscraper/citations/tests/test_self_references.py
index ff07c15..96dc12d 100644
--- a/paperscraper/citations/tests/test_self_references.py
+++ b/paperscraper/citations/tests/test_self_references.py
@@ -5,6 +5,7 @@
 import pytest
 
 from paperscraper.citations import self_references_paper
+from paperscraper.citations.entity import Researcher
 from paperscraper.citations.self_references import ReferenceResult
 
 logging.disable(logging.INFO)
@@ -75,3 +76,25 @@ def test_compare_async_and_sync_performance(self, dois):
             f"Async execution ({async_duration:.2f}s) is slower than sync execution "
             f"({sync_duration:.2f}s)"
         )
+
+    def test_researcher(self):
+        """
+        Tests calculation of self-references for all papers of an author.
+        """
+        ssaid = "2326988211"
+        researcher = Researcher(ssaid)
+        result = researcher.self_references(verbose=True)
+        assert result.ssid == int(ssaid)
+        assert isinstance(result.name, str)
+        assert result.name == "Patrick Soga"
+        assert isinstance(result.num_references, int)
+        assert result.num_references > 0
+        assert isinstance(result.num_citations, int)
+        assert result.num_citations == -1
+        assert isinstance(result.self_references, Dict)
+        for title, ratio in result.self_references.items():
+            assert isinstance(title, str)
+            assert isinstance(ratio, float)
+            assert ratio >= 0 and ratio <= 100
+
+        assert result.self_reference_ratio >= 0 and result.self_reference_ratio <= 100

From 4e5e6e4d85a1e777d61c6c77af871b74d461eac2 Mon Sep 17 00:00:00 2001
From: Jannis Born <jab@zurich.ibm.com>
Date: Sat, 22 Nov 2025 14:18:31 +0100
Subject: [PATCH 4/8] fix: avoid None API key

---
 paperscraper/citations/self_references.py | 10 +++++++--
 paperscraper/citations/utils.py           | 25 ++++++++++++-----------
 2 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/paperscraper/citations/self_references.py b/paperscraper/citations/self_references.py
index 014216d..d44e78d 100644
--- a/paperscraper/citations/self_references.py
+++ b/paperscraper/citations/self_references.py
@@ -18,6 +18,12 @@
 logging.getLogger("httpx").setLevel(logging.WARNING)
 ModeType = Literal[tuple(MODES := ("doi", "infer", "ssid"))]
 
+
+SS_API_KEY = os.getenv("SS_API_KEY")
+HEADERS: Dict[str, str] = {}
+if SS_API_KEY:
+    HEADERS["x-api-key"] = SS_API_KEY
+
 CONCURRENCY_LIMIT = 10
 _SEM = asyncio.Semaphore(CONCURRENCY_LIMIT)
 
@@ -47,7 +53,7 @@ async def _fetch_paper_with_references(
     response = await client.get(
         f"https://api.semanticscholar.org/graph/v1/paper/{suffix}",
         params={"fields": "title,authors,references.authors"},
-        headers={"x-api-key": os.getenv("SS_API_KEY")},
+        headers=HEADERS,
     )
     response.raise_for_status()
     return response.json()
@@ -58,7 +64,7 @@ async def fetch_references(client: httpx.AsyncClient, paper_id: str) -> list[dic
     resp = await client.get(
         f"https://api.semanticscholar.org/graph/v1/paper/{paper_id}/references",
         params={"fields": "citedPaper.paperId,citedPaper.title,citedPaper.authors"},
-        headers={"x-api-key": os.getenv("SS_API_KEY")},
+        headers=HEADERS,
     )
     resp.raise_for_status()
     data = resp.json()
diff --git a/paperscraper/citations/utils.py b/paperscraper/citations/utils.py
index 4380619..e755e9c 100644
--- a/paperscraper/citations/utils.py
+++ b/paperscraper/citations/utils.py
@@ -21,6 +21,12 @@
 AUTHOR_URL: str = "https://api.semanticscholar.org/graph/v1/author/search"
 
 
+SS_API_KEY = os.getenv("SS_API_KEY")
+HEADERS: Dict[str, str] = {}
+if SS_API_KEY:
+    HEADERS["x-api-key"] = SS_API_KEY
+
+
 def get_doi_from_title(title: str) -> Optional[str]:
     """
     Searches the DOI of a paper based on the paper title
@@ -34,7 +40,7 @@ def get_doi_from_title(title: str) -> Optional[str]:
     response = requests.get(
         PAPER_URL + "search",
         params={"query": title, "fields": "externalIds", "limit": 1},
-        headers={"x-api-key": os.getenv("SS_API_KEY")},
+        headers=HEADERS,
     )
     data = response.json()
 
@@ -46,6 +52,7 @@ def get_doi_from_title(title: str) -> Optional[str]:
     logger.warning(f"Did not find DOI for title={title}")
 
 
+@retry_with_exponential_backoff(max_retries=10, base_delay=1.0)
 def get_doi_from_ssid(ssid: str, max_retries: int = 10) -> Optional[str]:
     """
     Given a Semantic Scholar paper ID, returns the corresponding DOI if available.
@@ -67,7 +74,7 @@ def get_doi_from_ssid(ssid: str, max_retries: int = 10) -> Optional[str]:
         response = requests.get(
             f"{PAPER_URL}{ssid}",
             params={"fields": "externalIds", "limit": 1},
-            headers={"x-api-key": os.getenv("SS_API_KEY")},
+            headers=HEADERS,
         )
 
         # If successful, try to extract and return the DOI.
@@ -76,12 +83,12 @@ def get_doi_from_ssid(ssid: str, max_retries: int = 10) -> Optional[str]:
             doi = data.get("externalIds", {}).get("DOI")
             return doi
         attempts += 1
-        sleep(10)
     logger.warning(
         f"Did not find DOI for paper ID {ssid}. Code={response.status_code}, text={response.text}"
     )
 
 
+@retry_with_exponential_backoff(max_retries=10, base_delay=1.0)
 def get_title_and_id_from_doi(doi: str) -> Dict[str, Any]:
     """
     Given a DOI, retrieves the paper's title and semantic scholar paper ID.
@@ -94,10 +101,7 @@ def get_title_and_id_from_doi(doi: str) -> Dict[str, Any]:
     """
 
     # Send the GET request to Semantic Scholar
-    response = requests.get(
-        f"{PAPER_URL}DOI:{doi}",
-        headers={"x-api-key": os.getenv("SS_API_KEY")},
-    )
+    response = requests.get(f"{PAPER_URL}DOI:{doi}", headers=HEADERS)
     if response.status_code == 200:
         data = response.json()
         return {"title": data.get("title"), "ssid": data.get("paperId")}
@@ -106,6 +110,7 @@ def get_title_and_id_from_doi(doi: str) -> Dict[str, Any]:
     )
 
 
+@retry_with_exponential_backoff(max_retries=10, base_delay=1.0)
 def author_name_to_ssaid(author_name: str) -> str:
     """
     Given an author name, returns the Semantic Scholar author ID.
@@ -222,11 +227,7 @@ def find_matching(
     return list(overlap_ids | overlap_names)
 
 
-def check_overlap(
-    n1: str,
-    n2: str,
-    recursive: bool = True,
-) -> bool:
+def check_overlap(n1: str, n2: str) -> bool:
     """
     Check whether two author names are identical.
 

From 40c4b9e1b21f0410c5fdab057020163c30e058c8 Mon Sep 17 00:00:00 2001
From: Jannis Born <jab@zurich.ibm.com>
Date: Sat, 22 Nov 2025 15:15:27 +0100
Subject: [PATCH 5/8] fix: async

---
 paperscraper/citations/entity/paper.py |  22 +++--
 paperscraper/citations/utils.py        | 106 ++++++++++++-------------
 2 files changed, 68 insertions(+), 60 deletions(-)

diff --git a/paperscraper/citations/entity/paper.py b/paperscraper/citations/entity/paper.py
index 8530507..ad51a11 100644
--- a/paperscraper/citations/entity/paper.py
+++ b/paperscraper/citations/entity/paper.py
@@ -68,14 +68,14 @@ def self_references(self):
         Extracts the self references of a paper, for each author.
         """
         if isinstance(self.doi, str):
-            self.ref_result: ReferenceResult = self_references_paper(self.doi)
+            self.self_ref: ReferenceResult = self_references_paper(self.doi)
 
     def self_citations(self):
         """
         Extracts the self citations of a paper, for each author.
         """
         if isinstance(self.doi, str):
-            self.citation_result: CitationResult = self_citations_paper(self.doi)
+            self.self_cite: CitationResult = self_citations_paper(self.doi)
 
     def get_result(self) -> Optional[PaperResult]:
         """
@@ -83,18 +83,26 @@ def get_result(self) -> Optional[PaperResult]:
 
         Returns: PaperResult if available.
         """
-        if not hasattr(self, "ref_result"):
+        if not hasattr(self, "self_ref"):
             logger.warning(
                 f"Can't get result since no referencing result for {self.input} exists. Run `.self_references` first."
             )
             return
-        elif not hasattr(self, "citation_result"):
+        elif not hasattr(self, "self_cite"):
             logger.warning(
                 f"Can't get result since no citation result for {self.input} exists. Run `.self_citations` first."
             )
             return
-        ref_result = self.ref_result.model_dump()
-        ref_result.pop("ssid", None)
         return PaperResult(
-            title=self.title, **ref_result, **self.citation_result.model_dump()
+            title=self.title,
+            **{
+                k: v
+                for k, v in self.self_ref.model_dump().items()
+                if k not in ["ssid", "title"]
+            },
+            **{
+                k: v
+                for k, v in self.self_cite.model_dump().items()
+                if k not in ["title"]
+            },
         )
diff --git a/paperscraper/citations/utils.py b/paperscraper/citations/utils.py
index e755e9c..f22a5d7 100644
--- a/paperscraper/citations/utils.py
+++ b/paperscraper/citations/utils.py
@@ -2,7 +2,6 @@
 import os
 import re
 import sys
-from time import sleep
 from typing import Any, Dict, List, Literal, Optional
 
 import httpx
@@ -10,7 +9,7 @@
 from tqdm import tqdm
 from unidecode import unidecode
 
-from ..async_utils import retry_with_exponential_backoff
+from ..async_utils import optional_async, retry_with_exponential_backoff
 
 logging.basicConfig(stream=sys.stdout, level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -52,8 +51,8 @@ def get_doi_from_title(title: str) -> Optional[str]:
     logger.warning(f"Did not find DOI for title={title}")
 
 
-@retry_with_exponential_backoff(max_retries=10, base_delay=1.0)
-def get_doi_from_ssid(ssid: str, max_retries: int = 10) -> Optional[str]:
+@optional_async
+async def get_doi_from_ssid(ssid: str, max_retries: int = 10) -> Optional[str]:
     """
     Given a Semantic Scholar paper ID, returns the corresponding DOI if available.
 
@@ -63,33 +62,34 @@ def get_doi_from_ssid(ssid: str, max_retries: int = 10) -> Optional[str]:
     Returns:
       str or None: The DOI of the paper, or None if not found or in case of an error.
     """
-    logger.warning(
-        "Semantic Scholar API is easily overloaded when passing SS IDs, provide DOIs to improve throughput."
-    )
-    attempts = 0
-    for attempt in tqdm(
-        range(1, max_retries + 1), desc=f"Fetching DOI for {ssid}", unit="attempt"
-    ):
-        # Make the GET request to Semantic Scholar.
-        response = requests.get(
-            f"{PAPER_URL}{ssid}",
-            params={"fields": "externalIds", "limit": 1},
-            headers=HEADERS,
+    async with httpx.AsyncClient(timeout=httpx.Timeout(20)) as client:
+        logger.warning(
+            "Semantic Scholar API is easily overloaded when passing SS IDs, provide DOIs to improve throughput."
         )
+        attempts = 0
+        for attempt in tqdm(
+            range(1, max_retries + 1), desc=f"Fetching DOI for {ssid}", unit="attempt"
+        ):
+            # Make the GET request to Semantic Scholar.
+            response = await client.get(
+                f"{PAPER_URL}{ssid}",
+                params={"fields": "externalIds", "limit": 1},
+                headers=HEADERS,
+            )
 
-        # If successful, try to extract and return the DOI.
-        if response.status_code == 200:
-            data = response.json()
-            doi = data.get("externalIds", {}).get("DOI")
-            return doi
-        attempts += 1
-    logger.warning(
-        f"Did not find DOI for paper ID {ssid}. Code={response.status_code}, text={response.text}"
-    )
+            # If successful, try to extract and return the DOI.
+            if response.status_code == 200:
+                data = response.json()
+                doi = data.get("externalIds", {}).get("DOI")
+                return doi
+            attempts += 1
+        logger.warning(
+            f"Did not find DOI for paper ID {ssid}. Code={response.status_code}, text={response.text}"
+        )
 
 
-@retry_with_exponential_backoff(max_retries=10, base_delay=1.0)
-def get_title_and_id_from_doi(doi: str) -> Dict[str, Any]:
+@optional_async
+async def get_title_and_id_from_doi(doi: str) -> Dict[str, str] | None:
     """
     Given a DOI, retrieves the paper's title and semantic scholar paper ID.
 
@@ -99,19 +99,19 @@ def get_title_and_id_from_doi(doi: str) -> Dict[str, Any]:
     Returns:
         dict or None: A dictionary with keys 'title' and 'ssid'.
     """
-
-    # Send the GET request to Semantic Scholar
-    response = requests.get(f"{PAPER_URL}DOI:{doi}", headers=HEADERS)
-    if response.status_code == 200:
-        data = response.json()
-        return {"title": data.get("title"), "ssid": data.get("paperId")}
-    logger.warning(
-        f"Could not get authors & semantic scholar ID for DOI={doi}, {response.status_code}: {response.text}"
-    )
+    async with httpx.AsyncClient(timeout=httpx.Timeout(20)) as client:
+        # Send the GET request to Semantic Scholar
+        response = await client.get(f"{PAPER_URL}DOI:{doi}", headers=HEADERS)
+        if response.status_code == 200:
+            data = response.json()
+            return {"title": data.get("title"), "ssid": data.get("paperId")}
+        logger.warning(
+            f"Could not get authors & semantic scholar ID for DOI={doi}, {response.status_code}: {response.text}"
+        )
 
 
-@retry_with_exponential_backoff(max_retries=10, base_delay=1.0)
-def author_name_to_ssaid(author_name: str) -> str:
+@optional_async
+async def author_name_to_ssaid(author_name: str) -> str:
     """
     Given an author name, returns the Semantic Scholar author ID.
 
@@ -121,22 +121,22 @@ def author_name_to_ssaid(author_name: str) -> str:
     Returns:
         str or None: The Semantic Scholar author ID or None if no author is found.
     """
+    async with httpx.AsyncClient(timeout=httpx.Timeout(20)) as client:
+        response = await client.get(
+            AUTHOR_URL,
+            params={"query": author_name, "fields": "name", "limit": 1},
+            headers={"x-api-key": os.getenv("SS_API_KEY")},
+        )
+        if response.status_code == 200:
+            data = response.json()
+            authors = data.get("data", [])
+            if authors:
+                # Return the Semantic Scholar author ID from the first result.
+                return authors[0].get("authorId")
 
-    response = requests.get(
-        AUTHOR_URL,
-        params={"query": author_name, "fields": "name", "limit": 1},
-        headers={"x-api-key": os.getenv("SS_API_KEY")},
-    )
-    if response.status_code == 200:
-        data = response.json()
-        authors = data.get("data", [])
-        if authors:
-            # Return the Semantic Scholar author ID from the first result.
-            return authors[0].get("authorId")
-
-    logger.error(
-        f"Error in retrieving name from SS Author ID: {response.status_code} - {response.text}"
-    )
+        logger.error(
+            f"Error in retrieving name from SS Author ID: {response.status_code} - {response.text}"
+        )
 
 
 def determine_paper_input_type(input: str) -> Literal["ssid", "doi", "title"]:

From ba26945de8dc9c3230eb4642b64076f20c264e97 Mon Sep 17 00:00:00 2001
From: Jannis Born <jab@zurich.ibm.com>
Date: Tue, 25 Nov 2025 17:12:16 +0100
Subject: [PATCH 6/8] ci: use SS API key

---
 .github/workflows/test_pypi.yml | 1 +
 .github/workflows/test_tip.yml  | 4 ++++
 2 files changed, 5 insertions(+)

diff --git a/.github/workflows/test_pypi.yml b/.github/workflows/test_pypi.yml
index 546df40..938488e 100644
--- a/.github/workflows/test_pypi.yml
+++ b/.github/workflows/test_pypi.yml
@@ -26,5 +26,6 @@ jobs:
           python -c "import paperscraper.server_dumps"
           python -c "import paperscraper.tests"
           python -c "import paperscraper.impact"
+          python -c "import paperscraper.citations"
 
 
diff --git a/.github/workflows/test_tip.yml b/.github/workflows/test_tip.yml
index 599f068..7dd2dd6 100644
--- a/.github/workflows/test_tip.yml
+++ b/.github/workflows/test_tip.yml
@@ -5,6 +5,8 @@ on: [push, release]
 jobs:
   test-source-install:
     runs-on: ubuntu-latest
+    env:
+      SS_API_KEY: ${{ secrets.SS_API_KEY }}
     strategy:
       max-parallel: 3
       matrix:
@@ -49,6 +51,8 @@ jobs:
 
   test-potential-wheel-install:
     runs-on: ubuntu-latest
+    env:
+      SS_API_KEY: ${{ secrets.SS_API_KEY }}
     steps:
     - name: Checkout code
       uses: actions/checkout@v2

From 3de423b24f47167602a418b88663b0d5321fa964 Mon Sep 17 00:00:00 2001
From: Jannis Born <jab@zurich.ibm.com>
Date: Tue, 25 Nov 2025 17:58:32 +0100
Subject: [PATCH 7/8] chore: upper bound impact factor

---
 requirements.txt | 2 +-
 setup.py         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index a07e1f5..944f024 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,7 +8,7 @@ seaborn>=0.11.0
 matplotlib>=3.3.2
 matplotlib-venn>=0.11.5
 bs4>=0.0.1
-impact-factor>=1.1.1
+impact-factor>=1.1.1,<1.1.3
 thefuzz>=0.20.0
 pytest
 tldextract
diff --git a/setup.py b/setup.py
index 8c0e29b..5fd1152 100644
--- a/setup.py
+++ b/setup.py
@@ -37,7 +37,7 @@
         "matplotlib",
         "matplotlib_venn",
         "bs4",
-        "impact-factor>=1.1.1",
+        "impact-factor>=1.1.1,<1.1.3",
         "thefuzz",
         "pytest",
         "tldextract",

From c85372754c8f39419ef50d34bf23cfd26fd3ef99 Mon Sep 17 00:00:00 2001
From: Jannis Born <jab@zurich.ibm.com>
Date: Tue, 25 Nov 2025 17:59:36 +0100
Subject: [PATCH 8/8] fixes

---
 paperscraper/citations/entity/researcher.py   | 14 +++++------
 paperscraper/citations/self_references.py     | 17 -------------
 .../citations/tests/test_citations.py         |  8 ++++++-
 .../citations/tests/test_self_references.py   | 24 +++++++++++++++++++
 paperscraper/citations/utils.py               | 11 +++++----
 5 files changed, 44 insertions(+), 30 deletions(-)

diff --git a/paperscraper/citations/entity/researcher.py b/paperscraper/citations/entity/researcher.py
index 533a60d..e6a256f 100644
--- a/paperscraper/citations/entity/researcher.py
+++ b/paperscraper/citations/entity/researcher.py
@@ -79,12 +79,12 @@ def __init__(self, input: str, mode: ModeType = "infer"):
             self.name = sch.get_author(input)._name
             self.ssid = input
         elif mode == "orcid":
-            self.name = orcid_to_author_name(input)
+            orcid_name = orcid_to_author_name(input)
             self.orcid = input
-            self.ssid = author_name_to_ssaid(input)
+            self.ssid, self.name = author_name_to_ssaid(orcid_name)
         elif mode == "name":
-            self.name = input
-            self.ssid = author_name_to_ssaid(input)
+            name = input
+            self.ssid, self.name = author_name_to_ssaid(input)
 
     async def _self_references_async(
         self, verbose: bool = False
@@ -119,12 +119,12 @@ def self_references(self, verbose: bool = False) -> ResearcherResult:
         reference_results = asyncio.run(self._self_references_async(verbose=verbose))
 
         individual_self_references = {
-            getattr(result, "title"): getattr(result, "self_references")[self.name]
+            getattr(result, "title"): getattr(result, "self_references").get(self.name, 0.0)
             for result in reference_results
         }
-        reference_ratio = sum(individual_self_references.values()) / len(
+        reference_ratio = sum(individual_self_references.values()) / max(1, len(
             individual_self_references
-        )
+        ))
         return ResearcherResult(
             name=self.name,
             ssid=int(self.ssid),
diff --git a/paperscraper/citations/self_references.py b/paperscraper/citations/self_references.py
index d44e78d..3ddffd0 100644
--- a/paperscraper/citations/self_references.py
+++ b/paperscraper/citations/self_references.py
@@ -59,18 +59,6 @@ async def _fetch_paper_with_references(
     return response.json()
 
 
-@retry_with_exponential_backoff(max_retries=10, base_delay=1.0, factor=1.4)
-async def fetch_references(client: httpx.AsyncClient, paper_id: str) -> list[dict]:
-    resp = await client.get(
-        f"https://api.semanticscholar.org/graph/v1/paper/{paper_id}/references",
-        params={"fields": "citedPaper.paperId,citedPaper.title,citedPaper.authors"},
-        headers=HEADERS,
-    )
-    resp.raise_for_status()
-    data = resp.json()
-    return data.get("data", []) or []
-
-
 async def _process_single_reference(
     client: httpx.AsyncClient, identifier: str
 ) -> ReferenceResult:
@@ -100,11 +88,6 @@ async def _process_single_reference(
         author_counts: Dict[str, int] = {a["name"]: 0 for a in paper.get("authors", [])}
         references = paper.get("references", []) or []
 
-        # # Sometimes references are not included in the initial fetch
-        # if len(references) == 0:
-        #     await asyncio.sleep(0.2)
-        #     references = await fetch_references(client, paper.get("paperId"))
-
         total_refs = len(references)
 
         # Tally self-references
diff --git a/paperscraper/citations/tests/test_citations.py b/paperscraper/citations/tests/test_citations.py
index a21389f..93bd27f 100644
--- a/paperscraper/citations/tests/test_citations.py
+++ b/paperscraper/citations/tests/test_citations.py
@@ -1,7 +1,7 @@
 import logging
 
 from paperscraper.citations import get_citations_by_doi
-from paperscraper.citations.utils import check_overlap
+from paperscraper.citations.utils import check_overlap, author_name_to_ssaid
 
 logging.disable(logging.INFO)
 
@@ -15,6 +15,12 @@ def test_citations(self):
         num = get_citations_by_doi("10.1035348/s42256-023-00639-z")
         assert isinstance(num, int) and num == 0
 
+    def test_author_name_to_ssid(self):
+
+        ssaid, name = author_name_to_ssaid('Fabian H Sinz')
+        assert ssaid == '50095217'
+        assert name == 'Fabian H Sinz'
+
     def test_name_overlap(self):
         assert check_overlap("John Smith", "J. Smith")
         assert check_overlap("J. Smith", "John Smith")
diff --git a/paperscraper/citations/tests/test_self_references.py b/paperscraper/citations/tests/test_self_references.py
index 96dc12d..95763dd 100644
--- a/paperscraper/citations/tests/test_self_references.py
+++ b/paperscraper/citations/tests/test_self_references.py
@@ -98,3 +98,27 @@ def test_researcher(self):
             assert ratio >= 0 and ratio <= 100
 
         assert result.self_reference_ratio >= 0 and result.self_reference_ratio <= 100
+        print(result)
+
+    def test_researcher_from_orcid(self):
+        """
+        Tests calculation of self-references for all papers of an author.
+        """
+        orcid = "0000-0003-4221-6988"
+        researcher = Researcher(orcid)
+        result = researcher.self_references(verbose=True)
+        assert result.orcid == orcid
+        assert isinstance(result.name, str)
+        assert result.name == "Juan M. Galeazzi"
+        assert isinstance(result.num_references, int)
+        assert result.num_references > 0
+        assert isinstance(result.num_citations, int)
+        assert result.num_citations == -1
+        assert isinstance(result.self_references, Dict)
+        for title, ratio in result.self_references.items():
+            assert isinstance(title, str)
+            assert isinstance(ratio, float)
+            assert ratio >= 0 and ratio <= 100
+
+        assert result.self_reference_ratio >= 0 and result.self_reference_ratio <= 100
+        print(result)
\ No newline at end of file
diff --git a/paperscraper/citations/utils.py b/paperscraper/citations/utils.py
index f22a5d7..fa5c402 100644
--- a/paperscraper/citations/utils.py
+++ b/paperscraper/citations/utils.py
@@ -2,7 +2,7 @@
 import os
 import re
 import sys
-from typing import Any, Dict, List, Literal, Optional
+from typing import Any, Dict, List, Literal, Optional, Tuple
 
 import httpx
 import requests
@@ -111,7 +111,7 @@ async def get_title_and_id_from_doi(doi: str) -> Dict[str, str] | None:
 
 
 @optional_async
-async def author_name_to_ssaid(author_name: str) -> str:
+async def author_name_to_ssaid(author_name: str) -> Optional[Tuple[str, str]]:
     """
     Given an author name, returns the Semantic Scholar author ID.
 
@@ -119,20 +119,21 @@ async def author_name_to_ssaid(author_name: str) -> str:
         author_name (str): The full name of the author.
 
     Returns:
-        str or None: The Semantic Scholar author ID or None if no author is found.
+        Tuple[str, str] or None: The SS author ID alongside the SS name (may differ
+            slightly from input name) or None if no author is found.
     """
     async with httpx.AsyncClient(timeout=httpx.Timeout(20)) as client:
         response = await client.get(
             AUTHOR_URL,
             params={"query": author_name, "fields": "name", "limit": 1},
-            headers={"x-api-key": os.getenv("SS_API_KEY")},
+            headers=HEADERS,
         )
         if response.status_code == 200:
             data = response.json()
             authors = data.get("data", [])
             if authors:
                 # Return the Semantic Scholar author ID from the first result.
-                return authors[0].get("authorId")
+                return authors[0].get("authorId"), authors[0].get("name")
 
         logger.error(
             f"Error in retrieving name from SS Author ID: {response.status_code} - {response.text}"