55These functions are not intended for direct use by end-users, but are consumed
66by the main API interface.
77"""
8+
89from __future__ import annotations
910
1011import random
1819
1920# --- Module Constants ---
2021PUBCHEM_API_BASE = "https://pubchem.ncbi.nlm.nih.gov/rest/pug"
21- PUG_VIEW_BASE = "https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data"
22- REQUEST_TIMEOUT = 15
22+ PUG_VIEW_BASE = "https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data"
23+ REQUEST_TIMEOUT = 15
2324
2425MAX_RETRIES , INITIAL_BACKOFF , MAX_BACKOFF = 5 , 1 , 16
2526REQUEST_RATE_LIMIT = 5 # Requests per second
3233
3334# --- Session & Caching ---
3435
36+
3537def setup_cache (
3638 cache_name : str = "pubchem_cache" ,
3739 backend : str = "sqlite" ,
@@ -56,13 +58,18 @@ def setup_cache(
5658 """
5759 global _session
5860 _session = requests_cache .CachedSession (
59- cache_name = cache_name ,
60- backend = backend ,
61- expire_after = expire_after ,
62- allowable_codes = [200 , 404 , 503 ], # Cache "not found" and "server busy" responses
61+ cache_name = cache_name ,
62+ backend = backend ,
63+ expire_after = expire_after ,
64+ allowable_codes = [
65+ 200 ,
66+ 404 ,
67+ 503 ,
68+ ], # Cache "not found" and "server busy" responses
6369 ** kw ,
6470 )
6571
72+
6673def get_session () -> requests_cache .CachedSession :
6774 """
6875 Gets the current cached session, initializing it with defaults if necessary.
@@ -80,6 +87,7 @@ def get_session() -> requests_cache.CachedSession:
8087
8188# --- Core Fetching Logic ---
8289
90+
8391def _execute_fetch (url : str ) -> requests .Response :
8492 """
8593 Executes a single GET request using the global session.
@@ -91,7 +99,10 @@ def _execute_fetch(url: str) -> requests.Response:
9199 """
92100 return get_session ().get (url , timeout = REQUEST_TIMEOUT )
93101
94- def _fetch_with_ratelimit_and_retry (url : str ) -> dict [str , Any ] | list [Any ] | str | None :
102+
103+ def _fetch_with_ratelimit_and_retry (
104+ url : str ,
105+ ) -> dict [str , Any ] | list [Any ] | str | None :
95106 """
96107 Performs a GET request with rate-limiting and exponential backoff retry logic.
97108
@@ -135,23 +146,35 @@ def _fetch_with_ratelimit_and_retry(url: str) -> dict[str, Any] | list[Any] | st
135146 return None # Resource not found is a valid, final state.
136147
137148 if resp .status_code == 503 :
138- print (f"[ChemInformant] 503 Server Busy -> retry in { backoff :.1f} s" , file = sys .stderr )
149+ print (
150+ f"[ChemInformant] 503 Server Busy -> retry in { backoff :.1f} s" ,
151+ file = sys .stderr ,
152+ )
139153 else :
140154 resp .raise_for_status () # Trigger for other 4xx/5xx errors
141155
142156 except requests .exceptions .RequestException as e :
143- print (f"[ChemInformant] Network error { e } -> retry in { backoff :.1f} s" , file = sys .stderr )
157+ print (
158+ f"[ChemInformant] Network error { e } -> retry in { backoff :.1f} s" ,
159+ file = sys .stderr ,
160+ )
144161
145162 time .sleep (backoff )
146- backoff = min (MAX_BACKOFF , backoff * 2 ) + random .uniform (0 , 1 ) # Exponential backoff with jitter
163+ backoff = min (MAX_BACKOFF , backoff * 2 ) + random .uniform (
164+ 0 , 1
165+ ) # Exponential backoff with jitter
147166 retries += 1
148167
149- print (f"[ChemInformant] Giving up after { MAX_RETRIES } retries for URL: { url } " , file = sys .stderr )
168+ print (
169+ f"[ChemInformant] Giving up after { MAX_RETRIES } retries for URL: { url } " ,
170+ file = sys .stderr ,
171+ )
150172 return None
151173
152174
153175# --- Public-Facing Helper Functions ---
154176
177+
155178def get_cids_by_name (name : str ) -> list [int ] | None :
156179 """
157180 Fetches PubChem Compound IDs (CIDs) for a given chemical name.
@@ -175,10 +198,11 @@ def get_cids_by_name(name: str) -> list[int] | None:
175198 This function is used internally by get_properties() for name-to-CID resolution.
176199 End users should typically use get_properties() instead.
177200 """
178- url = f"{ PUBCHEM_API_BASE } /compound/name/{ quote (name )} /cids/JSON"
201+ url = f"{ PUBCHEM_API_BASE } /compound/name/{ quote (name )} /cids/JSON"
179202 data = _fetch_with_ratelimit_and_retry (url )
180203 return data .get ("IdentifierList" , {}).get ("CID" ) if isinstance (data , dict ) else None
181204
205+
182206def get_cids_by_smiles (smiles : str ) -> list [int ] | None :
183207 """
184208 Fetches PubChem Compound IDs (CIDs) for a given SMILES string.
@@ -204,11 +228,14 @@ def get_cids_by_smiles(smiles: str) -> list[int] | None:
204228 This function is used internally by get_properties() for SMILES-to-CID resolution.
205229 End users should typically use get_properties() instead.
206230 """
207- url = f"{ PUBCHEM_API_BASE } /compound/smiles/{ quote (smiles )} /cids/JSON"
231+ url = f"{ PUBCHEM_API_BASE } /compound/smiles/{ quote (smiles )} /cids/JSON"
208232 data = _fetch_with_ratelimit_and_retry (url )
209233 return data .get ("IdentifierList" , {}).get ("CID" ) if isinstance (data , dict ) else None
210234
211- def get_batch_properties (cids : list [int ], props : list [str ]) -> dict [int , dict [str , Any ]]:
235+
236+ def get_batch_properties (
237+ cids : list [int ], props : list [str ]
238+ ) -> dict [int , dict [str , Any ]]:
212239 """
213240 Fetches multiple properties for a batch of CIDs in a single request,
214241 handling API pagination automatically.
@@ -258,7 +285,10 @@ def get_batch_properties(cids: list[int], props: list[str]) -> dict[int, dict[st
258285
259286 # Loop as long as the API provides a ListKey for the next page
260287 while list_key :
261- print (f"[ChemInformant] Pagination detected, fetching next page with ListKey: { list_key } " , file = sys .stderr )
288+ print (
289+ f"[ChemInformant] Pagination detected, fetching next page with ListKey: { list_key } " ,
290+ file = sys .stderr ,
291+ )
262292 paginated_url = (
263293 f"{ PUBCHEM_API_BASE } /compound/listkey/{ list_key } "
264294 f"/property/{ ',' .join (props )} /JSON"
@@ -307,7 +337,7 @@ def get_cas_for_cid(cid: int) -> str | None:
307337 It may be slower than standard property queries as it accesses
308338 detailed compound records rather than the property API.
309339 """
310- url = f"{ PUG_VIEW_BASE } /compound/{ cid } /JSON"
340+ url = f"{ PUG_VIEW_BASE } /compound/{ cid } /JSON"
311341 data = _fetch_with_ratelimit_and_retry (url )
312342 if isinstance (data , dict ):
313343 for sec in data .get ("Record" , {}).get ("Section" , []):
@@ -317,11 +347,14 @@ def get_cas_for_cid(cid: int) -> str | None:
317347 for cas_sec in sub .get ("Section" , []):
318348 if cas_sec .get ("TOCHeading" ) == "CAS" :
319349 for info in cas_sec .get ("Information" , []):
320- markup = info .get ("Value" , {}).get ("StringWithMarkup" )
350+ markup = info .get ("Value" , {}).get (
351+ "StringWithMarkup"
352+ )
321353 if markup and isinstance (markup , list ) and markup :
322354 return markup [0 ].get ("String" )
323355 return None
324356
357+
325358def get_synonyms_for_cid (cid : int ) -> list [str ]:
326359 """
327360 Fetches all known synonyms (alternative names) for a given CID.
@@ -347,7 +380,7 @@ def get_synonyms_for_cid(cid: int) -> list[str]:
347380 This function is used internally by get_properties() and get_synonyms().
348381 The first synonym in the list is typically the preferred/most common name.
349382 """
350- url = f"{ PUBCHEM_API_BASE } /compound/cid/{ cid } /synonyms/JSON"
383+ url = f"{ PUBCHEM_API_BASE } /compound/cid/{ cid } /synonyms/JSON"
351384 data = _fetch_with_ratelimit_and_retry (url )
352385 if isinstance (data , dict ):
353386 info_list = data .get ("InformationList" , {}).get ("Information" , [])
0 commit comments