Skip to content

Commit e04a2d6

Browse files
Merge pull request #416 from scholarly-python-package/develop
Release v1.6.1
2 parents 079e8be + 9e9a46d commit e04a2d6

5 files changed

Lines changed: 34 additions & 50 deletions

File tree

.github/workflows/pythonpackage.yml

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -41,11 +41,6 @@ jobs:
4141
pip3 install -r requirements.txt
4242
#- name: Typilus, Suggest Python Type Annotations
4343
# uses: typilus/typilus-action@v0.9
44-
- name: Install Chrome
45-
uses: browser-actions/setup-chrome@latest
46-
continue-on-error: true
47-
with:
48-
fail_ci_if_error: false
4944
- name: Run unittests
5045
id: unittests
5146
continue-on-error: true

README.md

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -37,18 +37,6 @@ This means your code that uses an earlier version of `scholarly` is guaranteed t
3737

3838
### Optional dependencies
3939

40-
- **geckodriver** or **chrome-driver** provide the browser capabilities that may be needed to fully utilize the library.
41-
Installing at least one of `geckodriver` or `chrome-driver` if you need to fetch the complete list of co-authors from an author's profile.
42-
If neither installed, `scholarly` will fetch only up to 20 co-authors.
43-
44-
To install `geckodriver`, download the latest version from their [Github repo](https://github.com/mozilla/geckodriver/releases) and the executable should be in the system path.
45-
Follow the appropriate installation instructions:
46-
[macOS](https://stackoverflow.com/a/67211136) | [Ubuntu](https://askubuntu.com/a/871077) | [Windows](https://stackoverflow.com/a/56926716)
47-
48-
To install `chrome-driver`, [download](https://chromedriver.chromium.org/downloads) the ChromeDriver binary for your platform and include its location in the PATH environment variable.
49-
See their [getting-started](https://chromedriver.chromium.org/getting-started) page for instructions.
50-
Alternatively, if you are on Ubuntu, you can run [scripts/setup-chrome-ubuntu-latest.sh](scripts/setup-chrome-ubuntu-latest.sh) to install the latest version of ChromeDriver.
51-
5240
- **Tor**:
5341

5442
`scholarly` comes with a handful of APIs to set up proxies to circumvent anti-bot measures.

scholarly/author_parser.py

Lines changed: 27 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,14 @@
11
from .publication_parser import PublicationParser
22
import re
33
from .data_types import Author, AuthorSource, PublicationSource, PublicAccess
4-
from selenium.webdriver.common.by import By
54
import codecs
65

76
_CITATIONAUTHRE = r'user=([\w-]*)'
87
_HOST = 'https://scholar.google.com{0}'
98
_PAGESIZE = 100
109
_EMAILAUTHORRE = r'Verified email at '
1110
_CITATIONAUTH = '/citations?hl=en&user={0}'
12-
_COAUTH = ('https://scholar.google.com/citations?user={0}&hl=en'
13-
'#d=gsc_md_cod&u=%2Fcitations%3Fview_op%3Dlist_colleagues'
14-
'%26hl%3Den%26json%3D%26user%3D{0}%23t%3Dgsc_cod_lc')
11+
_COAUTH = '/citations?view_op=list_colleagues&hl=en&user={0}'
1512
_MANDATES = "/citations?hl=en&tzom=300&user={0}&view_op=list_mandates&pagesize={1}"
1613

1714

@@ -135,9 +132,9 @@ def _fill_public_access(self, soup, author):
135132
not_available = soup.find('div', class_='gsc_rsb_m_na')
136133
n_available, n_not_available = 0, 0
137134
if available:
138-
n_available = int(available.text.split(" ")[0])
135+
n_available = int(available.text.split(" ")[0].replace(",", ""))
139136
if not_available:
140-
n_not_available = int(not_available.text.split(" ")[0])
137+
n_not_available = int(not_available.text.split(" ")[0].replace(",", ""))
141138

142139
author["public_access"] = PublicAccess(available=n_available,
143140
not_available=n_not_available)
@@ -198,7 +195,13 @@ def _fill_publications(self, soup, author, publication_limit: int = 0, sortby_st
198195
def _get_coauthors_short(self, soup):
199196
"""Get the short list of coauthors from the profile page.
200197
201-
To be called by _fill_coauthors method.
198+
This method fetches the list of coauthors visible from an author's
199+
prilfe page alone. This may or may not be the complete list of
200+
coauthors.
201+
202+
Note:
203+
-----
204+
This method is to be called by _fill_coauthors method.
202205
"""
203206
coauthors = soup.find_all('span', class_='gsc_rsb_a_desc')
204207
coauthor_ids = [re.findall(_CITATIONAUTHRE,
@@ -215,24 +218,24 @@ def _get_coauthors_short(self, soup):
215218
def _get_coauthors_long(self, author):
216219
"""Get the long (>20) list of coauthors.
217220
218-
Opens the dialog box to get the complete list of coauthors.
219-
To be called by _fill_coauthors method.
221+
This method fetches the complete list of coauthors bu opening a new
222+
page filled with the complete coauthor list.
223+
224+
Note:
225+
-----
226+
This method is to be called by _fill_coauthors method.
220227
"""
221-
with self.nav.pm2._get_webdriver() as wd:
222-
wd.get(_COAUTH.format(author['scholar_id']))
223-
# Wait up to 30 seconds for the various elements to be available.
224-
# The wait may be better set elsewhere.
225-
wd.implicitly_wait(30)
226-
coauthors = wd.find_elements(By.CLASS_NAME, 'gs_ai_pho')
227-
coauthor_ids = [re.findall(_CITATIONAUTHRE,
228-
coauth.get_attribute('href'))[0]
229-
for coauth in coauthors]
230-
coauthor_names = [name.text for name in
231-
wd.find_elements(By.CLASS_NAME, 'gs_ai_name')]
232-
coauthor_affils = [affil.text for affil in
233-
wd.find_elements(By.CLASS_NAME, 'gs_ai_aff')]
234-
235-
return coauthor_ids, coauthor_names, coauthor_affils
228+
soup = self.nav._get_soup(_COAUTH.format(author['scholar_id']))
229+
coauthors = soup.find_all('div', 'gs_ai gs_scl')
230+
coauthor_ids = [re.findall(_CITATIONAUTHRE,
231+
coauth('a')[0].get('href'))[0]
232+
for coauth in coauthors]
233+
234+
coauthor_names = [coauth.find(class_="gs_ai_name").text for coauth in coauthors]
235+
coauthor_affils = [coauth.find(class_="gs_ai_aff").text
236+
for coauth in coauthors]
237+
238+
return coauthor_ids, coauthor_names, coauthor_affils
236239

237240
def _fill_coauthors(self, soup, author):
238241
# If "View All" is not found, scrape the page for coauthors

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
setuptools.setup(
77
name='scholarly',
8-
version='1.6.0',
8+
version='1.6.1',
99
author='Steven A. Cholewiak, Panos Ipeirotis, Victor Silva, Arun Kannawadi',
1010
author_email='steven@cholewiak.com, panos@stern.nyu.edu, vsilva@ualberta.ca, arunkannawadi@astro.princeton.edu',
1111
description='Simple access to Google Scholar authors and citations',

test_module.py

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -518,12 +518,10 @@ def test_coauthors(self):
518518
author = scholarly.search_author_id('PA9La6oAAAAJ')
519519
scholarly.fill(author, sections=['basics', 'coauthors'])
520520
self.assertEqual(author['name'], "Panos Ipeirotis")
521-
self.assertGreaterEqual(len(author['coauthors']), 20)
522-
# Don't break the build if the long list cannot be fetch.
523-
# Chrome/Geckodriver are mentioned only as optional dependencies.
524-
if (len(author['coauthors']) > 20):
525-
self.assertIn('Eduardo Ruiz', [_coauth['name'] for _coauth in author['coauthors']])
526-
self.assertIn('hWq7jFQAAAAJ', [_coauth['scholar_id'] for _coauth in author['coauthors']])
521+
self.assertGreaterEqual(len(author['coauthors']), 66)
522+
# Break the build if the long list cannot be fetched.
523+
self.assertIn('Eduardo Ruiz', [_coauth['name'] for _coauth in author['coauthors']])
524+
self.assertIn('hWq7jFQAAAAJ', [_coauth['scholar_id'] for _coauth in author['coauthors']])
527525

528526
def test_public_access(self):
529527
"""
@@ -533,9 +531,9 @@ def test_public_access(self):
533531
100, thus requiring fetching information from a second page and 2) fill
534532
public access counts without fetching publications.
535533
"""
536-
author = scholarly.search_author_id("7x48vOkAAAAJ")
534+
author = scholarly.search_author_id("f4KlrXIAAAAJ")
537535
scholarly.fill(author, sections=['basics', 'public_access', 'publications'])
538-
self.assertGreaterEqual(author["public_access"]["available"], 110)
536+
self.assertGreaterEqual(author["public_access"]["available"], 1180)
539537
self.assertEqual(author["public_access"]["available"],
540538
sum(pub.get("public_access", None) is True for pub in author["publications"]))
541539
self.assertEqual(author["public_access"]["not_available"],

0 commit comments

Comments
 (0)