Merge pull request #416 from scholarly-python-package/develop

arunkannawadi · web-flow · commit e04a2d635c6a · 2022-05-02T03:26:37.000-04:00
Release v1.6.1
diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml
@@ -41,11 +41,6 @@ jobs:
           pip3 install -r requirements.txt
       #- name: Typilus, Suggest Python Type Annotations
       #  uses: typilus/typilus-action@v0.9
-      - name: Install Chrome
-        uses: browser-actions/setup-chrome@latest
-        continue-on-error: true
-        with:
-          fail_ci_if_error: false
       - name: Run unittests
         id: unittests
         continue-on-error: true
diff --git a/README.md b/README.md
@@ -37,18 +37,6 @@ This means your code that uses an earlier version of `scholarly` is guaranteed t
 
 ### Optional dependencies
 
-- **geckodriver** or **chrome-driver** provide the browser capabilities that may be needed to fully utilize the library.
-Installing at least one of `geckodriver` or `chrome-driver` if you need to fetch the complete list of co-authors from an author's profile.
-If neither installed, `scholarly` will fetch only up to 20 co-authors.
-
-    To install `geckodriver`, download the latest version from their [Github repo](https://github.com/mozilla/geckodriver/releases) and the executable should be in the system path.
-    Follow the appropriate installation instructions:
-    [macOS](https://stackoverflow.com/a/67211136) | [Ubuntu](https://askubuntu.com/a/871077) | [Windows](https://stackoverflow.com/a/56926716)
-
-    To install `chrome-driver`, [download](https://chromedriver.chromium.org/downloads) the ChromeDriver binary for your platform and include its location in the PATH environment variable.
-    See their [getting-started](https://chromedriver.chromium.org/getting-started) page for instructions.
-    Alternatively, if you are on Ubuntu, you can run [scripts/setup-chrome-ubuntu-latest.sh](scripts/setup-chrome-ubuntu-latest.sh) to install the latest version of ChromeDriver.
-
 - **Tor**:
 
     `scholarly` comes with a handful of APIs to set up proxies to circumvent anti-bot measures.
diff --git a/scholarly/author_parser.py b/scholarly/author_parser.py
@@ -1,17 +1,14 @@
 from .publication_parser import PublicationParser
 import re
 from .data_types import Author, AuthorSource, PublicationSource, PublicAccess
-from selenium.webdriver.common.by import By
 import codecs
 
 _CITATIONAUTHRE = r'user=([\w-]*)'
 _HOST = 'https://scholar.google.com{0}'
 _PAGESIZE = 100
 _EMAILAUTHORRE = r'Verified email at '
 _CITATIONAUTH = '/citations?hl=en&user={0}'
-_COAUTH = ('https://scholar.google.com/citations?user={0}&hl=en'
-           '#d=gsc_md_cod&u=%2Fcitations%3Fview_op%3Dlist_colleagues'
-           '%26hl%3Den%26json%3D%26user%3D{0}%23t%3Dgsc_cod_lc')
+_COAUTH = '/citations?view_op=list_colleagues&hl=en&user={0}'
 _MANDATES = "/citations?hl=en&tzom=300&user={0}&view_op=list_mandates&pagesize={1}"
 
 
@@ -135,9 +132,9 @@ def _fill_public_access(self, soup, author):
         not_available = soup.find('div', class_='gsc_rsb_m_na')
         n_available, n_not_available = 0, 0
         if available:
-            n_available = int(available.text.split(" ")[0])
+            n_available = int(available.text.split(" ")[0].replace(",", ""))
         if not_available:
-            n_not_available = int(not_available.text.split(" ")[0])
+            n_not_available = int(not_available.text.split(" ")[0].replace(",", ""))
 
         author["public_access"] = PublicAccess(available=n_available,
                                                not_available=n_not_available)
@@ -198,7 +195,13 @@ def _fill_publications(self, soup, author, publication_limit: int = 0, sortby_st
     def _get_coauthors_short(self, soup):
         """Get the short list of coauthors from the profile page.
 
-        To be called by _fill_coauthors method.
+        This method fetches the list of coauthors visible from an author's
+        prilfe page alone. This may or may not be the complete list of
+        coauthors.
+
+        Note:
+        -----
+        This method is to be called by _fill_coauthors method.
         """
         coauthors = soup.find_all('span', class_='gsc_rsb_a_desc')
         coauthor_ids = [re.findall(_CITATIONAUTHRE,
@@ -215,24 +218,24 @@ def _get_coauthors_short(self, soup):
     def _get_coauthors_long(self, author):
         """Get the long (>20) list of coauthors.
 
-        Opens the dialog box to get the complete list of coauthors.
-        To be called by _fill_coauthors method.
+        This method fetches the complete list of coauthors bu opening a new
+        page filled with the complete coauthor list.
+
+        Note:
+        -----
+        This method is to be called by _fill_coauthors method.
         """
-        with self.nav.pm2._get_webdriver() as wd:
-            wd.get(_COAUTH.format(author['scholar_id']))
-            # Wait up to 30 seconds for the various elements to be available.
-            # The wait may be better set elsewhere.
-            wd.implicitly_wait(30)
-            coauthors = wd.find_elements(By.CLASS_NAME, 'gs_ai_pho')
-            coauthor_ids = [re.findall(_CITATIONAUTHRE,
-                            coauth.get_attribute('href'))[0]
-                            for coauth in coauthors]
-            coauthor_names = [name.text for name in
-                              wd.find_elements(By.CLASS_NAME, 'gs_ai_name')]
-            coauthor_affils = [affil.text for affil in
-                               wd.find_elements(By.CLASS_NAME, 'gs_ai_aff')]
-
-            return coauthor_ids, coauthor_names, coauthor_affils
+        soup = self.nav._get_soup(_COAUTH.format(author['scholar_id']))
+        coauthors = soup.find_all('div', 'gs_ai gs_scl')
+        coauthor_ids = [re.findall(_CITATIONAUTHRE,
+                        coauth('a')[0].get('href'))[0]
+                        for coauth in coauthors]
+
+        coauthor_names = [coauth.find(class_="gs_ai_name").text for coauth in coauthors]
+        coauthor_affils = [coauth.find(class_="gs_ai_aff").text
+                           for coauth in coauthors]
+
+        return coauthor_ids, coauthor_names, coauthor_affils
 
     def _fill_coauthors(self, soup, author):
         # If "View All" is not found, scrape the page for coauthors
diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setuptools.setup(
     name='scholarly',
-    version='1.6.0',
+    version='1.6.1',
     author='Steven A. Cholewiak, Panos Ipeirotis, Victor Silva, Arun Kannawadi',
     author_email='steven@cholewiak.com, panos@stern.nyu.edu, vsilva@ualberta.ca, arunkannawadi@astro.princeton.edu',
     description='Simple access to Google Scholar authors and citations',
diff --git a/test_module.py b/test_module.py
@@ -518,12 +518,10 @@ def test_coauthors(self):
         author = scholarly.search_author_id('PA9La6oAAAAJ')
         scholarly.fill(author, sections=['basics', 'coauthors'])
         self.assertEqual(author['name'], "Panos Ipeirotis")
-        self.assertGreaterEqual(len(author['coauthors']), 20)
-        # Don't break the build if the long list cannot be fetch.
-        # Chrome/Geckodriver are mentioned only as optional dependencies.
-        if (len(author['coauthors']) > 20):
-            self.assertIn('Eduardo Ruiz', [_coauth['name'] for _coauth in author['coauthors']])
-            self.assertIn('hWq7jFQAAAAJ', [_coauth['scholar_id'] for _coauth in author['coauthors']])
+        self.assertGreaterEqual(len(author['coauthors']), 66)
+        # Break the build if the long list cannot be fetched.
+        self.assertIn('Eduardo Ruiz', [_coauth['name'] for _coauth in author['coauthors']])
+        self.assertIn('hWq7jFQAAAAJ', [_coauth['scholar_id'] for _coauth in author['coauthors']])
 
     def test_public_access(self):
         """
@@ -533,9 +531,9 @@ def test_public_access(self):
         100, thus requiring fetching information from a second page and 2) fill
         public access counts without fetching publications.
         """
-        author = scholarly.search_author_id("7x48vOkAAAAJ")
+        author = scholarly.search_author_id("f4KlrXIAAAAJ")
         scholarly.fill(author, sections=['basics', 'public_access', 'publications'])
-        self.assertGreaterEqual(author["public_access"]["available"], 110)
+        self.assertGreaterEqual(author["public_access"]["available"], 1180)
         self.assertEqual(author["public_access"]["available"],
                          sum(pub.get("public_access", None) is True for pub in author["publications"]))
         self.assertEqual(author["public_access"]["not_available"],