Prevent 503 Error when scraping Google Scholar

2k views Asked by At

I have written the following code to scrape data from Google Scholar security page.. However, whenever I run it I receive this error:

 Traceback (most recent call last):
  File "/Users/.../Documents/GS_Tag_Scraper/scrape-modified.py", line 53, in <module>
    getProfileFromTag(each)
  File "/Users/.../Documents/GS_Tag_Scraper/scrape-modified.py", line 32, in getProfileFromTag
    page = urllib.request.urlopen(url)
  File "/Users/.../anaconda/lib/python3.5/urllib/request.py", line 163, in urlopen
    return opener.open(url, data, timeout)
  File "/Users/.../anaconda/lib/python3.5/urllib/request.py", line 472, in open
    response = meth(req, response)
  File "/Users/.../anaconda/lib/python3.5/urllib/request.py", line 582, in http_response
    'http', request, response, code, msg, hdrs)
  File "/Users/.../anaconda/lib/python3.5/urllib/request.py", line 504, in error
    result = self._call_chain(*args)
  File "/Users/.../anaconda/lib/python3.5/urllib/request.py", line 444, in _call_chain
    result = func(*args)
  File "/Users/.../anaconda/lib/python3.5/urllib/request.py", line 696, in http_error_302
    return self.parent.open(new, timeout=req.timeout)
  File "/Users/.../anaconda/lib/python3.5/urllib/request.py", line 472, in open
    response = meth(req, response)
  File "/Users/.../anaconda/lib/python3.5/urllib/request.py", line 582, in http_response
    'http', request, response, code, msg, hdrs)
  File "/Users/.../anaconda/lib/python3.5/urllib/request.py", line 510, in error
    return self._call_chain(*args)
  File "/Users/.../anaconda/lib/python3.5/urllib/request.py", line 444, in _call_chain
    result = func(*args)
  File "/Users/.../anaconda/lib/python3.5/urllib/request.py", line 590, in http_error_default
    raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 503: Service Unavailable

I presume this is because GS is blocking my requests. How can I prevent this?

The code is:

# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import urllib.request
import string
import csv
import time

#Declares array's to store data
name = []
urlList =[]

#Opens and writer header of CSV file
outputFile = open('sample.csv', 'w', newline='')
outputWriter = csv.writer(outputFile)
outputWriter.writerow(['Name', 'URL', 'Total Citations', 'h-index', 'i10-index'])

def getStat (url):
    #Given an authors URL it retunrs an array of stats.
    url = 'https://scholar.google.pl' + url
    page = urllib.request.urlopen(url)
    soup = BeautifulSoup(page, 'lxml')
    buttons = soup.findAll("td", { "class" : "gsc_rsb_std" })
    list=[]
    return (list)

def getProfileFromTag(tag):
    url = "http://scholar.google.pl/citations?view_op=search_authors&hl=pl&mauthors=label:" + tag
    while True:
        page = urllib.request.urlopen(url)
        soup = BeautifulSoup(page, 'lxml')

        mydivs = BeautifulSoup(urllib.request.urlopen(url), 'lxml').findAll("h3", { "class" : "gsc_1usr_name"})
        for each in mydivs:
            for anchor in each.find_all('a'):
                name.append(anchor.text)
                urlList.append(anchor['href'])
                time.sleep(0.001)
        buttons = soup.findAll("button", {"aria-label": "Następna"})
        if not buttons:
            break
        on_click = buttons[0].get('onclick')
        url = 'http://scholar.google.pl' + on_click[17:-1]
        url = url.encode('utf-8').decode('unicode_escape')
    for each in name:
        list = getStat(urlList[i])
        outputWriter.writerow([each, urlList[i], list[0], list[2], list[4]])

tags = ['security']
for each in tags:
    getProfileFromTag(each)
2

There are 2 answers

0
AudioBubble On

Use requests along with appropriate request headers instead.

import requests

url = 'https://scholar.google.pl/citations?view_op=search_authors&mauthors=label:security'

request_headers = {
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'en-US,en;q=0.8',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
}

with requests.Session() as s:
    r = s.get(url, headers=request_headers)

The result you get:

Adrian Perrig    /citations?user=n-Oret4AAAAJ&hl=pl
Vern Paxson      /citations?user=HvwPRJ0AAAAJ&hl=pl
Frans Kaashoek   /citations?user=YCoLskoAAAAJ&hl=pl
Mihir Bellare    /citations?user=2pW1g5IAAAAJ&hl=pl
Xuemin Shen      /citations?user=Bjl3GwoAAAAJ&hl=pl
Helen J. Wang    /citations?user=qhu-DxwAAAAJ&hl=pl
Sushil Jajodia   /citations?user=lOZ1vHIAAAAJ&hl=pl
Martin Abadi     /citations?user=vWTI60AAAAAJ&hl=pl
Jean-Pierre Hubaux   /citations?user=W7YBLlEAAAAJ&hl=pl
Ross Anderson    /citations?user=WgyDcoUAAAAJ&hl=pl

using this:

users = soup.findAll('h3', {'class': 'gsc_oai_name'})
for user in users:
    name = user.a.text.strip()
    link = user.a['href']
    print(name, '\t', link)

You can find the headers that the browser sends by studying the network tab of Chrome's developer tools.

0
Dmitriy Zub On

I assume you tried to parse the next page token. However, if not, it's because you didn't provide a next page token ID to the request once it's parsed. Or you hit the IP rate limit or get a CAPTCHA. Request headers unfortunately aren't enough.

The ideal solution besides passing the next page token is to use proxies with CAPTCHA solving service.


Code and example in the online IDE:

from bs4 import BeautifulSoup
import requests, lxml, re


def scrape_all_authors():
    params = {
        "view_op": "search_authors",
        "mauthors": "blizzard",
        "hl": "en",
        "astart": 0
    }

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3538.102 Safari/537.36 Edge/18.19582",
    }

    authors_is_present = True
    while authors_is_present:

        html = requests.get("https://scholar.google.com/citations", params=params, headers=headers, timeout=30)
        soup = BeautifulSoup(html.text, "lxml")

        for author in soup.select(".gs_ai_chpr"):
            name = author.select_one(".gs_ai_name a").text
            link = f'https://scholar.google.com{author.select_one(".gs_ai_name a")["href"]}'
            affiliations = author.select_one(".gs_ai_aff").text
            email = author.select_one(".gs_ai_eml").text
            try:
                cited_by = re.search(r"\d+", author.select_one(".gs_ai_cby").text).group() # Cited by 17143 -> 17143
            except: cited_by = None

            print(f"extracting authors at page #{params['astart']}.",
                  name,
                  link,
                  affiliations,
                  email,
                  cited_by, sep="\n")

        # if next page token
        if soup.select_one("button.gs_btnPR")["onclick"]:
            params["after_author"] = re.search(r"after_author\\x3d(.*)\\x26", str(soup.select_one("button.gs_btnPR")["onclick"])).group(1)  # -> XB0HAMS9__8J
            params["astart"] += 10
        else:
            authors_is_present = False

scrape_all_authors()


# output:
'''
extracting authors at page #0.
Johnson Thomas
https://scholar.google.com/citations?hl=en&user=eKLr0EgAAAAJ
Professor of Computer Science, Oklahoma State University
Verified email at cs.okstate.edu
159469
...

extracting authors at page #60.
Bulent Sankur
https://scholar.google.com/citations?hl=en&user=z9FUD8QAAAAJ
Professor of Electrical and Electronics Engineering, Bogazici University
Verified email at boun.edu.tr
16953
'''

Alternatively, you can achieve the same thing using Google Scholar Profiles API from SerpApi. It's a paid API with a free plan.

The difference is that you only need to iterate through the received dictionary and grab the data you want without the need to figure out how to scale the number of requests, how to bypass blocks from search engines.

Example code to integrate:

from serpapi import GoogleSearch
import os, json
from urllib.parse import urlsplit, parse_qsl


def serpapi_scrape_all_authors():
    params = {
        "api_key": os.getenv("API_KEY"),      # SerpApi API key
        "engine": "google_scholar_profiles",  # profile results search engine
        "mauthors": "blizzard",               # search query
    }
    search = GoogleSearch(params)

    profile_results_data = []

    profiles_is_present = True
    while profiles_is_present:
        profile_results = search.get_dict()

        for profile in profile_results["profiles"]:

            print(f'Currently extracting {profile["name"]} with {profile["author_id"]} ID.')

            thumbnail = profile["thumbnail"]
            name = profile["name"]
            link = profile["link"]
            author_id = profile["author_id"]
            affiliations = profile["affiliations"]
            email = profile.get("email")
            cited_by = profile.get("cited_by")
            interests = profile.get("interests")

            profile_results_data.append({
                "thumbnail": thumbnail,
                "name": name,
                "link": link,
                "author_id": author_id,
                "email": email,
                "affiliations": affiliations,
                "cited_by": cited_by,
                "interests": interests
            })

            if "next" in profile_results["pagination"]:
                # split URL in parts as a dict() and update search "params" variable to a new page
                search.params_dict.update(dict(parse_qsl(urlsplit(profile_results["pagination"]["next"]).query)))
            else:
                profiles_is_present = False

    return profile_results_data

print(json.dumps(serpapi_scrape_all_authors(), indent=2))


# output:
'''
Currently extracting Adam Lobel with _xwYD2sAAAAJ ID.
...
Currently extracting Vladimir Ivanov with rddjbZcAAAAJ ID.

[
  {
    "thumbnail": "https://scholar.googleusercontent.com/citations?view_op=small_photo&user=_xwYD2sAAAAJ&citpid=3",
    "name": "Adam Lobel",
    "link": "https://scholar.google.com/citations?hl=en&user=_xwYD2sAAAAJ",
    "author_id": "_xwYD2sAAAAJ",
    "email": "Verified email at AdamLobel.com",
    "affiliations": "Blizzard Entertainment",
    "cited_by": 2980,
    "interests": [
      {
        "title": "Gaming",
        "serpapi_link": "https://serpapi.com/search.json?engine=google_scholar_profiles&hl=en&mauthors=label%3Agaming",
        "link": "https://scholar.google.com/citations?hl=en&view_op=search_authors&mauthors=label:gaming"
      },
      {
        "title": "Emotion regulation",
        "serpapi_link": "https://serpapi.com/search.json?engine=google_scholar_profiles&hl=en&mauthors=label%3Aemotion_regulation",
        "link": "https://scholar.google.com/citations?hl=en&view_op=search_authors&mauthors=label:emotion_regulation"
      }
    ]
  } ... other results
  {
    "thumbnail": "https://scholar.google.com/citations/images/avatar_scholar_56.png",
    "name": "Vladimir Ivanov",
    "link": "https://scholar.google.com/citations?hl=en&user=rddjbZcAAAAJ",
    "author_id": "rddjbZcAAAAJ",
    "email": null,
    "affiliations": "Blizzard Entertainment",
    "cited_by": null,
    "interests": [
      {
        "title": "Machine Learning",
        "serpapi_link": "https://serpapi.com/search.json?after_author=V8JcAPb___8J&engine=google_scholar_profiles&hl=en&mauthors=label%3Amachine_learning",
        "link": "https://scholar.google.com/citations?hl=en&view_op=search_authors&mauthors=label:machine_learning"
      },
      {
        "title": "Reinforcement Learning",
        "serpapi_link": "https://serpapi.com/search.json?after_author=V8JcAPb___8J&engine=google_scholar_profiles&hl=en&mauthors=label%3Areinforcement_learning",
        "link": "https://scholar.google.com/citations?hl=en&view_op=search_authors&mauthors=label:reinforcement_learning"
      },
      {
        "title": "Computer Vision",
        "serpapi_link": "https://serpapi.com/search.json?after_author=V8JcAPb___8J&engine=google_scholar_profiles&hl=en&mauthors=label%3Acomputer_vision",
        "link": "https://scholar.google.com/citations?hl=en&view_op=search_authors&mauthors=label:computer_vision"
      },
      {
        "title": "Cinematics",
        "serpapi_link": "https://serpapi.com/search.json?after_author=V8JcAPb___8J&engine=google_scholar_profiles&hl=en&mauthors=label%3Acinematics",
        "link": "https://scholar.google.com/citations?hl=en&view_op=search_authors&mauthors=label:cinematics"
      }
    ]
  }
]
'''

If you would like to parse historic organic results from Google Scholar, there is a dedicated blog post Scrape historic Google Scholar results using Python of mine.

Disclaimer, I work for SerpApi.