How do I scrape all R&B songs from 2018-2023 from Spotify's API?

122 views Asked by At

I am new to programming and web-scraping, but I am trying to scrape audio features and collaborators for all R&B songs released onto Spotify from 2018-2023 using Spotipy. I understand that there are limits as for how many songs one can scrape, but I referenced an answer on how to solve this and tried to adapt the given Python code to my problem.

However, when I run the below script:

import spotipy
from spotipy.oauth2 import SpotifyOAuth
import pandas as pd
import re

# Spotify API Credentials and Constants
USER_ID = 'USER_ID'  
CLIENT_ID = 'CLIENT_ID'  
CLIENT_SECRET = 'CLIENT_SECRET'  
REDIRECT_URI = 'http://localhost:3000'  

# Spotify Scopes
SCOPE = [
    'user-library-read',
    'user-follow-read',
    'user-top-read',
    'playlist-read-private',
    'playlist-read-collaborative',
    'playlist-modify-public',
    'playlist-modify-private'
]

# Initialize Spotipy client
sp = spotipy.Spotify(auth_manager=SpotifyOAuth(
    scope=SCOPE,
    username=USER_ID,
    redirect_uri=REDIRECT_URI,
    client_id=CLIENT_ID,
    client_secret=CLIENT_SECRET))

def get_categories():
    query_limit = 50
    categories = []
    new_offset = 0
    while True:
        results = sp.category_playlists(category_id='0JQ5DAqbMKFEZPnFQSFB1T', limit=query_limit, country='US', offset=new_offset)
        for item in results['playlists']['items']:
            if item and item.get('name') and item.get('tracks', {}).get('href'):
                tokens = re.split(r"[\/]", item['tracks']['href'])
                categories.append({
                    'id': item['id'],
                    'name': item['name'],
                    'url': item['external_urls']['spotify'],
                    'tracks': item['tracks']['href'],
                    'playlist_id': tokens[5],
                    'type': item['type']
                })
        new_offset += query_limit
        next_page = results['playlists']['next']
        if not next_page:
            break
    return categories

def get_songs(categories):
    songs = []
    for category in categories:
        playlist_id = category['playlist_id']
        results = sp.playlist_tracks(playlist_id=playlist_id)
        for item in results['items']:
            track = item['track']
            if track and 'id' in track and track['id']:
                release_date = track['album']['release_date']
                # Filter songs based on release date
                if '2018' <= release_date.split('-')[0] <= '2023':
                    features = sp.audio_features(track['id'])[0]
                    main_artist = track['artists'][0]['name']
                    featured_artists = ', '.join([artist['name'] for artist in track['artists'][1:]]) if len(track['artists']) > 1 else 'N/A'
                    is_featured = len(track['artists']) > 1
                    songs.append({
                        'track_name': track['name'],
                        'album_name': track['album']['name'],
                        'main_artist': main_artist,
                        'features_artist': is_featured,
                        'featured_artist': featured_artists,
                        'release_date': release_date,
                        'popularity_score': track['popularity'],
                        'explicit': track['explicit'],
                        'duration': track['duration_ms'],
                        'danceability': features['danceability'],
                        'energy': features['energy'],
                        'key': features['key'],
                        'loudness': features['loudness'],
                        'mode': features['mode'],
                        'speechiness': features['speechiness'],
                        'acousticness': features['acousticness'],
                        'instrumentalness': features['instrumentalness'],
                        'liveness': features['liveness'],
                        'valence': features['valence'],
                        'tempo': features['tempo'],
                    })
    return songs

# Fetch categories and songs
categories = get_categories()
songs = get_songs(categories)

# Create dataframe
df = pd.DataFrame(songs)
print(df.shape)
df.head()

I either get an error that I've done the Maximum Retries or the following error:

---------------------------------------------------------------------------
TimeoutError                              Traceback (most recent call last)
File ~/anaconda3/lib/python3.11/site-packages/urllib3/connectionpool.py:467, in HTTPConnectionPool._make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
    463         except BaseException as e:
    464             # Remove the TypeError from the exception chain in
    465             # Python 3 (including for exceptions like SystemExit).
    466             # Otherwise it looks like a bug in the code.
--> 467             six.raise_from(e, None)
    468 except (SocketTimeout, BaseSSLError, SocketError) as e:

File <string>:3, in raise_from(value, from_value)

File ~/anaconda3/lib/python3.11/site-packages/urllib3/connectionpool.py:462, in HTTPConnectionPool._make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
    461 try:
--> 462     httplib_response = conn.getresponse()
    463 except BaseException as e:
    464     # Remove the TypeError from the exception chain in
    465     # Python 3 (including for exceptions like SystemExit).
    466     # Otherwise it looks like a bug in the code.

File ~/anaconda3/lib/python3.11/http/client.py:1378, in HTTPConnection.getresponse(self)
   1377 try:
-> 1378     response.begin()
   1379 except ConnectionError:

File ~/anaconda3/lib/python3.11/http/client.py:318, in HTTPResponse.begin(self)
    317 while True:
--> 318     version, status, reason = self._read_status()
    319     if status != CONTINUE:

File ~/anaconda3/lib/python3.11/http/client.py:279, in HTTPResponse._read_status(self)
    278 def _read_status(self):
--> 279     line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
    280     if len(line) > _MAXLINE:

File ~/anaconda3/lib/python3.11/socket.py:706, in SocketIO.readinto(self, b)
    705 try:
--> 706     return self._sock.recv_into(b)
    707 except timeout:

File ~/anaconda3/lib/python3.11/ssl.py:1311, in SSLSocket.recv_into(self, buffer, nbytes, flags)
   1308         raise ValueError(
   1309           "non-zero flags not allowed in calls to recv_into() on %s" %
   1310           self.__class__)
-> 1311     return self.read(nbytes, buffer)
   1312 else:

File ~/anaconda3/lib/python3.11/ssl.py:1167, in SSLSocket.read(self, len, buffer)
   1166 if buffer is not None:
-> 1167     return self._sslobj.read(len, buffer)
   1168 else:

TimeoutError: The read operation timed out

During handling of the above exception, another exception occurred:

ReadTimeoutError                          Traceback (most recent call last)
File ~/anaconda3/lib/python3.11/site-packages/requests/adapters.py:486, in HTTPAdapter.send(self, request, stream, timeout, verify, cert, proxies)
    485 try:
--> 486     resp = conn.urlopen(
    487         method=request.method,
    488         url=url,
    489         body=request.body,
    490         headers=request.headers,
    491         redirect=False,
    492         assert_same_host=False,
    493         preload_content=False,
    494         decode_content=False,
    495         retries=self.max_retries,
    496         timeout=timeout,
    497         chunked=chunked,
    498     )
    500 except (ProtocolError, OSError) as err:

File ~/anaconda3/lib/python3.11/site-packages/urllib3/connectionpool.py:799, in HTTPConnectionPool.urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
    797     e = ProtocolError("Connection aborted.", e)
--> 799 retries = retries.increment(
    800     method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2]
    801 )
    802 retries.sleep()

File ~/anaconda3/lib/python3.11/site-packages/urllib3/util/retry.py:550, in Retry.increment(self, method, url, response, error, _pool, _stacktrace)
    549 if read is False or not self._is_method_retryable(method):
--> 550     raise six.reraise(type(error), error, _stacktrace)
    551 elif read is not None:

File ~/anaconda3/lib/python3.11/site-packages/urllib3/packages/six.py:770, in reraise(tp, value, tb)
    769         raise value.with_traceback(tb)
--> 770     raise value
    771 finally:

File ~/anaconda3/lib/python3.11/site-packages/urllib3/connectionpool.py:715, in HTTPConnectionPool.urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
    714 # Make the request on the httplib connection object.
--> 715 httplib_response = self._make_request(
    716     conn,
    717     method,
    718     url,
    719     timeout=timeout_obj,
    720     body=body,
    721     headers=headers,
    722     chunked=chunked,
    723 )
    725 # If we're going to release the connection in ``finally:``, then
    726 # the response doesn't need to know about the connection. Otherwise
    727 # it will also try to release it and we'll have a double-release
    728 # mess.

File ~/anaconda3/lib/python3.11/site-packages/urllib3/connectionpool.py:469, in HTTPConnectionPool._make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
    468 except (SocketTimeout, BaseSSLError, SocketError) as e:
--> 469     self._raise_timeout(err=e, url=url, timeout_value=read_timeout)
    470     raise

File ~/anaconda3/lib/python3.11/site-packages/urllib3/connectionpool.py:358, in HTTPConnectionPool._raise_timeout(self, err, url, timeout_value)
    357 if isinstance(err, SocketTimeout):
--> 358     raise ReadTimeoutError(
    359         self, url, "Read timed out. (read timeout=%s)" % timeout_value
    360     )
    362 # See the above comment about EAGAIN in Python 3. In Python 2 we have
    363 # to specifically catch it and throw the timeout error

ReadTimeoutError: HTTPSConnectionPool(host='api.spotify.com', port=443): Read timed out. (read timeout=5)

During handling of the above exception, another exception occurred:

ReadTimeout                               Traceback (most recent call last)
Cell In[16], line 95
     93 # Fetch categories and songs
     94 categories = get_categories()
---> 95 songs = get_songs(categories)
     97 # Create dataframe
     98 df = pd.DataFrame(songs)

Cell In[16], line 65, in get_songs(categories)
     63 # Filter songs based on release date
     64 if '2018' <= release_date.split('-')[0] <= '2023':
---> 65     features = sp.audio_features(track['id'])[0]
     66     main_artist = track['artists'][0]['name']
     67     featured_artists = ', '.join([artist['name'] for artist in track['artists'][1:]]) if len(track['artists']) > 1 else 'N/A'

File ~/anaconda3/lib/python3.11/site-packages/spotipy/client.py:1734, in Spotify.audio_features(self, tracks)
   1732 if isinstance(tracks, str):
   1733     trackid = self._get_id("track", tracks)
-> 1734     results = self._get("audio-features/?ids=" + trackid)
   1735 else:
   1736     tlist = [self._get_id("track", t) for t in tracks]

File ~/anaconda3/lib/python3.11/site-packages/spotipy/client.py:323, in Spotify._get(self, url, args, payload, **kwargs)
    320 if args:
    321     kwargs.update(args)
--> 323 return self._internal_call("GET", url, payload, kwargs)

File ~/anaconda3/lib/python3.11/site-packages/spotipy/client.py:266, in Spotify._internal_call(self, method, url, payload, params)
    262 logger.debug('Sending %s to %s with Params: %s Headers: %s and Body: %r ',
    263              method, url, args.get("params"), headers, args.get('data'))
    265 try:
--> 266     response = self._session.request(
    267         method, url, headers=headers, proxies=self.proxies,
    268         timeout=self.requests_timeout, **args
    269     )
    271     response.raise_for_status()
    272     results = response.json()

File ~/anaconda3/lib/python3.11/site-packages/requests/sessions.py:589, in Session.request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
    584 send_kwargs = {
    585     "timeout": timeout,
    586     "allow_redirects": allow_redirects,
    587 }
    588 send_kwargs.update(settings)
--> 589 resp = self.send(prep, **send_kwargs)
    591 return resp

File ~/anaconda3/lib/python3.11/site-packages/requests/sessions.py:703, in Session.send(self, request, **kwargs)
    700 start = preferred_clock()
    702 # Send the request
--> 703 r = adapter.send(request, **kwargs)
    705 # Total elapsed time of the request (approximately)
    706 elapsed = preferred_clock() - start

File ~/anaconda3/lib/python3.11/site-packages/requests/adapters.py:532, in HTTPAdapter.send(self, request, stream, timeout, verify, cert, proxies)
    530     raise SSLError(e, request=request)
    531 elif isinstance(e, ReadTimeoutError):
--> 532     raise ReadTimeout(e, request=request)
    533 elif isinstance(e, _InvalidHeader):
    534     raise InvalidHeader(e, request=request)

ReadTimeout: HTTPSConnectionPool(host='api.spotify.com', port=443): Read timed out. (read timeout=5)

Does anyone know how I can modify the below script to achieve my goal or a new approach?

1

There are 1 answers

2
Bench Vue On

Your audio_features() API call too many call error response from Spotify. This indicates that your app has reached our Web API rate limit

Max Retries reached
Failed to upload to call get_songs: http status: 429, code:-1 - /v1/audio-features/?ids=track_id:
 Max Retries, reason: too many 429 error responses
sp.audio_features(track['id'])[0]

So I update your code by removing audio_features() call.

This code get the all of 3360 R&B songs between 2018 and 2023

import spotipy
from spotipy.oauth2 import SpotifyOAuth
import json
import re

SCOPE = ['user-library-read',
    'user-follow-read',
    'user-top-read',
    'playlist-read-private',
    'playlist-read-collaborative',
    'playlist-modify-public',
    'playlist-modify-private']
USER_ID = 'USER_ID'
REDIRECT_URI = 'http://localhost:3000/callback'
CLIENT_ID = 'CLIENT_ID'
CLIENT_SECRET = 'CLIENT_SECRET'

auth_manager = SpotifyOAuth(
    scope=SCOPE,
    username=USER_ID,
    redirect_uri=REDIRECT_URI,
    client_id=CLIENT_ID,
    client_secret=CLIENT_SECRET)

def get_categories():
    try:
        sp = spotipy.Spotify(auth_manager=auth_manager)
        query_limit = 50
        categories=[]
        new_offset = 0
        while True:
            results=sp.category_playlists(category_id='0JQ5DAqbMKFEZPnFQSFB1T', limit = query_limit, country='US', offset=new_offset)
            for item in results['playlists']['items']:
                if (item is not None and item['name'] is not None):
                    tokens = re.split(r"[\/]", item['tracks']['href'])
                    categories.append({
                        'id' : item['id'],
                        'name': item['name'],
                        'url': item['external_urls']['spotify'],
                        'tracks': item['tracks']['href'],
                        'playlist_id': tokens[5],
                        'type': item['type']
                    })
            new_offset = new_offset + query_limit
            next = results['playlists']['next']
            if next is None:
                break
        return categories
    except Exception as e:
        print('Failed to upload to call get_categories: '+ str(e))

def get_songs(categories):
    try:
        sp = spotipy.Spotify(auth_manager=auth_manager)
        songs=[]
        for category in categories:
            if category is None:
                break
            playlist_id = category['playlist_id']
            results=sp.playlist(playlist_id=playlist_id)
            for item in results['tracks']['items']:
                if (item is not None and item['track'] is not None and item['track']['id'] is not None and item['track']['name'] is not None and item['track']['external_urls']['spotify'] is not None):
                    track = item['track']
                    release_date = track['album']['release_date']
                    if '2018' <= release_date.split('-')[0] <= '2023':
                        main_artist = track['artists'][0]['name']
                        featured_artists = ', '.join([artist['name'] for artist in track['artists'][1:]]) if len(track['artists']) > 1 else 'N/A'
                        is_featured = len(track['artists']) > 1
                        songs.append({
                            'track_name': track['name'],
                            'album_name': track['album']['name'],
                            'main_artist': main_artist,
                            'features_artist': is_featured,
                            'featured_artist': featured_artists,
                            'release_date': release_date,
                            'popularity_score': track['popularity'],
                            'explicit': track['explicit'],
                            'duration': track['duration_ms']
                        })
                else:
                    break
            print(len(songs))
        return songs
    except Exception as e:
        print('Failed to upload to call get_songs: '+ str(e))

def save_songs_to_file(songs, file_name):
    try:
        with open(file_name, 'w') as file:
            json.dump(songs, file)
    except Exception as e:
        print('Failed to save songs to file: ' + str(e))

try:
    categories = get_categories()
    songs = get_songs(categories)
    save_songs_to_file(songs, 'all_songs.json')
    print(len(songs)) # -> 3360
except Exception as e:
    print('Failed to run: ' + str(e))

Result

[
  {
    "track_name": "act ii: date @ 8",
    "album_name": "act ii: date @ 8",
    "main_artist": "4batz",
    "features_artist": false,
    "featured_artist": "N/A",
    "release_date": "2023-12-15",
    "popularity_score": 83,
    "explicit": true,
    "duration": 113684
  },
  {
    "track_name": "Sensational (feat. Davido & Lojay)",
    "album_name": "11:11",
    "main_artist": "Chris Brown",
    "features_artist": true,
    "featured_artist": "Davido, Lojay",
    "release_date": "2023-11-10",
    "popularity_score": 78,
    "explicit": true,
    "duration": 231109
  },
  {
    "track_name": "On and On",
    "album_name": "TYLA",
    "main_artist": "Tyla",
    "features_artist": false,
    "featured_artist": "N/A",
    "release_date": "2023-12-01",
    "popularity_score": 68,
    "explicit": false,
    "duration": 167746
  },
  {
    "track_name": "Me & U",
    "album_name": "Me & U",
    "main_artist": "Tems",
    "features_artist": false,
    "featured_artist": "N/A",
    "release_date": "2023-10-05",
    "popularity_score": 82,
    "explicit": false,
    "duration": 192935
  },

... removed most data

  {
    "track_name": "Rolling Thunder",
    "album_name": "Angels",
    "main_artist": "SALTI",
    "features_artist": false,
    "featured_artist": "N/A",
    "release_date": "2020-10-16",
    "popularity_score": 17,
    "explicit": true,
    "duration": 183440
  },
  {
    "track_name": "Love Me Like (feat. DUCKWRTH)",
    "album_name": "Love Me Like (feat. DUCKWRTH)",
    "main_artist": "Rayana Jay",
    "features_artist": true,
    "featured_artist": "Duckwrth",
    "release_date": "2019-03-15",
    "popularity_score": 43,
    "explicit": true,
    "duration": 164745
  },
  {
    "track_name": "Sweet Life",
    "album_name": "Sweet Life",
    "main_artist": "Seinabo Sey",
    "features_artist": true,
    "featured_artist": "waterbaby",
    "release_date": "2021-10-15",
    "popularity_score": 41,
    "explicit": false,
    "duration": 176760
  },
  {
    "track_name": "Thank You",
    "album_name": "Thank You",
    "main_artist": "Maya Delilah",
    "features_artist": false,
    "featured_artist": "N/A",
    "release_date": "2021-09-02",
    "popularity_score": 48,
    "explicit": false,
    "duration": 204592
  }
]

enter image description here