How do properly paginate the results from polygon.io API?

1.1k views Asked by At

I'm trying to download all the minute bars between two dates for stocks symbols using polygon.io. According to their API the API is limited to 50000 results.

From their Github issues, I've found the following comment:

The aggregates endpoint does not have the next_url for pagination. Instead, if there are more than 50,000 messages in the response, you will need to query a smaller time frame of data. I recommend querying 1 months worth of minute bars per query.

So here's what I've done so far:

Return a list of symbols:

from polygon import RESTClient
import os.path
from IPython.display import display
import pandas as pd

key = ''
all_tickers = []
df_list = []
final_df = []
from_ = '2021-05-01'
to = '2022-12-01'


def get_tickers():
    
    with RESTClient(key) as client:
        next_url = None
        while True:
            if next_url is None:
                tickers = client.reference_tickers_v3(type="CS")
            else:
                tickers = client._handle_response("ReferenceTickersV3ApiResponse", next_url, {})
            all_tickers.extend(tickers.results)
            if hasattr(tickers, 'next_url'):
                next_url = tickers.next_url
            else:
                break
    

file_name = 'tickers.csv'
if not os.path.exists(file_name):
    get_tickers()

    all_tickers_copy = pd.DataFrame(all_tickers)
    all_tickers_copy.to_csv(file_name, index=False)
else:
    all_tickers = pd.read_csv(file_name)
    all_tickers = all_tickers['ticker']

Return a list with the start and the end days of the months, between the from_ and to dates:

import pandas as pd

start_date, end_date = from_, to
dtrange = pd.date_range(start=start_date, end=end_date, freq='d')
months = pd.Series(dtrange .month)

starts, ends = months.ne(months.shift(1)), months.ne(months.shift(-1))
df = pd.DataFrame({'month_starting_date': dtrange[starts].strftime('%Y-%m-%d'),
                   'month_ending_date': dtrange[ends].strftime('%Y-%m-%d')})

# as a list of lists:
months = [df.columns.values.tolist()] + df.values.tolist()
months = pd.DataFrame(months)

I then have a function which loops through my symbols and makes an API request for every month between from_ and to:

def get_daily_agg(from_, to, ticker):
    with RESTClient(key) as client:
        folder_name = 'intraday_bars_gapped_new'
        final_df = pd.DataFrame([])

        try:
            # skip the header and loop through the rows
            for index, row in months[1:].iterrows():
                # save the start and end dates as variables
                from_ = row[0]
                to = row[1]
                print(f'{to} and {from_}')
                r = client.stocks_equities_aggregates(ticker, 1, "minute", from_, to, unadjusted=False, limit='50000')
                print(f'downloading {ticker} from {from_} to {to}')
                df = pd.DataFrame(r.results)
                df = df[['t','v','o','c','h','l', 'vw']]
                df.columns = ['datetime', 'volume','open','close','high', 'low', 'vwap']
                df['datetime'] = pd.to_datetime(df['datetime'],unit='ms')
                df['time'] = df['datetime'].dt.strftime("%H:%M:%S")
                df['date'] = df['datetime'].dt.strftime("%Y-%m-%d")

                final_df.append(df)

        except:
            print(f'nothing found for {ticker} from {from_} to {to}')
            pass

            
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)
        final_df.to_csv('{}/{}.csv'.format(folder_name, ticker), index=False)
    else:
        final_df.to_csv('{}/{}.csv'.format(folder_name, ticker), index=False)

import glob
from pathlib import Path

folder = "daily_bars_filtered/*.csv"
for fname in glob.glob(folder)[:20]:
    ticker = Path(fname).stem
    get_daily_agg(from_, to, ticker)

My question is - how do properly paginate the results from polygon.io API?

3

There are 3 answers

6
WillGetItDunn On

Update: this has been addressed and as of Polygon's Python client version v1.10.1 you can find examples like the following in the documentation where you can use a for loop to handle paginated responses:

client = RESTClient()  # POLYGON_API_KEY environment variable is used

aggs = client.get_aggs(
aggs = []
for a in client.list_aggs(
    "AAPL",
    1,
    "day",
    "2023-01-30",
    "minute",
    "2022-01-01",
    "2023-02-03",
)
    limit=50000,
):
    aggs.append(a)

print(aggs)

(as found in the new versions of the documentation examples/rest/stocks-aggregates_bars.py)

Here we can use list_aggs() and iterate over the pages of the result and process each (in this example, append them to a list)

Previously...

Prior to the Python client support documented above, I considered two workarounds for accomplishing this. I'll preserve them here, especially since you may find using HTTP requests (instead of the Python client) more performant and/or more quickly adaptable to API updates.

Using HTTP requests instead of the Python client

The Polygonio REST API can be accessed via a standard HTTP request, which you can parse as JSON. If there is data, it will be in the resulting JSON key 'results'. If there is an additional page, it will be in the JSON key 'next_url'. So you can make an HTTP request for the data, and use a while loop to make additional requests for each 'next_url' (if they exist).

For example:

from urllib3 import PoolManager
import json
import pandas as pd  # For examples of what to do with the results
def download_aggregates(
    api_key: str,
    ticker: str,
    start_date: str = None,
    end_date: str = None,
    time_span: str = "minute",
    multiplier: int = 1,
    adjusted: bool = False,
):
    params = (
        f"{ticker}/range/{multiplier}/{time_span}/{start_date}/{end_date}"
        f"?adjusted={adjusted}&sort=asc&limit=50000&apiKey={api_key}"
    )
    http_pool_manager = PoolManager()
    bars = http_pool_manager.request(
        "GET", f"https://api.polygon.io/v2/aggs/ticker/{params}"
    )
    response_json = json.loads(bars.data)

    if "results" not in response_json:
        return

    # Do stuff with results. For example, load the data into a dataframe:
    df = pd.DataFrame(response_json["results"])

    next_url = response_json["next_url"] if "next_url" in response_json else None
    while next_url:
        bars = http_pool_manager.request("GET", next_url + "&apiKey=" + api_key)
        response_json = json.loads(bars.data)

        # Do stuff with results, for example, append them to a dataframe
        if "results" in response_json:
            df = pd.concat(
                [df, pd.DataFrame(response_json["results"])],
                ignore_index=True,
                sort=False,
            )

        next_url = response_json["next_url"] if "next_url" in response_json else None

    return df

Multiple requests divided by date ranges

As noted by the question, you can separate your requests into date ranges so that each request returns less than the limit of 50,000 results. The major problem with this approach is it can greatly increases the number of requests you need to make compared to using the 'next_url' cursor, especially considering most polygon.io accounts are limited by the number of requests they can make in a given timeframe.

That said, it is doable. It's easiest to implement a solution that makes a request per month or per two months. Something like:

from polygon import RESTClient
from urllib3 import HTTPResponse
from datetime import date
from dateutil.rrule import rrule, MONTHLY

client = RESTClient(api_key='<your_api_key>')

ticker='<your_ticker>'

start_date = date(2021, 3, 1)
end_date = date(2023, 2, 28)

for d in rrule(freq=MONTHLY, dtstart=start_date, until=end_date):
    from_date = d.strftime("%Y-%m")+"-01"
    to_date = d.strftime("%Y-%m")+f"-{calendar.monthrange(d.year, d.month)[1]}"
    
    bars = cast(
        HTTPResponse,
        client.get_aggs(
            ticker=ticker, 
            multiplier=1, 
            timespan="minute", 
            from_=from_date, 
            to=to_date,
            limit=50000,
            raw = True),
            )

    # Operate on the returned data

You can probably half the number of requests required by doing two months at a time. Doing a quick lookup on the last two years of data, the largest dataset of minute aggregates for a ticker in a given month was ticker AAPL which had 18,956 records in the range 2022-03-01 to 2022-03-31, so I think you can safely collect two months of data at a time and still be below the 50,000 record limit. So if you use rrule as I did above, consider setting the interval parameter = 2 (2 months) or whatever the equivalent is for the library you are using.

But even two months at a time, this method is horribly inefficient. There are tickers where you could get the entire 2 years of data less than 3 requests via pagination, but even doing two months at a time would require 6 requests for the same.

0
FreelanceConsultant On

How to do it with standard requests until Python Client is fixed:

I hate to do this to you, since I also want the Python Client to work properly so that I can use it, however here is a way of doing it with standard HTTP requests.

I am posting this as an answer because:

  • It is simple
  • The HTTP API appears to be supported well (I haven't found issues with it)
  • however the Python Client appears to have some issues
  • I raised a support request regarding the Python Client - while waiting for a response on that I am going to use standard http(s) requests, as explained in this answer, because this is faster than waiting for support to reply

I have left some of the debugging comments in so that you can see what it is doing, and if it doesn't work, hopefully get a headstart in debugging as to what exactly has gone wrong.

  • In this example, I show how to use the Pagination API with a request to get a list of all the tickers.
  • Since the Pagination API is universal to the whole of Polygon.io API, this should work exactly the same way for all API endpoints.
  • If it doesn't, the most likely cause is the JSON response is formatted slightly differently and you will need a way to figure out how to get the 'next_url' value.
#!/usr/bin/env python3

import time
import requests
import json


def main():

    api_key = 'PUT YOUR API KEY HERE'

    base_url = 'https://api.polygon.io/v3/reference/tickers'
    params = {
        'market': 'stocks',
        'active': 'true',
        'limit': '1000',
        'apiKey': api_key
    }

    print(f'get: {base_url}')
    response = requests.get(base_url, params=params)

    all_results = []

    # just for debugging and to check some sensible
    # sequence of tickers comes back
    ticker_pairs = []

    # don't expect to need this but useful to have
    # to prevent losing all data in case of failure
    try:

        # NOTE: other status codes might occur if rate
        # limit exceeded, these can be handled by waiting
        # for a period of time and simply re-trying the
        # request
        while response.status_code == 200:

            response_json = response.json()

            # this contains data about the next url
            # we get it using an alternative method
            links = response.links
            print(links)

            results = response_json['results'] if 'results' in response_json else None
            next_url = response_json['next_url'] if 'next_url' in response_json else None
            count = response_json['count'] if 'count' in response_json else None

            if results is None:
                print('results is None')
            if next_url is None:
                print('next_url is None')
            if count is None:
                print('count is none')

            # expect this on the last page, but not before
            if count != 1000:
                print(f'warning: count={count}')

            first_ticker = results[0]['ticker']
            last_ticker = results[-1]['ticker']

            ticker_pair = (first_ticker, last_ticker)
            ticker_pairs.append(ticker_pair)

            # to save individual fields / a subset of fields
            # NOTE: availability of fields may vary between tickers
            # for result in results:
            #     ticker              = result['ticker']
            #     name                = result['name']
            #     market              = result['market']
            #     locale              = result['locale']
            #     primary_exchange    = result['primary_exchange']
            #     type                = result['type']
            #     active              = result['active']
            #     currency_name       = result['currency_name']
            #     cik                 = result['cik']
            #     composite_figi      = result['composite_figi']
            #     share_class_figi    = result['share_class_figi']
            #     last_updated_utc    = result['last_updated_utc']

            # otherwise, to save all fields
            all_results.extend(results)

            # limit rate
            print(f'{ticker_pair} (rate limit)')
            time.sleep(15)

            if next_url is None:
                print(f'next_url is None, break')
                break

            # I don't have a better way of doing this figured out yet
            # Maybe: Parse the url to get the params, and build the
            # url as before?
            next_url_with_api_key = f'{next_url}&apiKey={api_key}'

            print(f'get: {next_url_with_api_key}')
            response = requests.get(next_url_with_api_key)

            print(f'status_code: {response.status_code}')

            if response.status_code != 200:
                print(f'error')
                print(response)
                print(response['error'])


    except Exception as exception:
        print(f'{exception}')


    # dump json for later processing
    filename_json = f'tickers.json'
    with open(filename_json, 'w') as ofile:
        json.dump(all_results, ofile, indent=4)

    # save ticker pairs for inspection
    filename_ticker_pairs_json = f'debug_ticker_pairs.json'
    with open(filename_ticker_pairs_json, 'w') as ofile:
        json.dump(ticker_pairs, ofile, indent=4)

    # return data in case you want to make this function
    # part of some other code which processes the returned
    # data rather than reading it back from disk
    return all_results


if __name__ == '__main__':
    main()

You could cut this example down further. You obviously do not need to:

  • save the "ticker pairs", but this is useful debugging information to see if the tickers being returned are in a sensible alphabetic order (they are)
  • dump the data to a JSON file at the end
  • you could dispense with the whole try statement as well, or put it at a higher level of call stack

The last ticker I get is ZYXI, with the final count being 117.

When I get the Python Client working I will add another answer for this.

0
FreelanceConsultant On

I posted a previous answer in which I said I had some issues with the Polygon Python Client.

After getting further information from support it appeared to be the case that I was using an older, unofficial, and no longer supported, version of the API.

You want to be careful here: If you just do a search for Polygon Python Client the first results to be recommended will be the wrong (unofficial) client.

This is the wrong one:

And this is the correct, official one:

What makes matters worse is that the unofficial client took the most obvious name which is pip install polygon. (Don't install this!)

The official client should be installed with pip install polygon-api-client.


Using requests is probably faster

Having now played with both the requests API and the Python Client it seems as if the Python Client is much more limited in terms of the volume of data you can get out of it. This might not be of concern if you have a paid license. However, if you are just doing some testing with the free-tier API Key (as I am) then you might not want to use the Python Client.

On the other hand, it appears to be the case that more data is returned by the Python Client.

Here's a list of fields returned by the REST method:

  • ticker
  • name
  • market
  • locale
  • primary_exchange
  • type
  • active
  • currency_name
  • cik
  • composite_figi
  • share_class_figi
  • last_updated_utc

And a list of fields contained in the type from polygon.rest.models.tickers import Ticker:

  • active
  • base_currency_name
  • base_currency_symbol
  • cik
  • composite_figi
  • currency_name
  • currency_symbol
  • delisted_utc
  • last_updated_utc
  • locale
  • market
  • name
  • primary_exchange
  • share_class_figi
  • source_feed
  • ticker
  • type

Whether or not all the data is populated in that Ticker structure, I am not sure yet. (I don't have all the data returned to me yet to inspect it.)

The REST Client, as I mentioned appears to be faster. I think this is because you are able to request 1000 rows of data with each request, which is around 50k rows/minute with the free-tier.

With the Python client, the limit appears to be 10 rows/request.


Python Client Code:

Note as with the previous answer I am collecting ticker information here rather than the aggregations data, however the principle is the same since the RESTClient works the same way regardless of which data endpoint you request. Here I show the use of both get_ticker_types and list_tickers endpoints.

#!/usr/bin/env python3

from polygon import RESTClient
from polygon.rest.models.tickers import Ticker

import json
import time


def main():

    rest_client = RESTClient('YOUR API KEY HERE')

    ticker_types = rest_client.get_ticker_types()

    print('TICKER TYPES')
    for ticker_type in ticker_types:
        print(ticker_type)
    print('END')

    tickers_result = \
        rest_client.list_tickers(
            type="CS",
            limit=1000)

    ticker_dict_list: list[dict] = []

    for ticker_result in tickers_result:

        ticker: Ticker = ticker_result

        ticker_dict = {
            'active': ticker.active,
            'base_currency_name': ticker.base_currency_name,
            'base_currency_symbol': ticker.base_currency_symbol,
            'cik': ticker.cik,
            'composite_figi': ticker.composite_figi,
            'currency_name': ticker.currency_name,
            'currency_symbol': ticker.currency_symbol,
            'delisted_utc': ticker.delisted_utc,
            'last_updated_utc': ticker.last_updated_utc,
            'locale': ticker.locale,
            'market': ticker.market,
            'name': ticker.name,
            'primary_exchange': ticker.primary_exchange,
            'share_class_figi': ticker.share_class_figi,
            'source_feed': ticker.source_feed,
            'ticker': ticker.ticker,
            'type': ticker.type,
        }

        ticker_dict_list.append(ticker_dict)

        count = len(ticker_dict_list)

        print(f'ticker_dict_list count: {count}')

        # rate-limiting to 4 request/minute
        # each loop iteration appears to make 1 request
        # returning up to 10 rows of data
        if count % 10 == 0:
            time.sleep(15)

    with open('tickers.json', 'w') as ofile:
        json.dump(ticker_dict_list, ofile, indent=4)