I have a task to parse every second data from a website and check if there is fresh news. I also have a condition that the parser should execute js. I tried to implement this with requests_html, but as I realized my site uses HTTP2, which is not supported by this library. I decided to use pyppeteer for scraping. Here is my code:
import asyncio
import datetime
from csv import DictWriter
from time import time
from pyppeteer import launch
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
async def create_browser():
browser = await launch({'headless': True,
'executablePath': '/usr/bin/google-chrome-stable'})
return browser
async def scrape_page(browser, ua):
page = await browser.newPage()
await page.setUserAgent(ua.random)
await page.goto(f'https://announcements.bybit.com/en-US/?category=&page=1&{int(time())}')
visit_time = datetime.datetime.now()
html_content = await page.content()
return html_content, visit_time
class Parsing:
def __init__(self, domain):
self.domain = domain
self.last_news_title = ""
def __call__(self, html_code, visit_time):
soup = BeautifulSoup(html_code, 'lxml')
news_data = soup.select_one('a.no-style span:only-child')
news_title = " ".join(news_data.text.strip().split())
if news_title != self.last_news_title:
self.last_news_title = news_title
link = f"{self.domain}{news_data.find_parent('a')['href']}"
d = {'time': visit_time, 'title': self.last_news_title, 'link': link}
with open('data/news.csv', 'a') as file:
writer = DictWriter(file, fieldnames=list(d.keys()), dialect='excel')
writer.writerow(d)
print("Новая новость записана")
else:
print("Новых новостей нет")
async def job(parsing, browser, ua):
html_code, visit_time = await scrape_page(browser, ua)
parsing(html_code, visit_time)
async def main():
start = time()
parsing = Parsing("https://announcements.bybit.com")
our_browser = await create_browser()
user_agent = UserAgent()
try:
while True:
await job(parsing, our_browser, user_agent)
await asyncio.sleep(1)
print(time()-start)
except:
await our_browser.close()
if __name__ == '__main__':
asyncio.run(main())
As I noticed with time(), the time per check for new data on the site is 3-4 seconds. This seems too long to me. Besides, I implemented similar logic on Selenium (synchronous, as I understood, it doesn't support asynchronous) and it showed approximately the same results. Is my code asynchronous?