I am trying to scrape an API that returns a JSON object but it only returns a JSON very first time and after it, it's not returning anything. i am using "if-none-match" header with Cookies but i want to do it without Cookies because I have lots of API of this category to scrape.
Here is my spider code:
import scrapy
from scrapy import Spider, Request
import json
from scrapy.crawler import CrawlerProcess
header_data = {'authority': 'shopee.com.my',
'method': 'GET',
'scheme': 'https',
'accept': '*/*',
'if-none-match-': '*',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9',
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36',
'x-requested-with': 'XMLHttpRequest',
'x-shopee-language': 'en',
'Cache-Control': 'max-age=0',
}
class TestSales(Spider):
name = "testsales"
allowed_domains = ['shopee.com', 'shopee.com.my', 'shopee.com.my/api/']
cookie_string = {'SPC_U':'-', 'SPC_IA':'-1' , 'SPC_EC':'-' , 'SPC_F':'7jrWAm4XYNNtyVAk83GPknN8NbCMQEIk', 'REC_T_ID':'476673f8-eeb0-11ea-8919-48df374df85c', '_gcl_au':'1.1.1197882328.1599225148', '_med':'refer', '_fbp':'fb.2.1599225150134.114138691', 'language':'en', '_ga':'GA1.3.1167355736.1599225151', 'SPC_SI':'mall.gTmrpiDl24JHLSNwnCw107mao3hd8qGP', 'csrftoken':'2ntG40uuWzOLUsjv5Sn8glBUQjXtbGgo', 'welcomePkgShown':'true', '_gid':'GA1.3.590966412.1602427202', 'AMP_TOKEN':'%24NOT_FOUND', 'SPC_CT_21c6f4cb':'1602508637.vtyz9yfI6ckMZBdT9dlICuAYf7crlEQ6NwQScaB2VXI=', 'SPC_CT_087ee755':'1602508652.ihdXyWUp3wFdBN1FGrKejd91MM8sJHEYCPqcgmKqpdA=', '_dc_gtm_UA-61915055-6':'1', 'SPC_R_T_ID':'vT4Yxil96kYSRG2GIhtzk8fRJldlPJ1/szTbz9sG21nTJr4zDoOnnxFEgYe2Ea+RhM0H8q0m/SFWBMO7ktpU5Kim0CJneelIboFavxAVwb0=', 'SPC_T_IV':'hhHcCbIpVvuchn7SbLYeFw==', 'SPC_R_T_IV':'hhHcCbIpVvuchn7SbLYeFw==', 'SPC_T_ID':'vT4Yxil96kYSRG2GIhtzk8fRJldlPJ1/szTbz9sG21nTJr4zDoOnnxFEgYe2Ea+RhM0H8q0m/SFWBMO7ktpU5Kim0CJneelIboFavxAVwb0='}
custom_settings = {
'AUTOTHROTTLE_ENABLED' : 'True',
# The initial download delay
'AUTOTHROTTLE_START_DELAY' : '0.5',
# The maximum download delay to be set in case of high latencies
'AUTOTHROTTLE_MAX_DELAY' : '10',
# The average number of requests Scrapy should be sending in parallel to
# each remote server
'AUTOTHROTTLE_TARGET_CONCURRENCY' : '1.0',
# 'DNSCACHE_ENABLED' : 'False',
# 'COOKIES_ENABLED': 'False',
}
def start_requests(self):
subcat_url = '/Baby-Toddler-Play-cat.27.23785'
id = subcat_url.split('.')[-1]
header_data['path'] = f'/api/v2/search_items/?by=sales&limit=50&match_id={id}&newest=0&order=desc&page_type=search&version=2'
header_data['referer'] = f'https://shopee.com.my{subcat_url}?page=0&sortBy=sales'
url = f'https://shopee.com.my/api/v2/search_items/?by=sales&limit=50&match_id={id}&newest=0&order=desc&page_type=search&version=2'
yield Request(url=url, headers=header_data, #cookies=self.cookie_string,
cb_kwargs={'subcat': 'baby tobbler play cat', 'category': 'baby and toys' })
def parse(self, response, subcat, category):
# pass
try:
jdata = json.loads(response.body)
except Exception as e:
print(f'exception: {e}')
print(response.body)
return None
items = jdata['items']
for item in items:
name = item['name']
image_path = item['image']
absolute_image = f'https://cf.shopee.com.my/file/{image_path}_tn'
print(f'this is absolute image {absolute_image}')
subcategory = subcat
monthly_sold = 'pending'
price = float(item['price'])/100000
total_sold = item['sold']
location = item['shop_location']
stock = item['stock']
print(name)
print(price)
print(total_sold)
print(location)
print(stock)
app = CrawlerProcess()
app.crawl(TestSales)
app.start()
This is the page url which you can see the entering on browser: https://shopee.com.my/Baby-Toddler-Play-cat.27.23785?page=0&sortBy=sales
This is the API url which you can also find from developers tool of that page: https://shopee.com.my/api/v2/search_items/?by=sales&limit=50&match_id=23785&newest=0&order=desc&page_type=search&version=2
Please tell me how to handle 'cache' or 'if-none-match' because i can't understand how to handle it. Thanks in Advance!
All you need to generate API GET requests is category identificator which is match_id and start item number which is newest parameter.
Using link template https://shopee.com.my/api/v2/search_items/?by=sales&limit=50&match_id={category_id}&newest={start_item_number}&order=desc&page_type=search&version=2 you can fetch any API category endpoint.
There's no need to manage cookies or even headers in this case. API is not restrictive at all.
UPDATE:
This worked for me in scrapy shell: