I have a link : https://www.glassdoor.ca/Job/canada-data-jobs-SRCH_IL.0,6_IN3_KE7,11_IP1.htm
I want to increment the link like this : https://www.glassdoor.ca/Job/canada-data-jobs-SRCH_IL.0,6_IN3_KE7,11_IP2.htm
then 3,4,5.... My code is:
# -*- coding: utf-8 -*-
import scrapy
class GlassdoorSpider(scrapy.Spider):
name = 'glassdoor'
#allowed_domains = ['https://www.glassdoor.ca/Job/canada-data-jobs-SRCH_IL.0,6_IN3_KE7,11.htm']
start_urls = ['https://www.glassdoor.ca/Job/canada-data-jobs-SRCH_IL.0,6_IN3_KE7,11_IP1.htm']
def parse(self, response):
#main_url = "https://www.glassdoor.ca"
urls = response.css('li.jl > div > div.flexbox > div > a::attr(href)').extract()
for url in urls:
url = "https://www.glassdoor.ca" + url
yield scrapy.Request(url = url, callback = self.parse_details)
next_page_url = "https://www.glassdoor.ca/Job/canada-data-jobs-SRCH_IL.0,6_IN3_KE7,11_IP"
if next_page_url:
#next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(url = next_page_url, callback = self.parse)
def parse_details(self,response):
yield{
'Job_Title' : response.css('div.header.cell.info > h2::text').extract()
}
self.log("reached22: "+ response.url)
I want to increment it in the variable next_page_url.
You are right that it's not found in the page source on the same place as when you inspect the page. However, you can see it's present in the page source under
<head>
asYou can extract it using