How can I increment the link

1.2k views Asked by At

I have a link : https://www.glassdoor.ca/Job/canada-data-jobs-SRCH_IL.0,6_IN3_KE7,11_IP1.htm

I want to increment the link like this : https://www.glassdoor.ca/Job/canada-data-jobs-SRCH_IL.0,6_IN3_KE7,11_IP2.htm

then 3,4,5.... My code is:

# -*- coding: utf-8 -*-
import scrapy


class GlassdoorSpider(scrapy.Spider):

name = 'glassdoor'
#allowed_domains = ['https://www.glassdoor.ca/Job/canada-data-jobs-SRCH_IL.0,6_IN3_KE7,11.htm']
start_urls = ['https://www.glassdoor.ca/Job/canada-data-jobs-SRCH_IL.0,6_IN3_KE7,11_IP1.htm']

def parse(self, response):
    #main_url = "https://www.glassdoor.ca"
    urls = response.css('li.jl > div > div.flexbox > div > a::attr(href)').extract()

    for url in urls:            
            url = "https://www.glassdoor.ca" + url
            yield scrapy.Request(url = url, callback = self.parse_details)

    next_page_url = "https://www.glassdoor.ca/Job/canada-data-jobs-SRCH_IL.0,6_IN3_KE7,11_IP" 
    if next_page_url:
       #next_page_url = response.urljoin(next_page_url) 
       yield scrapy.Request(url = next_page_url, callback = self.parse)  

def parse_details(self,response):
    yield{
        'Job_Title' : response.css('div.header.cell.info > h2::text').extract()

    }
    self.log("reached22: "+ response.url)

I want to increment it in the variable next_page_url.

3

There are 3 answers

7
Tomáš Linhart On BEST ANSWER

You are right that it's not found in the page source on the same place as when you inspect the page. However, you can see it's present in the page source under <head> as

<link rel="next" href="https://www.monster.ca/jobs/search/?q=data-analyst&amp;page=2" />

You can extract it using

next_page_link = response.xpath('//head/link[@rel="next"]/@href').extract_first()
1
MishaVacic On

You need XPath expression in this manner

urls = response.xpath('//*[contains(@class,"next")]//@href')

Try,it should work.

4
parik On

for getting the second page you can you this

import requests

headers = {
    'Pragma': 'no-cache',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'fr-FR,fr;q=0.8,en-US;q=0.6,en;q=0.4',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Referer': 'https://www.monster.ca/jobs/search/?q=data-analyst',
    'Connection': 'keep-alive',
    'Cache-Control': 'no-cache',
}
#for the other page, you should change page number
params = (
    ('q', 'data-analyst'),
    ('page', '2'),
)

r = requests.get('https://www.monster.ca/jobs/search/', headers=headers, params=params)
print r.text

for getting all pages, you should get the number of last page,

for page_number in xrange(2, last_page):
   #put page_number in params

UPDATE 1

ANOTHER SOLUTION

enter image description here

def start_requests(self):
    request =  Request("https://www.monster.ca/jobs/search/?q=data-analyst", callback=self.get_lastPage)
    yield request

def get_lastPage(self,response):
    headers = {
        'Pragma': 'no-cache',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'fr-FR,fr;q=0.8,en-US;q=0.6,en;q=0.4',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Referer': 'https://www.monster.ca/jobs/search/?q=data-analyst',
        'Connection': 'keep-alive',
        'Cache-Control': 'no-cache',
    }
    last_page = response.css('input#totalPages::attr("value")').extract_first()
    for last_page in xrange(2, int(last_page)):
        link = "https://www.monster.ca/jobs/search/?q=data-analyst&page=" + str(last_page)
        yield Request(link,
                        headers=headers, 
                        callback=self.parse_product)