request_html not returning/rendering whole web page

281 views Asked by At

I am trying to parse a google shopping page and I am trying to do it faster than selenium. I stumbled across request_html and it's been working pretty well. I have almost everything I need from it except one element it isn't parsing from the page. If you go to this google shopping page you will notice that you can hover over some of the product images and see a second one. I am parsing the information from each product but when it comes to both images for some reason request_html is only retrieving the second(hovered) image and not the first(main) one. I have attached my code below I have been trying to find a good way to represent the output of request_html to show what it IS retrieving but I haven't found a way for it to print in a readable manner. To my knowledge request_html can render javascript on pages and in my case, it is just weird that it is getting everything but the first image. I have viewed the 'inspect' part of the page to get the HTML and the image's div class that I am trying to get is '.gOenxf'. Why is request_html not rendering the first image of each product?

for google_post in google_initiate(request):
    #  parse what is needed


def google_initiate(request):
    form = SearchForm(request.POST or None)
    if form.is_valid():
        url = 'https://www.google.com/search?biw=1866&bih=1043&tbm=shop&q=desk&tbs=mr:1,price:1,ppr_min:,ppr_max:,avg_rating:None'
    session = HTMLSession()
    response = session.get(url)
    print(response.html)
    google_parsed = response.html.find('.sh-dgr__gr-auto.sh-dgr__grid-result')
    response.close()
    session.close()
    return google_parsed

UPDATE:

import requests
from requests_html import HTMLSession

for google_post in google_initiate(request):
    post_website = 'Google'
    post_parse_page = google_initiate.google_parse_page
    try:
        post_title = google_post.find('.Xjkr3b', first=True).text
    except:
        post_title = ''

    try:
        post_url = str(google_post.find('.xCpuod'))
        post_url = 'https://www.google.com' + post_url[post_url.find("href='") + len("href='"):post_url.rfind("'")]
    except:
        post_url = ''

    try:
        post_second_website = google_post.find('.aULzUe.IuHnof', first=True).text
        if 'Amazon' in post_second_website or 'eBay' in post_second_website or 'Walmart' in post_second_website or 'AliExpress' in post_second_website or 'Craigslist' in post_second_website or 'Facebook Marketplace' in post_second_website or 'Oodle' in post_second_website:
            post_second_website = ''
    except:
        post_second_website = ''

    try:
        post_second_url = str(google_post.find('.shntl'))
        post_second_url = post_second_url[post_second_url.find("href='/url?url=") + len("href='/url?url="):post_second_url.rfind("'")]
        if '%' in post_second_url:
            post_second_url = post_second_url.split('%')[0]
    except:
        post_second_url = ''

    try:
        post_second_image_url = str(google_post.find('img'))
        if 'encrypted' in post_second_image_url:
            post_second_image_url = post_second_image_url[post_second_image_url.find("data-image-src='") + len("data-image-src='"):post_second_image_url.rfind('')]
        else:
            post_second_image_url = NO_IMAGE
    except:
        post_second_image_url = ''

    try:
        post_price = google_post.find('.a8Pemb.OFFNJ', first=True).text
        post_price = str(post_price.split()[0])
        try:  string first
            if '.' not in post_price:
                post_price = post_price + '.00'
            elif len(post_price.split('.')[1]) == 1:
                post_price = post_price + '0'
            elif len(post_price.split('.')[1]) == 0:
                post_price = post_price + '00'
            post_sort_by = post_price.replace(',', '')  
            post_sort_by = float(post_sort_by.split('$')[1]) 
        except:
            post_price = 'n/a'
            post_sort_by = ''
    except:
        post_price = 'n/a'
        post_sort_by = ''

    try:
        post_rating = google_post.find('.Rsc7Yb', first=True).text
    except:
        post_rating = ''

    try:
        post_rating_quantity = google_post.find('.NzUzee', first=True).text
        post_rating_quantity = str(post_rating_quantity.split()[1])
    except:
        post_rating_quantity = ''

    try:
        post_image_url = str(google_post.find('.gOenxf'))
        if 'encrypted' in post_image_url:
            post_image_url = post_image_url[post_image_url.find("src='") + len("src='"):post_image_url.rfind("'")]
        else:
            post_image_url = NO_IMAGE
    except:
        post_image_url = ''

    google_final_postings.append((post_title, post_url, post_price, post_image_url, post_rating, post_rating_quantity, post_website, post_second_website, post_second_url, post_second_image_url, post_parse_page, post_sort_by))







def google_initiate(request):
    form = SearchForm(request.POST or None)
    if form.is_valid():
        url = 'https://www.google.com/search?biw=1866&bih=1043&tbm=shop&q=desk&tbs=mr:1,price:1,ppr_min:,ppr_max:,avg_rating:None'
    session = HTMLSession()
    response = session.get(url)
    google_parsed = response.html.find('.sh-dgr__gr-auto.sh- 
    dgr__grid-result')
    print(google_parsed)
    response.close()
    session.close()
    return google_parsed
1

There are 1 answers

5
kwiknik On

Assuming the content in question is dynamically injected by Javascript, you need to call response.html.render() before seeking the element.

def google_initiate(request):
    form = SearchForm(request.POST or None)
    if form.is_valid():
        url = 'https://www.google.com/search?biw=1866&bih=1043&tbm=shop&q=desk&tbs=mr:1,price:1,ppr_min:,ppr_max:,avg_rating:None'
    session = HTMLSession()
    response = session.get(url)
    response.html.render()
    print(response.html)
    google_parsed = response.html.find('.sh-dgr__gr-auto.sh-dgr__grid-result')
    response.close()
    session.close()
    return google_parsed

See example in official docs.