next steps in ASPX scraping

44 views Asked by At

I am trying to get data from this website, where I pass in a license ID, go to "details", then "view profile" to get to a profile page. (for example, I need the ID 66244 to get to this profile)

I watched a short video about scraping ASPX pages with httpx, and came up with this messy bit of code: (this repl)

import httpx
from selectolax.parser import HTMLParser


def getLicense(response, number):

  html = HTMLParser(response.text) 
  viewstate = html.css_first("input#__VIEWSTATE").attributes['value']
  generator = html.css_first("input#__VIEWSTATEGENERATOR").attributes['value']

  formdata = {
      "ctl00$ScriptManager1":
      "ctl00$MainContentPlaceHolder$ucLicenseLookup$UpdtPanelGridLookup",
      "__EVENTTARGET":
      "ctl00$MainContentPlaceHolder$ucLicenseLookup$UpdtPanelGridLookup",
      "__EVENTARGUMENT":
      1,
      "__VIEWSTATE":
      viewstate,
      "__VIEWSTATEGENERATOR":
      generator,
      "ctl00$MainContentPlaceHolder$ucLicenseLookup$ctl03$ddCredPrefix":
      "",
      "ctl00$MainContentPlaceHolder$ucLicenseLookup$ctl03$tbLicenseNumber":
      "66244",
      "ctl00$MainContentPlaceHolder$ucLicenseLookup$ctl03$ddSubCategory":
      "",
      "ctl00$MainContentPlaceHolder$ucLicenseLookup$ctl03$tbFirstName_Contact":
      "",
      "ctl00$MainContentPlaceHolder$ucLicenseLookup$ctl03$tbLastName_Contact":
      "",
      "ctl00$MainContentPlaceHolder$ucLicenseLookup$ctl03$tbDBA_Contact":
      "",
      "ctl00$MainContentPlaceHolder$ucLicenseLookup$ctl03$tbMaidenName_Contact":
      "",
      "ctl00$MainContentPlaceHolder$ucLicenseLookup$ctl03$tbCity_ContactAddress":
      "",
      "ctl00$MainContentPlaceHolder$ucLicenseLookup$ctl03$ddStates":
      "",
      "ctl00$MainContentPlaceHolder$ucLicenseLookup$ctl03$tbZipCode_ContactAddress":
      "",
      "ctl00$MainContentPlaceHolder$ucLicenseLookup$ResizeLicDetailPopupID_ClientState":
      "0,0",
      "ctl00$OutsidePlaceHolder$ucLicenseDetailPopup$ResizeLicDetailPopupID_ClientState":
      "0,0",
      "__ASYNCPOST":
      "false"
  }

  yield formdata




def main():

  
  header = { "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36", "Cookie": "ASP.NET_SessionId=ycpyzckej2z3pq1pltjpt55k; apps.colorado.gov=!qseyvwywknmjz6ywzsK/zCjMlNMem+DnBFEVDa2UErNVnVk/IwmqktUUbE9Sk9Ed2SwBEeqRdai5GHM=" }
  
  client = httpx.Client() 
  url = "https://apps2.colorado.gov/dora/licensing/lookup/" 
  data = client.get(url + "licenselookup.aspx")  
  
  for license in getLicense(data, 66244): 
    
    licenseData = client.post(url + "licenselookup.aspx", data=license, headers = header)
    
    
    with open('license_data.html', 'w') as file:
      file.seek(0)
      file.write(licenseData.text)



if __name__ == "__main__":
  main()

It seems to do the job, and I get a response similar to what I see in my browser. The only problem is that I don't know where to go with the data it returns. I see that the response includes this bit of code:

<a id="ctl00_MainContentPlaceHolder_ucLicenseLookup_gvSearchResults_ctl03_HyperLinkDetail" class="btn btn-primary btn-xs" href="javascript:DisplayLicenceDetail(&#39;1361971;1460049;0;Michael John Abadier;133644907;0&#39;)">Detail</a>

and I need to go to that page, then get a link to the profile from that page. I do not know what I am doing, so it would be very nice if I could get some help.

Thanks!

1

There are 1 answers

0
Andrej Kesely On

Here is example how you can get from this site using requests/BeautifulSoup:

import re
from io import StringIO

import pandas as pd
import requests
from bs4 import BeautifulSoup

url = "https://apps2.colorado.gov/dora/licensing/lookup/licenselookup.aspx"
post_url = "https://apps2.colorado.gov/dora/licensing/lookup/licenselookup.aspx"
detail_url = "https://apps2.colorado.gov/dora/licensing/Lookup/licensedetail.aspx"

with requests.session() as s:
    s.headers.update(
        {
            "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0"
        }
    )

    soup = BeautifulSoup(s.get(url).content, "html.parser")

    data = {}
    for inp in soup.select("input[name]"):
        data[inp["name"]] = inp.get("value", "")

    data["ctl00$MainContentPlaceHolder$ucLicenseLookup$ctl03$tbLicenseNumber"] = 66244
    data[
        "ctl00$ScriptManager1"
    ] = "ctl00$MainContentPlaceHolder$ucLicenseLookup$UpdtPanelGridLookup|ctl00$MainContentPlaceHolder$ucLicenseLookup$UpdtPanelGridLookup"
    data[
        "__EVENTTARGET"
    ] = "ctl00$MainContentPlaceHolder$ucLicenseLookup$UpdtPanelGridLookup"
    data["__EVENTARGUMENT"] = 1
    data["__ASYNCPOST"] = "true"
    data[
        "ctl00$MainContentPlaceHolder$ucLicenseLookup$ResizeLicDetailPopupID_ClientState"
    ] = "0,0"
    data[
        "ctl00$OutsidePlaceHolder$ucLicenseDetailPopup$ResizeLicDetailPopupID_ClientState"
    ] = "0,0"

    data["ctl00$MainContentPlaceHolder$ucLicenseLookup$ctl03$ddCredPrefix"] = ""
    data["ctl00$MainContentPlaceHolder$ucLicenseLookup$ctl03$ddSubCategory"] = ""
    data["ctl00$MainContentPlaceHolder$ucLicenseLookup$ctl03$ddStates"] = ""

    del data["ctl00$MainContentPlaceHolder$ucLicenseLookup$btnLookup"]
    del data["ctl00$MainContentPlaceHolder$ucLicenseLookup$btnShowPopup"]
    del data["ctl00$OutsidePlaceHolder$ucLicenseDetailPopup$btnShowPopup"]

    headers = {"X-MicrosoftAjax": "Delta=true", "X-Requested-With": "XMLHttpRequest"}

    soup = BeautifulSoup(
        s.post(post_url, data=data, headers=headers).text, "html.parser"
    )

    links = []
    for a in soup.table.select('a[href*="javascript:"]'):
        link = re.search(r"'([^']+)'", a["href"]).group(1)
        links.append(link)

    for l in links:
        soup = BeautifulSoup(s.get(detail_url, params={"id": l}).content, "html.parser")
        df = pd.read_html(StringIO(str(soup)))[1]
        print(df)

Prints:

  License Number License Method License Type License Status Original Issue Date Effective Date Expiration Date
0     DR.0066244       Original    Physician         Active          03/23/2021     05/01/2023      04/30/2025

  License Number License Method      License Type License Status Original Issue Date Effective Date Expiration Date  Primary State of Residence Nurse Compact Designation
0     RN.0066244    Endorsement  Registered Nurse        Expired          12/01/1979     10/01/2002      09/30/2004                         NaN              Single State