HTMLUnit button.click() doesn't return a new HtmlPage object

73 views Asked by At

I am and HTMLUnit newbie and am trying to scrape this website for DATACENTER. I am able to get to the first page and scrape the necessary information. Within the page there is an array of buttons that allow you to paginate to get the next page of information. I can successfully find the button, but then executing the button.click() method does not return a new page of data. It is the same data. Below is the test code that I am using. I did note that when I manually paginate that the URL remains the same (https://www.datacenters.com/locations) and the data changes within the page, so there is something else going on in the page that I am not aware of or don't understand. I have searched through other stackoverflow answers for a similar issue but none I have tried seem to work.

TIA for any and all help.

package com.mycompany.datacenter.app;

import com.gargoylesoftware.htmlunit.*;
import com.gargoylesoftware.htmlunit.html.*;
import org.w3c.dom.Node;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

public class App {


    public static void main(String[] args) throws IOException {
        // get the locations page
        String baseURL = "https://www.datacenters.com";
        String locURL = baseURL + "/locations";

        WebClient client = new WebClient();
        client.getOptions().setTimeout(60000);
        client.getOptions().setRedirectEnabled(true);
        client.getOptions().setJavaScriptEnabled(true);
        client.getOptions().setThrowExceptionOnFailingStatusCode(false);
        client.getOptions().setThrowExceptionOnScriptError(false);
        client.getOptions().setCssEnabled(true);
        client.getOptions().setUseInsecureSSL(true);
        client.setAjaxController(new NicelyResynchronizingAjaxController());

        HtmlPage page = client.getPage(locURL);

        Map<String, DCInfo> dcInfoList = new HashMap<>();
        do {
            // get all the location urls on this page
            DomNodeList<DomNode> list = page.querySelectorAll("a[class^='LocationTile__anchor']");
            for (DomNode node : list) {
                DCInfo dcInfo = new DCInfo();

                // get owner text
                DomNode ownerNode = node.querySelector("div[class^='LocationTile__provider']");
                dcInfo.setDcOwner(ownerNode.getFirstChild().getNodeValue());

                // get location name
                DomNode locationNode = node.querySelector("div[class^='LocationTile__name']");
                dcInfo.setDcName(locationNode.getFirstChild().getNodeValue());

                // get location address
                DomNode locAddressNode = node.querySelector("div[class^='LocationTile__address']");
                dcInfo.setDcAddress(locAddressNode.getFirstChild().getNodeValue());

                // get location specific URL
                Node hRef = node.getAttributes().getNamedItem("href");
                dcInfo.setDcLocationLink(baseURL + hRef.getNodeValue());
                dcInfoList.put(baseURL + '\\' + hRef.getNodeValue(), dcInfo);
                System.out.println(dcInfo);
            }

            // do the pagination
            HtmlButton button = page.querySelector("nav button[class^='Control__control']:last-child");
            HtmlPage newPage = button.click();
            client.waitForBackgroundJavaScript(10000);
            page = newPage;
        } while (page != null);

        client.close();
    }
}
0

There are 0 answers