Loading 30,000 URLs with PHP curl_multi_init returning empty string

184 views Asked by At

I'm trying to load around 30,000 URLs in PHP. To complete this task as quickly as possible I'm trying to use curl_multi_init(). However it appears to be loading all 30,000 at once where as my understanding was it would process 10 at a time unless otherwise specified by CURLMOPT_MAXCONNECTS.

I believe it's trying to load all 30,000 at once because the code runs for about 8 seconds (the timeout set below) and then returns empty content for most of the URLs, as if the requests failed.

The code runs as expected for a smaller amount of domains, e.g under 100.

How can I ensure it only processes 10 requests at a time?

    $mh = curl_multi_init();

    $requests = [];
    foreach ($urls as $i => $url) {
        $requests[$i] = curl_init($url);
        curl_setopt($requests[$i], CURLOPT_RETURNTRANSFER, true);
        curl_setopt($requests[$i], CURLOPT_TIMEOUT, 8);
        curl_setopt($requests[$i], CURLOPT_CONNECTTIMEOUT, 5);
        curl_setopt($requests[$i], CURLOPT_SSL_VERIFYHOST, false);
        curl_setopt($requests[$i], CURLOPT_SSL_VERIFYPEER, false);
        curl_setopt($requests[$i], CURLOPT_HEADER, false);
        curl_setopt($requests[$i], CURLOPT_FOLLOWLOCATION, TRUE);
        curl_setopt($requests[$i], CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36');
        curl_multi_add_handle($mh, $requests[$i]);
    }
    
    $active = null;
    
    do {
        curl_multi_exec($mh, $active);
    } while ($active);
    
    
    $responses = [];
    foreach ($requests as $request) {
        $responses[] = curl_multi_getcontent($request);
        curl_multi_remove_handle($mh, $request);
        curl_close($request);
    }
1

There are 1 answers

0
SlamJammington On BEST ANSWER

Give this a try. It splits $urls into 100 element arrays, and sends a multi request for each group of 100.

$chunks = array_chunk($urls,100);
foreach($chunks as $chunk) {
    $mh = curl_multi_init();
    $responses = [];
    $requests = [];
    foreach ($urls as $i => $url) {
        $requests[$i] = curl_init($url);
        curl_setopt($requests[$i], CURLOPT_RETURNTRANSFER, true);
        curl_setopt($requests[$i], CURLOPT_TIMEOUT, 8);
        curl_setopt($requests[$i], CURLOPT_CONNECTTIMEOUT, 5);
        curl_setopt($requests[$i], CURLOPT_SSL_VERIFYHOST, false);
        curl_setopt($requests[$i], CURLOPT_SSL_VERIFYPEER, false);
        curl_setopt($requests[$i], CURLOPT_HEADER, false);
        curl_setopt($requests[$i], CURLOPT_FOLLOWLOCATION, TRUE);
        curl_setopt($requests[$i], CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36');
        curl_multi_add_handle($mh, $requests[$i]);
    }
    
    $active = null;
    
    do {
        curl_multi_exec($mh, $active);
    } while ($active);
    
    

    foreach ($requests as $request) {
        $responses[] = curl_multi_getcontent($request);
        curl_multi_remove_handle($mh, $request);
        curl_close($request);
    }
}