Using Request.js and Cheerio.js in Node/Express return empty array

1k views Asked by At

I'm building a simple scraper using Request.js and Cheerio.js within Express. Right now I'm only looking for the title of the site. Instead scraping a website one by one, I put the list in an array. I parse through them and then use Cheerio.js to find the Title of the website. When I console log the titles, they come out fine, but I want to ultimately show them on a html page. Please note, I'm very new to programming so if you could provide detailed feedback, that would be incredibly helpful (below is the code I've been working on). Thanks in advance!

function parseSites(urls) {
var parsedSites = [];
urls.forEach(function(site) {
        request(site, function(err, res, body) {
            if(err) {
                console.log(err);
            } else {
                var $ = cheerio.load(body);
                parsedSites.push($('title').text());
                }
            }    
        });             
    });
    return parsedSites;
}
2

There are 2 answers

1
charly rl On

First you need to understand the difference between asynchronous and synchronous code. Lets see an example:

function testFor() {
    for(let i=0;i<5;++i){
        console.log(i);
    }
}

-

console.log('start:');
testFor();
console.log('end:');

// Here you get the expected output because this code is synchronous.
//output:
    start:
    0
    1
    2
    3
    4
    end:

-

console.log('start:');
setTimeout(testFor,1000);
console.log('end:');

// Here you don't get your expected output because setTimeout is asynchronous .
//output:
    start:
    end:
    0
    1
    2
    3
    4
  1. First the console.log('start:'); is called.
  2. Then setTimeout(testFor,1000); (but it is async and the call will execute in 1 second).
  3. Immediately after the console.log('end:'); is called.
  4. Finally 1 second after, the testFor() is executed and it prints 0 1 2 3 4

The next point is that there is an error in your code!

function parseSites(urls) {
    var parsedSites = [];
    urls.forEach(function(site) {
        request(site, function(err, res, body) {
            if(err) {
                console.log(err);
            } else {
                var $ = cheerio.load(body);
                parsedSites.push($('title').text());
            }
        //} ! THIS bracket should be removed
        });
    });
    return parsedSites;
}

So your problem is that the 'request' in the forEach loop is an async function that will call the callback 'function(err, res, body)' once there is a response from the web page.

My solutions for this:

'use strict'

const cheerio = require('cheerio');
const request = require('request');
const async = require('async');

const urls = ['http://stackoverflow.com/','http://hackaday.com/','https://www.raspberrypi.org/','https://cheerio.js.org/'];

//SOLUTION 1: do what you need to do when all calls are done using recursion
let i=0;
let parsedSites = [];
parseSites(urls[i],parsedSites);
function finalCall(sites) {
    console.log(sites);
}
function parseSites(site,parsedSites) {
    ++i;
    request(site, function(err, res, body) {
        if(err) {
            console.log(err);
        } else {
            let $ = cheerio.load(body);
            let title = $('title').text();
            console.log(title);
            parsedSites.push(title);
        }
        if(i<urls.length){
            parseSites(urls[i],parsedSites);// recursive call;
        }
        else{
            finalCall(parsedSites);// when all sites are done.
        }
    });
    //return parsedSites;// cant return! we are in async calls!
}


//SOLUTION 2: do what you need to do when all calls are done using 'async'
parseSites(urls);
function finalCall(sites) {
    console.log(sites);
}
function parseSites(urls) {
    let parsedSites = [];
    async.each(urls,function parseSite(site, callback) {
        request(site, function (err, res, body) {
            if (err) {
                callback(err);
            } else {
                let $ = cheerio.load(body);
                parsedSites.push($('title').text());
                callback();
            }
        })
    },function (err) {
        if(err) console.log(err);
        else finalCall(parsedSites);
    });
}

Async github page

Async example

1
Balaji V On

Please refer to the below code for a working implementation

var request = require('request-promise')
    var cheerio = require("cheerio")

    function parseSites(urls, callback) {
        var parsedSites = [];
        var promiseList = urls.map(getPage)

        Promise.all(promiseList).then(function (data) {
            callback(data.map(parse))
        })

        return parsedSites;
    }

    function getPage(url) {

        return request.get(url)
    }

    function parse(body) {
        console.log("parsing body")
        var $ = cheerio.load(body);
        return $('title').text()
    }

    parseSites(['https://www.google.com','https://www.facebook.com'],function(data) {
        console.log(data)
    })