I have been writing a web scraping API built on top of NodeJS, using Cheerio, node-fetch and fs-extra. In the following piece of code, I call the getReport method, for each string in the config.supportedMountains array. For each of the items, I want to run them through the fetchAndStore function, which makes the html request, runs it through the specific parser, and then stores the json results.
// const fs = require('fs-extra');
const _ = require('lodash');
// const Promise = require('promise');
const schedule = require('node-schedule');
const fetchAndStore = require('./fetchAndStore.js');
const config = require('../config.js');
exports.run = function() {
schedule.scheduleJob('*/20 * * * * *', function() {
// Get the most recent reports
// And write them to storage
_.forEach(config.supportedMountains, function(fName) {
getReport(fName);
});
});
};
/**
* Gets the lift statuses for every mountain
* @param {string} fName the file name of the mountain
* @return {promise} the promise resolved when the file is written
*/
function getReport(fName) {
return fetchAndStore.run(fName);
}
Here you can see the fetch and store file. This file, takes the fName, and requires the corresponding staticData file. This file contains the url to fetch the page with. Now, the html request is made, and it is run through the parser. Then, with the resulting parsed json, this goes through a few steps to store it. The final output should be two files, one which stores the reports and the other which stores the historicSnowfall, most of the logic in the fs.outputJson functions is to deal with missing files.
const fs = require('fs-extra');
const fetch = require('node-fetch');
exports.run = (function(fName) {
// Get the staticJson
let staticJson = require(`../staticData/mountains/${fName}.json`);
// console.log(staticJson.id)
// Output the report
return fetch(staticJson.urls.reportFetchUrl).then(function(res) {
return res.text();
}).then(function(html) {
// Run the html through the parser
let parser = require(`../scrapers/${staticJson.sName}.js`);
parsed = parser.run(html);
// Output the report
return fs.outputJson(
`data/reports/${staticJson.id}.json`,
parsed.report
).then(function() {
// console.log(parsed.report.lifts[0].name);
// Once output is completed
if (parsed.snowHistory) {
// If snow history is defined
// Read the old file
return fs.readJson(
`data/snowHistory/${staticJson.id}.json`
).then(function(oldJson) {
// If the date of the old json is todays date
if (oldJson[0].date === parsed.snowHistory.date) {
// Replace the first element in array
oldJson[0] = parsed.snowHistory;
return fs.outputJson(
`data/snowHistory/${staticJson.id}.json`,
oldJson
);
} else {
// If this is a fresh entry
oldJson.unshift(parsed.snowHistory);
// If the record does not exist
return fs.outputJson(
`data/snowHistory/${staticJson.id}.json`,
oldJson
);
}
}).catch(function(e) {
// If the old file cannot be read
if (e.code === 'ENOENT') {
// If the file does not exist
// Write brand new file
return fs.outputJson(
`data/snowHistory/${staticJson.id}.json`,
[parsed.snowHistory]
);
}
});
}
});
});
});
For some reason, when the scraper is running, about 1/4 of the time, the data from one execution of fetchAndStore will get mixed up with the data from another execution of fetchAndStore, meaning the wrong data will get written into the file system. How is this possible? I figured that since I am making the calls to fetchAndStore.run() separately, data would not be able to get mixed up. Any idea why this is happening?
The first thing I see is that
parsed
is globally scoped. Could that be the issue? As an aside, if you're nesting that deeply, you should really consider breaking this up in a couple of functions.