I am trying to figure out the exact reason why my headless program retrieves less data than the graphical one.
You can find the repository here and to run the code you must have a TikTok account. This is because if you load cookies into the browser it gets rid of popups and makes the program easier to write.
Once cloned you will run node cookieLoader.js and sign into your tiktok account, then press enter and you can run the main program.
Then try this command (headless by default)
node index.js -m undertimeslopper
If the repository no longer exists or you don't want to clone it you can follow these code snippets instead. This snippet will create your tiktok cookies once you login to the site and press enter.
const readline = require('readline');
const { exit } = require("process");
const fs = require('fs');
const puppeteer = require('puppeteer-extra');
const StealthPlugin = require('puppeteer-extra-plugin-stealth');
// Apply the stealth plugin to avoid being detected as a bot
puppeteer.use(StealthPlugin());
(async () => {
const readline = require('node:readline').createInterface({
input: process.stdin,
output: process.stdout,
});
const browser = await puppeteer.launch({ headless: false });
// Open a new page
const page = await browser.newPage();
// Navigate to your desired URL
await page.goto('https://www.tiktok.com');
readline.question(`Press enter button to save your cookies\n`, async ()=> {
readline.close();
const cookies = await page.cookies();
console.log(cookies)
await fs.writeFileSync('./cookies.json', JSON.stringify(cookies, null, 2));
exit()
});
})();
Then you can run the actual program with this snippet.
const chalk = require("chalk");
const fs = require("fs");
const puppeteer = require("puppeteer");
const { exit } = require("process");
const path = require("path");
const loadCookie = async (page) => {
//could be useful in future so ill keep it
const cookieJson = await fs.readFileSync(path.join(__dirname,'cookies.json'));
const cookies = JSON.parse(cookieJson);
await page.setCookie(...cookies);
}
const generateUrlProfile = (username) => {
var baseUrl = "https://www.tiktok.com/";
if (username.includes("@")) {
baseUrl = `${baseUrl}${username}`;
} else {
baseUrl = `${baseUrl}@${username}`;
}
return baseUrl;
};
const getListVideoByUsername = async (username) => {
var baseUrl = await generateUrlProfile(username)
const browser = await puppeteer.launch({
headless: true,
})
const page = await browser.newPage()
await page.setRequestInterception(true)
page.on('request', (request) => {
if (request.resourceType() === 'image') request.abort()
else request.continue()
})
await loadCookie(page);
page.setUserAgent(
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4182.0 Safari/537.36"
);
await page.goto(baseUrl).catch(err =>{
console.error(err)
exit();
});
await page.keyboard.press('Escape')
const delay_milliseconds=3000+500
const delay_after_load=1000
await page.keyboard.press('Escape')
try {
await sleep(delay_milliseconds)
const xpathSelector = "//button[contains(text(),'Refresh')]"; // Replace with your XPath
await page.evaluate(xpath => {
const xpathResult = document.evaluate(xpath, document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null);
const element = xpathResult.singleNodeValue;
if (element) {
element.click()
}
}, xpathSelector);
await sleep(delay_after_load)
}
catch (error) {
}
await page.keyboard.press('Escape')
var listVideo = []
console.log(chalk.green("[*] Getting list video from: " + username))
var loop = true
var no_video_found=false
while(loop) {
listVideo = await page.evaluate(() => {
const listVideo = document.querySelectorAll('a');
const videoUrls2 = Array.from(listVideo).map(item => item.href)
.filter(href => href.includes('/video/') || href.includes('/photo/'))
.filter((value, index, self) => self.indexOf(value) === index).map(item=>item.replace('photo','video'));
return videoUrls2;
});
console.log(chalk.green(`[*] ${listVideo.length} video found`))
previousHeight = await page.evaluate("document.body.scrollHeight").catch(() => {
});
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)").catch(() => {
})
await page.waitForFunction(`document.body.scrollHeight > ${previousHeight}`, {timeout: 10000})
.catch(() => {
console.log(chalk.red("[X] No more video found"));
console.log(chalk.green(`[*] Total video found: ${listVideo.length}`))
loop = false
if(listVideo.length===0){
no_video_found=true
}
});
await new Promise((resolve) => setTimeout(resolve, 1000));
}
await browser.close()
return listVideo
}
(async() => {
getListVideoByUsername('undertimeslopper') // or any valid tiktok username
})()
The output on my machine is
[*] Getting list video from: undertimeslopper
[*] 35 video found
[*] 69 video found
[*] 69 video found
[X] No more video found
but after I go to line 5 in getListVideoByUsername and change headless: true to headless: false the output is
[*] Getting list video from: undertimeslopper
[*] 35 video found
[*] 69 video found
[*] 102 video found
[*] 137 video found
[*] 158 video found
[X] No more video found
As we can observe, the graphical program performed as intended: scraping all the users videos whilst the headless one only got 69.
This is the core of the problem as I intend to run this script headlessly on a server and if I cant get all the videos its worthless.
You don't have to run the code to help me. Essentially I am just looking for ways to debug and see what a headless browser is doing but I included the instructions and output as supplementary information.
This issue was either caused by issues in the cache of the
puppeteerlibrary or simply by the version of the library. Upon upgradingpuppeteerwith the following changesto
The issue was resolved and the program ran as intended.