I recently tried to write a scraper capable of downloading 3.5 million
torrent files based on their magnet URL. I decided to start by hacking
an example from libtorrent's tutorial webpage, but while it works well
with just one torrent file, it fails segfaults in create_torrent()
when
I try to download more than one file. Here's my code:
#include <thread>
#include <chrono>
#include <fstream>
#include <sstream>
#include <string>
#include <libtorrent/session.hpp>
#include <libtorrent/add_torrent_params.hpp>
#include <libtorrent/create_torrent.hpp>
#include <libtorrent/torrent_handle.hpp>
#include <libtorrent/alert_types.hpp>
#include <libtorrent/bencode.hpp>
#include <libtorrent/torrent_status.hpp>
#include <libtorrent/torrent_info.hpp>
namespace lt = libtorrent;
using clk = std::chrono::steady_clock;
int torrents_left = 0;
int save_file(std::string const& filename, std::vector<char>& v)
{
FILE* f = std::fopen(filename.c_str(), "wb");
if (f == nullptr)
return -1;
int w = int(std::fwrite(&v[0], 1, v.size(), f));
std::fclose(f);
if (w < 0) return -1;
if (w != int(v.size())) return -3;
return 0;
}
void add_torrent_url(std::string url, lt::session& ses) {
// std::cerr << "DEBUG: Will download '" << url << "'" << std::endl;
lt::add_torrent_params atp;
atp.url = url;
atp.save_path = "."; // save in current dir
ses.async_add_torrent(atp);
torrents_left++;
}
void add_torrents_from_stdin(lt::session& ses) {
std::cerr << "DEBUG: reading stdin." << std::endl;
std::string url;
while(std::getline(std::cin, url)) {
add_torrent_url(url, ses);
}
std::cerr << "DEBUG: done reading stdin." << std::endl;
}
int main(int argc, char const* argv[])
{
lt::settings_pack pack;
pack.set_int(lt::settings_pack::alert_mask
, lt::alert::error_notification
| lt::alert::storage_notification
| lt::alert::status_notification);
lt::session ses(pack);
lt::add_torrent_params atp;
//add_torrent_url(argv[1]);
add_torrent_url("magnet:?xt=urn:btih:3E37CFE29B1049E03F858758A73EFD85BA170BE8", ses);
add_torrent_url("magnet:?xt=urn:btih:8FCDE178E3F9A24EA40856826C4E8A625A931B73", ses);
//add_torrents_from_stdin(ses);
// this is the handle we'll set once we get the notification of it being
// added
lt::torrent_handle h;
for (;;) {
std::vector<lt::alert*> alerts;
ses.pop_alerts(&alerts);
for (lt::alert const* a : alerts) {
if (auto at = lt::alert_cast<lt::add_torrent_alert>(a)) {
h = at->handle;
}
// if we receive the finished alert or an error, we're done
if (lt::alert_cast<lt::torrent_finished_alert>(a)) {
std::cout << "torrent finished or error." << std::endl;
goto done;
}
if (lt::alert_cast<lt::torrent_error_alert>(a)) {
std::cout << a->message() << std::endl;
goto done;
}
if (auto st = lt::alert_cast<lt::state_update_alert>(a)) {
if (st->status.empty()) continue;
// we only have a single torrent, so we know which one
// the status is for
lt::torrent_status const& s = st->status[0];
if (s.state == lt::torrent_status::downloading)
{
std::cout << "Hi!" << std::endl;
std::shared_ptr<const lt::torrent_info> ti = h.torrent_file();
if (ti == 0) {
std::cerr << "ERROR: ti == NULL" << std::endl;
goto done;
}
ses.remove_torrent(h, lt::session::delete_files);
lt::create_torrent new_torrent(*ti);
std::vector<char> out;
lt::bencode(std::back_inserter(out), new_torrent.generate());
std::stringstream ss;
ss << "downloaded/" << (*ti).info_hash() << ".torrent";
save_file(ss.str(), out);
h.pause();
torrents_left--;
std::cerr << "DEBUG: Done (" << torrents_left << " left): " << (*ti).info_hash() << std::endl;
if (torrents_left == 0)
goto done;
}
}
}
std::this_thread::sleep_for(std::chrono::milliseconds(200));
// ask the session to post a state_update_alert, to update our
// state output for the torrent
ses.post_torrent_updates();
}
done:
{}
}
I suspect it's related to this part:
// we only have a single torrent, so we know which one
// the status is for
lt::torrent_status const& s = st->status[0];
But according to my debugger, when torrent_file()
gives NULL
, st->status[]
only contains one element anyway.
What's happening here? How do I fix it?
It looks like I made wrong assumptions about what "h" points to in the example. Here's a diff that fixes the code in question: