currently I am developing fasttext go binding, I am stacked with the problem while training on the same file
from cgo code:
go test -run ^TestTrain$ -v
=== RUN TestTrain
Read 1M words
Number of words: 14339
Number of labels: 128
from fasttext cli tool:
$ fasttext supervised -input test_data/train -output test_data/context -lr 0.1 -epoch 10 -wordNgrams 2 -thread 10
Read 1M words
Number of words: 49914
Number of labels: 128
my cgo realization look like this:
extern "C" {
fasttext::FastText ft_model;
bool ft_initialized = false;
int ft_train(const char* model_name, const char* input, const char* output, int epoch, int word_ngrams, int thread, float lr)
{
fasttext::Args args_object;
if (strcmp(model_name, "supervised") == 0) {
args_object.model = fasttext::model_name::sup;
} else if (strcmp(model_name, "cbow") == 0) {
args_object.model = fasttext::model_name::cbow;
} else if (strcmp(model_name, "skipgram") == 0) {
args_object.model = fasttext::model_name::sg;
} else {
return -1;
}
args_object.input = input;
args_object.output = output;
args_object.epoch = epoch;
args_object.wordNgrams = word_ngrams;
args_object.thread = thread;
args_object.lr = lr;
ft_model.train(args_object);
ft_initialized = true;
return 0;
}
}
so as I can research fasttext has two train fuctions, one called from macro and another is method of fasttext class, but they are pretty similar to me as I'm not a professional as c++, can anyone provide some help?
UPD: I use function from macro which create fasttext instance and call the train method, like this:
int train(const char* model_name, const char* input, const char* output, int epoch, int word_ngrams, int thread, float lr) {
const std::vector<std::string> args = {
"fasttext",
std::string(model_name),
"-input",
std::string(input),
"-output",
std::string(output),
"-epoch",
std::to_string(epoch),
"-wordNgrams",
std::to_string(word_ngrams),
"-thread",
std::to_string(thread),
"-lr",
std::to_string(lr)
};
fasttext::Args a = fasttext::Args();
a.parseArgs(args);
std::shared_ptr<fasttext::FastText> fasttext = std::make_shared<fasttext::FastText>();
std::string outputFileName;
if (a.hasAutotune() &&
a.getAutotuneModelSize() != fasttext::Args::kUnlimitedModelSize) {
outputFileName = a.output + ".ftz";
} else {
outputFileName = a.output + ".bin";
}
std::ofstream ofs(outputFileName);
if (!ofs.is_open()) {
throw std::invalid_argument(
outputFileName + " cannot be opened for saving.");
}
ofs.close();
if (a.hasAutotune()) {
fasttext::Autotune autotune(fasttext);
autotune.train(a);
} else {
fasttext->train(a);
}
fasttext->saveModel(outputFileName);
fasttext->saveVectors(a.output + ".vec");
if (a.saveOutput) {
fasttext->saveOutput(a.output + ".output");
}
return 0;
}
It solved my problem, but I still do not understand what is the difference between those two approaches?