Multithreading takes more time than Serial execution in nodejs with addons in c++

344 views Asked by At

I have a code in nodejs using addons in c++. The code is a algorithm for a product of matrix in serial code and parallel code.

When i execute the code in node for a matrix with size 2000 the parallel not present a improvement speed-up. But when i execute a code, only in c++, there is improvement speed-up.

the code in essence is the same, which leads me to think that there is a problem with nodejs or something, like the way of nodejs for work the threads.

I'm using ubuntu 16.04, nodejs v6.9.1, gcc 5.4.0 and library pthread.

Another important thing is when i run the nodejs code in my smartphone with termux there improvement speed-up.

Serial code in C++

//g++ serial.cc -o serial

#include <cstdlib>
#include <iostream>
#include <cmath>
#include <stdio.h>
using namespace std;

#define BILLION  1E9;

float **A;
float **B;
float **result;
int n;

void createMatrix();

int main(int argc, char *argv[]){

  n = atoi(argv[1]);
  createMatrix();

  result = (float**) malloc(n *sizeof(float*));
  for (unsigned int i = 0; i < n; i++) {
    result[i] = (float*) malloc(n *sizeof(float));
  }
  std::cout << "calculating product..." << std::endl;
  struct timespec requestStart, requestEnd;
  //start execution time
  clock_gettime(CLOCK_REALTIME, &requestStart);

  for (unsigned int i = 0; i < n ; ++i){
    for (unsigned int j = 0; j < n ; ++j){
      result[i][j] = 0;
      for (unsigned int k = 0; k < n ; k++) {
        result[i][j]  += (A[i][k] * B[k][j]);
      }
    }
  }

  //end execution time
  clock_gettime(CLOCK_REALTIME, &requestEnd);
  double accum = ( requestEnd.tv_sec - requestStart.tv_sec )
      + ( requestEnd.tv_nsec - requestStart.tv_nsec )
      / BILLION;
  printf( "Serial Time taken: %lf\n", accum );

  return 0;
}

void createMatrix(){

  std::cout << "creating matrix A ..." << std::endl;
  A = (float**) malloc(n * sizeof(float*));
  for (int i = 0; i < n; i++) {
    A[i] = (float*) malloc(n * sizeof(float));
    for (int j = 0; j < n; j++) {
      A[i][j] = rand() % 10;
    }
  }

  std::cout << "creating matrix B ..." << std::endl;
  B = (float**) malloc(n * sizeof(float*));
  for (int i = 0; i < n; i++) {
    B[i] = (float*) malloc(n * sizeof(float));
    for (int j = 0; j < n; j++) {
      B[i][j] = rand() % 10;
    }
  }
}

Parallel code in c++

//g++ -std=c++11 parallel.cc -o parallel -pthread

#include <cstdlib>
#include <iostream>
#include <cmath>
#include <stdio.h>
#include <pthread.h>
#include <thread>
using namespace std;

#define BILLION  1E9;

float **A;
float **B;
float **result;
int n;
int task_per_thread;

void createMatrix();

void *runner(void *pid) {

    int slice = (long) pid;
    int to = task_per_thread*slice;
    int from = (task_per_thread*(slice+2)<n) ? task_per_thread*(slice+1) : n;
    for (unsigned int i = to; i < from ; ++i){
    for (unsigned int j = 0; j < n ; ++j){
            result[i][j] = 0;
      for (unsigned int k = 0; k < n ; k++) {
        result[i][j]  += (A[i][k] * B[k][j]);
      }
    }
  }
  pthread_exit(NULL);
}

int main(int argc, char *argv[]){

  n = atoi(argv[1]);
  createMatrix();

  result = (float**) malloc(n *sizeof(float*));
  for (unsigned int i = 0; i < n; i++) {
    result[i] = (float*) malloc(n *sizeof(float));
  }
  unsigned cpus = 4;
    //std::cout << "cpus: "<< cpus << std::endl;
    pthread_t threads[cpus];
    task_per_thread = n / cpus;
    int rc;

    std::cout << "calculating product..." << std::endl;
    struct timespec requestStart, requestEnd;
    //start execution time
    clock_gettime(CLOCK_REALTIME, &requestStart);
    for(int i=0; i < cpus; i++ ){
    rc = pthread_create(&threads[i], NULL, runner, (void *) i);
    if (rc){
       cout << "Error:unable to create thread," << rc << endl;
       exit(-1);
    }
  }

  for(int i=0; i < cpus; i++ ){
    pthread_join(threads[i],NULL);
  }
    //end execution time
  clock_gettime(CLOCK_REALTIME, &requestEnd);
  double accum = ( requestEnd.tv_sec - requestStart.tv_sec )
      + ( requestEnd.tv_nsec - requestStart.tv_nsec )
      / BILLION;
  printf( "Parallel Time taken: %lf\n", accum );

  return 0;
}

void createMatrix(){

  std::cout << "creating matrix A ..." << std::endl;
  A = (float**) malloc(n * sizeof(float*));
  for (int i = 0; i < n; i++) {
    A[i] = (float*) malloc(n * sizeof(float));
    for (int j = 0; j < n; j++) {
      A[i][j] = rand() % 10;
    }
  }

  std::cout << "creating matrix B ..." << std::endl;
  B = (float**) malloc(n * sizeof(float*));
  for (int i = 0; i < n; i++) {
    B[i] = (float*) malloc(n * sizeof(float));
    for (int j = 0; j < n; j++) {
      B[i][j] = rand() % 10;
    }
  }
}

code of addon in Nodejs

#include <cstdlib>
#include <iostream>
#include <stdio.h>
#include <pthread.h>
#include <node.h>
#include <v8.h>

using v8::Exception;
using v8::FunctionCallbackInfo;
using v8::Isolate;
using v8::Local;
using v8::Number;
using v8::Object;
using v8::String;
using v8::Value;
using v8::Array;
using v8::Integer;

using namespace std;

#define BILLION  1E9;

float **A;
float **B;
float **result;
int n;
int task_per_thread;

void createMatrix();

void *runner(void *pid) {

    int slice = (long) pid;
    int to = task_per_thread*slice;
    int from = (task_per_thread*(slice+2)<n) ? task_per_thread*(slice+1) : n;
    for (unsigned int i = to; i < from ; ++i){
    for (unsigned int j = 0; j < n ; ++j){
            result[i][j] = 0;
      for (unsigned int k = 0; k < n ; k++) {
        result[i][j]  += (A[i][k] * B[k][j]);
      }
    }
  }
  pthread_exit(NULL);
}

void parallelProduct(const FunctionCallbackInfo<Value>& args){
  Isolate* isolate = args.GetIsolate();
    if(args.Length() < 1){
    isolate->ThrowException(Exception::TypeError(
      String::NewFromUtf8(isolate, "Wrong numbers of arguments")));
    return;
  }


  n = args[0]->NumberValue();
  std::cout << "n: "<< n << std::endl;
  createMatrix();

  result = (float**) malloc(n *sizeof(float*));
  for (unsigned int i = 0; i < n; i++) {
    result[i] = (float*) malloc(n *sizeof(float));
  }
  unsigned cpus = 4;
    //std::cout << "cpus: "<< cpus << std::endl;
    pthread_t threads[cpus];
    task_per_thread = n / cpus;
    int rc;

    std::cout << "calculating product..." << std::endl;
    struct timespec requestStart, requestEnd;
    //start execution time
    clock_gettime(CLOCK_REALTIME, &requestStart);
    for(int i=0; i < cpus; i++ ){
    rc = pthread_create(&threads[i], NULL, runner, (void *) i);
    if (rc){
       cout << "Error:unable to create thread," << rc << endl;
       exit(-1);
    }
  }

  for(int i=0; i < cpus; i++ ){
    pthread_join(threads[i],NULL);
  }

    //end execution time
  clock_gettime(CLOCK_REALTIME, &requestEnd);
  double accum = ( requestEnd.tv_sec - requestStart.tv_sec )
      + ( requestEnd.tv_nsec - requestStart.tv_nsec )
      / BILLION;
  printf( "Parallel Time taken: %lf\n", accum );

}

void createMatrix(){

  std::cout << "creating matrix A ..." << std::endl;
  A = (float**) malloc(n * sizeof(float*));
  for (int i = 0; i < n; i++) {
    A[i] = (float*) malloc(n * sizeof(float));
    for (int j = 0; j < n; j++) {
      A[i][j] = rand() % 10;
    }
  }

  std::cout << "creating matrix B ..." << std::endl;
  B = (float**) malloc(n * sizeof(float*));
  for (int i = 0; i < n; i++) {
    B[i] = (float*) malloc(n * sizeof(float));
    for (int j = 0; j < n; j++) {
      B[i][j] = rand() % 10;
    }
  }
}

void serialProduct(const FunctionCallbackInfo<Value>& args){
  Isolate* isolate = args.GetIsolate();
    if(args.Length() < 1){
    isolate->ThrowException(Exception::TypeError(
      String::NewFromUtf8(isolate, "Wrong numbers of arguments")));
    return;
  }

  std::cout << "n: "<< args[0]->NumberValue() << std::endl;
  n = args[0]->NumberValue();
  createMatrix();

  result = (float**) malloc(n *sizeof(float*));
  for (unsigned int i = 0; i < n; i++) {
    result[i] = (float*) malloc(n *sizeof(float));
  }
  std::cout << "calculating product..." << std::endl;
  struct timespec requestStart, requestEnd;
  //start execution time
  clock_gettime(CLOCK_REALTIME, &requestStart);

  for (unsigned int i = 0; i < n ; ++i){
    for (unsigned int j = 0; j < n ; ++j){
      result[i][j] = 0;
      for (unsigned int k = 0; k < n ; k++) {
        result[i][j]  += (A[i][k] * B[k][j]);
      }
    }
  }
  //end execution time
  clock_gettime(CLOCK_REALTIME, &requestEnd);
  double accum = ( requestEnd.tv_sec - requestStart.tv_sec )
      + ( requestEnd.tv_nsec - requestStart.tv_nsec )
      / BILLION;
  printf( "Serial Time taken: %lf\n", accum );

}

void Init(Local<Object> exports) {
NODE_SET_METHOD(exports, "serialProduct", serialProduct);
NODE_SET_METHOD(exports, "parallelProduct", parallelProduct);
}

NODE_MODULE(addon, Init)

binding.gyp

{
  "targets": [
    {
      "target_name": "addon",
      "sources": [ "addon.cc" ],
      "conditions":[ ['OS=="linux"', {
            'cflags': [
              '-w'
              ]},
             {# OS != "linux"
               'cflags': [
                 '-w'
                 ]},
          ],
         ]
    }
  ],
  "link_settings":
    {
      "libraries": [
        "-pthread"
      ]
    }
}

index.js

let addon = require('./build/Release/addon')

addon.parallelProduct(2000)
addon.serialProduct(2000)
0

There are 0 answers