How to write fast c++ lazy evaluation code in Fastor or Xtensor?

936 views Asked by At

I am a newbie in c++, and heard that libraries like eigen, blaze, Fastor and Xtensor with lazy-evaluation and simd are fast for vectorized operation.

I measured the time collapsed in some doing basic numeric operation by the following function:

(Fastor)

using namespace Fastor;

template<typename T, size_t num>
T func2(Tensor<T,num> &u) {

    Tensor<T,num> z;
    for (auto k=0; k<100; ++k){
        z = u * u;
        z /= exp(u+u);
        z *= 1.;
        z *= sin(u) * cos(z);
    }
    return z(last);
}

(Xtensor)

template<typename T, size_t num>
T func2(xt::xtensor_fixed<T, xt::xshape<num>> &u) {

    xt::xtensor_fixed<T, xt::xshape<num>> z;

    for (auto k=0; k<100; ++k){
        z = u * u;
        z /= xt::exp(u+u);
        z *= 1.;
        z *= xt::sin(u) * xt::cos(z);
    }
    return z(0);
}

compilation flag :

(Fastor)

-std=c++14 -O3 -march=native -funroll-loops -DNDEBUG -mllvm -inline-threshold=10000000 -ffp-contract=fast  -mfma -I/Path/to/Fastor -DFASTOR_NO_ALIAS -DFASTOR_DISPATCH_DIV_TO_MUL_EXPR

(Xtensor)

 -std=c++14 -O3 -march=native -funroll-loops -DNDEBUG -mllvm -inline-threshold=10000000 -ffp-contract=fast  -mfma -I/Path/to/xsimd/include/ -I/Path/to/xtl/include/ -I/Path/to/xtensor/include/ -I/Path/to/xtensor-blas/include/ -DXTENSOR_USE_XSIMD -lblas -llapack -DHAVE_CBLAS=1

compiler : Apple LLVM version 10.0.0 (clang-1000.11.45.5)

processor : 2.6 GHz Intel Core i5

for comparison, I also measured the function written in python which optimized by numba.vectorize

@numba.vectorize(['float64(float64)'],nopython=True)
def func(x):
    for k in range(100):
        z = x * x
        z /= np.exp(x + x)
        z *= 1.0
        z *= np.sin(x) * np.cos(x)
    return z

the result (in unit of usec) shows that

---------------------------------------
num     |  Fastor  |  Xtensor | numba
---------------------------------------
100     |  286     |  201     | 13
1000    |  2789    |  1202    | 65
10000   |  29288   |  20468   | 658
100000  |  328033  |  165263  | 3166
---------------------------------------

Am I doing anything wrong? How could Fastor and Xtensor be 50x slower.

How could I make the use of expression template and lazy-evaluation by using auto keyword?

Thanks for your help!



@Jérôme Richard Thanks for your help!

It's interesting how Fastor and Xtensor are not able to ignore the redundant for-loop. Anyway, I made a fairer comparison of each numeric operation.

The factor 2 from SIMD also makes sense.

(Fastor)

template<typename T, size_t num>
T func_exp(Tensor<T,num> &u) {
    Tensor<T,num> z=u;
    for (auto k=0; k<100; ++k){
        z += exp( u );
    }
    return z(0);
}
template<typename T, size_t num>
T func_sin(Tensor<T,num> &u) {
    Tensor<T,num> z=u;
    for (auto k=0; k<100; ++k){
        z += sin( u );
    }
    return z(0);
}
template<typename T, size_t num>
T func_cos(Tensor<T,num> &u) {
    Tensor<T,num> z=u;
    for (auto k=0; k<100; ++k){
        z += cos( u );
    }
    return z(0);
}
template<typename T, size_t num>
T func_add(Tensor<T,num> &u) {
    Tensor<T,num> z=u;
    for (auto k=0; k<100; ++k){
        z += u;
    }
    return z(0);
}
template<typename T, size_t num>
T func_mul(Tensor<T,num> &u) {
    Tensor<T,num> z=u;
    for (auto k=0; k<100; ++k){
        z *= u;
    }
    return z(0);
}
template<typename T, size_t num>
T func_div(Tensor<T,num> &u) {
    Tensor<T,num> z=u;
    for (auto k=0; k<100; ++k){
        z /= u;
    }
    return z(0);
}

(Xtensor)

template<typename T, size_t nn>
T func_exp(xt::xtensor_fixed<T, xt::xshape<nn>> &u) {
    xt::xtensor_fixed<T, xt::xshape<nn>> z=u;
    for (auto k=0; k<100; ++k){
        z += xt::exp( u );
    }
    return z(0);
}
template<typename T, size_t nn>
T func_sin(xt::xtensor_fixed<T, xt::xshape<nn>> &u) {
    xt::xtensor_fixed<T, xt::xshape<nn>> z=u;
    for (auto k=0; k<100; ++k){
        z += xt::sin( u );
    }
    return z(0);
}
template<typename T, size_t nn>
T func_cos(xt::xtensor_fixed<T, xt::xshape<nn>> &u) {
    xt::xtensor_fixed<T, xt::xshape<nn>> z=u;
    for (auto k=0; k<100; ++k){
        z += xt::sin( u );
    }
    return z(0);
}
template<typename T, size_t nn>
T func_add(xt::xtensor_fixed<T, xt::xshape<nn>> &u) {
    xt::xtensor_fixed<T, xt::xshape<nn>> z=u;
    for (auto k=0; k<100; ++k){
        z += u;
    }
    return z(0);
}
template<typename T, size_t nn>
T func_mul(xt::xtensor_fixed<T, xt::xshape<nn>> &u) {
    xt::xtensor_fixed<T, xt::xshape<nn>> z=u;
    for (auto k=0; k<100; ++k){
        z *= u;
    }
    return z(0);
}
template<typename T, size_t nn>
T func_div(xt::xtensor_fixed<T, xt::xshape<nn>> &u) {
    xt::xtensor_fixed<T, xt::xshape<nn>> z=u;
    for (auto k=0; k<100; ++k){
        z /= u;
    }
    return z(0);
}

(Numba)

@numba.vectorize(['float64(float64)'],nopython=True)
def func_exp(u):
    z = u
    for k in range(100):
        z += exp(u)
    return z
@numba.vectorize(['float64(float64)'],nopython=True)
def func_sin(u):
    z = u
    for k in range(100):
        z += sin(u)
    return z
@numba.vectorize(['float64(float64)'],nopython=True)
def func_cos(u):
    z = u
    for k in range(100):
        z += cos(u)
    return z
@numba.vectorize(['float64(float64)'],nopython=True)
def func_add(u):
    z = u
    for k in range(100):
        z += u
    return z
@numba.vectorize(['float64(float64)'],nopython=True)
def func_mul(u):
    z = u
    for k in range(100):
        z *= u
    return z
@numba.vectorize(['float64(float64)'],nopython=True)
def func_div(u):
    z = u
    for k in range(100):
        z *= u
    return z

the result shows

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
unit [1E-6 sec] |          exp              |         sin               |           cos             |         add           |           mul         |          div          |
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
                |   F     |   X     |   N   |   F     |   X     |   N   |   F     |   X     |   N   |   F   |   X   |   N   |   F   |   X   |   N   |   F   |   X   |   N   |
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
n=100           | 135/135 | 38/38   | 10    | 162/162 | 65/32   | 9     | 111/110 | 34/58   | 9     | 0.07  | 0.06  | 6.2   | 0.06  | 0.05  | 9.6   | 0.06  | 0.05  | 9.6   |
n=1000          | 850/858 | 501/399 | 110   | 1004/961| 522/491 | 94    | 917/1021| 486/450 | 92    | 20    | 43    | 57    | 22    | 40    | 91    | 279   | 275   | 91    |
n=10000         | 8113    | 4160    | 830   | 10670   | 4052    | 888   | 10094   | 3436    | 1063  | 411   | 890   | 645   | 396   | 922   | 1011  | 2493  | 2735  | 914   |
n=100000        | 84032   | 46173   | 8743  | 104808  | 48203   | 8745  | 102868  | 53948   | 8958  | 6138  | 18803 | 5672  | 6039  | 13851 | 9204  | 23404 | 33485 | 9149  |
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------|

formats like 135/135 indicates the result without/with the -ffast-math.

It turns out that

  1. Fastor/Xtensor performs really bad in exp, sin, cos, which was surprising.
  2. Fastor/Xtensor scales worse than Numba in +=, *=, /=.

Is this the nature of Fastor/Xtensor?


I modified the expression to

template<typename T, size_t num>
auto func_exp2(Tensor<T,num> &u) {
    Tensor<T,num> z=u + 100. * exp(u);;
    return z;
}

template<typename T, size_t nn>
auto func_exp2(xt::xtensor_fixed<T, xt::xshape<nn>> &u) {
    xt::xtensor_fixed<T, xt::xshape<nn>> z=u + 100.*xt::exp(u);
    return z;
}

@numba.vectorize(['float64(float64)'],nopython=True)
def func_exp2(u):
    z = u + 100 * exp(u)
    return z

and it gives

-----------------------------------------------------------------
unit [1E-6 sec] |     Fastor    |     Xtensor   |      Numba    |
-----------------------------------------------------------------
n=100           |     0.100     |     0.066     |       1.8     |
n=1000          |     0.073     |     0.057     |       3.6     |
n=10000         |     0.086     |     0.089     |      26.7     |
n=100000        |     0.056     |     0.065     |     275.7     |
-----------------------------------------------------------------

What's happening?

  1. why Fastor/Xtensor is not able to express the for-loop to a naive 100*exp(u) by lazy-evaluation?
  2. why Fastor/Xtensor gets faster as tensor size increases?
2

There are 2 answers

1
Jérôme Richard On BEST ANSWER

The reason the Numpy implementation is much faster is that it does not compute the same thing as the two others.

Indeed, the python version does not read z in the expression np.sin(x) * np.cos(x). As a result, the Numba JIT is clever enough to execute the loop only once justifying a factor of 100 between Fastor and Numba. You can check that by replacing range(100) by range(10000000000) and observing the same timings.

Finally, XTensor is faster than Fastor in this benchmark as it seems to use its own fast SIMD implementation of exp/sin/cos while Fastor seems to use a scalar implementation from libm justifying the factor of 2 between XTensor and Fastor.


Answer to the update:

Fastor/Xtensor performs really bad in exp, sin, cos, which was surprising.

No. We cannot conclude that from the benchmark. What you are comparing is the ability of compilers to optimize your code. In this case, Numba is better than plain C++ compilers as it deals with a high-level SIMD-aware code while C++ compilers have to deals with a huge low-level template-based code coming from the Fastor/Xtensor libraries. Theoretically, I think that it should be possible for a C++ compiler to apply the same kind of high-level optimization than Numba, but it is just harder. Moreover, note that Numpy tends to create/allocate temporary arrays while Fastor/Xtensor should not.

In practice, Numba is faster because u is a constant and so is exp(u), sin(u) and cos(u). Thus, Numba precompute the expression (computed only once) and still perform the sum in the loop. The following code give the same timing:

@numba.vectorize(['float64(float64)'],nopython=True)
def func_exp(u):
    z = u
    tmp = exp(u)
    for k in range(100):
        z += tmp
    return z

I guess the C++ implementations do not perform this optimization due to the lazy evaluation. It may a good idea to report this optimization issue on the two github projects.

Moreover, note that u + u + ... + u is not strictly equal to 100 * u as the floating-point addition is not associative. While -ffast-math help to overcome this problem, compilers may still fail to perform this kind of optimizations due to conflicting optimization passes. For example, too many iterations can prevent loop unrolling which then can prevent the factorization of the expression.

I strongly advise you to perform more realistic benchmarks.

Fastor/Xtensor scales worse than Numba in +=, *=, /=.

Numba may replace the division by a constant with a multiplication in this case (ie. 1/u may be precomputed). Beside that, note that Fastor and Numba are relatively close each other.

why Fastor/Xtensor is not able to express the for-loop to a naive 100*exp(u) by lazy-evaluation?

I think lazy-evaluation does not means expressions are automatically factorized/optimized. It rather means that the result should be computed only when it is required. However, expression factorization could be a good feature to add in future Fastor/Xtensor releases (apparently missing yet).

why Fastor/Xtensor gets faster as tensor size increases?

I think they are just as fast, not faster (timing variations are probably noise). Thus, I guess the expressions are not actually computed. It is likely due to the lazy-evaluation since z is never read. Try with return z(0); rather than return z; (the former forces the expression to be evaluated).

3
Christopher Mauer On

I think you misunderstand how lazy evaluation works. C++ is a strongly-typed language, and Python is not. When you perform an operation that produces an expression template, it produces a new type.

This code:

using namespace Fastor;

template<typename T, size_t num>
T func2(Tensor<T,num> &u) {

    Tensor<T,num> z;
    for (auto k=0; k<100; ++k){
        z = u * u;
        z /= exp(u+u);
        z *= 1.;
        z *= sin(u) * cos(z);
    }
    return z(last);
}

Does not produce what you think it does. z = u * u produces an expression template representing u * u, immediately invokes it and assigns it to z, because z is of type Tensor<T, num>. For the expression templates to carry over the loop, the type of z would have to change with each iteration! This is possible in Python since python is a dynamically-typed language. Fastor and xtensor assume you are attempting to evaluate the expression at each step, thereby ruining any chances they have at performing reductions (which many libraries such as Blaze, Eigen, Fastor, xtensor, etc do. Fastor's documentation even states it will attempt to automatically reduce the expression using einsum notation and reductions where possible. Its implementation has a relatively sophisticated cost model for how simple the library really is).

To do this in C++, you need to unroll the loop and not assign to z until you're ready to evaluate. You can do this using std::make_index_sequence:

template<std::size_t ... Is, typename T>
constexpr auto unroll(std::index_sequence<Is...>, T&& expr) noexcept -> decltype(auto)
{
    return ((Is, expr), ...);
}

template<std::size_t ... Is, typename T>
constexpr auto unroll_add(std::index_sequence<Is...>, T&& expr) noexcept -> decltype(auto)
{
    return ((Is, expr) + ...);
}


template<typename T, size_t num>
T func2(Tensor<T,num> &u) {

    Tensor<T,num> z = unroll(std::make_index_sequence<100>{},
        u * u
        / exp(u+u)
        * 1.
        * sin(u) * cos(z)
    );
    return z(last);
}

template<typename T, size_t num>
T func_exp(Tensor<T,num> &u) {
    Tensor<T,num> z = u + unroll_add(std::make_index_sequence<100>{},
        exp( u );
    );
    return z(0);
}

With expression templates, you can perform multi-step operations like this without immediately invoking the expression:

auto&& a = u + u; // add<T&, T&>
auto&& b = a * u; // mul<add<T&, T&>, T&>
Tensor c = b;.  // Tensor is evaluated here