Why don't std::valarray get more attention from all C++ compilers? Expression templates mean big speedup to certain math-heavy tasks

490 views Asked by At

Some codes like this (I'm not the author but I appreciate the work):

// 22 cycles per pixel mandelbrot (cascadelake)
#include <algorithm>
#include <cstdint>
#include <fstream>
#include <iostream>
#include <iterator>
#include <string_view>
#include <valarray>

namespace {

constexpr int kWidth = 2000;
constexpr int kHeight = 2000;
constexpr int kChunkSize = 256;

using Pixel = std::uint8_t;
using Image = std::array<Pixel, kWidth * kHeight>;
using FloatChunk = std::valarray<float>;
using IterChunk = std::valarray<Pixel>;

// Mapping of integer (x,y) to complex(Re,Im):
//  re = 3. * x / width - 2.25
//  im = (1.5 * height - 3. * y) / width

Image Mandelbrot(float x_scale, float y_scale, float x_mid, float y_mid) {
  std::valarray<Pixel> image_temp(kWidth*kHeight + kChunkSize - 1);
  FloatChunk mb_c_re(kChunkSize);
  FloatChunk mb_c_im(kChunkSize);
  FloatChunk mb_z_re(kChunkSize);
  FloatChunk mb_z_im(kChunkSize);
  IterChunk mb_i(kChunkSize);
  IterChunk ones(kChunkSize);
  ones = 1;
  const int pix_offset = (kHeight - kHeight / 2 + 1) * kWidth - kWidth / 2;

  // Don't bother worrying about the seam at the right.
  // Overcompute and overwrite.
  for (int y = kHeight / 2 - kHeight; y < kHeight / 2; ++y) {
    mb_c_im = y_scale * y + y_mid;
    for (int x = kWidth / 2 - kWidth; x < kWidth / 2; x += kChunkSize) {
      for (int xx = 0; xx < kChunkSize; ++xx) {
        mb_c_re[xx] = x_scale * (x + xx) + x_mid;
      }
      mb_z_re = 0.f;
      mb_z_im = 0.f;
      mb_i = 0;

      for (int i = 0; i < 35; ++i) {
        const auto mb_z_re_2 = mb_z_re * mb_z_re;
        const auto mb_z_im_2 = mb_z_im * mb_z_im;
        const auto mb_z_norm = mb_z_re_2 + mb_z_im_2;
        const FloatChunk n_mb_z_re = mb_z_re_2 - mb_z_im_2 + mb_c_re;
        const FloatChunk n_mb_z_im = 2.0f * mb_z_re * mb_z_im + mb_c_im;
        const auto mask = mb_z_norm < 4.0f;
        mb_i[mask] += ones[mask];

        mb_z_re = n_mb_z_re;
        mb_z_im = n_mb_z_im;
//      mb_z_re[mask] = n_mb_z_re[mask];
//      mb_z_im[mask] = n_mb_z_im[mask];
        if (!mask.sum()) break;
      }
  
      mb_i[mb_i >= 34] = 0;
      mb_i = mb_i * 7 + (mb_i * 3) / 4;
  
      image_temp[std::slice(y*kWidth + x + pix_offset, kChunkSize, 1)] = mb_i;
    }
  }
  
  Image image{};
  std::copy(std::begin(image_temp), std::begin(image_temp) + kWidth * kHeight,
            image.begin()); 
  return image;
}     

}  // namespace

#ifdef _MSC_VER
# include <intrin.h>
#else
# include <x86intrin.h>
#endif

// optional wrapper if you don't want to just use __rdtsc() everywhere
inline
uint64_t readTSC() {
    // _mm_lfence();  // optionally wait for earlier insns to retire before reading the clock
    uint64_t tsc = __rdtsc();
    // _mm_lfence();  // optionally block later instructions until rdtsc retires
    return tsc;
}

int main() {
  auto t1 = readTSC();
  auto image = Mandelbrot(kWidth / 3, kWidth / 3, -3.0f / 4, 0);
  auto t2 = readTSC();  
  std::cout << (t2 - t1)/(kWidth*kHeight) << " cycles per pixel" << std::endl;

  // write to file at once
  std::ofstream fout;
  fout.open("mandelbrot.ppm");
  if (fout.is_open()) {
    fout << "P5\n" << kWidth << ' ' << kHeight << " 255\n"
         << std::string_view(reinterpret_cast<char *>(image.data()), image.size());
    fout.close();
  } else {
    std::cout << "Could not open the file!\n";
  }

  return 0;
}

have no explicit vectorization so are relatively more readable yet they retain good portion of the speedup gained from explicit vectorization by intrinsics or GNU vector extensions.

Are there plans for future C++ versions to have more importance for std::valarray (so it approaches explicit intrinsic usage performance)? What about structs made of multiple numbers (like std::complex)? Will std::valarray support them?

0

There are 0 answers