Why initial auto-vectorication loads from aligned std::array are scalar? (g++/clang++)

223 views Asked by At

I have problem understanding what prevents compilers from using initial vector loads when reading data from std::array<uint64_t,...>.

I know that gcc can produce debug information with -fopt-info-vec-*. I can't find anything from the verbose log that would indicate why both compilers make same sub-optimal decision to use initial scalar loads.

On other hand I don't know how to make clang to provide detailed information about vectorization problems. -Rpass-analysis=loop-vectorize only reports that loop in init isn't worth interleaving. Of course my intrinsic version is proof that loop could be vectorized but required transformations are probably too complex to except from a compiler.

I could of course implement hot paths using intrinsics but that requires duplicating same logic for each cpu argitecture. I would much prefer to write stanard c++ code that compiler can vectorize near perfectly. The it becomes simple to compile same code multiple times with different flags using target_clones attribute or macros and target attribute.

How to make compiler tell why loads failed to vectorize?

I suspect gcc might already print that information I just don't know what I'm looking for.

Why auto vectorization fails with initial load?

    /**
     * This is a test case removing abstraction layers from my actual code. My
     * real code includes one extra problem that access to pack loses alignment
     * information wasn't only issue. Compilers still generate
     * suboptimal machine code with alignment information present. I fail to
     * understand why loads are treated differently compared to stores to
     * same address when auto-vectorization is used.
     *
     * I tested gcc 6.2 and clang 3.9
     * g++ O3 -g -march=native vectest.cc -o vectest -fvect-cost-model=unlimited
     * clang++ -O3 -g -march=native vectest.cc -o vectest
     */


    #include <array>
    #include <cstdint>

    alignas(32) std::array<uint64_t, 52> pack;
    alignas(32) uint64_t board[4];

    __attribute__((noinline))
    static void init(uint64_t initial)
    {
        /* Clang seem to prefer large constant table and unrolled copy
         * which should perform worse outside micro benchmark. L1 misses
         * and memory bandwidth are bigger bottleneck than alu instruction
         * execution. But of course this code won't be compiled to hot path so
         * I don't care how it is compiled as long as it works correctly.
         *
         * But most interesting detail from clang is vectorized stores are
         * generated correctly like:
    4005db:       vpsllvq %ymm2,%ymm1,%ymm2
    4005e0:       vmovdqa %ymm2,0x200a78(%rip)        # 601060 <pack>
    4005e8:       vpaddq 0x390(%rip),%ymm0,%ymm2        # 400980 <_IO_stdin_used+0x60>
    4005f0:       vpsllvq %ymm2,%ymm1,%ymm2
    4005f5:       vmovdqa %ymm2,0x200a83(%rip)        # 601080 <pack+0x20>
    4005fd:       vpaddq 0x39b(%rip),%ymm0,%ymm2        # 4009a0 <_IO_stdin_used+0x80>
         *
         * gcc prefers scalar loop.
         */

        for (unsigned i = 0; i < pack.size(); i++) {
            pack[i] = 1UL << (i + initial);
        }
    }

    #include "immintrin.h"
    __attribute__((noinline))
    static void expected_init(uint64_t initial)
    {
        /** Just an intrinsic implementation of init that would be IMO ideal
         * optimization.
         */
    #if __AVX2__
        unsigned i;
        union {
            uint64_t *mem;
            __m256i *avx;
        } conv;
        conv.mem = &pack[0];
        __m256i t = _mm256_set_epi64x(
                1UL << 3,
                1UL << 2,
                1UL << 1,
                1UL << 0
                );
        /* initial is just extra random number to prevent constant array
         * initialization
         */
        t = _mm256_slli_epi64(t, initial);
        for(i = 0; i < pack.size()/4; i++) {
            _mm256_store_si256(&conv.avx[i], t);
            t = _mm256_slli_epi64(t, 4);
        }
    #endif
    }

    __attribute__((noinline))
    static void iter_or()
    {
        /** initial load (clang):
    4006f0:       vmovaps 0x200988(%rip),%xmm0        # 601080 <pack+0x20>
    4006f8:       vorps  0x200960(%rip),%xmm0,%xmm0        # 601060 <pack>
    400700:       vmovaps 0x200988(%rip),%xmm1        # 601090 <pack+0x30>
    400708:       vorps  0x200960(%rip),%xmm1,%xmm1        # 601070 <pack+0x10>
    400710:       vinsertf128 $0x1,%xmm1,%ymm0,%ymm0
        * expected:
    400810:       vmovaps 0x200868(%rip),%ymm0        # 601080 <pack+0x20>
    400818:       vorps  0x200840(%rip),%ymm0,%ymm0        # 601060 <pack>
    400820:       vorps  0x200878(%rip),%ymm0,%ymm0        # 6010a0 <pack+0x40>
        */

        auto iter = pack.begin();
        uint64_t n(*iter++),
             e(*iter++),
             s(*iter++),
             w(*iter++);
        for (;iter != pack.end();) {
            n |= *iter++;
            e |= *iter++;
            s |= *iter++;
            w |= *iter++;
        }
        /** Store is correctly vectorized to single instruction */
        board[0] = n;
        board[1] = e;
        board[2] = s;
        board[3] = w;
    }

    __attribute__((noinline))
    static void index_or()
    {
        /** Clang compiles this to same as iterator variant. gcc goes
         * completely insane. I don't even want to try to guess what all the
         * permutation stuff is trying to archive.
         */
        unsigned i;
        uint64_t n(pack[0]),
             e(pack[1]),
             s(pack[2]),
             w(pack[3]);
        for (i = 4 ; i < pack.size(); i+=4) {
            n |= pack[i+0];
            e |= pack[i+1];
            s |= pack[i+2];
            w |= pack[i+3];
        }
        board[0] = n;
        board[1] = e;
        board[2] = s;
        board[3] = w;
    }

    #include "immintrin.h"

    __attribute__((noinline))
    static void expected_result()
    {
        /** Intrinsics implementation what I would expect auto-vectorization
         * transform my c++ code. I simple can't understand why both compilers
         * fails to archive results I expect.
         */
    #if __AVX2__
        union {
            uint64_t *mem;
            __m256i *avx;
        } conv;
        conv.mem = &pack[0];
        unsigned i;
        __m256i res = _mm256_load_si256(&conv.avx[0]);
        for (i = 1; i < pack.size()/4; i++) {
            __m256i temp = _mm256_load_si256(&conv.avx[i]);
            res = _mm256_or_si256(res, temp);
        }
        conv.mem = board;
        _mm256_store_si256(conv.avx, res);
    #endif
    }

    int main(int c, char **v)
    {
        (void)v;
        expected_init(c - 1);
        init(c - 1);

        iter_or();
        index_or();
        expected_result();
    }
1

There are 1 answers

0
Pauli Nieminen On

It appears that gcc and clang both fail to vectorize the initial load from outside loop. If change code to just zero temporary variables first and then use or from first element both compilers do better job. Clang generates good unrolled vector code (only single ymm registers is bottleneck with all instruction having dependency to previous one). GCC generates a bit worse code with extra initial vpxor and a pretty bad loop doing one vpor per iteration.

I also tested a few alternative implementations where micro benchmark best would be clangs unrolled code improved with alternating registers.

/* only reduce (calling this function from a for loop):
 * ST 7.3 cycles (ST=single thread)
 * SMT 15.3 cycles (SMT=simultaneous multi threading aka hyper threading)
 * shuffle+reduce (calling Fisher-Yatas shuffle and then this function):
 * ST 222 cycles
 * SMT 383 cycles 
 */
    "vmovaps 0x00(%0), %%ymm0\n"
    "vmovaps 0x20(%0), %%ymm1\n"
    "vpor 0x40(%0), %%ymm0, %%ymm0\n"
    "vpor 0x60(%0), %%ymm1, %%ymm1\n"
    "vpor 0x80(%0), %%ymm0, %%ymm0\n"
    "vpor 0xA0(%0), %%ymm1, %%ymm1\n"
    "vpor 0xC0(%0), %%ymm0, %%ymm0\n"
    "vpor 0xE0(%0), %%ymm1, %%ymm1\n"
    "vpor 0x100(%0), %%ymm0, %%ymm0\n"
    "vpor 0x120(%0), %%ymm1, %%ymm1\n"
    "vpor 0x140(%0), %%ymm0, %%ymm0\n"
    "vpor 0x160(%0), %%ymm1, %%ymm1\n"
    "vpor 0x180(%0), %%ymm0, %%ymm0\n"

    "vpor %%ymm0, %%ymm1, %%ymm0\n"
    "vmovaps %%ymm0, 0x00(%1)\n"

The Clang unrolled loop has timings like

/* only reduce:
 * ST 9.8 cycles
 * SMT 21.8 cycles
 * shuffle+reduce:
 * ST 223 cycles
 * SMT 385 cycles
 */

But the numbers where SMT reduced performance for unrolled code looked suspicious. I decided to try better write GCC loop that was still clearly slower than unrolled. But then I decided to break instruction dependencies by using two registers and unrolling loop once. That resulted to slightly faster shuffle+reduce code than completely unrolling.

size_t end = pack.size() - 3*4;
asm (
/* The best SMT option outside micro optimization.
 * This allows executing two vpor instructions same time and
 * reduces loop count to half with single unroll
 *
 * only reduce:
 * ST 13.0 cycles
 * SMT 20.0 cycles
 * shuffle+reduce:
 * ST 221 cycles
 * SMT 380 cycles
 */
    "vmovaps 0x180(%[pack]), %%ymm0\n"
    "vmovaps 0x160(%[pack]), %%ymm1\n"
    "vpor 0x00(%[pack],%[cnt],8), %%ymm0, %%ymm0\n"
    "1:\n"
    "vpor -0x20(%[pack],%[cnt],8), %%ymm1, %%ymm1\n"
    "vpor -0x40(%[pack],%[cnt],8), %%ymm0, %%ymm0\n"
    "sub $8, %[cnt]\n"
    "jne 1b\n"

    "vpor %%ymm0, %%ymm1, %%ymm0\n"
    "vmovaps %%ymm0, 0x00(%[out])\n"
    : [cnt]"+r"(end)
    : [pack]"r"(begin), [out]"r"(hands_));

But differences are surprising small when code is run after Fisher-Yates shuffle. Even gcc version with clear lose in reduce only benchmark (16.4/38.8) run shuffle+reduce test close to same speed (228/387).