What is the benefit of using SIMD to pre-calculate the branching results?

83 views Asked by AceSrc At 20 March 2024 at 11:05

I noticed that for the following codes, the loop would be unrolled, and the results of eight consecutive (ident != -1) are computed with SIMD when compiled with LLVM. After that, the computed branching results are extracted one by one to do the branching. What is the benefit of using SIMD here when the number of branching is not reduced? It seems that it just turns the cmp instructions into the ``extractelement'' instructions.

struct T {
  short ident;
  int other_fields[10];
} ;

void foo(int n, struct T *data) {
    for (int i = 0; i < n; i++ )
        if (data[i].ident != -1)
            data[i].ident = 1;
}

The LLVM IR:

9:                                                ; preds = %66, %7
  %10 = phi i64 [ 0, %7 ], [ %67, %66 ]
  %11 = or i64 %10, 1
  %12 = or i64 %10, 2
  %13 = or i64 %10, 3
  %14 = or i64 %10, 4
  %15 = or i64 %10, 5
  %16 = or i64 %10, 6
  %17 = or i64 %10, 7
  %18 = getelementptr inbounds %struct.T, ptr %1, i64 %10
  %19 = getelementptr inbounds %struct.T, ptr %1, i64 %11
  %20 = getelementptr inbounds %struct.T, ptr %1, i64 %12
  %21 = getelementptr inbounds %struct.T, ptr %1, i64 %13
  %22 = getelementptr inbounds %struct.T, ptr %1, i64 %14
  %23 = getelementptr inbounds %struct.T, ptr %1, i64 %15
  %24 = getelementptr inbounds %struct.T, ptr %1, i64 %16
  %25 = getelementptr inbounds %struct.T, ptr %1, i64 %17
  %26 = load i16, ptr %18, align 4, !tbaa !5
  %27 = load i16, ptr %19, align 4, !tbaa !5
  %28 = load i16, ptr %20, align 4, !tbaa !5
  %29 = load i16, ptr %21, align 4, !tbaa !5
  %30 = load i16, ptr %22, align 4, !tbaa !5
  %31 = load i16, ptr %23, align 4, !tbaa !5
  %32 = load i16, ptr %24, align 4, !tbaa !5
  %33 = load i16, ptr %25, align 4, !tbaa !5
  %34 = insertelement <8 x i16> poison, i16 %26, i64 0
  %35 = insertelement <8 x i16> %34, i16 %27, i64 1
  %36 = insertelement <8 x i16> %35, i16 %28, i64 2
  %37 = insertelement <8 x i16> %36, i16 %29, i64 3
  %38 = insertelement <8 x i16> %37, i16 %30, i64 4
  %39 = insertelement <8 x i16> %38, i16 %31, i64 5
  %40 = insertelement <8 x i16> %39, i16 %32, i64 6
  %41 = insertelement <8 x i16> %40, i16 %33, i64 7
  %42 = icmp ne <8 x i16> %41, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
  %43 = extractelement <8 x i1> %42, i64 0
  br i1 %43, label %44, label %45

44:                                               ; preds = %9
  store i16 1, ptr %18, align 4, !tbaa !5
  br label %45
45:                                               ; preds = %44, %9
  %46 = extractelement <8 x i1> %42, i64 1
  br i1 %46, label %47, label %48

47:                                               ; preds = %45
  store i16 1, ptr %19, align 4, !tbaa !5
  br label %48

48:                                               ; preds = %47, %45
  %49 = extractelement <8 x i1> %42, i64 2
  br i1 %49, label %50, label %51

50:                                               ; preds = %48
  store i16 1, ptr %20, align 4, !tbaa !5
  br label %51

51:                                               ; preds = %50, %48
  %52 = extractelement <8 x i1> %42, i64 3
  br i1 %52, label %53, label %54

53:                                               ; preds = %51
  store i16 1, ptr %21, align 4, !tbaa !5
  br label %54

54:                                               ; preds = %53, %51
  %55 = extractelement <8 x i1> %42, i64 4
  br i1 %55, label %56, label %57

56:                                               ; preds = %54
  store i16 1, ptr %22, align 4, !tbaa !5
  br label %57
...

Original Q&A

TechQA.

What is the benefit of using SIMD to pre-calculate the branching results?

There are 0 answers

Related Questions in C++

Related Questions in LLVM

Related Questions in SIMD

Popular Questions

Trending Questions