Integer Bilinear interpolation optimization

348 views Asked by At

My code was very much bottlenecked by bilinear interpolation so I wrote a version (ScaleBlerpI) that does not use floating point math. This is already 1.5 1.85 times faster but I am wondering how I could make it even faster.

Any hints are appreciated.

func ScaleBlerpI(src, dst *ValueFieldI) {
    mx := uint64((src.Width - 1) * math.MaxUint32 / dst.Width)
    my := uint64((src.Height - 1) * math.MaxUint32 / dst.Height)

    for y := uint64(0); y < uint64(dst.Height); y++ {
        for x := uint64(0); x < uint64(dst.Width); x++ {
            gx := (x * mx) >> 32            // eq. / math.MaxUint32
            tx := (x * mx) & math.MaxUint32 // eq. % (math.MaxUint32 + 1) or % 2^32
            gy := (y * my) >> 32
            ty := (y * my) & math.MaxUint32

            srcX, srcY := int(gx), int(gy)
            rgba00 := src.GetComponent(srcX, srcY)
            rgba10 := src.GetComponent(srcX+1, srcY)
            rgba01 := src.GetComponent(srcX, srcY+1)
            rgba11 := src.GetComponent(srcX+1, srcY+1)
            result := []uint32{
                blerpI(rgba00[0], rgba10[0], rgba01[0], rgba11[0], tx, ty),
                blerpI(rgba00[1], rgba10[1], rgba01[1], rgba11[1], tx, ty),
                blerpI(rgba00[2], rgba10[2], rgba01[2], rgba11[2], tx, ty),
            }
            dst.SetComponent(int(x), int(y), result)
        }
    }
}

func lerpI(s, e uint32, f uint64) uint32 {
    // basically s * (1 - f) + b * f
    return uint32(
        (uint64(s)*(math.MaxUint32-f) + uint64(e)*f) /
            math.MaxUint32)
}
func blerpI(c00, c10, c01, c11 uint32, tx, ty uint64) uint32 {
    return lerpI(
        lerpI(c00, c10, tx),
        lerpI(c01, c11, tx),
        ty,
    )
}
type ValueFieldI struct {
    Width, Height int
    ComponentSize int
    Values        []uint32
}

func (vf *ValueFieldI) GetComponent(x, y int) []uint32 {
    componentIdx := x + y*vf.Width
    return vf.Values[componentIdx*vf.ComponentSize : componentIdx*vf.ComponentSize+vf.ComponentSize]
}

func (vf *ValueFieldI) SetComponent(x, y int, c []uint32) {
    copy(vf.GetComponent(x, y), c)
}

Profiling has shown me that the most time is lost on blerpI, src.GetComponent and dst.SetComponent

Edit 1

Replaced

    // basically s * (1 - f) + e * f
    return uint32(
        (uint64(s)*(math.MaxUint32-f) + uint64(e)*f) /
            math.MaxUint32)

With

    // basically s + f*(e-s)
    return s + uint32((f*(uint64(e)-uint64(s)))>>32)

Integer version is now 1.85 times faster.

Edit 2

Benchmark:

func BenchmarkBlerpIRand(b *testing.B) {
    src := &ValueFieldI{
        Width:         37,
        Height:        37,
        ComponentSize: 3,
        Values:        make([]uint32, 37*37*3),
    }

    for i := range src.Values {
        src.Values[i] = rand.Uint32()
    }

    dst := &ValueFieldI{
        Width:         37 * 8,
        Height:        37 * 8,
        ComponentSize: 3,
        Values:        make([]uint32, 37*8*37*8*3),
    }

    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        ScaleBlerpI(src, dst)
    }
}
0

There are 0 answers