Issue Moving Inline Assembly to Standalone File

65 views Asked by At

I've been trying to port over a few functions from inline assembly to standalone assembly files. I've had success with all but one; below is the inline followed by my current standalone version. The moment it attempts to use onesByte in the standalone version it fails to return the same results. I'm asking here because I'm hoping someone knows how the compiler handles standalone asm files differently than inline assembly.

Inline version:

__declspec(align(16)) const __int64 onesByte[2] = { 0x0101010101010101, 0x0101010101010101 };

void TComb::checkOscillation5_SSE2(const uint8_t *p2p, const uint8_t *p1p, const uint8_t *s1p,
    const uint8_t *n1p, const uint8_t *n2p, uint8_t *dstp, int stride, int width, int height, int thresh)
{
    __asm
    {
        mov eax, p2p
        mov ebx, p1p
        mov edx, s1p
        mov edi, n1p
        mov esi, n2p
        pxor xmm6, xmm6
        dec thresh
        movd xmm7, thresh
        punpcklbw xmm7, xmm7
        punpcklwd xmm7, xmm7
        punpckldq xmm7, xmm7
        punpcklqdq xmm7, xmm7
    yloop :
        xor ecx, ecx
        align 16
    xloop :
        movdqa xmm0, [eax + ecx]
        movdqa xmm2, [ebx + ecx]
        movdqa xmm1, xmm0
        movdqa xmm3, xmm2
        pminub xmm0, [edx + ecx]
        pmaxub xmm1, [edx + ecx]
        pminub xmm2, [edi + ecx]
        pmaxub xmm3, [edi + ecx]
        pminub xmm0, [esi + ecx]
        pmaxub xmm1, [esi + ecx]
        movdqa xmm4, xmm3
        movdqa xmm5, xmm1
        psubusb xmm4, xmm2
        psubusb xmm5, xmm0
        psubusb xmm4, xmm7
        psubusb xmm5, xmm7
        psubusb xmm2, onesByte
        psubusb xmm0, onesByte
        psubusb xmm1, xmm2
        psubusb xmm3, xmm0
        pcmpeqb xmm1, xmm6
        pcmpeqb xmm3, xmm6
        pcmpeqb xmm4, xmm6
        pcmpeqb xmm5, xmm6
        mov eax, dstp
        por xmm1, xmm3
        pand xmm4, xmm5
        pand xmm1, xmm4
        movdqa[eax + ecx], xmm1
        add ecx, 16
        mov eax, p2p
        cmp ecx, width
        jl xloop
        mov eax, stride
        add ebx, stride
        add p2p, eax
        add edx, stride
        add edi, stride
        add dstp, eax
        add esi, stride
        mov eax, p2p
        dec height
        jnz yloop
    }
}

Standalone version:

.xmm
.model flat,c

.data

align 16

onesByte qword 2 dup(0101010101010101h)

checkOscillation5_SSE2 proc uses ebx esi edi p2p:dword,p1p:dword,s1p:dword,n1p:dword,n2p:dword,dstp:dword,stride:dword,width_:dword,height:dword,thresh:dword

    public checkOscillation5_SSE2

    mov eax,p2p
    mov ebx,p1p
    mov edx,s1p
    mov edi,n1p
    mov esi,n2p
    pxor xmm6,xmm6
    dec thresh
    movd xmm7,thresh
    punpcklbw xmm7,xmm7
    punpcklwd xmm7,xmm7
    punpckldq xmm7,xmm7
    punpcklqdq xmm7,xmm7
yloop:
    xor ecx,ecx
    align 16
xloop:
    movdqa xmm0,[eax+ecx]
    movdqa xmm2,[ebx+ecx]
    movdqa xmm1,xmm0
    movdqa xmm3,xmm2
    pminub xmm0,[edx+ecx]
    pmaxub xmm1,[edx+ecx]
    pminub xmm2,[edi+ecx]
    pmaxub xmm3,[edi+ecx]
    pminub xmm0,[esi+ecx]
    pmaxub xmm1,[esi+ecx]
    movdqa xmm4,xmm3
    movdqa xmm5,xmm1
    psubusb xmm4,xmm2
    psubusb xmm5,xmm0
    psubusb xmm4,xmm7
    psubusb xmm5,xmm7
    psubusb xmm2,oword ptr onesByte
    psubusb xmm0,oword ptr onesByte
    psubusb xmm1,xmm2
    psubusb xmm3,xmm0
    pcmpeqb xmm1,xmm6
    pcmpeqb xmm3,xmm6
    pcmpeqb xmm4,xmm6
    pcmpeqb xmm5,xmm6
    mov eax,dstp
    por xmm1,xmm3
    pand xmm4,xmm5
    pand xmm1,xmm4
    movdqa [eax+ecx],xmm1
    add ecx,16
    mov eax,p2p
    cmp ecx,width_
    jl xloop
    mov eax,stride
    add ebx,stride
    add p2p,eax
    add edx,stride
    add edi,stride
    add dstp,eax
    add esi,stride
    mov eax,p2p
    dec height
    jnz yloop

    ret

checkOscillation5_SSE2 endp

Any help or insight into the matter would be greatly appreciated.

0

There are 0 answers