I've been trying to port over a few functions from inline assembly to standalone assembly files. I've had success with all but one; below is the inline followed by my current standalone version. The moment it attempts to use onesByte in the standalone version it fails to return the same results. I'm asking here because I'm hoping someone knows how the compiler handles standalone asm files differently than inline assembly.
Inline version:
__declspec(align(16)) const __int64 onesByte[2] = { 0x0101010101010101, 0x0101010101010101 };
void TComb::checkOscillation5_SSE2(const uint8_t *p2p, const uint8_t *p1p, const uint8_t *s1p,
const uint8_t *n1p, const uint8_t *n2p, uint8_t *dstp, int stride, int width, int height, int thresh)
{
__asm
{
mov eax, p2p
mov ebx, p1p
mov edx, s1p
mov edi, n1p
mov esi, n2p
pxor xmm6, xmm6
dec thresh
movd xmm7, thresh
punpcklbw xmm7, xmm7
punpcklwd xmm7, xmm7
punpckldq xmm7, xmm7
punpcklqdq xmm7, xmm7
yloop :
xor ecx, ecx
align 16
xloop :
movdqa xmm0, [eax + ecx]
movdqa xmm2, [ebx + ecx]
movdqa xmm1, xmm0
movdqa xmm3, xmm2
pminub xmm0, [edx + ecx]
pmaxub xmm1, [edx + ecx]
pminub xmm2, [edi + ecx]
pmaxub xmm3, [edi + ecx]
pminub xmm0, [esi + ecx]
pmaxub xmm1, [esi + ecx]
movdqa xmm4, xmm3
movdqa xmm5, xmm1
psubusb xmm4, xmm2
psubusb xmm5, xmm0
psubusb xmm4, xmm7
psubusb xmm5, xmm7
psubusb xmm2, onesByte
psubusb xmm0, onesByte
psubusb xmm1, xmm2
psubusb xmm3, xmm0
pcmpeqb xmm1, xmm6
pcmpeqb xmm3, xmm6
pcmpeqb xmm4, xmm6
pcmpeqb xmm5, xmm6
mov eax, dstp
por xmm1, xmm3
pand xmm4, xmm5
pand xmm1, xmm4
movdqa[eax + ecx], xmm1
add ecx, 16
mov eax, p2p
cmp ecx, width
jl xloop
mov eax, stride
add ebx, stride
add p2p, eax
add edx, stride
add edi, stride
add dstp, eax
add esi, stride
mov eax, p2p
dec height
jnz yloop
}
}
Standalone version:
.xmm
.model flat,c
.data
align 16
onesByte qword 2 dup(0101010101010101h)
checkOscillation5_SSE2 proc uses ebx esi edi p2p:dword,p1p:dword,s1p:dword,n1p:dword,n2p:dword,dstp:dword,stride:dword,width_:dword,height:dword,thresh:dword
public checkOscillation5_SSE2
mov eax,p2p
mov ebx,p1p
mov edx,s1p
mov edi,n1p
mov esi,n2p
pxor xmm6,xmm6
dec thresh
movd xmm7,thresh
punpcklbw xmm7,xmm7
punpcklwd xmm7,xmm7
punpckldq xmm7,xmm7
punpcklqdq xmm7,xmm7
yloop:
xor ecx,ecx
align 16
xloop:
movdqa xmm0,[eax+ecx]
movdqa xmm2,[ebx+ecx]
movdqa xmm1,xmm0
movdqa xmm3,xmm2
pminub xmm0,[edx+ecx]
pmaxub xmm1,[edx+ecx]
pminub xmm2,[edi+ecx]
pmaxub xmm3,[edi+ecx]
pminub xmm0,[esi+ecx]
pmaxub xmm1,[esi+ecx]
movdqa xmm4,xmm3
movdqa xmm5,xmm1
psubusb xmm4,xmm2
psubusb xmm5,xmm0
psubusb xmm4,xmm7
psubusb xmm5,xmm7
psubusb xmm2,oword ptr onesByte
psubusb xmm0,oword ptr onesByte
psubusb xmm1,xmm2
psubusb xmm3,xmm0
pcmpeqb xmm1,xmm6
pcmpeqb xmm3,xmm6
pcmpeqb xmm4,xmm6
pcmpeqb xmm5,xmm6
mov eax,dstp
por xmm1,xmm3
pand xmm4,xmm5
pand xmm1,xmm4
movdqa [eax+ecx],xmm1
add ecx,16
mov eax,p2p
cmp ecx,width_
jl xloop
mov eax,stride
add ebx,stride
add p2p,eax
add edx,stride
add edi,stride
add dstp,eax
add esi,stride
mov eax,p2p
dec height
jnz yloop
ret
checkOscillation5_SSE2 endp
Any help or insight into the matter would be greatly appreciated.