I'm working on YUY2 (4:2:2)
to ARGB
conversion with scaling for video buffers 1920x1080i@25
. I already have code that works fine for UYVY (4:2:2)
to ARGB
conversion with scaling.
I'm new to intrinsics and don't know much about it.
#define CYRGB (1.164)
#define CVR (1.596)
#define CVG (0.813)
#define CUG (0.392)
#define CUB (2.017)
#define CCHROMAOFFSET (128)
#define CLUMAOFFSET (16)
#define MMX_SCALEBITS_TO_RGB (12)
#define MMX_F16_RGB(COEF) ((short)((COEF) * (double)(1L << MMX_SCALEBITS_TO_RGB) + 0.5))
#define MMX_64_RGB(f1,f2,f3,f4) _mm_set_pi16(MMX_F16_RGB(f1),MMX_F16_RGB(f2),MMX_F16_RGB(f3),MMX_F16_RGB(f4))
====================
void ScaleYUV4x2( void *_pSrc, ptrdiff_t _srcPitch, void *_pDst, ptrdiff_t _dstPitch )
{
int width32 = (m_prevWidth * 4) >> 4;
__m64 sumLine[(1920)/sizeof(__m64)];
__m64 *pInLine64 = (__m64*)_pSrc;
__m64 *pWorkBuf64 = (__m64*)&sumLine[0];
__m64 _yuv_r = MMX_64_RGB(CYRGB, CVR, CYRGB, CVR);
__m64 _yuv_b = MMX_64_RGB(CYRGB, CUB, CYRGB, CUB);
__m64 _yuv_g0 = MMX_64_RGB(0 ,-CVG,CYRGB ,-CUG);
__m64 _yuv_g1 = MMX_64_RGB(CYRGB,-CVG, 0 ,-CUG);
__m64 alpha_bits = _mm_set1_pi32(0xFF000000UL);
__m64 zero = _mm_setzero_si64();
__m64 _yuv_rgb_sub = _mm_set_pi16(CLUMAOFFSET,CCHROMAOFFSET,CLUMAOFFSET,CCHROMAOFFSET);
for( DWORD y = 0; y < m_prevHeight*2; ++y )
{
if((y & 1)==0)
{
// even line scaling
for(int i=0;i<width32;++i)
{
__m64 v0 = pInLine64[4*i+0];
__m64 v1 = pInLine64[4*i+1];
__m64 v2 = pInLine64[4*i+2];
__m64 v3 = pInLine64[4*i+3];
__m64 vll = _mm_unpacklo_pi32(v0,v1);
__m64 vhh = _mm_unpackhi_pi32(v0,v1);
__m64 vs1 = _mm_avg_pu8(vll,vhh);
__m64 vs2;
__m64 vsm = _mm_shuffle_pi16(vs1,(0<<0)|(2<<2)|(1<<4)|(3<<6));
vsm = _mm_and_si64(vsm,_mm_set_pi32(0xFFFFFF00,0xFF00FFFF));
vs1 = _mm_and_si64(vs1,_mm_set_pi32(0x000000FF,0x00FF0000));
vs1 = _mm_or_si64(vs1,vsm);
vll = _mm_unpacklo_pi32(v2,v3);
vhh = _mm_unpackhi_pi32(v2,v3);
vs2 = _mm_avg_pu8(vll,vhh);
vsm = _mm_shuffle_pi16(vs2,(0<<0)|(2<<2)|(1<<4)|(3<<6));
vsm = _mm_and_si64(vsm,_mm_set_pi32(0xFFFFFF00,0xFF00FFFF));
vs2 = _mm_and_si64(vs2,_mm_set_pi32(0x000000FF,0x00FF0000));
vs2 = _mm_or_si64(vs2,vsm);
vll = _mm_unpacklo_pi32(vs1,vs2);
vhh = _mm_unpackhi_pi32(vs1,vs2);
pWorkBuf64[i] = _mm_avg_pu8(vll,vhh);
}
}
else
{
__m64 *pPreviewOutput = (__m64*)((BYTE*)_pDst + (y>>1)*_dstPitch );
for(int i=0;i<width32;++i)
{
__m64 v0 = pInLine64[4*i+0];
__m64 v1 = pInLine64[4*i+1];
__m64 v2 = pInLine64[4*i+2];
__m64 v3 = pInLine64[4*i+3];
__m64 vll = _mm_unpacklo_pi32(v0,v1);
__m64 vhh = _mm_unpackhi_pi32(v0,v1);
__m64 vs1 = _mm_avg_pu8(vll,vhh);
__m64 vs2;
__m64 vsm = _mm_shuffle_pi16(vs1,(0<<0)|(2<<2)|(1<<4)|(3<<6));
vsm = _mm_and_si64(vsm,_mm_set_pi32(0xFFFFFF00,0xFF00FFFF));
vs1 = _mm_and_si64(vs1,_mm_set_pi32(0x000000FF,0x00FF0000));
vs1 = _mm_or_si64(vs1,vsm);
vll = _mm_unpacklo_pi32(v2,v3);
vhh = _mm_unpackhi_pi32(v2,v3);
vs2 = _mm_avg_pu8(vll,vhh);
vsm = _mm_shuffle_pi16(vs2,(0<<0)|(2<<2)|(1<<4)|(3<<6));
vsm = _mm_and_si64(vsm,_mm_set_pi32(0xFFFFFF00,0xFF00FFFF));
vs2 = _mm_and_si64(vs2,_mm_set_pi32(0x000000FF,0x00FF0000));
vs2 = _mm_or_si64(vs2,vsm);
vll = _mm_unpacklo_pi32(vs1,vs2);
vhh = _mm_unpackhi_pi32(vs1,vs2);
v0 = _mm_avg_pu8(vll,vhh);
__m64 v = _mm_avg_pu8(v0,pWorkBuf64[i]);
__m64 vl = _mm_subs_pi16(_mm_unpacklo_pi8(v,zero),_yuv_rgb_sub);
__m64 vh = _mm_subs_pi16(_mm_unpackhi_pi8(v,zero),_yuv_rgb_sub);
__m64 rl = _mm_madd_pi16(_mm_shuffle_pi16(vl,(2<<0)|(1<<2)|(2<<4)|(3<<6)),_yuv_r);
__m64 rh = _mm_madd_pi16(_mm_shuffle_pi16(vh,(2<<0)|(1<<2)|(2<<4)|(3<<6)),_yuv_r);
__m64 bl = _mm_madd_pi16(_mm_shuffle_pi16(vl,(0<<0)|(1<<2)|(0<<4)|(3<<6)),_yuv_b);
__m64 bh = _mm_madd_pi16(_mm_shuffle_pi16(vh,(0<<0)|(1<<2)|(0<<4)|(3<<6)),_yuv_b);
__m64 g0l = _mm_madd_pi16(vl,_yuv_g0);
__m64 g1l = _mm_madd_pi16(vl,_yuv_g1);
__m64 g0h = _mm_madd_pi16(vh,_yuv_g0);
__m64 g1h = _mm_madd_pi16(vh,_yuv_g1);
__m64 gl = _mm_add_pi32(_mm_unpacklo_pi32(g0l,g1l),_mm_unpackhi_pi32(g0l,g1l));
__m64 gh = _mm_add_pi32(_mm_unpacklo_pi32(g0h,g1h),_mm_unpackhi_pi32(g0h,g1h));
__m64 r = _mm_packs_pu16(_mm_srai_pi32(rl,MMX_SCALEBITS_TO_RGB),_mm_srai_pi32(rh,MMX_SCALEBITS_TO_RGB));
__m64 g = _mm_packs_pu16(_mm_srai_pi32(gl,MMX_SCALEBITS_TO_RGB),_mm_srai_pi32(gh,MMX_SCALEBITS_TO_RGB));
__m64 b = _mm_packs_pu16(_mm_srai_pi32(bl,MMX_SCALEBITS_TO_RGB),_mm_srai_pi32(bh,MMX_SCALEBITS_TO_RGB));
__m64 rl16 = _mm_unpacklo_pi16(zero,r);
__m64 gl8 = _mm_slli_pi32(_mm_unpacklo_pi16(g,zero),8);
__m64 bl0 = _mm_unpacklo_pi16(b,zero);
__m64 rh16 = _mm_unpackhi_pi16(zero,r);
__m64 gh8 = _mm_slli_pi32(_mm_unpackhi_pi16(g,zero),8);
__m64 bh0 = _mm_unpackhi_pi16(b,zero);
pPreviewOutput[2*i] = _mm_or_si64(_mm_or_si64(rl16,gl8),_mm_or_si64(bl0,alpha_bits));
pPreviewOutput[2*i+1] = _mm_or_si64(_mm_or_si64(rh16,gh8),_mm_or_si64(bh0,alpha_bits));
}
}
pInLine64 += _srcPitch/sizeof(__m64);
}
_mm_empty();
}
AFAIK
, difference between UYVY and YUY2
is just bit-ordering; but I don't know how to do it using intel intrinsics.
Any help is greatly appreciated !
Thanx