YUYV 4:2:2 to ARGB conversion using intel intrinsics SSE/MMX

215 views Asked by At

I'm working on YUY2 (4:2:2) to ARGB conversion with scaling for video buffers 1920x1080i@25. I already have code that works fine for UYVY (4:2:2) to ARGB conversion with scaling.

I'm new to intrinsics and don't know much about it.

#define CYRGB    (1.164)
#define CVR      (1.596)
#define CVG      (0.813)
#define CUG      (0.392)
#define CUB      (2.017)

#define CCHROMAOFFSET   (128)
#define CLUMAOFFSET     (16)

#define MMX_SCALEBITS_TO_RGB        (12)
#define MMX_F16_RGB(COEF)   ((short)((COEF) * (double)(1L << MMX_SCALEBITS_TO_RGB) + 0.5))
#define MMX_64_RGB(f1,f2,f3,f4) _mm_set_pi16(MMX_F16_RGB(f1),MMX_F16_RGB(f2),MMX_F16_RGB(f3),MMX_F16_RGB(f4))

====================

void ScaleYUV4x2( void *_pSrc, ptrdiff_t _srcPitch, void *_pDst, ptrdiff_t _dstPitch )
{
    int width32 = (m_prevWidth * 4) >> 4;

    __m64 sumLine[(1920)/sizeof(__m64)];

    __m64 *pInLine64 = (__m64*)_pSrc;
    __m64 *pWorkBuf64 = (__m64*)&sumLine[0];

    __m64 _yuv_r = MMX_64_RGB(CYRGB, CVR, CYRGB, CVR);
    __m64 _yuv_b = MMX_64_RGB(CYRGB, CUB, CYRGB, CUB);
    __m64 _yuv_g0 = MMX_64_RGB(0    ,-CVG,CYRGB ,-CUG);
    __m64 _yuv_g1 = MMX_64_RGB(CYRGB,-CVG,  0   ,-CUG);

    __m64 alpha_bits = _mm_set1_pi32(0xFF000000UL);

    __m64 zero = _mm_setzero_si64();
    __m64 _yuv_rgb_sub = _mm_set_pi16(CLUMAOFFSET,CCHROMAOFFSET,CLUMAOFFSET,CCHROMAOFFSET);

    for( DWORD y = 0; y < m_prevHeight*2; ++y )
    {
        if((y & 1)==0)
        {
            // even line scaling
            for(int i=0;i<width32;++i)
            {
                __m64 v0 = pInLine64[4*i+0];
                __m64 v1 = pInLine64[4*i+1];
                __m64 v2 = pInLine64[4*i+2];
                __m64 v3 = pInLine64[4*i+3];

                __m64 vll = _mm_unpacklo_pi32(v0,v1);
                __m64 vhh = _mm_unpackhi_pi32(v0,v1);
                __m64 vs1 = _mm_avg_pu8(vll,vhh);
                __m64 vs2;

                __m64 vsm = _mm_shuffle_pi16(vs1,(0<<0)|(2<<2)|(1<<4)|(3<<6));
                vsm = _mm_and_si64(vsm,_mm_set_pi32(0xFFFFFF00,0xFF00FFFF));
                vs1 = _mm_and_si64(vs1,_mm_set_pi32(0x000000FF,0x00FF0000));
                vs1 = _mm_or_si64(vs1,vsm);

                vll = _mm_unpacklo_pi32(v2,v3);
                vhh = _mm_unpackhi_pi32(v2,v3);
                vs2 = _mm_avg_pu8(vll,vhh);

                vsm = _mm_shuffle_pi16(vs2,(0<<0)|(2<<2)|(1<<4)|(3<<6));
                vsm = _mm_and_si64(vsm,_mm_set_pi32(0xFFFFFF00,0xFF00FFFF));
                vs2 = _mm_and_si64(vs2,_mm_set_pi32(0x000000FF,0x00FF0000));
                vs2 = _mm_or_si64(vs2,vsm);

                vll = _mm_unpacklo_pi32(vs1,vs2);
                vhh = _mm_unpackhi_pi32(vs1,vs2);

                pWorkBuf64[i] = _mm_avg_pu8(vll,vhh);
            }
        }
        else
        {
            __m64 *pPreviewOutput = (__m64*)((BYTE*)_pDst + (y>>1)*_dstPitch );

            for(int i=0;i<width32;++i)
            {
                __m64 v0 = pInLine64[4*i+0];
                __m64 v1 = pInLine64[4*i+1];
                __m64 v2 = pInLine64[4*i+2];
                __m64 v3 = pInLine64[4*i+3];

                __m64 vll = _mm_unpacklo_pi32(v0,v1);
                __m64 vhh = _mm_unpackhi_pi32(v0,v1);
                __m64 vs1 = _mm_avg_pu8(vll,vhh);
                __m64 vs2;

                __m64 vsm = _mm_shuffle_pi16(vs1,(0<<0)|(2<<2)|(1<<4)|(3<<6));
                vsm = _mm_and_si64(vsm,_mm_set_pi32(0xFFFFFF00,0xFF00FFFF));
                vs1 = _mm_and_si64(vs1,_mm_set_pi32(0x000000FF,0x00FF0000));
                vs1 = _mm_or_si64(vs1,vsm);

                vll = _mm_unpacklo_pi32(v2,v3);
                vhh = _mm_unpackhi_pi32(v2,v3);
                vs2 = _mm_avg_pu8(vll,vhh);

                vsm = _mm_shuffle_pi16(vs2,(0<<0)|(2<<2)|(1<<4)|(3<<6));
                vsm = _mm_and_si64(vsm,_mm_set_pi32(0xFFFFFF00,0xFF00FFFF));
                vs2 = _mm_and_si64(vs2,_mm_set_pi32(0x000000FF,0x00FF0000));
                vs2 = _mm_or_si64(vs2,vsm);

                vll = _mm_unpacklo_pi32(vs1,vs2);
                vhh = _mm_unpackhi_pi32(vs1,vs2);

                v0 = _mm_avg_pu8(vll,vhh);


                __m64 v = _mm_avg_pu8(v0,pWorkBuf64[i]);

                __m64 vl = _mm_subs_pi16(_mm_unpacklo_pi8(v,zero),_yuv_rgb_sub);
                __m64 vh = _mm_subs_pi16(_mm_unpackhi_pi8(v,zero),_yuv_rgb_sub);

                __m64 rl = _mm_madd_pi16(_mm_shuffle_pi16(vl,(2<<0)|(1<<2)|(2<<4)|(3<<6)),_yuv_r);
                __m64 rh = _mm_madd_pi16(_mm_shuffle_pi16(vh,(2<<0)|(1<<2)|(2<<4)|(3<<6)),_yuv_r);

                __m64 bl = _mm_madd_pi16(_mm_shuffle_pi16(vl,(0<<0)|(1<<2)|(0<<4)|(3<<6)),_yuv_b);
                __m64 bh = _mm_madd_pi16(_mm_shuffle_pi16(vh,(0<<0)|(1<<2)|(0<<4)|(3<<6)),_yuv_b);

                __m64 g0l = _mm_madd_pi16(vl,_yuv_g0);
                __m64 g1l = _mm_madd_pi16(vl,_yuv_g1);
                __m64 g0h = _mm_madd_pi16(vh,_yuv_g0);
                __m64 g1h = _mm_madd_pi16(vh,_yuv_g1);

                __m64 gl = _mm_add_pi32(_mm_unpacklo_pi32(g0l,g1l),_mm_unpackhi_pi32(g0l,g1l));
                __m64 gh = _mm_add_pi32(_mm_unpacklo_pi32(g0h,g1h),_mm_unpackhi_pi32(g0h,g1h));

                __m64 r = _mm_packs_pu16(_mm_srai_pi32(rl,MMX_SCALEBITS_TO_RGB),_mm_srai_pi32(rh,MMX_SCALEBITS_TO_RGB));
                __m64 g = _mm_packs_pu16(_mm_srai_pi32(gl,MMX_SCALEBITS_TO_RGB),_mm_srai_pi32(gh,MMX_SCALEBITS_TO_RGB));
                __m64 b = _mm_packs_pu16(_mm_srai_pi32(bl,MMX_SCALEBITS_TO_RGB),_mm_srai_pi32(bh,MMX_SCALEBITS_TO_RGB));

                __m64 rl16 = _mm_unpacklo_pi16(zero,r);
                __m64 gl8 = _mm_slli_pi32(_mm_unpacklo_pi16(g,zero),8);
                __m64 bl0 = _mm_unpacklo_pi16(b,zero);

                __m64 rh16 = _mm_unpackhi_pi16(zero,r);
                __m64 gh8 = _mm_slli_pi32(_mm_unpackhi_pi16(g,zero),8);
                __m64 bh0 = _mm_unpackhi_pi16(b,zero);

                pPreviewOutput[2*i] = _mm_or_si64(_mm_or_si64(rl16,gl8),_mm_or_si64(bl0,alpha_bits));
                pPreviewOutput[2*i+1] = _mm_or_si64(_mm_or_si64(rh16,gh8),_mm_or_si64(bh0,alpha_bits));
            }
        }
        pInLine64 += _srcPitch/sizeof(__m64);
    }
    _mm_empty();
}

AFAIK, difference between UYVY and YUY2 is just bit-ordering; but I don't know how to do it using intel intrinsics.

Any help is greatly appreciated !
Thanx

0

There are 0 answers