How to encode Planar 4:2:0 (fourcc P010)

740 views Asked by At

I'm trying to recode fourcc V210 (which is a packed YUV4:2:2 format) into a P010 (planar YUV4:2:0). I think I've implemented it according to spec, but the renderer is giving a wrong image so something is off. Decoding the V210 has a decent example in ffmpeg (defines are modified from their solution) but I can't find a P010 encoder to look at what I'm doing wrong.

(Yes, I've tried ffmpeg and that works but it's too slow for this, it takes ~30ms per frame on an Intel Gen11 i7)

Clarification (after @Frank's question): The frames being processed are 4k (3840px wide) and hence there is no code for doing the 128b alignment.

This is running on intel so little endian conversions applied.

Try1 - all green image:

The following code

#define V210_READ_PACK_BLOCK(a, b, c) \
    do {                              \
        val  = *src++;                \
        a = val & 0x3FF;              \
        b = (val >> 10) & 0x3FF;      \
        c = (val >> 20) & 0x3FF;      \
    } while (0)

#define PIXELS_PER_PACK 6
#define BYTES_PER_PACK (4*4)

void MyClass::FormatVideoFrame(
    BYTE* inFrame,
    BYTE* outBuffer)
{
    const uint32_t pixels = m_height * m_width;

    const uint32_t* src = (const uint32_t *)inFrame);

    uint16_t* dstY = (uint16_t *)outBuffer;

    uint16_t* dstUVStart = (uint16_t*)(outBuffer + ((ptrdiff_t)pixels * sizeof(uint16_t)));
    uint16_t* dstUV = dstUVStart;

    const uint32_t packsPerLine = m_width / PIXELS_PER_PACK;

    for (uint32_t line = 0; line < m_height; line++)
    {
        for (uint32_t pack = 0; pack < packsPerLine; pack++)
        {
            uint32_t val;
            uint16_t u, y1, y2, v;

            if (pack % 2 == 0)
            {
                V210_READ_PACK_BLOCK(u, y1, v);
                *dstUV++ = u;
                *dstY++ = y1;
                *dstUV++ = v;

                V210_READ_PACK_BLOCK(y1, u, y2);
                *dstY++ = y1;
                *dstUV++ = u;
                *dstY++ = y2;

                V210_READ_PACK_BLOCK(v, y1, u);
                *dstUV++ = v;
                *dstY++ = y1;
                *dstUV++ = u;

                V210_READ_PACK_BLOCK(y1, v, y2);
                *dstY++ = y1;
                *dstUV++ = v;
                *dstY++ = y2;
            }
            else
            {
                V210_READ_PACK_BLOCK(u, y1, v);
                *dstY++ = y1;

                V210_READ_PACK_BLOCK(y1, u, y2);
                *dstY++ = y1;
                *dstY++ = y2;

                V210_READ_PACK_BLOCK(v, y1, u);
                *dstY++ = y1;

                V210_READ_PACK_BLOCK(y1, v, y2);
                *dstY++ = y1;
                *dstY++ = y2;
            }
        }
    }

#ifdef _DEBUG

    // Fully written Y space
    assert(dstY == dstUVStart);

    // Fully written UV space
    const BYTE* expectedVurrentUVPtr = outBuffer + (ptrdiff_t)GetOutFrameSize();
    assert(expectedVurrentUVPtr == (BYTE *)dstUV);

#endif
}

// This is called to determine outBuffer size
LONG MyClass::GetOutFrameSize() const
{
    const LONG pixels = m_height * m_width;

    return
        (pixels * sizeof(uint16_t)) +  // Every pixel 1 y
        (pixels / 2 / 2 * (2 * sizeof(uint16_t)));  // Every 2 pixels and every odd row 2 16-bit numbers
}

Leads to all green image. This turned out to be a missing bit shift to place the 10 bits in the upper bits of the 16-bit value as per the P010 spec.

Try 2 - Y works, UV doubled?

Updated the code to properly (or so I think) shifts the YUV values to the correct position in their 16-bit space.

#define V210_READ_PACK_BLOCK(a, b, c) \
    do {                              \
        val  = *src++;                \
        a = val & 0x3FF;              \
        b = (val >> 10) & 0x3FF;      \
        c = (val >> 20) & 0x3FF;      \
    } while (0)


#define P010_WRITE_VALUE(d, v) (*d++ = (v << 6))

#define PIXELS_PER_PACK 6
#define BYTES_PER_PACK (4 * sizeof(uint32_t))

// Snipped constructor here which guarantees that we're processing
// something which does not violate alignment.

void MyClass::FormatVideoFrame(
    const BYTE* inBuffer,
    BYTE* outBuffer)
{   
    const uint32_t pixels = m_height * m_width;
    const uint32_t aligned_width = ((m_width + 47) / 48) * 48;
    const uint32_t stride = aligned_width * 8 / 3;

    uint16_t* dstY = (uint16_t *)outBuffer;

    uint16_t* dstUVStart = (uint16_t*)(outBuffer + ((ptrdiff_t)pixels * sizeof(uint16_t)));
    uint16_t* dstUV = dstUVStart;

    const uint32_t packsPerLine = m_width / PIXELS_PER_PACK;

    for (uint32_t line = 0; line < m_height; line++)
    {
        // Lines start at 128 byte alignment
        const uint32_t* src = (const uint32_t*)(inBuffer + (ptrdiff_t)(line * stride));

        for (uint32_t pack = 0; pack < packsPerLine; pack++)
        {
            uint32_t val;
            uint16_t u, y1, y2, v;

            if (pack % 2 == 0)
            {
                V210_READ_PACK_BLOCK(u, y1, v);
                P010_WRITE_VALUE(dstUV, u);
                P010_WRITE_VALUE(dstY, y1);
                P010_WRITE_VALUE(dstUV, v);

                V210_READ_PACK_BLOCK(y1, u, y2);
                P010_WRITE_VALUE(dstY, y1);
                P010_WRITE_VALUE(dstUV, u);
                P010_WRITE_VALUE(dstY, y2);

                V210_READ_PACK_BLOCK(v, y1, u);
                P010_WRITE_VALUE(dstUV, v);
                P010_WRITE_VALUE(dstY, y1);
                P010_WRITE_VALUE(dstUV, u);

                V210_READ_PACK_BLOCK(y1, v, y2);
                P010_WRITE_VALUE(dstY, y1);
                P010_WRITE_VALUE(dstUV, v);
                P010_WRITE_VALUE(dstY, y2);
            }
            else
            {
                V210_READ_PACK_BLOCK(u, y1, v);
                P010_WRITE_VALUE(dstY, y1);

                V210_READ_PACK_BLOCK(y1, u, y2);
                P010_WRITE_VALUE(dstY, y1);
                P010_WRITE_VALUE(dstY, y2);

                V210_READ_PACK_BLOCK(v, y1, u);
                P010_WRITE_VALUE(dstY, y1);

                V210_READ_PACK_BLOCK(y1, v, y2);
                P010_WRITE_VALUE(dstY, y1);
                P010_WRITE_VALUE(dstY, y2);
            }
        }
    }

#ifdef _DEBUG

    // Fully written Y space
    assert(dstY == dstUVStart);

    // Fully written UV space
    const BYTE* expectedVurrentUVPtr = outBuffer + (ptrdiff_t)GetOutFrameSize();
    assert(expectedVurrentUVPtr == (BYTE *)dstUV);

#endif
}

This leads to the Y being correct and the amount of lines for U and V as well, but somehow U and V are not overlaid properly. There are two versions of it seemingly mirrored through the center vertical. Something similar but less visible for zeroing out V. So both of these are getting rendered at half the width? Any tips appreciated :)

Fix: Found the bug, I'm flipping VU not per pack but per block

if (pack % 2 == 0)

Should be

if (line % 2 == 0)
1

There are 1 answers

0
DennisFleurbaaij On

There were 2 bugs. The first one because I wasn't pushing the 10 bit value to the higher bits as required by the spec. The second one because I wasn't writing UV per odd-line but per odd-pack.

Leaving it here for the disco effect value and maybe somebody else needs to play with this and travels the same path. I learned that "just follow the spec" works even in totally unknown territory :) Thanks everyone who looked at it.