Wednesday, November 25, 2009

x264 Perforated Loops

Modern video encoders (such as x264) compress raw video by finding and eliminating temporal and spatial redundancy in the input video. Motion estimation finds temporal redundancy by comparing regions of pixels in the current frame with regions in previously compressed reference frames to find similar regions. When successful, motion estimation makes it possible to efficiently encode regions in the current frame by referencing similar regions in reference frames. The motion estimation computation consumes the vast majority of the x264 execution time. All five of the final perforated loops are part of this computation.

Functions pixel_satd_wxh() and x264_pixel_sad_16x16() (see below) compute a measure of the similarity of two regions of pixels. Perforating the loops in these functions creates a new (more efficiently computable but slightly less accurate) similarity measure between the two regions.

// This function is found in common/pixel.c in the x264 source code
int x264_pixel_sad_16x16(uint8_t *pix1, int i_stride_pix1,
uint8_t *pix2, int i_stride_pix2 )
{
int i_sum = 0;
int x, y;

for( y = 0; y < 16; y++ )
{
// The following loop is perforated
// effectively changing x++ to x+=2
for( x = 0; x < 16; /*x++*/ x+=2 )
{
i_sum += abs( pix1[x] - pix2[x] );
}
pix1 += i_stride_pix1;
pix2 += i_stride_pix2;
}
return i_sum;
}


// This function is defined in the common/pixel.c from the x264 source code
int x264_pixel_satd_wxh(uint8_t *pix1, int i_pix1,
uint8_t *pix2, int i_pix2,
int i_width, int i_height )
{
int16_t tmp[4][4];
int16_t diff[4][4];
int x, y;
int i_satd = 0;

// This loop is perforated, effectively
// changing y+=4 to y+=8
for( y = 0; y < i_height; /*y += 4*/ y+=8 )
{
// The following loop is perforated,
// effectively changing x+=4 to x+=8
for( x = 0; x < i_width; /*x += 4*/ x+=8 )
{
int d;

pixel_sub_wxh( (int16_t*)diff, 4, &pix1[x], i_pix1, &pix2[x], i_pix2 );

for( d = 0; d < 4; d++ )
{
int s01, s23;
int d01, d23;

s01 = diff[d][0] + diff[d][1]; s23 = diff[d][2] + diff[d][3];
d01 = diff[d][0] - diff[d][1]; d23 = diff[d][2] - diff[d][3];

tmp[d][0] = s01 + s23;
tmp[d][1] = s01 - s23;
tmp[d][2] = d01 - d23;
tmp[d][3] = d01 + d23;
}
// The following loop is perforated
// effectively changing d== to d+=2
for( d = 0; d < 4; d++ )
{
int s01, s23;
int d01, d23;

s01 = tmp[0][d] + tmp[1][d]; s23 = tmp[2][d] + tmp[3][d];
d01 = tmp[0][d] - tmp[1][d]; d23 = tmp[2][d] - tmp[3][d];

i_satd += abs( s01 + s23 ) + abs( s01 - s23 ) + abs( d01 - d23 ) + abs( d01 + d23 );
}

}
pix1 += 4 * i_pix1;
pix2 += 4 * i_pix2;
}

return i_satd / 2;
}


The function x264_me_search_ref() controls the motion estimation process for a particular region. Perforating the loop in this function causes x264 to test fewer potential matches in the reference frame. Testing fewer matches reduces the amount of time that x264 spends looking for a good match, but may reduce the similarity of the match that x264 does find.

// This code snippet is taken from the function x264_me_search_ref()
// The function is defined in the file encoder/me.c

me_hex2:
/* hexagon search, radius 2 */
#if 0
for( i = 0; i < i_me_range/2; i++ )
{
omx = bmx; omy = bmy;
COST_MV( omx-2, omy );
COST_MV( omx-1, omy+2 );
COST_MV( omx+1, omy+2 );
COST_MV( omx+2, omy );
COST_MV( omx+1, omy-2 );
COST_MV( omx-1, omy-2 );
if( bmx == omx && bmy == omy )
break;
if( !CHECK_MVRANGE(bmx, bmy) )
break;
}
#else
/* equivalent to the above, but eliminates duplicate candidates */
dir = -2;

/* hexagon */
COST_MV_X3_DIR( -2,0, -1, 2, 1, 2, costs );
COST_MV_X3_DIR( 2,0, 1,-2, -1,-2, costs+3 );
COPY2_IF_LT( bcost, costs[0], dir, 0 );
COPY2_IF_LT( bcost, costs[1], dir, 1 );
COPY2_IF_LT( bcost, costs[2], dir, 2 );
COPY2_IF_LT( bcost, costs[3], dir, 3 );
COPY2_IF_LT( bcost, costs[4], dir, 4 );
COPY2_IF_LT( bcost, costs[5], dir, 5 );

if( dir != -2 )
{
bmx += hex2[dir+1][0];
bmy += hex2[dir+1][1];
/* half hexagon, not overlapping the previous iteration */
for( i = 1; i < i_me_range/2 && CHECK_MVRANGE(bmx, bmy); i++ )
{
const int odir = mod6m1[dir+1];
COST_MV_X3_DIR( hex2[odir+0][0], hex2[odir+0][1],
hex2[odir+1][0], hex2[odir+1][1],
hex2[odir+2][0], hex2[odir+2][1],
costs );
dir = -2;
COPY2_IF_LT( bcost, costs[0], dir, odir-1 );
COPY2_IF_LT( bcost, costs[1], dir, odir );
COPY2_IF_LT( bcost, costs[2], dir, odir+1 );
if( dir == -2 )
break;
bmx += hex2[dir+1][0];
bmy += hex2[dir+1][1];
}
}
#endif
/* square refine */
omx = bmx; omy = bmy;
COST_MV_X4( 0,-1, 0,1, -1,0, 1,0 );
COST_MV_X4( -1,-1, -1,1, 1,-1, 1,1 );
break;

case X264_ME_UMH:
{
/* Uneven-cross Multi-Hexagon-grid Search
* as in JM, except with different early termination */

static const int x264_pixel_size_shift[7] = { 0, 1, 1, 2, 3, 3, 4 };

int ucost1, ucost2;
int cross_start = 1;

/* refine predictors */
ucost1 = bcost;
DIA1_ITER( pmx, pmy );
if( pmx | pmy )
DIA1_ITER( 0, 0 );

if(i_pixel == PIXEL_4x4)
goto me_hex2;

ucost2 = bcost;
if( (bmx | bmy) && ((bmx-pmx) | (bmy-pmy)) )
DIA1_ITER( bmx, bmy );
if( bcost == ucost2 )
cross_start = 3;
omx = bmx; omy = bmy;

/* early termination */
#define SAD_THRESH(v) ( bcost < ( v >> x264_pixel_size_shift[i_pixel] ) )
if( bcost == ucost2 && SAD_THRESH(2000) )
{
COST_MV_X4( 0,-2, -1,-1, 1,-1, -2,0 );
COST_MV_X4( 2, 0, -1, 1, 1, 1, 0,2 );
if( bcost == ucost1 && SAD_THRESH(500) )
break;
if( bcost == ucost2 )
{
int range = (i_me_range>>1) | 1;
CROSS( 3, range, range );
COST_MV_X4( -1,-2, 1,-2, -2,-1, 2,-1 );
COST_MV_X4( -2, 1, 2, 1, -1, 2, 1, 2 );
if( bcost == ucost2 )
break;
cross_start = range + 2;
}
}

/* adaptive search range */
if( i_mvc )
{
/* range multipliers based on casual inspection of some statistics of
* average distance between current predictor and final mv found by ESA.
* these have not been tuned much by actual encoding. */
static const int range_mul[4][4] =
{
{ 3, 3, 4, 4 },
{ 3, 4, 4, 4 },
{ 4, 4, 4, 5 },
{ 4, 4, 5, 6 },
};
int mvd;
int sad_ctx, mvd_ctx;
int denom = 1;

if( i_mvc == 1 )
{
if( i_pixel == PIXEL_16x16 )
/* mvc is probably the same as mvp, so the difference isn't meaningful.
* but prediction usually isn't too bad, so just use medium range */
mvd = 25;
else
mvd = abs( m->mvp[0] - mvc[0][0] )
+ abs( m->mvp[1] - mvc[0][1] );
}
else
{
/* calculate the degree of agreement between predictors. */
/* in 16x16, mvc includes all the neighbors used to make mvp,
* so don't count mvp separately. */
denom = i_mvc - 1;
mvd = 0;
if( i_pixel != PIXEL_16x16 )
{
mvd = abs( m->mvp[0] - mvc[0][0] )
+ abs( m->mvp[1] - mvc[0][1] );
denom++;
}
for( i = 0; i < i_mvc-1; i++ )
mvd += abs( mvc[i][0] - mvc[i+1][0] )
+ abs( mvc[i][1] - mvc[i+1][1] );
}

sad_ctx = SAD_THRESH(1000) ? 0
: SAD_THRESH(2000) ? 1
: SAD_THRESH(4000) ? 2 : 3;
mvd_ctx = mvd < 10*denom ? 0
: mvd < 20*denom ? 1
: mvd < 40*denom ? 2 : 3;

i_me_range = i_me_range * range_mul[mvd_ctx][sad_ctx] / 4;
}

/* FIXME if the above DIA2/OCT2/CROSS found a new mv, it has not updated omx/omy.
* we are still centered on the same place as the DIA2. is this desirable? */
CROSS( cross_start, i_me_range, i_me_range/2 );

COST_MV_X4( -2,-2, -2,2, 2,-2, 2,2 );

/* hexagon grid */
omx = bmx; omy = bmy;
// This loop is perforated, changing i++ into i+=2
for( i = 1; i <= i_me_range/4; /*i++*/ i+=2 )
{
static const int hex4[16][2] = {
{-4, 2}, {-4, 1}, {-4, 0}, {-4,-1}, {-4,-2},
{ 4,-2}, { 4,-1}, { 4, 0}, { 4, 1}, { 4, 2},
{ 2, 3}, { 0, 4}, {-2, 3},
{-2,-3}, { 0,-4}, { 2,-3},
};

if( 4*i > X264_MIN4( mv_x_max-omx, omx-mv_x_min,
mv_y_max-omy, omy-mv_y_min ) )
{
for( j = 0; j < 16; j++ )
{
int mx = omx + hex4[j][0]*i;
int my = omy + hex4[j][1]*i;
if( CHECK_MVRANGE(mx, my) )
COST_MV( mx, my );
}
}
else
{
COST_MV_X4( -4*i, 2*i, -4*i, 1*i, -4*i, 0*i, -4*i,-1*i );
COST_MV_X4( -4*i,-2*i, 4*i,-2*i, 4*i,-1*i, 4*i, 0*i );
COST_MV_X4( 4*i, 1*i, 4*i, 2*i, 2*i, 3*i, 0*i, 4*i );
COST_MV_X4( -2*i, 3*i, -2*i,-3*i, 0*i,-4*i, 2*i,-3*i );
}
}
if( bmy <= mv_y_max )
goto me_hex2;
break;
}





The final perforations produce motion estimation algorithms that take substantially less time to perform but may produce less efficiently encoded video.