Wednesday, November 25, 2009

ferret Perforated Loops

This benchmark performs a content-based similarity search based on a series of query images. For each query image, it returns the $N$ images in its database determined to be most similar. It ferret first decomposes the query image into a set of segments, extracts a feature vector from each segment, indexes an image database to find candidate similar images, ranks the candidate images by measuring the distance between the query image and the candidate images, then returns the highest ranked images.

The loop in the function sdist_emd() is inlined from the function emd(). This function measures the distance between the query image and the candidate image. Perforating this loop creates a new, more efficient, but less accurate distance function.

// This function is defined in src/emd.c in the ferret benchmark.
// The function is then inlined into the function sdist_emd()
float emd(signature_t *Signature1, signature_t *Signature2, float (*Dist)(cass_size_t, feature_t, feature_t, void *), cass_size_t dim, void *param, flow_t *Flow, int*FlowSize)
{
struct emd_state_t emd_state, *state = &emd_state;
int itr;
double totalCost;
float w;
emd_node2_t *XP;
flow_t *FlowP;
emd_node1_t U[MAX_SIG_SIZE1], V[MAX_SIG_SIZE1];

if (C == NULL) C = type_matrix_alloc(double, MAX_SIG_SIZE1, MAX_SIG_SIZE1);

bzero(state, sizeof *state);

state->C = C;

w = emdinit(state, Signature1, Signature2, Dist, dim, param);

if (w == EMD_INFINITY) {
// init failed, due to nr_reg too high, ignore this seg.
return EMD_INFINITY;
}

#if DEBUG_LEVEL > 1
print("\nINITIAL SOLUTION:\n");
printSolution(state);
#endif

if (state->n1 > 1 && state->n2 > 1) /* IF _n1 = 1 OR _n2 = 1 THEN WE ARE DONE */
{
// This loop is perforated, changing itr++ to itr+=2
for (itr = 1; itr < MAX_ITERATIONS; /*itr++*/ itr+=2)
{
/* FIND BASIC VARIABLES */
findBasicVariables(state, U, V);

/* CHECK FOR OPTIMALITY */
if (isOptimal(state, U, V))
break;

/* IMPROVE SOLUTION */
newSol(state);

#if DEBUG_LEVEL > 1
print("\nITERATION # %d \n", itr);
printSolution(state);
#endif
}

if (itr == MAX_ITERATIONS)
warn("emd: Maximum number of iterations has been reached (%d)\n",
MAX_ITERATIONS);
}

/* COMPUTE THE TOTAL FLOW */
totalCost = 0;
FlowP = Flow;
for(XP=state->X; XP < state->EndX; XP++)
{
if (XP == state->EnterX) /* _EnterX IS THE EMPTY SLOT */
continue;
if (XP->i == Signature1->n || XP->j == Signature2->n) /* DUMMY FEATURE */
continue;

if (XP->val == 0) /* ZERO FLOW */
continue;

totalCost += (double)XP->val * state->C[XP->i][XP->j];
if (Flow != NULL)
{
FlowP->from = XP->i;
FlowP->to = XP->j;
FlowP->amount = XP->val;
FlowP->cost = state->C[XP->i][XP->j];
state->tot_flow_costs += FlowP->cost;
state->tot_flow ++;
FlowP++;
}
}
if (Flow != NULL)
*FlowSize = FlowP-Flow;

#if DEBUG_LEVEL > 0
print("\n*** OPTIMAL SOLUTION (%d ITERATIONS): %f ***\n", itr, totalCost);
#endif

/* RETURN THE NORMALIZED COST == EMD */
// print("avg_flow_costs: %g/%d = %g\n", state->tot_flow_costs,
// state->tot_flow, state->tot_flow_costs / state->tot_flow);
return (float)(totalCost / w);
}



The loop in the LSH_query_bootstrap() function initializes several data structures required to query the image database; perforating this loop causes ferret to skip some of the initialization steps. Because the application explicitly zeros the memory before the initialization loop executes, the computation always accesses initialized memory and the Valgrind memcheck tool does not report any reads to uninitialized data. The presence of zeroed data structures in image database queries does not materially affect the quality of service of the application.

// This function is defined in src/lsh/LSH_query.c in the ferret benchmark
static void LSH_query_bootstrap (LSH_query_t *query, const float *point)
{
cass_size_t D = query->lsh->D;
cass_size_t K = query->K;
cass_size_t L = query->L;
LSH_t *lsh = query->lsh;
ptb_vec_t **score = query->ptb;
uint32_t **tmp = query->tmp;
uint32_t *tmp2 = query->tmp2;
cass_list_entry_t **_topk = query->_topk;
cass_list_entry_t entry;

int *C = query->C;
int *H = query->H;

int i;

memset(C, 0, L * sizeof(int));
memset(H, 0, L * sizeof(int));
memset(query->S, 0, L * sizeof(float));

bitmap_clear(query->bitmap);

LSH_hash_score(query->lsh, L, point, tmp, score);
LSH_hash2_noperturb(query->lsh, tmp, tmp2, L);

// This loop is perforated, changing i++ to i+=2
for (i = 0; i < L; i++)
{
memset(_topk[i], 0xff, sizeof (*_topk[i]) * K);
TOPK_INIT(_topk[i], dist, K, HUGE);
ARRAY_BEGIN_FOREACH(lsh->hash[i].bucket[tmp2[i]], uint32_t id) {
if (!bitmap_contain(query->bitmap, id))
{
cass_vec_t *vec;
bitmap_insert(query->bitmap, id);
vec = DATASET_VEC(query->ds, id);
entry.id = id;
entry.dist = dist_L2_float(D, vec->u.float_data, point);
C[i]++;
query->CC++;
TOPK_INSERT_MIN_UNIQ_DO(_topk[i], dist, id, K, entry, H[i]++);
}
}
ARRAY_END_FOREACH;

ptb_qsort(score[i], lsh->M * 2);

#ifdef QUERY_DIRECT
ARRAY_TRUNC(query->heap[i]);
HEAP_ENQUEUE(query->heap[i], score[i][0], ptb_vec_ge);
#else
query->ptb_step[i] = 0;
map_perturb_vector(query->ptb_set, query->ptb_vec[i], score[i], lsh->M, query->T);
#endif
}

if (lsh->est != NULL)
{
LSH_query_merge(query);
LSH_query_local(query);
}

}



The two perforated loops account for a relatively small percentage of the execution time, their perforation does not significantly increase the performance.