The loop in the function sdist_emd() is inlined from the function emd(). This function measures the distance between the query image and the candidate image. Perforating this loop creates a new, more efficient, but less accurate distance function.
// This function is defined in src/emd.c in the ferret benchmark.
// The function is then inlined into the function sdist_emd()
float emd(signature_t *Signature1, signature_t *Signature2, float (*Dist)(cass_size_t, feature_t, feature_t, void *), cass_size_t dim, void *param, flow_t *Flow, int*FlowSize)
{
struct emd_state_t emd_state, *state = &emd_state;
int itr;
double totalCost;
float w;
emd_node2_t *XP;
flow_t *FlowP;
emd_node1_t U[MAX_SIG_SIZE1], V[MAX_SIG_SIZE1];
if (C == NULL) C = type_matrix_alloc(double, MAX_SIG_SIZE1, MAX_SIG_SIZE1);
bzero(state, sizeof *state);
state->C = C;
w = emdinit(state, Signature1, Signature2, Dist, dim, param);
if (w == EMD_INFINITY) {
// init failed, due to nr_reg too high, ignore this seg.
return EMD_INFINITY;
}
#if DEBUG_LEVEL > 1
print("\nINITIAL SOLUTION:\n");
printSolution(state);
#endif
if (state->n1 > 1 && state->n2 > 1) /* IF _n1 = 1 OR _n2 = 1 THEN WE ARE DONE */
{
// This loop is perforated, changing itr++ to itr+=2
for (itr = 1; itr < MAX_ITERATIONS; /*itr++*/ itr+=2)
{
/* FIND BASIC VARIABLES */
findBasicVariables(state, U, V);
/* CHECK FOR OPTIMALITY */
if (isOptimal(state, U, V))
break;
/* IMPROVE SOLUTION */
newSol(state);
#if DEBUG_LEVEL > 1
print("\nITERATION # %d \n", itr);
printSolution(state);
#endif
}
if (itr == MAX_ITERATIONS)
warn("emd: Maximum number of iterations has been reached (%d)\n",
MAX_ITERATIONS);
}
/* COMPUTE THE TOTAL FLOW */
totalCost = 0;
FlowP = Flow;
for(XP=state->X; XP < state->EndX; XP++)
{
if (XP == state->EnterX) /* _EnterX IS THE EMPTY SLOT */
continue;
if (XP->i == Signature1->n || XP->j == Signature2->n) /* DUMMY FEATURE */
continue;
if (XP->val == 0) /* ZERO FLOW */
continue;
totalCost += (double)XP->val * state->C[XP->i][XP->j];
if (Flow != NULL)
{
FlowP->from = XP->i;
FlowP->to = XP->j;
FlowP->amount = XP->val;
FlowP->cost = state->C[XP->i][XP->j];
state->tot_flow_costs += FlowP->cost;
state->tot_flow ++;
FlowP++;
}
}
if (Flow != NULL)
*FlowSize = FlowP-Flow;
#if DEBUG_LEVEL > 0
print("\n*** OPTIMAL SOLUTION (%d ITERATIONS): %f ***\n", itr, totalCost);
#endif
/* RETURN THE NORMALIZED COST == EMD */
// print("avg_flow_costs: %g/%d = %g\n", state->tot_flow_costs,
// state->tot_flow, state->tot_flow_costs / state->tot_flow);
return (float)(totalCost / w);
}
The loop in the LSH_query_bootstrap() function initializes several data structures required to query the image database; perforating this loop causes ferret to skip some of the initialization steps. Because the application explicitly zeros the memory before the initialization loop executes, the computation always accesses initialized memory and the Valgrind memcheck tool does not report any reads to uninitialized data. The presence of zeroed data structures in image database queries does not materially affect the quality of service of the application.
// This function is defined in src/lsh/LSH_query.c in the ferret benchmark
static void LSH_query_bootstrap (LSH_query_t *query, const float *point)
{
cass_size_t D = query->lsh->D;
cass_size_t K = query->K;
cass_size_t L = query->L;
LSH_t *lsh = query->lsh;
ptb_vec_t **score = query->ptb;
uint32_t **tmp = query->tmp;
uint32_t *tmp2 = query->tmp2;
cass_list_entry_t **_topk = query->_topk;
cass_list_entry_t entry;
int *C = query->C;
int *H = query->H;
int i;
memset(C, 0, L * sizeof(int));
memset(H, 0, L * sizeof(int));
memset(query->S, 0, L * sizeof(float));
bitmap_clear(query->bitmap);
LSH_hash_score(query->lsh, L, point, tmp, score);
LSH_hash2_noperturb(query->lsh, tmp, tmp2, L);
// This loop is perforated, changing i++ to i+=2
for (i = 0; i < L; i++)
{
memset(_topk[i], 0xff, sizeof (*_topk[i]) * K);
TOPK_INIT(_topk[i], dist, K, HUGE);
ARRAY_BEGIN_FOREACH(lsh->hash[i].bucket[tmp2[i]], uint32_t id) {
if (!bitmap_contain(query->bitmap, id))
{
cass_vec_t *vec;
bitmap_insert(query->bitmap, id);
vec = DATASET_VEC(query->ds, id);
entry.id = id;
entry.dist = dist_L2_float(D, vec->u.float_data, point);
C[i]++;
query->CC++;
TOPK_INSERT_MIN_UNIQ_DO(_topk[i], dist, id, K, entry, H[i]++);
}
}
ARRAY_END_FOREACH;
ptb_qsort(score[i], lsh->M * 2);
#ifdef QUERY_DIRECT
ARRAY_TRUNC(query->heap[i]);
HEAP_ENQUEUE(query->heap[i], score[i][0], ptb_vec_ge);
#else
query->ptb_step[i] = 0;
map_perturb_vector(query->ptb_set, query->ptb_vec[i], score[i], lsh->M, query->T);
#endif
}
if (lsh->est != NULL)
{
LSH_query_merge(query);
LSH_query_local(query);
}
}
The two perforated loops account for a relatively small percentage of the execution time, their perforation does not significantly increase the performance.