Ich versuche, ein Programm namens Streamcluster zu parallelisieren. Genauer gesagt verbringt die Funktion namens pgain die meiste Zeit des Programms mit dem Scalasca-Tool, das ich benutzt habe, also ist dies die Funktion, die ich parallelisieren sollte. Hier können Sie die Funktion und meinen Aufwand sehen, parallel dazu zu arbeiten. Das Problem ist, dass das einzige, was ich erreicht habe, das Programm ist, das noch mehr Zeit für die Ausführung benötigt.parallelisieren streamcluster mit openmp
Die ursprüngliche PGAIN Funktion in streamcluster:
double pgain (long x, Points *points, double z, long int *numcenters)
{
int i;
int number_of_centers_to_close = 0;
static double *work_mem;
static double gl_cost_of_opening_x;
static int gl_number_of_centers_to_close;
int stride = *numcenters + 2;
//make stride a multiple of CACHE_LINE
int cl = CACHE_LINE/sizeof (double);
if (stride % cl != 0) {
stride = cl * (stride/cl + 1);
}
int K = stride - 2 ; // K==*numcenters
//my own cost of opening x
double cost_of_opening_x = 0;
work_mem = (double*) malloc (2 * stride * sizeof (double));
gl_cost_of_opening_x = 0;
gl_number_of_centers_to_close = 0;
/*
* For each center, we have a *lower* field that indicates
* how much we will save by closing the center.
*/
int count = 0;
for (int i = 0; i < points->num; i++) {
if (is_center[i]) {
center_table[i] = count++;
}
}
work_mem[0] = 0;
//now we finish building the table. clear the working memory.
memset (switch_membership, 0, points->num * sizeof (bool));
memset (work_mem, 0, stride*sizeof (double));
memset (work_mem+stride,0,stride*sizeof (double));
//my *lower* fields
double* lower = &work_mem[0];
//global *lower* fields
double* gl_lower = &work_mem[stride];
for (i = 0; i < points->num; i++) {
float x_cost = dist (points->p[i], points->p[x], points->dim) * points->p[i].weight;
float current_cost = points->p[i].cost;
if (x_cost < current_cost) {
// point i would save cost just by switching to x
// (note that i cannot be a median,
// or else dist(p[i], p[x]) would be 0)
switch_membership[i] = 1;
cost_of_opening_x += x_cost - current_cost;
} else {
// cost of assigning i to x is at least current assignment cost of i
// consider the savings that i's **current** median would realize
// if we reassigned that median and all its members to x;
// note we've already accounted for the fact that the median
// would save z by closing; now we have to subtract from the savings
// the extra cost of reassigning that median and its members
int assign = points->p[i].assign;
lower[center_table[assign]] += current_cost - x_cost;
}
}
// at this time, we can calculate the cost of opening a center
// at x; if it is negative, we'll go through with opening it
for (int i = 0; i < points->num; i++) {
if (is_center[i]) {
double low = z + work_mem[center_table[i]];
gl_lower[center_table[i]] = low;
if (low > 0) {
// i is a median, and
// if we were to open x (which we still may not) we'd close i
// note, we'll ignore the following quantity unless we do open x
++number_of_centers_to_close;
cost_of_opening_x -= low;
}
}
}
//use the rest of working memory to store the following
work_mem[K] = number_of_centers_to_close;
work_mem[K+1] = cost_of_opening_x;
gl_number_of_centers_to_close = (int) work_mem[K];
gl_cost_of_opening_x = z + work_mem[K+1];
// Now, check whether opening x would save cost; if so, do it, and
// otherwise do nothing
if (gl_cost_of_opening_x < 0) {
// we'd save money by opening x; we'll do it
for (int i = 0; i < points->num; i++) {
bool close_center = gl_lower[center_table[points->p[i].assign]] > 0 ;
if (switch_membership[i] || close_center) {
// Either i's median (which may be i itself) is closing,
// or i is closer to x than to its current median
points->p[i].cost = points->p[i].weight * dist (points->p[i], points->p[x], points->dim);
points->p[i].assign = x;
}
}
for (int i = 0; i < points->num; i++) {
if (is_center[i] && gl_lower[center_table[i]] > 0) {
is_center[i] = false;
}
}
if (x >= 0 && x < points->num) {
is_center[x] = true;
}
*numcenters = *numcenters + 1 - gl_number_of_centers_to_close;
} else {
gl_cost_of_opening_x = 0; // the value we'll return
}
free (work_mem);
return -gl_cost_of_opening_x;
}
Und das ist, was ich es parallelisieren getan haben:
double pgain (long x, Points *points, double z, long int *numcenters)
{
int i;
int number_of_centers_to_close = 0;
static double *work_mem;
static double gl_cost_of_opening_x;
static int gl_number_of_centers_to_close;
int stride = *numcenters + 2;
//make stride a multiple of CACHE_LINE
int cl = CACHE_LINE/sizeof (double);
if (stride % cl != 0) {
stride = cl * (stride/cl + 1);
}
int K = stride - 2 ; // K==*numcenters
//my own cost of opening x
double cost_of_opening_x = 0;
work_mem = (double*) malloc (2 * stride * sizeof (double));
gl_cost_of_opening_x = 0;
gl_number_of_centers_to_close = 0;
/*
* For each center, we have a *lower* field that indicates
* how much we will save by closing the center.
*/
int count = 0;
for (int i = 0; i < points->num; i++) {
if (is_center[i]) {
center_table[i] = count++;
}
}
work_mem[0] = 0;
//now we finish building the table. clear the working memory.
memset (switch_membership, 0, points->num * sizeof (bool));
memset (work_mem, 0, stride*sizeof (double));
memset (work_mem+stride,0,stride*sizeof (double));
//my *lower* fields
double* lower = &work_mem[0];
//global *lower* fields
double* gl_lower = &work_mem[stride];
float x_cost=0.0;
float current_cost=0.0;
#pragma omp parallel for private(current_cost,x_cost)
shared(cost_of_opening_x)
for (i = 0; i < points->num; i++) {
x_cost = dist (points->p[i], points->p[x], points->dim) * points->p[i].weight;
current_cost = points->p[i].cost;
if (x_cost < current_cost) {
// point i would save cost just by switching to // x
// (note that i cannot be a median,
// or else dist(p[i], p[x]) would be 0)
switch_membership[i] = 1;
cost_of_opening_x += x_cost - current_cost;
{
#pragma omp flush(cost_of_opening_x)
}
} else {
// cost of assigning i to x is at least current assignment cost of i
// consider the savings that i's **current** median would realize
// if we reassigned that median and all its members to x;
// note we've already accounted for the fact that the median
// would save z by closing; now we have to subtract from the savings
// the extra cost of reassigning that median and its members
int assign = points->p[i].assign;
lower[center_table[assign]] += current_cost - x_cost;
{
#pragma omp flush(lower)
}
}
#pragma omp barrier
{
#pragma omp flush(lower,cost_of_opening_x)
}
}
// at this time, we can calculate the cost of opening a center
// at x; if it is negative, we'll go through with opening it
for (int i = 0; i < points->num; i++) {
if (is_center[i]) {
double low = z + work_mem[center_table[i]];
gl_lower[center_table[i]] = low;
if (low > 0) {
// i is a median, and
// if we were to open x (which we still may not) we'd close i
// note, we'll ignore the following quantity unless we do open x
++number_of_centers_to_close;
cost_of_opening_x -= low;
}
}
}
//use the rest of working memory to store the following
work_mem[K] = number_of_centers_to_close;
work_mem[K+1] = cost_of_opening_x;
gl_number_of_centers_to_close = (int) work_mem[K];
gl_cost_of_opening_x = z + work_mem[K+1];
// Now, check whether opening x would save cost; if so, do it, and
// otherwise do nothing
if (gl_cost_of_opening_x < 0) {
// we'd save money by opening x; we'll do it
#pragma omp parallel for
for (int i = 0; i < points->num; i++) {
bool close_center = gl_lower[center_table[points->p[i].assign]] > 0
;
if (switch_membership[i] || close_center) {
// Either i's median (which may be i itself) is closing,
// or i is closer to x than to its current median
points->p[i].cost = points->p[i].weight * dist (points->p[i],
points->p[x], points->dim);
points->p[i].assign = x;
}
}
for (int i = 0; i < points->num; i++) {
if (is_center[i] && gl_lower[center_table[i]] > 0) {
is_center[i] = false;
}
}
if (x >= 0 && x < points->num) {
is_center[x] = true;
}
*numcenters = *numcenters + 1 - gl_number_of_centers_to_close;
} else {
gl_cost_of_opening_x = 0; // the value we'll return
}
free (work_mem);
return -gl_cost_of_opening_x;
}
Können Sie jede Verbesserung oder Veränderung zu sehen, dass es schneller machen würde? Vielen Dank im Voraus.