[cig-commits] r15483 - mc/3D/CitcomS/trunk/lib
leif at geodynamics.org
leif at geodynamics.org
Mon Jul 27 18:00:36 PDT 2009
Author: leif
Date: 2009-07-27 18:00:36 -0700 (Mon, 27 Jul 2009)
New Revision: 15483
Modified:
mc/3D/CitcomS/trunk/lib/multigrid_kernel.cu
Log:
Experiments show that using a block size of 1 leads to poor
performance. Using the recommended minimum of 64, gauss_seidel_0 and
gauss_seidel_1 are about 3x to 4x faster.
Modified: mc/3D/CitcomS/trunk/lib/multigrid_kernel.cu
===================================================================
--- mc/3D/CitcomS/trunk/lib/multigrid_kernel.cu 2009-07-27 23:44:52 UTC (rev 15482)
+++ mc/3D/CitcomS/trunk/lib/multigrid_kernel.cu 2009-07-28 01:00:36 UTC (rev 15483)
@@ -230,42 +230,42 @@
/* These are based on the function from General_matrix_functions.c. */
__global__ void gauss_seidel_0(
- struct Some_variables *E,
+ int neq,
double *d0,
double *Ad
)
{
- const double zeroo = 0.0;
- int i;
-
- i = blockIdx.x; /* 0 <= i < E->lmesh.NEQ */
- d0[i] = Ad[i] = zeroo;
+ int i = blockIdx.x * blockDim.x + threadIdx.x;
+ if (i < neq) {
+ d0[i] = Ad[i] = 0.0;
+ }
}
__global__ void gauss_seidel_1(
struct Some_variables *E,
+ int neq, int nno,
double *F, double *Ad
)
{
const double zeroo = 0.0;
- const int neq = E->lmesh.NEQ;
- int i, doff, eqn;
-
- i = blockIdx.x + 1; /* 1 <= i <= E->lmesh.NNO */
- doff = blockIdx.y + 1; /* 1 <= doff < NSD */
- eqn = E->ID[i].doff[doff];
-
- if (E->NODE[i] & OFFSIDE) {
- E->temp[eqn] = (F[eqn] - Ad[eqn])*E->BI[eqn];
- } else {
- E->temp[eqn] = zeroo;
+ int i = blockIdx.x * blockDim.x + threadIdx.x + 1;
+ if (i <= nno) {
+ /* 1 <= i <= E->lmesh.NNO */
+ int doff = blockIdx.y + 1; /* 1 <= doff < NSD */
+ int eqn = E->ID[i].doff[doff];
+
+ if (E->NODE[i] & OFFSIDE) {
+ E->temp[eqn] = (F[eqn] - Ad[eqn])*E->BI[eqn];
+ } else {
+ E->temp[eqn] = zeroo;
+ }
+
+ if (i == 1 && doff == 1) {
+ E->temp[neq] = zeroo;
+ Ad[neq] = zeroo;
+ }
}
-
- if (i == 1 && doff == 1) {
- E->temp[neq] = zeroo;
- Ad[neq] = zeroo;
- }
}
__global__ void gauss_seidel_2a(
@@ -536,6 +536,11 @@
/* copy input to the device */
cudaMemcpy(d_F, F, neq*sizeof(double), cudaMemcpyHostToDevice);
+ cudaEvent_t start, stop;
+ cudaEventCreate(&start);
+ cudaEventCreate(&stop);
+ static float totalElapsedTime = 0.0;
+
if (guess) {
/* copy more input to the device */
d0[E->lmesh.NEQ] = 0.0; /* normally done by n_assemble_del2_u() */
@@ -547,18 +552,42 @@
else host_n_assemble_del2_u(E, d_d0, d_Ad, 1);
} else {
- dim3 block(1, 1, 1);
- dim3 grid(E->lmesh.NEQ, 1, 1);
- if (1) gauss_seidel_0<<< grid, block >>>(d_E, d_d0, d_Ad);
- else host_gauss_seidel_0(E, d_d0, d_Ad);
+ const int blockSize = 64;
+ const int dim = E->lmesh.NEQ;
+ int gridSize = dim / blockSize;
+ if (dim % blockSize)
+ ++gridSize;
+
+ dim3 block(blockSize, 1, 1);
+ dim3 grid(gridSize, 1, 1);
+ gauss_seidel_0<<< grid, block >>>(E->lmesh.NEQ, d_d0, d_Ad);
}
for (count = 0; count < steps; ++count) {
{
- dim3 block(1, 1, 1);
- dim3 grid(E->lmesh.NNO, NSD, 1);
- if (1) gauss_seidel_1<<< grid, block >>>(d_E, d_F, d_Ad);
- else host_gauss_seidel_1(E, &s_E, d_F, d_Ad);
+ const int blockSize = 64;
+ const int dim = E->lmesh.NNO;
+ int gridSize = dim / blockSize;
+ if (dim % blockSize)
+ ++gridSize;
+
+ dim3 block(blockSize, 1, 1);
+ dim3 grid(gridSize, NSD, 1);
+
+ cudaEventRecord(start, 0);
+
+ gauss_seidel_1<<< grid, block >>>(d_E, E->lmesh.NEQ, E->lmesh.NNO, d_F, d_Ad);
+
+ cudaEventRecord(stop, 0);
+ if (0) {
+ cudaEventSynchronize(stop);
+ float elapsedTime;
+ cudaEventElapsedTime(&elapsedTime, start, stop);
+ totalElapsedTime += elapsedTime;
+ if (count == steps - 1) {
+ printf("gauss_seidel_0 %d\t%f\t%f\n", count, elapsedTime, totalElapsedTime);
+ }
+ }
}
if (1) {
@@ -582,6 +611,9 @@
}
}
+ cudaEventDestroy(start);
+ cudaEventDestroy(stop);
+
/* wait for completion */
if (cudaThreadSynchronize() != cudaSuccess) {
assert(0 && "something went wrong");
More information about the CIG-COMMITS
mailing list