[cig-commits] r15460 - mc/3D/CitcomS/trunk/lib
leif at geodynamics.org
leif at geodynamics.org
Tue Jul 14 17:14:53 PDT 2009
Author: leif
Date: 2009-07-14 17:14:53 -0700 (Tue, 14 Jul 2009)
New Revision: 15460
Modified:
mc/3D/CitcomS/trunk/lib/multigrid_kernel.cu
Log:
Parallel n_assemble_del2_u() takes shape.
Modified: mc/3D/CitcomS/trunk/lib/multigrid_kernel.cu
===================================================================
--- mc/3D/CitcomS/trunk/lib/multigrid_kernel.cu 2009-07-13 14:14:44 UTC (rev 15459)
+++ mc/3D/CitcomS/trunk/lib/multigrid_kernel.cu 2009-07-15 00:14:53 UTC (rev 15460)
@@ -39,29 +39,12 @@
double *temp;
unsigned int *NODE;
+
+ int2 **term;
};
/*------------------------------------------------------------------------*/
-/* from BC_util.c */
-
-__device__ void strip_bcs_from_residual(
- struct Some_variables *E,
- double *Res
- )
-{
- int i;
-
- /* XXX: see get_bcs_id_for_residual() */
- if (E->num_zero_resid)
- for (i=1;i<=E->num_zero_resid;i++)
- Res[E->zero_resid[i]] = 0.0;
-
- return;
-}
-
-
-/*------------------------------------------------------------------------*/
/* from Element_calculations.c */
__global__ void n_assemble_del2_u(
@@ -70,52 +53,94 @@
int strip_bcs
)
{
- const int neq = E->lmesh.NEQ;
+ int n = blockIdx.x + 1; /* 1 <= n <= E->lmesh.NNO */
+ int doff = blockIdx.y + 1; /* 1 <= doff < NSD */
+ unsigned int tid = threadIdx.x; /* 0 <= tid < MAX_EQN */
- int e, doff, eqn;
+ /* Each block writes one element of Au in global memory: Au[eqn]. */
+ int eqn = E->ID[n].doff[doff]; /* XXX: Compute this value? */
- e = blockIdx.x + 1; /* 1 <= e <= E->lmesh.NNO */
- doff = blockIdx.y + 1; /* 1 <= doff < NSD */
- eqn = E->ID[e].doff[doff];
+ if (strip_bcs) {
+ /* See get_bcs_id_for_residual(). */
+ unsigned int flags = E->NODE[n];
+ unsigned int vb = 0x1 << doff; /* VBX, VBY, or VBZ */
+ if (flags & vb) {
+ /* no-op: Au[eqn] is zero */
+ if (tid == 0) {
+ Au[eqn] = 0.0;
+ }
+ /* XXX: Hundreds of blocks exit here (E->num_zero_resid).
+ Does it matter? */
+ return;
+ }
+ }
- double Au_eqn;
+ /* The partial sum computed by this thread. */
+ double acc;
- Au_eqn = 0.0;
- if (e == 1 && doff == 1) {
- Au[neq] = 0.0;
- u[neq] = 0.0;
+ /* Part I: The terms here are easily derived from the block and
+ thread indices. */
+ {
+ int e = n; /* 1 <= e <= E->lmesh.NNO */
+ int i = (int)tid; /* 0 <= i < MAX_EQN */
+
+ if (i < 3) {
+ acc = 0.0;
+ } else {
+ int *C = E->Node_map + (e-1)*MAX_EQN;
+ higher_precision *B = E->Eqn_k[doff]+(e-1)*MAX_EQN;
+ double UU = u[C[i]];
+ acc = B[i]*UU;
+ }
}
- int *C;
- higher_precision *B;
- double UU;
- int i;
+ /* Part II: These terms are more complicated. */
+ {
+ int2 pair = E->term[eqn][tid];
+ int e = pair.x; /* 1 <= e <= E->lmesh.NNO */
+ int i = pair.y; /* 0 <= i < MAX_EQN */
+
+ if (i != -1) {
+ /* XXX: Compute these values? */
+ int eqn1 = E->ID[e].doff[1];
+ int eqn2 = E->ID[e].doff[2];
+ int eqn3 = E->ID[e].doff[3];
+
+ double U1 = u[eqn1];
+ double U2 = u[eqn2];
+ double U3 = u[eqn3];
+
+ higher_precision *B1, *B2, *B3;
+ B1 = E->Eqn_k[1]+(e-1)*MAX_EQN;
+ B2 = E->Eqn_k[2]+(e-1)*MAX_EQN;
+ B3 = E->Eqn_k[3]+(e-1)*MAX_EQN;
+
+ acc += B1[i]*U1 +
+ B2[i]*U2 +
+ B3[i]*U3;
+ } else {
+ /* XXX: A considerable number of threads idle here. */
+ }
+ }
- C = E->Node_map + (e-1)*MAX_EQN;
- B = E->Eqn_k[doff]+(e-1)*MAX_EQN;
-
- for (i=3;i<MAX_EQN;i++) {
- UU = u[C[i]];
- Au_eqn += B[i]*UU;
+ /* Reduce the partial sums for this block.
+ Based on reduce2() in the CUDA SDK. */
+ __shared__ double sum[MAX_EQN];
+ sum[tid] = acc;
+ __syncthreads();
+ for (unsigned int s = MAX_EQN/2; s > 0; s >>= 1) {
+ if (tid < s) {
+ sum[tid] += sum[tid + s];
+ }
+ /* XXX: not always necessary */
+ __syncthreads();
}
- /* store to global memory */
- Au[eqn] = Au_eqn;
-
-#if 0
- /* XXX: What now? */
- double U1,U2,U3;
- U1 = u[eqn1];
- U2 = u[eqn2];
- U3 = u[eqn3];
- for (i=0;i<MAX_EQN;i++) {
- Au[C[i]] += B1[i]*U1+B2[i]*U2+B3[i]*U3;
+ /* Each block writes one element of Au in global memory. */
+ if (tid == 0) {
+ Au[eqn] = sum[0];
}
-#endif
- if (strip_bcs)
- strip_bcs_from_residual(E,Au);
-
return;
}
@@ -231,7 +256,13 @@
double *d_d0 = 0, *d_F = 0, *d_Ad = 0;
if (guess) {
- n_assemble_del2_u<<< nnoGrid, nnoBlock >>>(d_E, d_d0, d_Ad, 1);
+ /* XXX */
+ d_Ad[E->lmesh.NEQ] = 0.0; /* Au -- unnecessary? */
+ d_d0[E->lmesh.NEQ] = 0.0; /* u */
+
+ dim3 block(MAX_EQN, 1, 1);
+ dim3 grid(E->lmesh.NNO, NSD, 1);
+ n_assemble_del2_u<<< grid, block >>>(d_E, d_d0, d_Ad, 1);
} else {
gauss_seidel_0<<< neqGrid, neqBlock >>>(d_E, d_d0, d_Ad);
}
@@ -308,21 +339,30 @@
int maxAcc, total;
int *tally;
int **threadMap, **threadTally;
+ int2 **terms;
int f;
tally = (int *)malloc((neq+1) * sizeof(int));
threadMap = (int **)malloc((neq+1)* sizeof(int*));
threadTally = (int **)malloc((neq+1)* sizeof(int*));
+ terms = (int2 **)malloc((neq+1)* sizeof(int2 *));
for(e=0;e<=neq;e++) {
//Au[e]=0.0;
tally[e] = 0;
threadMap[e] = (int *)malloc((neq+1) * sizeof(int));
threadTally[e] = (int *)malloc((neq+1) * sizeof(int));
+ terms[e] = (int2 *)malloc((MAX_EQN+1) * sizeof(int2));
for(f=0;f<=neq;f++) {
threadMap[e][f] = -1;
threadTally[e][f] = 0;
}
+ for (f = 0; f < MAX_EQN; ++f) {
+ terms[e][f].x = -1;
+ terms[e][f].y = -1;
+ }
+ terms[e][MAX_EQN].x = 0;
+ terms[e][MAX_EQN].y = 0;
}
#if 0
@@ -443,6 +483,15 @@
break;
}
}
+ ++terms[C[i]][MAX_EQN].y;
+ for (f = 0; f < MAX_EQN; ++f) {
+ if (terms[C[i]][f].y == -1) {
+ terms[C[i]][f].x = e;
+ terms[C[i]][f].y = i;
+ break;
+ }
+ }
+ assert(C[i] == neq || f < MAX_EQN);
}
} /* end for e */
@@ -467,8 +516,19 @@
//fprintf(stderr, "Au[%d] == %f\n", e - 1, Au[e]);
fprintf(stderr, "max accesses %d\n", maxAcc);
fprintf(stderr, "total accesses %d\n", total);
- assert(0);
+ fprintf(stderr, "\nterms:\n");
+ for(e=0;e<=neq;e++) {
+ fprintf(stderr, "Au[%d]: %d terms %s", e, terms[e][MAX_EQN].y,
+ terms[e][MAX_EQN].y > MAX_EQN ? "XXXTO" : "");
+ for (f = 0; f < MAX_EQN; ++f) {
+ if (terms[e][f].y == -1)
+ break;
+ fprintf(stderr, " %d(%d)", terms[e][f].y, terms[e][f].x);
+ }
+ fprintf(stderr, "\n");
+ }
+
#if 0
if (strip_bcs)
strip_bcs_from_residual(E,Au);
@@ -531,5 +591,11 @@
fprintf(stderr, "\n0 - NEQ %d\n", kE.lmesh.NEQ);
} while (0);
tally_n_assemble_del2_u(&kE);
+ do {
+ int i;
+ fprintf(stderr, "E->num_zero_resid == %d\n", kE.num_zero_resid);
+ for (i=1;i<=kE.num_zero_resid;i++)
+ fprintf(stderr, " Au[%d] = 0.0\n", kE.zero_resid[i]);
+ } while (0);
assert(0);
}
More information about the CIG-COMMITS
mailing list