[cig-commits] r15461 - mc/3D/CitcomS/trunk/lib

Tue Jul 14 18:58:53 PDT 2009

Author: leif
Date: 2009-07-14 18:58:53 -0700 (Tue, 14 Jul 2009)
New Revision: 15461

Modified:
   mc/3D/CitcomS/trunk/lib/multigrid_kernel.cu
Log:
Parallelized the last fragment of the gauss_seidel() main loop as
gauss_seidel_3().  It turns out to be very similar to the second half
of n_assemble_del2_u().  Tore down scaffolding.


Modified: mc/3D/CitcomS/trunk/lib/multigrid_kernel.cu
===================================================================

--- mc/3D/CitcomS/trunk/lib/multigrid_kernel.cu	2009-07-15 00:14:53 UTC (rev 15460)
+++ mc/3D/CitcomS/trunk/lib/multigrid_kernel.cu	2009-07-15 01:58:53 UTC (rev 15461)
@@ -40,7 +40,7 @@
     double *temp;
     unsigned int *NODE;
     
-    int2 **term;
+    int2 *term;
 };
 
 
@@ -96,7 +96,8 @@
     
     /* Part II: These terms are more complicated. */
     {
-        int2 pair = E->term[eqn][tid];
+        int2 *term = E->term + eqn*MAX_EQN;
+        int2 pair = term[tid];
         int e = pair.x; /* 1 <= e <= E->lmesh.NNO */
         int i = pair.y; /* 0 <= i < MAX_EQN */
         
@@ -227,6 +228,65 @@
 
 }
 
+__global__ void gauss_seidel_3(
+    struct Some_variables *E,
+    double *d0,
+    double *Ad
+    )
+{
+    int n = blockIdx.x + 1; /* 1 <= n <= E->lmesh.NNO */
+    int doff = blockIdx.y + 1; /* 1 <= doff < NSD */ 
+    unsigned int tid = threadIdx.x; /* 0 <= tid < MAX_EQN */
+    
+    /* Each block writes one element of Ad and d0 in global memory:
+       Ad[eqn], d0[eqn]. */
+    int eqn = E->ID[n].doff[doff]; /* XXX: Compute this value? */
+    
+    __shared__ double sum[MAX_EQN];
+    
+    int2 *term = E->term + eqn*MAX_EQN;
+    int2 pair = term[tid];
+    int e = pair.x; /* 1 <= e <= E->lmesh.NNO */
+    int i = pair.y; /* 0 <= i < MAX_EQN */
+        
+    if (i != -1) {
+        /* XXX: Compute these values? */
+        int eqn1 = E->ID[e].doff[1];
+        int eqn2 = E->ID[e].doff[2];
+        int eqn3 = E->ID[e].doff[3];
+            
+        higher_precision *B1, *B2, *B3;
+        B1 = E->Eqn_k[1]+(e-1)*MAX_EQN;
+        B2 = E->Eqn_k[2]+(e-1)*MAX_EQN;
+        B3 = E->Eqn_k[3]+(e-1)*MAX_EQN;
+        
+        sum[tid] = B1[i]*E->temp[eqn1] +
+                   B2[i]*E->temp[eqn2] +
+                   B3[i]*E->temp[eqn3];
+    } else {
+        /* XXX: A considerable number of threads idle here. */
+        sum[tid] = 0.0;
+    }
+    __syncthreads();
+    
+    /* Reduce the partial sums for this block.
+       Based on reduce2() in the CUDA SDK. */
+    for (unsigned int s = MAX_EQN/2; s > 0; s >>= 1) {
+        if (tid < s) {
+            sum[tid] += sum[tid + s];
+        }
+        /* XXX: not always necessary */
+        __syncthreads();
+    }
+    
+    if (tid == 0) {
+        /* Each block writes one element of Ad... */
+        Ad[eqn] = sum[0];
+        /* ..and one element of d0. */
+        d0[eqn] += E->temp[eqn];
+    }
+}
+
 void do_gauss_seidel(
     struct Some_variables *E,
     double *d0,
@@ -237,20 +297,10 @@
     )
 {
 
-    int count,i,j,steps;
-    int *C;
-    int eqn1,eqn2,eqn3;
+    int count, steps;
 
-    higher_precision *B1,*B2,*B3;
-
     steps=*cycles;
 
-    dim3 neqBlock(1, 1, 1);
-    dim3 neqGrid(E->lmesh.NEQ, 1, 1);
-    
-    dim3 nnoBlock(1, 1, 1);
-    dim3 nnoGrid(E->lmesh.NNO, NSD, 1);
-    
     /* XXX: allocate & init device memory */
     struct Some_variables *d_E = 0;
     double *d_d0 = 0, *d_F = 0, *d_Ad = 0;
@@ -264,28 +314,24 @@
         dim3 grid(E->lmesh.NNO, NSD, 1);
         n_assemble_del2_u<<< grid, block >>>(d_E, d_d0, d_Ad, 1);
     } else {
-        gauss_seidel_0<<< neqGrid, neqBlock >>>(d_E, d_d0, d_Ad);
+        dim3 block(1, 1, 1);
+        dim3 grid(E->lmesh.NEQ, 1, 1);
+        gauss_seidel_0<<< grid, block >>>(d_E, d_d0, d_Ad);
     }
     
     for (count = 0; count < steps; ++count) {
+        {
+            dim3 block(1, 1, 1);
+            dim3 grid(E->lmesh.NNO, NSD, 1);
+            gauss_seidel_1<<< grid, block >>>(d_E, d_F, d_Ad);
+            gauss_seidel_2<<< grid, block >>>(d_E, d_F, d_Ad);
+        }
         
-        gauss_seidel_1<<< nnoGrid, nnoBlock >>>(d_E, d_F, d_Ad);
-        gauss_seidel_2<<< nnoGrid, nnoBlock >>>(d_E, d_F, d_Ad);
-        
-        
-        /* XXX: How to parallelize this? */
-        for (i=1;i<=E->lmesh.NNO;i++) {
-
-            /* Ad on boundaries differs after the following operation */
-            for (j=0;j<MAX_EQN;j++) {
-                Ad[C[j]]  += B1[j]*E->temp[eqn1]
-                             +  B2[j]*E->temp[eqn2]
-                             +  B3[j]*E->temp[eqn3];
-            }
-
-            d0[eqn1] += E->temp[eqn1];
-            d0[eqn2] += E->temp[eqn2];
-            d0[eqn3] += E->temp[eqn3];
+        /* Ad on boundaries differs after the following operation */
+        {
+            dim3 block(MAX_EQN, 1, 1);
+            dim3 grid(E->lmesh.NNO, NSD, 1);
+            gauss_seidel_3<<< grid, block >>>(d_E, d_d0, d_Ad);
         }
     }
     
@@ -310,230 +356,41 @@
     assert(E->parallel.nproc == 1);
 }
 
-static void tally_n_assemble_del2_u(
-    struct Some_variables *E //,
-    //double *u, double *Au,
-    //int strip_bcs
+static void collect_terms(
+    struct Some_variables *E
     )
 {
-    int e,i;
-    int eqn1,eqn2,eqn3;
+    /* Map out how to parallelize "Au[C[i]] += ..." and "Ad[C[j]] += ...". */
     
-#if 0
-    double UU,U1,U2,U3;
-#endif
-
-    int *C;
-#if 0
-    higher_precision *B1,*B2,*B3;
-#endif
-
-    const int neq=E->lmesh.NEQ;
-    const int nno=E->lmesh.NNO;
+    const int neq = E->lmesh.NEQ;
+    const int nno = E->lmesh.NNO;
     
-    /*
-     * Au = E->Eqn_k? * u
-     *  where E->Eqn_k? is the sparse stiffness matrix
-     */
+    E->term = (int2 *)malloc((neq+1) * MAX_EQN * sizeof(int2));
     
-    int maxAcc, total;
-    int *tally;
-    int **threadMap, **threadTally;
-    int2 **terms;
-    int f;
-    
-    tally = (int *)malloc((neq+1) * sizeof(int));
-    threadMap = (int **)malloc((neq+1)* sizeof(int*));
-    threadTally = (int **)malloc((neq+1)* sizeof(int*));
-    terms = (int2 **)malloc((neq+1)* sizeof(int2 *));
-    
-    for(e=0;e<=neq;e++) {
-        //Au[e]=0.0;
-        tally[e] = 0;
-        threadMap[e] = (int *)malloc((neq+1) * sizeof(int));
-        threadTally[e] = (int *)malloc((neq+1) * sizeof(int));
-        terms[e] = (int2 *)malloc((MAX_EQN+1) * sizeof(int2));
-        for(f=0;f<=neq;f++) {
-            threadMap[e][f] = -1;
-            threadTally[e][f] = 0;
+    for (int e = 0; e <= neq; e++) {
+        int2 *term = E->term + e*MAX_EQN;
+        for (int j = 0; j < MAX_EQN; j++) {
+            term[j].x = -1;
+            term[j].y = -1;
         }
-        for (f = 0; f < MAX_EQN; ++f) {
-            terms[e][f].x = -1;
-            terms[e][f].y = -1;
-        }
-        terms[e][MAX_EQN].x = 0;
-        terms[e][MAX_EQN].y = 0;
     }
-
-#if 0
-    u[neq] = 0.0;
-#endif
-
-    for(e=1;e<=nno;e++)     {
-
-        eqn1=E->ID[e].doff[1];
-        eqn2=E->ID[e].doff[2];
-        eqn3=E->ID[e].doff[3];
-        
-        /* could compute, but 'Node_map' is more complicated */
-        assert(eqn1 == 3*(e-1));
-        assert(eqn2 == eqn1+1);
-        assert(eqn3 == eqn1+2);
-        
-        /* could put maps in constant memory */
-        
-        /*
-         * Key observation: after parallelizing on 'e' (either one):
-         *
-         *   ID[e].doff[1,2,3]
-         *   C
-         *
-         * are fixed for each thread.  => Not worth obsessing over?
-         */
-        
-        /*
-         * Put Au[eqnX] into shared memory; it is accessed almost MAX_EQN=42 times.
-         *
-         * "Au[e]=0.0" should be unnecessary -- single write at end of fn
-         * from 'AuX' local var.. so actually, Au[eqnX] sits in register
-         * 
-         * But what about "Au[C[i]]"???????????
-         */
-        
-        /*
-         * neq vs. nno
-         *
-         * neq == 3*nno
-         * warp=32; only 2 threads wasted
-         * better: 3 warps 32*3
-         *
-         * use y for "dimension index"? (block size 32x3; nno % 32)
-         */
-        
-#if 0
-        U1 = u[eqn1];
-        U2 = u[eqn2];
-        U3 = u[eqn3];
-#endif
-
-        C=E->Node_map + (e-1)*MAX_EQN;
-#if 0
-        B1=E->Eqn_k[1]+(e-1)*MAX_EQN;
-        B2=E->Eqn_k[2]+(e-1)*MAX_EQN;
-        B3=E->Eqn_k[3]+(e-1)*MAX_EQN;
-#endif
-
-        for(i=3;i<MAX_EQN;i++)  {
-#if 0
-            UU = u[C[i]];
-            Au[eqn1] += B1[i]*UU;
-            Au[eqn2] += B2[i]*UU;
-            Au[eqn3] += B3[i]*UU;
-#endif
-            ++tally[eqn1];
-            ++tally[eqn2];
-            ++tally[eqn3];
-            for(f=0;f<=neq;f++) {
-                if (threadMap[eqn1][f] == e) {
-                    ++threadTally[eqn1][f];
+    
+    for (int e = 1; e <= nno; e++) {
+        int *C = E->Node_map + (e-1)*MAX_EQN;
+        for (int i = 0; i < MAX_EQN; i++) {
+            int2 *term = E->term + C[i]*MAX_EQN;
+            int j;
+            for (j = 0; j < MAX_EQN; j++) {
+                if (term[j].x == -1) {
+                    term[j].x = e;
+                    term[j].y = i;
                     break;
                 }
-                if (threadMap[eqn1][f] == -1) {
-                    threadMap[eqn1][f] = e;
-                    ++threadTally[eqn1][f];
-                    break;
-                }
             }
-            for(f=0;f<=neq;f++) {
-                if (threadMap[eqn2][f] == e) {
-                    ++threadTally[eqn2][f];
-                    break;
-                }
-                if (threadMap[eqn2][f] == -1) {
-                    threadMap[eqn2][f] = e;
-                    ++threadTally[eqn2][f];
-                    break;
-                }
-            }
-            for(f=0;f<=neq;f++) {
-                if (threadMap[eqn3][f] == e) {
-                    ++threadTally[eqn3][f];
-                    break;
-                }
-                if (threadMap[eqn3][f] == -1) {
-                    threadMap[eqn3][f] = e;
-                    ++threadTally[eqn3][f];
-                    break;
-                }
-            }
+            assert(C[i] == neq || j < MAX_EQN);
         }
-        for(i=0;i<MAX_EQN;i++) {
-#if 0
-            Au[C[i]] += B1[i]*U1+B2[i]*U2+B3[i]*U3;
-#endif
-            ++tally[C[i]];
-            for(f=0;f<=neq;f++) {
-                if (threadMap[C[i]][f] == e) {
-                    ++threadTally[C[i]][f];
-                    break;
-                }
-                if (threadMap[C[i]][f] == -1) {
-                    threadMap[C[i]][f] = e;
-                    ++threadTally[C[i]][f];
-                    break;
-                }
-            }
-            ++terms[C[i]][MAX_EQN].y;
-            for (f = 0; f < MAX_EQN; ++f) {
-                if (terms[C[i]][f].y == -1) {
-                    terms[C[i]][f].x = e;
-                    terms[C[i]][f].y = i;
-                    break;
-                }
-            }
-            assert(C[i] == neq || f < MAX_EQN);
-        }
-
-    }     /* end for e */
-    
-    maxAcc = 0;
-    total = 0;
-    for(e=0;e<=neq;e++) {
-        int myTally;
-        fprintf(stderr, "Au[%d]: %d times", e, tally[e]);
-        if (e < neq)
-            maxAcc = max(maxAcc, tally[e]);
-        total += tally[e];
-        myTally = 0;
-        for(f=0;f<=neq;f++) {
-            if (threadMap[e][f] == -1)
-                break;
-            fprintf(stderr, " %d(%d)", threadMap[e][f], threadTally[e][f]);
-            myTally += threadTally[e][f];
-        }
-        fprintf(stderr, " (%d times)\n", myTally);
     }
-    //fprintf(stderr, "Au[%d] == %f\n", e - 1, Au[e]);
-    fprintf(stderr, "max accesses %d\n", maxAcc);
-    fprintf(stderr, "total accesses %d\n", total);
     
-    fprintf(stderr, "\nterms:\n");
-    for(e=0;e<=neq;e++) {
-        fprintf(stderr, "Au[%d]: %d terms %s", e, terms[e][MAX_EQN].y,
-                terms[e][MAX_EQN].y > MAX_EQN ? "XXXTO" : "");
-        for (f = 0; f < MAX_EQN; ++f) {
-            if (terms[e][f].y == -1)
-                break;
-            fprintf(stderr, " %d(%d)", terms[e][f].y, terms[e][f].x);
-        }
-        fprintf(stderr, "\n");
-    }
-    
-#if 0
-    if (strip_bcs)
-        strip_bcs_from_residual(E,Au);
-#endif
-
     return;
 }
 
@@ -573,29 +430,16 @@
     
     kE.NODE = E->NODE[level][M];
     
-                                       /* XXX */
-    do {
-        int i, doff, print;
-        for (i=1;i<=kE.lmesh.NNO;i++) {
-            print = (i < 10 || i > kE.lmesh.NNO - 10);
-            if (print)
-                fprintf(stderr, "%04d:", i);
-            for (doff = 1; doff <= 3; ++doff) {
-                assert(kE.ID[i].doff[doff] == /*NSD*/ 3 * (i - 1) + doff - 1);
-                if (print)
-                    fprintf(stderr, " %d", kE.ID[i].doff[doff]);
-            }
-            if (print)
-                fprintf(stderr, "\n");
-        }
-        fprintf(stderr, "\n0 - NEQ %d\n", kE.lmesh.NEQ);
-    } while (0);
-    tally_n_assemble_del2_u(&kE);
-    do {
-        int i;
-        fprintf(stderr, "E->num_zero_resid == %d\n", kE.num_zero_resid);
-        for (i=1;i<=kE.num_zero_resid;i++)
-            fprintf(stderr, "    Au[%d] = 0.0\n", kE.zero_resid[i]);
-    } while (0);
+    collect_terms(&kE);
+    
     assert(0);
+    
+    do_gauss_seidel(
+        &kE,
+        d0[M],
+        F[M], Ad[M],
+        acc,
+        cycles,
+        guess
+        );
 }