[cig-commits] r15072 - in mc/3D/CitcomS/trunk: . lib

Wed May 27 18:50:40 PDT 2009

Author: leif
Date: 2009-05-27 18:50:40 -0700 (Wed, 27 May 2009)
New Revision: 15072

Modified:
   mc/3D/CitcomS/trunk/configure.ac
   mc/3D/CitcomS/trunk/lib/General_matrix_functions.c
   mc/3D/CitcomS/trunk/lib/Makefile.am
   mc/3D/CitcomS/trunk/lib/cgrad_kernel.cu
Log:
CUDA-ized e_assemble_del2_u(), and ran it successfully -- and slowly!
-- under the device emulator.  Added a new 'configure' option:
"--with-cuda".


Modified: mc/3D/CitcomS/trunk/configure.ac
===================================================================

--- mc/3D/CitcomS/trunk/configure.ac	2009-05-27 22:01:24 UTC (rev 15071)
+++ mc/3D/CitcomS/trunk/configure.ac	2009-05-28 01:50:40 UTC (rev 15072)
@@ -53,6 +53,12 @@
         [use Exchanger @<:@default=auto@:>@])],
     [want_exchanger="$withval"],
     [want_exchanger=auto])
+AC_ARG_WITH([cuda],
+    [AC_HELP_STRING([--with-cuda],
+        [use CUDA @<:@default=no@:>@])],
+    [want_cuda="$withval"],
+    [want_cuda=no])
+AM_CONDITIONAL([COND_CUDA], [test "$want_cuda" = yes])
 
 # Checks for programs.
 AC_PROG_CC([mpicc hcc mpcc mpcc_r mpxlc cmpicc gcc cc cl icc ecc pgcc xlc xlc_r])

Modified: mc/3D/CitcomS/trunk/lib/General_matrix_functions.c
===================================================================
--- mc/3D/CitcomS/trunk/lib/General_matrix_functions.c	2009-05-27 22:01:24 UTC (rev 15071)
+++ mc/3D/CitcomS/trunk/lib/General_matrix_functions.c	2009-05-28 01:50:40 UTC (rev 15072)
@@ -45,6 +45,7 @@
     Iterative solver also using multigrid  ........
     ===========================================================  */
 
+#ifndef USE_CUDA
 int solve_del2_u(E,d0,F,acc,high_lev)
      struct All_variables *E;
      double **d0;
@@ -145,6 +146,7 @@
 
   return(valid);
 }
+#endif /* !USE_CUDA */
 
 /* =================================
    recursive multigrid function ....

Modified: mc/3D/CitcomS/trunk/lib/Makefile.am
===================================================================
--- mc/3D/CitcomS/trunk/lib/Makefile.am	2009-05-27 22:01:24 UTC (rev 15071)
+++ mc/3D/CitcomS/trunk/lib/Makefile.am	2009-05-28 01:50:40 UTC (rev 15072)
@@ -47,6 +47,17 @@
     AM_CPPFLAGS += -DUSE_HDF5
 endif
 
+CUDA_SOURCES =
+
+if COND_CUDA
+AM_CPPFLAGS += -DUSE_CUDA
+
+.cu.o:
+	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libCitcomS_a_CFLAGS) $(CFLAGS) -c -o $@ $<
+
+CUDA_SOURCES += cgrad_kernel.cu
+endif
+
 # static library
 libCitcomS_a_CFLAGS = $(AM_CFLAGS) # hack for automake
 libCitcomS_a_SOURCES = $(sources)
@@ -136,7 +147,8 @@
 	Regional_solver.c \
 	Regional_sphere_related.c \
 	Regional_tracer_advection.c \
-	Regional_version_dependent.c
+	Regional_version_dependent.c \
+	$(CUDA_SOURCES)
 
 EXTRA_DIST = \
 	Obsolete.c \

Modified: mc/3D/CitcomS/trunk/lib/cgrad_kernel.cu
===================================================================
--- mc/3D/CitcomS/trunk/lib/cgrad_kernel.cu	2009-05-27 22:01:24 UTC (rev 15071)
+++ mc/3D/CitcomS/trunk/lib/cgrad_kernel.cu	2009-05-28 01:50:40 UTC (rev 15072)
@@ -5,16 +5,6 @@
 #include "global_defs.h"
 
 
-#ifdef __CUDACC__
-#undef assert
-#define assert(c)
-#else
-#define __device__ static inline
-#define __global__ static
-#define __host__
-#endif
-
-
 enum {
     LEVEL = 0,
     CAPS_PER_PROC = 1,
@@ -36,7 +26,7 @@
 
 struct matrix_mult {
     int n; /* number of octoterms: 1, 2, 4, or 8 */
-    struct octoterm *ot;
+    int ot_i; /* index of octoterm struct in 'ot' array */
     int zero_res; /* boolean */
 };
 
@@ -73,6 +63,8 @@
     int memoryDim;
     
     struct matrix_mult *mm; /* dim is 'neq' */
+    int n_octoterms;
+    struct octoterm *ot;
     
     /* outputs */
     
@@ -87,7 +79,7 @@
 /*------------------------------------------------------------------------*/
 /* from BC_util.c */
 
-__device__ void strip_bcs_from_residual(
+void strip_bcs_from_residual(
     struct Some_variables *E,
     double *Res
     )
@@ -102,7 +94,6 @@
 
 
 /*------------------------------------------------------------------------*/
-/* from Element_calculations.c */
 
 static int e_tally(
     struct Some_variables *E
@@ -146,17 +137,18 @@
 }
 
 static void e_map(
-    struct Some_variables *E,
-    struct octoterm *ot
+    struct Some_variables *E
     )
 {
     int  e,i,a,a1,a2,a3,o1,o2,o3,ii;
+    struct octoterm *ot;
 
     const int nel=E->lmesh.NEL;
     const int neq=E->lmesh.NEQ;
-
+    
+    ot = E->ot;
     for(i=0;i<neq;i++) {
-        E->mm[i].ot = ot;
+        E->mm[i].ot_i = ot - E->ot;
         ot += E->mm[i].n;
         E->mm[i].n = 0;
     }
@@ -174,17 +166,20 @@
             o2 = E->mm[a2].n++;
             o3 = E->mm[a3].n++;
             
-            E->mm[a1].ot[o1].e = e;
-            E->mm[a1].ot[o1].a = a;
-            E->mm[a1].ot[o1].offset = 0;
+            ot = E->ot + E->mm[a1].ot_i + o1;
+            ot->e = e;
+            ot->a = a;
+            ot->offset = 0;
             
-            E->mm[a2].ot[o2].e = e;
-            E->mm[a2].ot[o2].a = a;
-            E->mm[a2].ot[o2].offset = LOC_MAT_SIZE;
+            ot = E->ot + E->mm[a2].ot_i + o2;
+            ot->e = e;
+            ot->a = a;
+            ot->offset = LOC_MAT_SIZE;
             
-            E->mm[a3].ot[o3].e = e;
-            E->mm[a3].ot[o3].a = a;
-            E->mm[a3].ot[o3].offset = LOC_MAT_SIZE + LOC_MAT_SIZE;
+            ot = E->ot + E->mm[a3].ot_i + o3;
+            ot->e = e;
+            ot->a = a;
+            ot->offset = LOC_MAT_SIZE + LOC_MAT_SIZE;
             
         }             /* end for loop a */
 
@@ -193,45 +188,63 @@
     return;
 }
 
-__device__ void dp_e_assemble_del2_u(
+/* based on the function from Element_calculations.c */
+
+__global__ void e_assemble_del2_u(
     struct Some_variables *E,
     double *u, double *Au,
     int strip_bcs
     )
 {
     int  e,i,a,b,o,ii,nodeb,offset;
-    double sum;
-
-    const int neq=E->lmesh.NEQ;
+    struct octoterm *ot;
     
-    for (i = 0; i < neq; i++) {
+    /* ENODES*ENODES = 8*8 = 64 threads (2 warps) per block */
+    __shared__ double sum[ENODES*ENODES];
+    unsigned int tid = ENODES*threadIdx.y + threadIdx.x; /* 0 <= tid < 64 */
+    
+    do {
+        i = blockIdx.x; /* 0 <= i < E->lmesh.NEQ */
         
-        /* ENODES*ENODES = 8*8 = 64 threads per block */
-        /* XXX: 8*(8-n) wasted threads */
+        /***
+         * This is block 'i'.
+         */
         
-        sum = 0.0;
-        
         if (strip_bcs && E->mm[i].zero_res) {
             
             /* no-op: Au[i] is zero */
-        
+            sum[tid] = 0.0; /* but only sum[0] is used */
+            
+            /* XXX: How many blocks are wasted here? */
+            
         } else {
             
-            for (o = 0; o < E->mm[i].n; o++) {
+            o = threadIdx.x; /* 0 <= o < ENODES */
+            
+            if (o < E->mm[i].n) {
                 
-                e      = E->mm[i].ot[o].e;
-                a      = E->mm[i].ot[o].a;
-                offset = E->mm[i].ot[o].offset;
+                /****
+                 * This is thread group 'o' in block 'i'.
+                 * Each group of 8 threads handles one 'octoterm'.
+                 */
                 
-                for (b = 1; b <= ENODES; b++) {
+                ot = E->ot + E->mm[i].ot_i + o;
+                e      = ot->e;
+                a      = ot->a;
+                offset = ot->offset;
+                
+                do {
+                    b = threadIdx.y + 1; /* 1 <= b <= ENODES */
                     
-                    /* each thread computes three terms */
+                    /****
+                     * This is thread '(o, b)' in block 'i'.
+                     * Each thread computes three terms.
+                     */
                     
                     nodeb = E->IEN[e].node[b];
                     ii = (a*LOC_MAT_SIZE+b)*NSD-(NSD*LOC_MAT_SIZE+NSD);
                     
-                    /* XXX: must reduce here */
-                    sum +=
+                    sum[tid] =
                         E->elt_k[e].k[ii+offset] *
                         u[E->ID[nodeb].doff[1]]
                         + E->elt_k[e].k[ii+offset+1] *
@@ -239,160 +252,46 @@
                         + E->elt_k[e].k[ii+offset+2] *
                         u[E->ID[nodeb].doff[3]];
                     
-                }
+                } while (0);
                 
+            } else {
+                /* XXX: 8-n wasted threads per block go through here */
+                sum[tid] = 0.0;
             }
             
+            __syncthreads();
+            
+            /* Reduce the partial sums for this block.
+               Based on reduce2() in the CUDA SDK. */
+            for (unsigned int s = ENODES*ENODES / 2; s > 0; s >>= 1) 
+            {
+                if (tid < s) {
+                    sum[tid] += sum[tid + s];
+                }
+                /* XXX: This is unnecessary, since only a single
+                   "warp" is involved. */
+                __syncthreads();
+            }
         }
         
-        /* each block writes one element, Au[i] */
-        Au[i] = sum;
+        /* XXX: ??? */
+        __syncthreads();
 
-    }
-
-    return;
-}
-
-__device__ void e_assemble_del2_u(
-    struct Some_variables *E,
-    double *u, double *Au,
-    int strip_bcs
-    )
-{
-    int  e,i,a,b,a1,a2,a3,ii,nodeb;
-
-    const int nel=E->lmesh.NEL;
-    const int neq=E->lmesh.NEQ;
-
-    for(i=0;i<neq;i++)
-        Au[i] = 0.0;
-
-    for(e=1;e<=nel;e++)   {
-        for(a=1;a<=ENODES;a++) {
-            ii = E->IEN[e].node[a];
-            a1 = E->ID[ii].doff[1];
-            a2 = E->ID[ii].doff[2];
-            a3 = E->ID[ii].doff[3];
-            for(b=1;b<=ENODES;b++) {
-                nodeb = E->IEN[e].node[b];
-                ii = (a*LOC_MAT_SIZE+b)*NSD-(NSD*LOC_MAT_SIZE+NSD);
-                /* i=1, j=1,2 */
-                /* i=1, j=1,2,3 */
-                Au[a1] +=
-                    E->elt_k[e].k[ii] *
-                    u[E->ID[nodeb].doff[1]]
-                    + E->elt_k[e].k[ii+1] *
-                    u[E->ID[nodeb].doff[2]]
-                    + E->elt_k[e].k[ii+2] *
-                    u[E->ID[nodeb].doff[3]];
-                /* i=2, j=1,2,3 */
-                Au[a2] +=
-                    E->elt_k[e].k[ii+LOC_MAT_SIZE] *
-                    u[E->ID[nodeb].doff[1]]
-                    + E->elt_k[e].k[ii+LOC_MAT_SIZE+1] *
-                    u[E->ID[nodeb].doff[2]]
-                    + E->elt_k[e].k[ii+LOC_MAT_SIZE+2] *
-                    u[E->ID[nodeb].doff[3]];
-                /* i=3, j=1,2,3 */
-                Au[a3] +=
-                    E->elt_k[e].k[ii+LOC_MAT_SIZE+LOC_MAT_SIZE] *
-                    u[E->ID[nodeb].doff[1]]
-                    + E->elt_k[e].k[ii+LOC_MAT_SIZE+LOC_MAT_SIZE+1] *
-                    u[E->ID[nodeb].doff[2]]
-                    + E->elt_k[e].k[ii+LOC_MAT_SIZE+LOC_MAT_SIZE+2] *
-                    u[E->ID[nodeb].doff[3]];
-
-            }         /* end for loop b */
-        }             /* end for loop a */
-
-    }          /* end for e */
-    
-    if(strip_bcs)
-        strip_bcs_from_residual(E,Au);
-
-    return;
-}
-
-__device__ void n_assemble_del2_u(
-    struct Some_variables *E,
-    double *u, double *Au,
-    int strip_bcs
-    )
-{
-    int e,i;
-    int eqn1,eqn2,eqn3;
-
-    double UU,U1,U2,U3;
-
-    int *C;
-    higher_precision *B1,*B2,*B3;
-
-    const int neq=E->lmesh.NEQ;
-    const int nno=E->lmesh.NNO;
-    
-    /*
-     * Au = E->Eqn_k? * u
-     *  where E->Eqn_k? is the sparse stiffness matrix
-     */
-
-    for(e=0;e<=neq;e++)
-        Au[e]=0.0;
-
-    u[neq] = 0.0;
-
-    for(e=1;e<=nno;e++)     {
-
-        eqn1=E->ID[e].doff[1];
-        eqn2=E->ID[e].doff[2];
-        eqn3=E->ID[e].doff[3];
-
-        U1 = u[eqn1];
-        U2 = u[eqn2];
-        U3 = u[eqn3];
-
-        C=E->Node_map + (e-1)*MAX_EQN;
-        B1=E->Eqn_k1+(e-1)*MAX_EQN;
-        B2=E->Eqn_k2+(e-1)*MAX_EQN;
-        B3=E->Eqn_k3+(e-1)*MAX_EQN;
-
-        for(i=3;i<MAX_EQN;i++)  {
-            UU = u[C[i]];
-            Au[eqn1] += B1[i]*UU;
-            Au[eqn2] += B2[i]*UU;
-            Au[eqn3] += B3[i]*UU;
+        /* each block writes one element, Au[i], in global memory */
+        if (tid == 0) {
+            Au[i] = sum[0];
         }
-        for(i=0;i<MAX_EQN;i++)
-            Au[C[i]] += B1[i]*U1+B2[i]*U2+B3[i]*U3;
 
-    }     /* end for e */
-    
-    if (strip_bcs)
-        strip_bcs_from_residual(E,Au);
+    } while (0);
 
     return;
 }
 
-__device__ void assemble_del2_u(
-    struct Some_variables *E,
-    double *u, double *Au,
-    int strip_bcs
-    )
-{
-    if (E->control.NASSEMBLE)
-        n_assemble_del2_u(E,u,Au,strip_bcs);
-    else if (1)
-        dp_e_assemble_del2_u(E,u,Au,strip_bcs);
-    else
-        e_assemble_del2_u(E,u,Au,strip_bcs);
 
-    return;
-}
-
-
 /*------------------------------------------------------------------------*/
 /* from Global_operations.c */
 
-__device__ double global_vdot(
+double global_vdot(
     struct Some_variables *E,
     double *A, double *B
     )
@@ -411,9 +310,64 @@
 
 
 /*------------------------------------------------------------------------*/
+
+static void construct_E(
+    struct Some_variables **d_E,
+    struct Some_variables *s_E, /* host's shadow copy of d_E */
+    struct Some_variables *E
+    )
+{
+    /* construct a copy of 'E' in device memory */
+    
+    int neq = E->lmesh.NEQ;
+    int nno = E->lmesh.NNO;
+    int nel = E->lmesh.NEL;
+    
+    memset(s_E, 0, sizeof(struct Some_variables));
+    
+    /* mm, ot */
+    cudaMalloc((void**)&s_E->mm, neq * sizeof(struct matrix_mult));
+    cudaMalloc((void**)&s_E->ot, E->n_octoterms * sizeof(struct octoterm));
+    cudaMemcpy(s_E->mm, E->mm, neq * sizeof(struct matrix_mult), cudaMemcpyHostToDevice);
+    cudaMemcpy(s_E->ot, E->ot, E->n_octoterms * sizeof(struct octoterm), cudaMemcpyHostToDevice);
+    
+    /* IEN -- cf. allocate_common_vars() */
+    cudaMalloc((void**)&s_E->IEN, (nel+2)*sizeof(struct IEN));
+    cudaMemcpy(s_E->IEN, E->IEN, (nel+2)*sizeof(struct IEN), cudaMemcpyHostToDevice);
+    
+    /* ID -- cf. allocate_common_vars()*/
+    cudaMalloc((void **)&s_E->ID, (nno+1)*sizeof(struct ID));
+    cudaMemcpy(s_E->ID, E->ID, (nno+1)*sizeof(struct ID), cudaMemcpyHostToDevice);
+    
+    /* elt_k -- cf. general_stokes_solver_setup() */
+    cudaMalloc((void **)&s_E->elt_k, (nel+1)*sizeof(struct EK));
+    cudaMemcpy(s_E->elt_k, E->elt_k, (nel+1)*sizeof(struct EK), cudaMemcpyHostToDevice);
+    
+    /* E */
+    cudaMalloc((void**)d_E, sizeof(Some_variables));
+    cudaMemcpy(*d_E, s_E, sizeof(Some_variables), cudaMemcpyHostToDevice);
+    
+    return;
+}
+
+static void destroy_E(
+    struct Some_variables *d_E,
+    struct Some_variables *s_E
+    )
+{
+    cudaFree(s_E->mm);
+    cudaFree(s_E->ot);
+    cudaFree(s_E->IEN);
+    cudaFree(s_E->ID);
+    cudaFree(s_E->elt_k);
+    cudaFree(d_E);
+}
+
+
+/*------------------------------------------------------------------------*/
 /* from General_matix_functions.c */
 
-__device__ double conj_grad(
+double conj_grad(
     struct Some_variables *E,
     double *d0,
     double *F,
@@ -456,7 +410,19 @@
 
     assert(residual != 0.0  /* initial residual for CG = 0.0 */);
     count = 0;
-
+    
+    /* pointers to device memory */
+    struct Some_variables *d_E = 0;
+    double *d_p2 = 0, *d_Ap = 0;
+    
+    /* construct 'E' on the device */
+    struct Some_variables s_E;
+    construct_E(&d_E, &s_E, E);
+    
+    /* allocate memory on the device */
+    cudaMalloc((void**)&d_p2, (1+neq)*sizeof(double));
+    cudaMalloc((void**)&d_Ap, (1+neq)*sizeof(double));
+    
     while (((residual > acc) && (count < steps)) || count == 0)  {
 
         for(i=0;i<neq;i++)
@@ -475,9 +441,26 @@
         }
 
         dotr0z0 = dotr1z1;
-
-        assemble_del2_u(E,p2,Ap,1);
-
+        
+        /********************************************/
+        /* launch e_assemble_del2_u() on the device */
+        
+        /* copy input to the device */
+        cudaMemcpy(d_p2, p2, (1+neq)*sizeof(double), cudaMemcpyHostToDevice);
+        
+        /* launch */
+        dim3 dimBlock(ENODES, ENODES, 1);
+        dim3 dimGrid(neq, 1, 1);
+        e_assemble_del2_u<<< dimGrid, dimBlock >>>(d_E, d_p2, d_Ap, 1);
+        
+        /* wait for completion */
+        cudaThreadSynchronize();
+        
+        /* copy output from device */
+        cudaMemcpy(Ap, d_Ap, (1+neq)*sizeof(double), cudaMemcpyDeviceToHost);
+        
+        /********************************************/
+        
         dotprod=global_vdot(E,p2,Ap);
 
         if(0.0==dotprod)
@@ -500,6 +483,12 @@
         /* end of while-loop */
 
     }
+    
+    /* free device memory */
+    cudaFree(d_p2);
+    cudaFree(d_Ap);
+    
+    destroy_E(d_E, &s_E);
 
     *cycles=count;
 
@@ -509,7 +498,7 @@
 }
 
 
-__global__ void solve_del2_u(
+void do_solve_del2_u(
     struct Some_variables *E,
     double *d0,
     double *F,
@@ -550,6 +539,7 @@
 static void assert_assumptions(struct All_variables *E, int high_lev) {
     
     assert(!E->control.NMULTIGRID);
+    assert(!E->control.NASSEMBLE);
     
     assert(E->sphere.caps_per_proc == CAPS_PER_PROC);
     
@@ -562,7 +552,7 @@
     assert(E->parallel.Skip_neq[LEVEL][M] == 0);
 }
 
-__host__ int launch_solve_del2_u(
+extern "C" int solve_del2_u(
     struct All_variables *E,
     double **d0,
     double **F,
@@ -571,8 +561,6 @@
     )
 {
     struct Some_variables kE;
-    struct octoterm *ot;
-    int n_octoterms;
     
     assert_assumptions(E, high_lev);
     
@@ -609,16 +597,16 @@
     kE.memory = (double *)malloc(kE.memoryDim*sizeof(double));
     
     kE.mm = (struct matrix_mult *)malloc(kE.lmesh.NEQ * sizeof(struct matrix_mult));
-    n_octoterms = e_tally(&kE);
-    ot = (struct octoterm *)malloc(n_octoterms * sizeof(struct octoterm));
-    e_map(&kE, ot);
+    kE.n_octoterms = e_tally(&kE);
+    kE.ot = (struct octoterm *)malloc(kE.n_octoterms * sizeof(struct octoterm));
+    e_map(&kE);
     
     /* zero outputs */
     kE.monitor.momentum_residual = 0.0;
     kE.valid = 0;
     
     /* solve */
-    solve_del2_u(&kE, d0[M], F[M], acc);
+    do_solve_del2_u(&kE, d0[M], F[M], acc);
     
     /* get outputs */
     E->control.total_iteration_cycles = kE.control.total_iteration_cycles;
@@ -627,7 +615,7 @@
     
     free(kE.memory);
     free(kE.mm);
-    free(ot);
+    free(kE.ot);
     
     return kE.valid;
 }