[cig-commits] r15067 - mc/3D/CitcomS/trunk/lib

leif at geodynamics.org leif at geodynamics.org
Tue May 26 18:28:55 PDT 2009


Author: leif
Date: 2009-05-26 18:28:55 -0700 (Tue, 26 May 2009)
New Revision: 15067

Modified:
   mc/3D/CitcomS/trunk/lib/cgrad_kernel.cu
Log:
Sketched data-parallel version of e_assemble_del2_u().


Modified: mc/3D/CitcomS/trunk/lib/cgrad_kernel.cu
===================================================================
--- mc/3D/CitcomS/trunk/lib/cgrad_kernel.cu	2009-05-27 00:47:21 UTC (rev 15066)
+++ mc/3D/CitcomS/trunk/lib/cgrad_kernel.cu	2009-05-27 01:28:55 UTC (rev 15067)
@@ -26,6 +26,20 @@
 };
 
 
+struct octoterm {
+    /* an octoterm is 8 (ENODES) * 3 terms */
+    int e;
+    int a;
+    int offset;
+};
+
+
+struct matrix_mult {
+    int n; /* number of octoterms: 1, 2, 4, or 8 */
+    struct octoterm *ot;
+};
+
+
 struct Some_variables {
     int num_zero_resid;
     int *zero_resid;
@@ -57,6 +71,8 @@
     double *memory;
     int memoryDim;
     
+    struct matrix_mult *mm; /* dim is 'neq' */
+    
     /* outputs */
     
     struct /*MONITOR*/ {
@@ -87,6 +103,143 @@
 /*------------------------------------------------------------------------*/
 /* from Element_calculations.c */
 
+static int e_tally(
+    struct Some_variables *E
+    )
+{
+    int  e,i,a,a1,a2,a3,ii,nTotal;
+
+    const int nel=E->lmesh.NEL;
+    const int neq=E->lmesh.NEQ;
+    
+    nTotal = 0;
+    for(i=0;i<neq;i++) {
+        E->mm[i].n = 0;
+    }
+
+    for(e=1;e<=nel;e++)   {
+        
+        for(a=1;a<=ENODES;a++) {
+            
+            ii = E->IEN[e].node[a];
+            a1 = E->ID[ii].doff[1];
+            a2 = E->ID[ii].doff[2];
+            a3 = E->ID[ii].doff[3];
+            
+            ++E->mm[a1].n;
+            ++E->mm[a2].n;
+            ++E->mm[a3].n;
+            nTotal += 3;
+            
+        }             /* end for loop a */
+
+    }          /* end for e */
+    
+    /* return the total number of octoterms */
+    return nTotal;
+}
+
+static void e_map(
+    struct Some_variables *E,
+    struct octoterm *ot
+    )
+{
+    int  e,i,a,a1,a2,a3,o1,o2,o3,ii;
+
+    const int nel=E->lmesh.NEL;
+    const int neq=E->lmesh.NEQ;
+
+    for(i=0;i<neq;i++) {
+        E->mm[i].ot = ot;
+        ot += E->mm[i].n;
+        E->mm[i].n = 0;
+    }
+
+    for(e=1;e<=nel;e++)   {
+        
+        for(a=1;a<=ENODES;a++) {
+            
+            ii = E->IEN[e].node[a];
+            a1 = E->ID[ii].doff[1];
+            a2 = E->ID[ii].doff[2];
+            a3 = E->ID[ii].doff[3];
+            
+            o1 = E->mm[a1].n++;
+            o2 = E->mm[a2].n++;
+            o3 = E->mm[a3].n++;
+            
+            E->mm[a1].ot[o1].e = e;
+            E->mm[a1].ot[o1].a = a;
+            E->mm[a1].ot[o1].offset = 0;
+            
+            E->mm[a2].ot[o2].e = e;
+            E->mm[a2].ot[o2].a = a;
+            E->mm[a2].ot[o2].offset = LOC_MAT_SIZE;
+            
+            E->mm[a3].ot[o3].e = e;
+            E->mm[a3].ot[o3].a = a;
+            E->mm[a3].ot[o3].offset = LOC_MAT_SIZE + LOC_MAT_SIZE;
+            
+        }             /* end for loop a */
+
+    }          /* end for e */
+    
+    return;
+}
+
+__device__ void dp_e_assemble_del2_u(
+    struct Some_variables *E,
+    double *u, double *Au,
+    int strip_bcs
+    )
+{
+    int  e,i,a,b,o,ii,nodeb,offset;
+    double sum;
+
+    const int neq=E->lmesh.NEQ;
+    
+    for (i = 0; i < neq; i++) {
+        
+        sum = 0.0;
+        
+        /* ENODES*ENODES = 8*8 = 64 threads per block */
+        /* XXX: 8*(8-n) wasted threads */
+        
+        for (o = 0; o < E->mm[i].n; o++) {
+            
+            e      = E->mm[i].ot[o].e;
+            a      = E->mm[i].ot[o].a;
+            offset = E->mm[i].ot[o].offset;
+            
+            for (b = 1; b <= ENODES; b++) {
+                
+                /* each thread computes three terms */
+
+                nodeb = E->IEN[e].node[b];
+                ii = (a*LOC_MAT_SIZE+b)*NSD-(NSD*LOC_MAT_SIZE+NSD);
+                
+                /* XXX: must reduce here */
+                sum +=
+                    E->elt_k[e].k[ii+offset] *
+                    u[E->ID[nodeb].doff[1]]
+                    + E->elt_k[e].k[ii+offset+1] *
+                    u[E->ID[nodeb].doff[2]]
+                    + E->elt_k[e].k[ii+offset+2] *
+                    u[E->ID[nodeb].doff[3]];
+                
+            }
+        }
+        
+        /* each block writes one element, Au[i] */
+        Au[i] = sum;
+    }
+
+    if (strip_bcs)
+        strip_bcs_from_residual(E,Au);
+
+    return;
+}
+
 __device__ void e_assemble_del2_u(
     struct Some_variables *E,
     double *u, double *Au,
@@ -214,6 +367,8 @@
 {
     if (E->control.NASSEMBLE)
         n_assemble_del2_u(E,u,Au,strip_bcs);
+    else if (1)
+        dp_e_assemble_del2_u(E,u,Au,strip_bcs);
     else
         e_assemble_del2_u(E,u,Au,strip_bcs);
 
@@ -403,6 +558,8 @@
     )
 {
     struct Some_variables kE;
+    struct octoterm *ot;
+    int n_octoterms;
     
     assert_assumptions(E, high_lev);
     
@@ -438,6 +595,11 @@
                  ;
     kE.memory = (double *)malloc(kE.memoryDim*sizeof(double));
     
+    kE.mm = (struct matrix_mult *)malloc(kE.lmesh.NEQ * sizeof(struct matrix_mult));
+    n_octoterms = e_tally(&kE);
+    ot = (struct octoterm *)malloc(n_octoterms * sizeof(struct octoterm));
+    e_map(&kE, ot);
+    
     /* zero outputs */
     kE.monitor.momentum_residual = 0.0;
     kE.valid = 0;
@@ -451,6 +613,8 @@
     E->monitor.momentum_residual = kE.monitor.momentum_residual;
     
     free(kE.memory);
+    free(kE.mm);
+    free(ot);
     
     return kE.valid;
 }



More information about the CIG-COMMITS mailing list