[cig-commits] r15456 - mc/3D/CitcomS/trunk/lib
leif at geodynamics.org
leif at geodynamics.org
Fri Jul 10 15:15:02 PDT 2009
Author: leif
Date: 2009-07-10 15:15:02 -0700 (Fri, 10 Jul 2009)
New Revision: 15456
Modified:
mc/3D/CitcomS/trunk/lib/multigrid_kernel.cu
Log:
Pardon my dust... tally_n_assemble_del2_u() is instrumention and
printfs I wrote in order to understand n_assemble_del2_u(). As with
e_assemble_del2_u(), it appears I need one block for each Au[i]
element, with a reduction in shared memory.
Modified: mc/3D/CitcomS/trunk/lib/multigrid_kernel.cu
===================================================================
--- mc/3D/CitcomS/trunk/lib/multigrid_kernel.cu 2009-07-10 21:08:14 UTC (rev 15455)
+++ mc/3D/CitcomS/trunk/lib/multigrid_kernel.cu 2009-07-10 22:15:02 UTC (rev 15456)
@@ -279,6 +279,204 @@
assert(E->parallel.nproc == 1);
}
+static void tally_n_assemble_del2_u(
+ struct Some_variables *E //,
+ //double *u, double *Au,
+ //int strip_bcs
+ )
+{
+ int e,i;
+ int eqn1,eqn2,eqn3;
+
+#if 0
+ double UU,U1,U2,U3;
+#endif
+
+ int *C;
+#if 0
+ higher_precision *B1,*B2,*B3;
+#endif
+
+ const int neq=E->lmesh.NEQ;
+ const int nno=E->lmesh.NNO;
+
+ /*
+ * Au = E->Eqn_k? * u
+ * where E->Eqn_k? is the sparse stiffness matrix
+ */
+
+ int maxAcc, total;
+ int *tally;
+ int **threadMap, **threadTally;
+ int f;
+
+ tally = (int *)malloc((neq+1) * sizeof(int));
+ threadMap = (int **)malloc((neq+1)* sizeof(int*));
+ threadTally = (int **)malloc((neq+1)* sizeof(int*));
+
+ for(e=0;e<=neq;e++) {
+ //Au[e]=0.0;
+ tally[e] = 0;
+ threadMap[e] = (int *)malloc((neq+1) * sizeof(int));
+ threadTally[e] = (int *)malloc((neq+1) * sizeof(int));
+ for(f=0;f<=neq;f++) {
+ threadMap[e][f] = -1;
+ threadTally[e][f] = 0;
+ }
+ }
+
+#if 0
+ u[neq] = 0.0;
+#endif
+
+ for(e=1;e<=nno;e++) {
+
+ eqn1=E->ID[e].doff[1];
+ eqn2=E->ID[e].doff[2];
+ eqn3=E->ID[e].doff[3];
+
+ /* could compute, but 'Node_map' is more complicated */
+ assert(eqn1 == 3*(e-1));
+ assert(eqn2 == eqn1+1);
+ assert(eqn3 == eqn1+2);
+
+ /* could put maps in constant memory */
+
+ /*
+ * Key observation: after parallelizing on 'e' (either one):
+ *
+ * ID[e].doff[1,2,3]
+ * C
+ *
+ * are fixed for each thread. => Not worth obsessing over?
+ */
+
+ /*
+ * Put Au[eqnX] into shared memory; it is accessed almost MAX_EQN=42 times.
+ *
+ * "Au[e]=0.0" should be unnecessary -- single write at end of fn
+ * from 'AuX' local var.. so actually, Au[eqnX] sits in register
+ *
+ * But what about "Au[C[i]]"???????????
+ */
+
+ /*
+ * neq vs. nno
+ *
+ * neq == 3*nno
+ * warp=32; only 2 threads wasted
+ * better: 3 warps 32*3
+ *
+ * use y for "dimension index"? (block size 32x3; nno % 32)
+ */
+
+#if 0
+ U1 = u[eqn1];
+ U2 = u[eqn2];
+ U3 = u[eqn3];
+#endif
+
+ C=E->Node_map + (e-1)*MAX_EQN;
+#if 0
+ B1=E->Eqn_k[1]+(e-1)*MAX_EQN;
+ B2=E->Eqn_k[2]+(e-1)*MAX_EQN;
+ B3=E->Eqn_k[3]+(e-1)*MAX_EQN;
+#endif
+
+ for(i=3;i<MAX_EQN;i++) {
+#if 0
+ UU = u[C[i]];
+ Au[eqn1] += B1[i]*UU;
+ Au[eqn2] += B2[i]*UU;
+ Au[eqn3] += B3[i]*UU;
+#endif
+ ++tally[eqn1];
+ ++tally[eqn2];
+ ++tally[eqn3];
+ for(f=0;f<=neq;f++) {
+ if (threadMap[eqn1][f] == e) {
+ ++threadTally[eqn1][f];
+ break;
+ }
+ if (threadMap[eqn1][f] == -1) {
+ threadMap[eqn1][f] = e;
+ ++threadTally[eqn1][f];
+ break;
+ }
+ }
+ for(f=0;f<=neq;f++) {
+ if (threadMap[eqn2][f] == e) {
+ ++threadTally[eqn2][f];
+ break;
+ }
+ if (threadMap[eqn2][f] == -1) {
+ threadMap[eqn2][f] = e;
+ ++threadTally[eqn2][f];
+ break;
+ }
+ }
+ for(f=0;f<=neq;f++) {
+ if (threadMap[eqn3][f] == e) {
+ ++threadTally[eqn3][f];
+ break;
+ }
+ if (threadMap[eqn3][f] == -1) {
+ threadMap[eqn3][f] = e;
+ ++threadTally[eqn3][f];
+ break;
+ }
+ }
+ }
+ for(i=0;i<MAX_EQN;i++) {
+#if 0
+ Au[C[i]] += B1[i]*U1+B2[i]*U2+B3[i]*U3;
+#endif
+ ++tally[C[i]];
+ for(f=0;f<=neq;f++) {
+ if (threadMap[C[i]][f] == e) {
+ ++threadTally[C[i]][f];
+ break;
+ }
+ if (threadMap[C[i]][f] == -1) {
+ threadMap[C[i]][f] = e;
+ ++threadTally[C[i]][f];
+ break;
+ }
+ }
+ }
+
+ } /* end for e */
+
+ maxAcc = 0;
+ total = 0;
+ for(e=0;e<=neq;e++) {
+ int myTally;
+ fprintf(stderr, "Au[%d]: %d times", e, tally[e]);
+ if (e < neq)
+ maxAcc = max(maxAcc, tally[e]);
+ total += tally[e];
+ myTally = 0;
+ for(f=0;f<=neq;f++) {
+ if (threadMap[e][f] == -1)
+ break;
+ fprintf(stderr, " %d(%d)", threadMap[e][f], threadTally[e][f]);
+ myTally += threadTally[e][f];
+ }
+ fprintf(stderr, " (%d times)\n", myTally);
+ }
+ //fprintf(stderr, "Au[%d] == %f\n", e - 1, Au[e]);
+ fprintf(stderr, "max accesses %d\n", maxAcc);
+ fprintf(stderr, "total accesses %d\n", total);
+ assert(0);
+
+#if 0
+ if (strip_bcs)
+ strip_bcs_from_residual(E,Au);
+#endif
+
+ return;
+}
+
extern "C" void gauss_seidel(
struct All_variables *E,
double **d0,
@@ -332,5 +530,6 @@
}
fprintf(stderr, "\n0 - NEQ %d\n", kE.lmesh.NEQ);
} while (0);
+ tally_n_assemble_del2_u(&kE);
assert(0);
}
More information about the CIG-COMMITS
mailing list