[cig-commits] r15472 - mc/3D/CitcomS/trunk/lib

leif at geodynamics.org leif at geodynamics.org
Wed Jul 22 20:50:26 PDT 2009


Author: leif
Date: 2009-07-22 20:50:26 -0700 (Wed, 22 Jul 2009)
New Revision: 15472

Modified:
   mc/3D/CitcomS/trunk/lib/multigrid_kernel.cu
Log:
This junk is my effort to understand gauss_seidel_2().  Because of the
way E->temp[] is used, it is quite difficult (impossible?) to
parallelize.  I even wrote code to generate a "dot" graph of the
expression tree.  Of course, this tree is too big to plot in its
entirety.  With neq == 2187, the tree has 87479 nodes and is 355
levels deep; each level can have anywhere from a handful to several
hundred nodes, with over a thousand nodes in each of the bottommost
layers.


Modified: mc/3D/CitcomS/trunk/lib/multigrid_kernel.cu
===================================================================
--- mc/3D/CitcomS/trunk/lib/multigrid_kernel.cu	2009-07-22 20:17:11 UTC (rev 15471)
+++ mc/3D/CitcomS/trunk/lib/multigrid_kernel.cu	2009-07-23 03:50:26 UTC (rev 15472)
@@ -538,6 +538,357 @@
 }
 
 
+void oy_gauss_seidel_2(
+    struct Some_variables *E
+    )
+{
+    const int neq = E->lmesh.NEQ;
+    const int nno = E->lmesh.NNO;
+    
+    int *ti = (int *)malloc(3* nno * sizeof(int));
+    int *ti_tally = (int *)malloc(3 * nno * sizeof(int));
+    for (int i = 1; i <= nno; i++) {
+        ti[i - 1] = 0;
+        ti_tally[i - 1] = 0;
+    }
+    
+    int *back = ti;
+    
+    int *temp = (int *)malloc((neq+1)*sizeof(int));
+    int *temp_pre = (int *)malloc((neq+1)*sizeof(int));
+    int *temp_post = (int *)malloc((neq+1)*sizeof(int));
+    for (int i = 0; i <= neq; ++i) {
+        temp[i] = 0;                   /* uninit */
+        temp_pre[i] = 0;
+        temp_post[i] = 0;
+    }
+    
+    FILE *dot = fopen("gs2.dot", "w");
+    
+    fprintf(dot,
+            "digraph G {\n"
+            "    node [shape=box];\n\n");
+    
+    // job1[label="Job #1"];
+    const int DOT_MIN = 270;
+    const int DOT_MAX = 300;
+    
+    for (int i = 0; i <= neq && DOT_MIN <= i && i < DOT_MAX; ++i) {
+        fprintf(dot, "    Ad_%d_0[label=\"Ad[%d]\"];\n", i, i);
+    }
+    int *Ad = (int *)malloc((neq+1)*sizeof(int));
+    for (int i = 0; i <= neq; ++i) {
+        Ad[i] = 0;
+    }
+    
+    for (int i = 0; i <= neq && DOT_MIN <= i && i < DOT_MAX; ++i) {
+        fprintf(dot, "    temp_%d_0[label=\"temp[%d]\"];\n", i, i);
+    }
+    for (int i = 1; i <= nno; i++) {
+        int eqn1 = E->ID[i].doff[1];
+        int eqn2 = E->ID[i].doff[2];
+        int eqn3 = E->ID[i].doff[3];
+        if (eqn3 < DOT_MIN || DOT_MAX <= eqn3) {
+            continue;
+        }
+        if (!(E->NODE[i] & OFFSIDE)) {
+            fprintf(dot, "    temp_%d_offside[label=\"temp[%d] OFFSIDE\"];\n", eqn1, eqn1);
+            fprintf(dot, "    temp_%d_offside[label=\"temp[%d] OFFSIDE\"];\n", eqn2, eqn2);
+            fprintf(dot, "    temp_%d_offside[label=\"temp[%d] OFFSIDE\"];\n", eqn3, eqn3);
+        }
+    }
+    for (int j = 3; j < MAX_EQN; j++) {
+        fprintf(dot, "    B1_%d_UU[label=\"B1[%d]*UU\"];\n", j, j);
+        fprintf(dot, "    B2_%d_UU[label=\"B2[%d]*UU\"];\n", j, j);
+        fprintf(dot, "    B3_%d_UU[label=\"B3[%d]*UU\"];\n", j, j);
+    }
+    fprintf(dot, "\n");
+    
+    for (int i = 1; i <= nno; i++) {
+            
+        int eqn1 = E->ID[i].doff[1];
+        int eqn2 = E->ID[i].doff[2];
+        int eqn3 = E->ID[i].doff[3];
+            
+        int *C = E->Node_map + (i-1)*MAX_EQN;
+            
+        //fprintf(stderr, "@@@ read: ");
+        for (int j = 3; j < MAX_EQN; j++) {
+            //double UU = E->temp[C[j]];
+            //fprintf(stderr, "%d ", C[j]);
+            
+            /***
+             *  This may read 'temp' elements which are not written by
+             *  the OFFSIDE code below.  BUT, if a temp element IS
+             *  written by the OFFSIDE code, the read here follows the
+             *  write.
+             */
+            
+            if (C[j] < DOT_MIN || DOT_MAX <= C[j]) {
+            } else if (temp_pre[C[j]] || temp_post[C[j]]) {
+                /* edge already written */
+            } else if (temp[C[j]]) {
+                /* offside value */
+                fprintf(dot, "    temp_%d_offside->B1_%d_UU;\n", C[j], j);
+            } else {
+                /* original value */
+                fprintf(dot, "    temp_%d_0->B1_%d_UU;\n", C[j], j);
+            }
+            if (temp[C[j]]) {
+                /* already written */
+                ++temp_post[C[j]];
+            } else {
+                ++temp_pre[C[j]];
+            }
+            for (int *x = ti; x < back; ++x) {
+                if (C[j] == *x) {
+                    int tallyho = x - ti;
+                    ++ti_tally[tallyho];
+                }
+            }
+            /*
+              double UU = E->temp[C[j]];
+              Ad[eqn1] += B1[j]*UU;
+              Ad[eqn2] += B2[j]*UU;
+              Ad[eqn3] += B3[j]*UU;
+            */
+            if (DOT_MIN <= eqn3 && eqn3 < DOT_MAX) {
+                fprintf(dot, "    Ad_%d_%d[label=\"Ad[%d]\"];\n", eqn1, Ad[eqn1] + 1, eqn1);
+                fprintf(dot, "    Ad_%d_%d[label=\"Ad[%d]\"];\n", eqn2, Ad[eqn2] + 1, eqn2);
+                fprintf(dot, "    Ad_%d_%d[label=\"Ad[%d]\"];\n", eqn3, Ad[eqn3] + 1, eqn3);
+                fprintf(dot, "    Ad_%d_%d->Ad_%d_%d;\n", eqn1, Ad[eqn1], eqn1, Ad[eqn1] + 1);
+                fprintf(dot, "    Ad_%d_%d->Ad_%d_%d;\n", eqn2, Ad[eqn2], eqn2, Ad[eqn2] + 1);
+                fprintf(dot, "    Ad_%d_%d->Ad_%d_%d;\n", eqn3, Ad[eqn3], eqn3, Ad[eqn3] + 1);
+                fprintf(dot, "    B1_%d_UU->Ad_%d_%d;\n", j, eqn1, Ad[eqn1] + 1);
+                fprintf(dot, "    B2_%d_UU->Ad_%d_%d;\n", j, eqn2, Ad[eqn2] + 1);
+                fprintf(dot, "    B3_%d_UU->Ad_%d_%d;\n", j, eqn3, Ad[eqn3] + 1);
+            }
+            ++Ad[eqn1];
+            ++Ad[eqn2];
+            ++Ad[eqn3];
+        }
+        //fprintf(stderr, "\n");
+            
+        if (!(E->NODE[i] & OFFSIDE)) {
+            fprintf(stderr, "@@@@@@ %d write: %d %d %d\n", i, eqn1, eqn2, eqn3);
+            //E->temp[eqn1] = (F[eqn1] - Ad[eqn1])*E->BI[eqn1];
+            //E->temp[eqn2] = (F[eqn2] - Ad[eqn2])*E->BI[eqn2];
+            //E->temp[eqn3] = (F[eqn3] - Ad[eqn3])*E->BI[eqn3];
+            ++temp[eqn1];
+            ++temp[eqn2];
+            ++temp[eqn3];
+            if (i < nno) {
+                int next_i = i + 1;
+                int *next_C = E->Node_map + (next_i-1)*MAX_EQN;
+                int flag1, flag2, flag3;
+                flag1 = flag2 = flag3 = 0;
+                for (int j = 3; j < MAX_EQN; j++) {
+                    if (next_C[j] == eqn1) {
+                        flag1 = 1;
+                    } else if (next_C[j] == eqn2) {
+                        flag2 = 1;
+                    } else if (next_C[j] == eqn3) {
+                        flag3 = 1;
+                    }
+                }
+                assert(flag1 && flag2 && flag3);
+            } else {
+                assert(0);
+            }
+            *back++ = eqn1;
+            *back++ = eqn2;
+            *back++ = eqn3;
+            if (DOT_MIN <= eqn3 && eqn3 < DOT_MAX) {
+                fprintf(dot, "    Ad_%d_%d->temp_%d_offside;\n", eqn1, Ad[eqn1], eqn1);
+                fprintf(dot, "    Ad_%d_%d->temp_%d_offside;\n", eqn2, Ad[eqn2], eqn2);
+                fprintf(dot, "    Ad_%d_%d->temp_%d_offside;\n", eqn3, Ad[eqn3], eqn3);
+            }
+        }
+            
+    }
+    
+    for (int i = 0; i <= neq; ++i) {
+        assert(temp[i] <= 1);
+        if (temp[i]) {
+            fprintf(stderr, "@@@ temp[%d] was written %d times\n", i, temp[i]);
+        }
+    }
+    for (int *x = ti; x < back; ++x) {
+        int tallyho = x - ti;
+        int eqn = *x;
+        assert(temp_pre[eqn] == 0);
+        assert(temp_pre[eqn] + temp_post[eqn] == ti_tally[tallyho]);
+        fprintf(stderr, "@@@ read tally %d for %d: %d + %d = %d\n", tallyho, eqn,
+                temp_pre[eqn], temp_post[eqn], ti_tally[tallyho]);
+    }
+    
+    fprintf(dot, "}\n");
+    fclose(dot);
+    
+}
+
+
+void piano_roll_gauss_seidel_2(
+    struct Some_variables *E
+    )
+{
+    const int neq = E->lmesh.NEQ;
+    const int nno = E->lmesh.NNO;
+    
+    int *roll = (int *)malloc(nno*MAX_EQN*sizeof(int));
+    for (int i = 0; i < nno*MAX_EQN; ++i) {
+        roll[i] = 0;
+    }
+    
+    int offsideTally = 0;
+    
+    int *Ad = (int *)malloc((neq+1)*sizeof(int));
+    for (int i = 0; i <= neq; ++i) {
+        Ad[i] = 0;
+    }
+    int *temp = (int *)malloc((neq+1)*sizeof(int));
+    for (int i = 0; i <= neq; ++i) {
+        temp[i] = 0;
+    }
+    
+    for (int i = 1; i <= nno; i++) {
+            
+        int eqn1 = E->ID[i].doff[1];
+        int eqn2 = E->ID[i].doff[2];
+        int eqn3 = E->ID[i].doff[3];
+            
+        int *C = E->Node_map + (i-1)*MAX_EQN;
+            
+        higher_precision *B1,*B2,*B3;
+        B1 = E->Eqn_k[1] + (i-1)*MAX_EQN;
+        B2 = E->Eqn_k[2] + (i-1)*MAX_EQN;
+        B3 = E->Eqn_k[3] + (i-1)*MAX_EQN;
+            
+        for (int j = 3; j < MAX_EQN; j++) {
+            /*
+              double UU = E->temp[C[j]];
+              Ad[eqn1] += B1[j]*UU;
+              Ad[eqn2] += B2[j]*UU;
+              Ad[eqn3] += B3[j]*UU;
+            */
+            
+            int *row = roll + j*nno;
+            row[i-1] = eqn1;
+            
+            Ad[eqn1] = max(Ad[eqn1], temp[C[j]]) + 1;
+            Ad[eqn2] = max(Ad[eqn2], temp[C[j]]) + 1;
+            Ad[eqn3] = max(Ad[eqn3], temp[C[j]]) + 1;
+        }
+            
+        if (!(E->NODE[i] & OFFSIDE)) {
+            ++offsideTally;
+            /*
+              E->temp[eqn1] = (F[eqn1] - Ad[eqn1])*E->BI[eqn1];
+              E->temp[eqn2] = (F[eqn2] - Ad[eqn2])*E->BI[eqn2];
+              E->temp[eqn3] = (F[eqn3] - Ad[eqn3])*E->BI[eqn3];
+            */
+            temp[eqn1] = Ad[eqn1] + 1;
+            temp[eqn2] = Ad[eqn2] + 1;
+            temp[eqn3] = Ad[eqn3] + 1;
+        }
+            
+    }
+    
+    fprintf(stderr, "!offside: %d of %d\n", offsideTally, nno);
+    int maxAd = 0, maxTemp = 0;
+    for (int i = 0; i <= neq; ++i) {
+        maxAd = max(maxAd, Ad[i]);
+        maxTemp = max(maxTemp, temp[i]);
+        fprintf(stderr, "depth[%d]: Ad %d temp %d\n", i, Ad[i], temp[i]);
+    }
+    fprintf(stderr, "max Ad depth %d\n", maxAd);
+    fprintf(stderr, "max temp depth %d\n", maxTemp);
+    
+    //------------------------------------------------------------------------
+    //------------------------------------------------------------------------
+    //------------------------------------------------------------------------
+    
+    
+    int *nodes = (int *)malloc(maxAd*sizeof(int));
+    for (int i = 0; i < maxAd; ++i) {
+        nodes[i] = 0;
+    }
+    for (int i = 0; i <= neq; ++i) {
+        Ad[i] = 0;
+        temp[i] = 0;
+    }
+    
+    for (int i = 1; i <= nno; i++) {
+            
+        int eqn1 = E->ID[i].doff[1];
+        int eqn2 = E->ID[i].doff[2];
+        int eqn3 = E->ID[i].doff[3];
+            
+        int *C = E->Node_map + (i-1)*MAX_EQN;
+            
+        higher_precision *B1,*B2,*B3;
+        B1 = E->Eqn_k[1] + (i-1)*MAX_EQN;
+        B2 = E->Eqn_k[2] + (i-1)*MAX_EQN;
+        B3 = E->Eqn_k[3] + (i-1)*MAX_EQN;
+            
+        for (int j = 3; j < MAX_EQN; j++) {
+            /*
+              double UU = E->temp[C[j]];
+              Ad[eqn1] += B1[j]*UU;
+              Ad[eqn2] += B2[j]*UU;
+              Ad[eqn3] += B3[j]*UU;
+            */
+            
+            int *row = roll + j*nno;
+            row[i-1] = eqn1;
+            
+            Ad[eqn1] = max(Ad[eqn1], temp[C[j]]) + 1;
+            Ad[eqn2] = max(Ad[eqn2], temp[C[j]]) + 1;
+            Ad[eqn3] = max(Ad[eqn3], temp[C[j]]) + 1;
+            ++nodes[Ad[eqn1]];
+            ++nodes[Ad[eqn2]];
+            ++nodes[Ad[eqn3]];
+        }
+            
+        if (!(E->NODE[i] & OFFSIDE)) {
+            ++offsideTally;
+            /*
+              E->temp[eqn1] = (F[eqn1] - Ad[eqn1])*E->BI[eqn1];
+              E->temp[eqn2] = (F[eqn2] - Ad[eqn2])*E->BI[eqn2];
+              E->temp[eqn3] = (F[eqn3] - Ad[eqn3])*E->BI[eqn3];
+            */
+            temp[eqn1] = Ad[eqn1] + 1;
+            temp[eqn2] = Ad[eqn2] + 1;
+            temp[eqn3] = Ad[eqn3] + 1;
+            ++nodes[temp[eqn1]];
+            ++nodes[temp[eqn2]];
+            ++nodes[temp[eqn3]];
+        }
+            
+    }
+    for (int i = 0; i <= neq; ++i) {
+        if (!Ad[i]) ++nodes[0];
+        if (!temp[i]) ++nodes[0];
+    }
+    
+    int nodeTally = 0;
+    for (int i = 0; i < maxAd; ++i) {
+        fprintf(stderr, "nodes at depth %03d %d\n", i, nodes[i]);
+        nodeTally += nodes[i];
+    }
+    fprintf(stderr, "total nodes: %d\n", nodeTally);
+        
+    if (0) for (int j = 3; j < MAX_EQN; j++) {
+        int *row = roll + j*nno;
+        fprintf(stderr, "row %d: ", j);
+        for (int i = 1; i <= nno; i++) {
+            fprintf(stderr, "%d ", row[i-1]);
+        }
+        fprintf(stderr, "\n");
+    }
+}
+
+
 extern "C" void gauss_seidel(
     struct All_variables *E,
     double **d0,
@@ -575,6 +926,9 @@
     kE.NODE = E->NODE[level][M];
     
     collect_terms(&kE);
+    if (0) oy_gauss_seidel_2(&kE);
+    piano_roll_gauss_seidel_2(&kE);
+    assert(0);
     
     do_gauss_seidel(
         &kE,



More information about the CIG-COMMITS mailing list