[cig-commits] r15478 - mc/3D/CitcomS/trunk/lib

Fri Jul 24 19:48:38 PDT 2009

Author: leif
Date: 2009-07-24 19:48:38 -0700 (Fri, 24 Jul 2009)
New Revision: 15478

Modified:
   mc/3D/CitcomS/trunk/lib/multigrid_kernel.cu
Log:
Wrote two new versions of gauss_seidel_2() -- neither of which is very
parallel.  Both produce similar wrong answers.


Modified: mc/3D/CitcomS/trunk/lib/multigrid_kernel.cu
===================================================================

--- mc/3D/CitcomS/trunk/lib/multigrid_kernel.cu	2009-07-24 14:03:04 UTC (rev 15477)
+++ mc/3D/CitcomS/trunk/lib/multigrid_kernel.cu	2009-07-25 02:48:38 UTC (rev 15478)
@@ -304,6 +304,136 @@
 
 }
 
+__global__ void gauss_seidel_2a(
+    struct Some_variables *E,
+    double *F, double *Ad
+    )
+{
+    const int nno = E->lmesh.NNO;
+    
+    int j = threadIdx.x + 3; /* 3 <= j < MAX_EQN */
+    int doff = threadIdx.y + 1; /* 1 <= doff < NSD */ 
+    
+    for (int i = 1; i <= nno; i++) {
+        int eqn = E->ID[i].doff[doff];
+        int *C = E->Node_map + (i-1)*MAX_EQN;
+        higher_precision *B = E->Eqn_k[doff] + (i-1)*MAX_EQN;
+        
+        /* Ad on boundaries differs after the following operation, but
+           no communications are needed yet, because boundary Ad will
+           not be used for the G-S iterations for interior nodes */
+        
+        __shared__ double UU[MAX_EQN-3];
+        if (threadIdx.y == 0) {
+            UU[threadIdx.x] = E->temp[C[j]];
+        }
+        __syncthreads();
+        __shared__ double Ad_eqn[NSD];
+        if (threadIdx.x == 0) {
+            Ad_eqn[threadIdx.y] = Ad[eqn];
+        }
+        __shared__ double sum[MAX_EQN-3][3];
+        /*for (int j = 3; j < MAX_EQN; j++)*/ {
+            sum[threadIdx.x][threadIdx.y] = B[j]*UU[threadIdx.x];
+        }
+        __syncthreads();
+        /* Reduce. */
+        for (unsigned int s = (MAX_EQN-3)/2; s > 0; s >>= 1) {
+            if (threadIdx.x < s) {
+                sum[threadIdx.x][threadIdx.y] += sum[threadIdx.x + s][threadIdx.y];
+            }
+            /* XXX: not always necessary */
+            __syncthreads();
+        }
+        
+        if (threadIdx.x == 0) {
+            Ad_eqn[threadIdx.y] += sum[0][threadIdx.y];
+            Ad[eqn] = Ad_eqn[threadIdx.y];
+            if (!(E->NODE[i] & OFFSIDE)) {
+                E->temp[eqn] = (F[eqn] - Ad_eqn[threadIdx.y])*E->BI[eqn];
+            }
+        }
+        __syncthreads();
+    }
+}
+
+
+__global__ void gauss_seidel_2b(
+    struct Some_variables *E,
+    double *F, double *Ad
+    )
+{
+    const int nno = E->lmesh.NNO;
+    unsigned int tid = threadIdx.x;
+    
+    for (int i = 1; i <= nno; i++) {
+        
+        int eqn1 = E->ID[i].doff[1];
+        int eqn2 = eqn1 + 1;
+        int eqn3 = eqn2 + 1;
+        
+        int *C = E->Node_map + (i-1)*MAX_EQN;
+        
+        higher_precision *B1,*B2,*B3;
+        B1 = E->Eqn_k[1] + (i-1)*MAX_EQN;
+        B2 = E->Eqn_k[2] + (i-1)*MAX_EQN;
+        B3 = E->Eqn_k[3] + (i-1)*MAX_EQN;
+        
+        /* Ad on boundaries differs after the following operation, but
+           no communications are needed yet, because boundary Ad will
+           not be used for the G-S iterations for interior nodes */
+        
+        double Ad_eqn1, Ad_eqn2, Ad_eqn3;
+        if (tid == 0) {
+            Ad_eqn1 = Ad[eqn1];
+            Ad_eqn2 = Ad[eqn2];
+            Ad_eqn3 = Ad[eqn3];
+        }
+        
+        __shared__ double sum1[MAX_EQN-3];
+        __shared__ double sum2[MAX_EQN-3];
+        __shared__ double sum3[MAX_EQN-3];
+        
+        int j = tid + 3; /* 3 <= j < MAX_EQN */
+        /*for (int j = 3; j < MAX_EQN; j++)*/ {
+            double UU = E->temp[C[j]];
+            sum1[tid] = B1[j]*UU;
+            sum2[tid] = B2[j]*UU;
+            sum3[tid] = B3[j]*UU;
+        }
+        __syncthreads();
+        
+        /* Reduce. */
+        for (unsigned int s = (MAX_EQN-3)/2; s > 0; s >>= 1) {
+            if (tid < s) {
+                sum1[tid] += sum1[tid + s];
+                sum2[tid] += sum2[tid + s];
+                sum3[tid] += sum3[tid + s];
+            }
+            /* XXX: not always necessary */
+            __syncthreads();
+        }
+        
+        if (tid == 0) {
+            Ad_eqn1 += sum1[0];
+            Ad_eqn2 += sum2[0];
+            Ad_eqn3 += sum3[0];
+            Ad[eqn1] = Ad_eqn1;
+            Ad[eqn2] = Ad_eqn2;
+            Ad[eqn3] = Ad_eqn3;
+            if (!(E->NODE[i] & OFFSIDE)) {
+                E->temp[eqn1] = (F[eqn1] - Ad_eqn1)*E->BI[eqn1];
+                E->temp[eqn2] = (F[eqn2] - Ad_eqn2)*E->BI[eqn2];
+                E->temp[eqn3] = (F[eqn3] - Ad_eqn3)*E->BI[eqn3];
+            }
+        }
+        __syncthreads();
+        
+    }
+    
+}
+
+
 __global__ void gauss_seidel_3(
     struct Some_variables *E,
     double *d0,
@@ -451,10 +581,25 @@
             dim3 grid(E->lmesh.NNO, NSD, 1);
             if (1) gauss_seidel_1<<< grid, block >>>(d_E, d_F, d_Ad);
             else host_gauss_seidel_1(E, &s_E, d_F, d_Ad);
-            if (0) gauss_seidel_2<<< grid, block >>>(d_E, d_F, d_Ad);
-            else host_gauss_seidel_2(E, &s_E, d_F, d_Ad);
         }
         
+        if (0) {
+            dim3 block(MAX_EQN - 3, NSD, 1);
+            dim3 grid(1, 1, 1);
+            gauss_seidel_2a<<< grid, block >>>(d_E, d_F, d_Ad);
+        }
+        if (1) {
+            //dim3 block(MAX_EQN - 3, NSD, 1);
+            dim3 block(MAX_EQN - 3, 1, 1);
+            dim3 grid(1, 1, 1);
+            gauss_seidel_2b<<< grid, block >>>(d_E, d_F, d_Ad);
+        }
+        if (0) host_gauss_seidel_2(E, &s_E, d_F, d_Ad);
+        
+        if (cudaThreadSynchronize() != cudaSuccess) {
+            assert(0 && "something went wrong");
+        }
+        
         /* Ad on boundaries differs after the following operation */
         {
             dim3 block(MAX_EQN, 1, 1);
@@ -889,6 +1034,208 @@
 }
 
 
+typedef struct OpcodeStream {
+    int alloc;
+    int count;
+    int *opcodes;
+    int *operands;
+} OS;
+
+
+static void pushBackOpcode(OS *os, int opcode, int operand, OS *sub) {
+    int subCount = sub->count ? 2 + sub->count : 1;
+    if (os->count + 1 + subCount > os->alloc) {
+        do {
+            os->alloc *= 2;
+        } while (os->count + 1 + subCount > os->alloc);
+        os->opcodes = (int *)realloc(os->opcodes, os->alloc*sizeof(int));
+        os->operands = (int *)realloc(os->operands, os->alloc*sizeof(int));
+    }
+    if (sub->count) {
+        os->opcodes[os->count] = 10;
+        os->operands[os->count] = 0;
+        ++os->count;
+        for (int op = 0; op < sub->count; ++op) {
+            os->opcodes[os->count] = sub->opcodes[op];
+            os->operands[os->count] = sub->operands[op];
+            ++os->count;
+        }
+        os->opcodes[os->count] = 11;
+        os->operands[os->count] = 0;
+        ++os->count;
+    } else {
+        os->opcodes[os->count] = 0;
+        os->operands[os->count] = 0;
+        ++os->count;
+    }
+    os->opcodes[os->count] = opcode;
+    os->operands[os->count] = operand;
+    ++os->count;
+}
+
+void opcode_gauss_seidel_2(
+    struct Some_variables *E
+    )
+{
+    const int neq = E->lmesh.NEQ;
+    const int nno = E->lmesh.NNO;
+    
+    OS *Ad = (OS *)malloc((neq+1)*sizeof(OS));
+    for (int i = 0; i <= neq; ++i) {
+        Ad[i].alloc = 2;
+        Ad[i].count = 0;
+        Ad[i].opcodes = (int *)malloc(2*sizeof(int));
+        Ad[i].operands = (int *)malloc(2*sizeof(int));
+    }
+    OS *temp = (OS *)malloc((neq+1)*sizeof(OS));
+    for (int i = 0; i <= neq; ++i) {
+        temp[i].alloc = 2;
+        temp[i].count = 0;
+        temp[i].opcodes = (int *)malloc(2*sizeof(int));
+        temp[i].operands = (int *)malloc(2*sizeof(int));
+    }
+    
+    for (int i = 1; i <= nno; i++) {
+            
+        int eqn1 = E->ID[i].doff[1];
+        int eqn2 = E->ID[i].doff[2];
+        int eqn3 = E->ID[i].doff[3];
+            
+        int *C = E->Node_map + (i-1)*MAX_EQN;
+            
+        higher_precision *B1,*B2,*B3;
+        B1 = E->Eqn_k[1] + (i-1)*MAX_EQN;
+        B2 = E->Eqn_k[2] + (i-1)*MAX_EQN;
+        B3 = E->Eqn_k[3] + (i-1)*MAX_EQN;
+            
+        /* Ad on boundaries differs after the following operation, but
+           no communications are needed yet, because boundary Ad will
+           not be used for the G-S iterations for interior nodes */
+            
+        for (int j = 3; j < MAX_EQN; j++) {
+            /*
+              double UU = E->temp[C[j]];
+              Ad[eqn1] += B1[j]*UU;
+              Ad[eqn2] += B2[j]*UU;
+              Ad[eqn3] += B3[j]*UU;
+            */
+            pushBackOpcode(Ad + eqn1, 1, j, temp + C[j]);
+            pushBackOpcode(Ad + eqn2, 1, j, temp + C[j]);
+            pushBackOpcode(Ad + eqn3, 1, j, temp + C[j]);
+        }
+            
+        if (!(E->NODE[i] & OFFSIDE)) {
+            /*
+              E->temp[eqn1] = (F[eqn1] - Ad[eqn1])*E->BI[eqn1];
+              E->temp[eqn2] = (F[eqn2] - Ad[eqn2])*E->BI[eqn2];
+              E->temp[eqn3] = (F[eqn3] - Ad[eqn3])*E->BI[eqn3];
+            */
+            pushBackOpcode(temp + eqn1, 2, eqn1, Ad + eqn1);
+            pushBackOpcode(temp + eqn2, 2, eqn2, Ad + eqn2);
+            pushBackOpcode(temp + eqn3, 2, eqn3, Ad + eqn3);
+        }
+            
+    }
+    
+    for (int i = 0; i <= neq; ++i) {
+        fprintf(stderr, "Ad[%d]: ", i);
+        OS *os = Ad + i;
+        for (int op = 0; op < os->count; ++op) {
+            if (os->opcodes[op] == 10) {
+                fprintf(stderr, "(");
+            } else if (os->opcodes[op] == 11) {
+                fprintf(stderr, ")");
+            } else if (os->opcodes[op] == 0) {
+                fprintf(stderr, "@ ");
+            } else {
+                fprintf(stderr, "%d,%d ", os->opcodes[op], os->operands[op]);
+            }
+        }
+        fprintf(stderr, "\n");
+    }
+    
+}
+
+
+void launch_tally_gauss_seidel_2(
+    struct Some_variables *E
+    )
+{
+    const int neq = E->lmesh.NEQ;
+    const int nno = E->lmesh.NNO;
+    
+    int *Ad = (int *)malloc((neq+1)*sizeof(int));
+    int *temp = (int *)malloc((neq+1)*sizeof(int));
+    for (int i = 0; i <= neq; ++i) {
+        Ad[i] = 0;
+        temp[i] = 0;
+    }
+    int launchTally = 0;
+    int maxSpread = 0, minSpread = 0;
+    
+    for (int i = 1; i <= nno; i++) {
+            
+        int eqn1 = E->ID[i].doff[1];
+        int eqn2 = E->ID[i].doff[2];
+        int eqn3 = E->ID[i].doff[3];
+            
+        int *C = E->Node_map + (i-1)*MAX_EQN;
+            
+        higher_precision *B1,*B2,*B3;
+        B1 = E->Eqn_k[1] + (i-1)*MAX_EQN;
+        B2 = E->Eqn_k[2] + (i-1)*MAX_EQN;
+        B3 = E->Eqn_k[3] + (i-1)*MAX_EQN;
+            
+        for (int j = 3; j < MAX_EQN; j++) {
+            /*
+              double UU = E->temp[C[j]];
+              Ad[eqn1] += B1[j]*UU;
+              Ad[eqn2] += B2[j]*UU;
+              Ad[eqn3] += B3[j]*UU;
+            */
+            
+            maxSpread = max(maxSpread, eqn1 - C[j]);
+            minSpread = min(minSpread, eqn1 - C[j]);
+            if (temp[C[j]]) {
+                ++launchTally;
+                for (int i = 0; i <= neq; ++i) {
+                    Ad[i] = 0;
+                    temp[i] = 0;
+                }
+            }
+            Ad[eqn1] = 1;
+            Ad[eqn2] = 1;
+            Ad[eqn3] = 1;
+        }
+            
+        if (!(E->NODE[i] & OFFSIDE)) {
+            /*
+              E->temp[eqn1] = (F[eqn1] - Ad[eqn1])*E->BI[eqn1];
+              E->temp[eqn2] = (F[eqn2] - Ad[eqn2])*E->BI[eqn2];
+              E->temp[eqn3] = (F[eqn3] - Ad[eqn3])*E->BI[eqn3];
+            */
+            if (Ad[eqn1] || Ad[eqn2] || Ad[eqn3]) {
+                ++launchTally;
+                for (int i = 0; i <= neq; ++i) {
+                    Ad[i] = 0;
+                    temp[i] = 0;
+                }
+            }
+            temp[eqn1] = 1;
+            temp[eqn2] = 1;
+            temp[eqn3] = 1;
+        }
+            
+    }
+    
+    fprintf(stderr, "launch tally: %d\n", launchTally);
+    fprintf(stderr, "maxSpread: %d\n", maxSpread);
+    fprintf(stderr, "minSpread: %d\n", minSpread);
+
+    
+}
+
+
 extern "C" void gauss_seidel(
     struct All_variables *E,
     double **d0,
@@ -927,8 +1274,10 @@
     
     collect_terms(&kE);
     if (0) oy_gauss_seidel_2(&kE);
-    piano_roll_gauss_seidel_2(&kE);
-    assert(0);
+    if (0) piano_roll_gauss_seidel_2(&kE);
+    if (0) opcode_gauss_seidel_2(&kE);
+    if (0) launch_tally_gauss_seidel_2(&kE);
+    if (0) assert(0);
     
     do_gauss_seidel(
         &kE,