[cig-commits] [commit] master: Adding the memory requirement before initializing the GPU (5f28ffa)

Mon Aug 11 03:41:38 PDT 2014

Repository : https://github.com/geodynamics/relax

On branch  : master
Link       : https://github.com/geodynamics/relax/compare/afc5fb16f867dce2b0d364b8b96d28b885a47c09...2f330a8481a5df32477d6505a981a34b69ce9d38

>---------------------------------------------------------------

commit 5f28ffacc081f042130e11da4bd8484154cd8554
Author: sagar masuti <sagar.masuti at gmail.com>
Date:   Mon Aug 11 18:26:45 2014 +0800

    Adding the memory requirement before initializing the GPU


>---------------------------------------------------------------

5f28ffacc081f042130e11da4bd8484154cd8554
 INSTALL           | 22 ++++++++++--------
 cuda.py           | 48 +++++++++++++++++++--------------------
 src/cu_elastic.cu | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
 wscript           | 14 ++++++------
 4 files changed, 108 insertions(+), 44 deletions(-)

diff --git a/INSTALL b/INSTALL
index 91a31ec..6007eb3 100644
--- a/INSTALL
+++ b/INSTALL
@@ -45,16 +45,6 @@ configured Relax with the command
 
   CPPFLAGS="-I/usr/include/netcdf-4" LDFLAGS="-L/usr/lib64" ./waf configure  --proj-dir=/home/walter/src/relax/relax-bin --gmt-dir=/home/walter/src/relax/relax-bin/ --mkl-incdir=/opt/intel/composerxe-2011.1.107/mkl/include/ --mkl-libdir=/opt/intel/composerxe-2011.1.107/mkl/lib/intel64/ --check-c-compiler=icc --check-fortran-compiler=ifort
 
-To build Relax for NVIDIA GPUs on Linux, we need to provide certain command-line options
-
-  ./waf configure --cuda-dir=/usr/local/cuda --use-fftw
-  ./waf
-
-If you get an error 'CUDA Runtime API error 38: no CUDA-capable device is detected', you might need to explicitly enable the GPUs with the following command
-  
-  sudo nvidia-xconfig --enable-all-gpus 
-
-
 Depending on how your libraries were built, you may have to set
 LD_LIBRARY_PATH (Linux) or DYLD_LIBRARY_PATH (Mac) so that Relax can
 find them.  For example, on Mac OS X using Fink, you may have to run the command
@@ -70,6 +60,18 @@ CPPFLAGS="-I/opt/GMT-4.5.7/netcdf/include" LDFLAGS="-L/usr/lib64 -lgdal" ./waf c
 
 ./waf build
 
+---------
+GPU
+---------
+To build Relax for NVIDIA GPUs on Linux, we need to provide certain command-line options
+
+  ./waf configure --use-cuda --cuda-dir=/usr/local/cuda
+  ./waf
+
+If you get an error 'CUDA Runtime API error 38: no CUDA-capable device is detected', you might need to explicitly enable the GPUs with the following command
+
+  sudo nvidia-xconfig --enable-all-gpus
+
 -------
 Doxygen
 -------
diff --git a/cuda.py b/cuda.py
index edf6cc3..d0b694e 100644
--- a/cuda.py
+++ b/cuda.py
@@ -11,40 +11,40 @@ from waflib.Tools import ccroot, c_preproc
 from waflib.Configure import conf
 
 class cuda(Task.Task):
-	run_str = '${NVCC} ${CUDAFLAGS} ${CXXFLAGS} ${FRAMEWORKPATH_ST:FRAMEWORKPATH} ${CPPPATH_ST:INCPATHS} ${DEFINES_ST:DEFINES} ${CXX_SRC_F}${SRC} ${CXX_TGT_F}${TGT}'
-	color   = 'GREEN'
-	ext_in  = ['.h']
-	vars    = ['CCDEPS']
-	scan    = c_preproc.scan
-	shell   = False
+    run_str = '${NVCC} ${CUDAFLAGS} ${CXXFLAGS} ${FRAMEWORKPATH_ST:FRAMEWORKPATH} ${CPPPATH_ST:INCPATHS} ${DEFINES_ST:DEFINES} ${CXX_SRC_F}${SRC} ${CXX_TGT_F}${TGT}'
+    color   = 'GREEN'
+    ext_in  = ['.h']
+    vars    = ['CCDEPS']
+    scan    = c_preproc.scan
+    shell   = False
 
 @extension('.cu', '.cuda')
 def c_hook(self, node):
-	return self.create_compiled_task('cuda', node)
+    return self.create_compiled_task('cuda', node)
 
 def configure(conf):
-        conf.find_program('nvcc', var='NVCC')
-	conf.options.cuda_dir=conf.root.find_node(conf.env.NVCC).parent.parent.abspath()
-	conf.find_cuda_libs()
+    conf.find_program('nvcc', var='NVCC')
+    conf.options.cuda_dir=conf.root.find_node(conf.env.NVCC).parent.parent.abspath()
+    conf.find_cuda_libs()
 
 @conf
 def find_cuda_libs(self):
-	if not self.env.NVCC:
-		self.fatal('check for nvcc first')
+    if not self.env.NVCC:
+        self.fatal('check for nvcc first')
 
-	d = self.root.find_node(self.env.NVCC).parent.parent
+    d = self.root.find_node(self.env.NVCC).parent.parent
 
-	node = d.find_node('include')
-	_includes = node and node.abspath() or ''
+    node = d.find_node('include')
+    _includes = node and node.abspath() or ''
 
-	_libpath=[]
-        for x in ('lib64', 'lib'):
-		try:
-			_libpath.append(d.find_node(x).abspath())
-		except:
-			pass
-	# this should not raise any error
+    _libpath=[]
+    for x in ('lib64', 'lib'):
+        try:
+            _libpath.append(d.find_node(x).abspath())
+        except:
+            pass
+    # this should not raise any error
         # self.check_cxx(header='cuda.h', lib='cuda', libpath=_libpath, includes=_includes)
-        self.check_cxx(header='cuda.h', lib='cudart', libpath=_libpath, includes=_includes)
-        self.check_cxx(header='cufft.h', lib='cufft', libpath=_libpath, includes=_includes)
+    self.check_cxx(header='cuda.h', lib='cudart', libpath=_libpath, includes=_includes)
+    self.check_cxx(header='cufft.h', lib='cufft', libpath=_libpath, includes=_includes)
 
diff --git a/src/cu_elastic.cu b/src/cu_elastic.cu
index cc1b0fe..469b1b2 100644
--- a/src/cu_elastic.cu
+++ b/src/cu_elastic.cu
@@ -465,6 +465,9 @@ __host__ __device__ void cuMulSub (double       dTaun,
 
 __host__ __device__ double mycuSinh (double dX) ;
 
+int checkMemRequirement(int iSx1,
+                        int iSx2,
+                        int iSx3) ;
 
 /* --------------------------------------------------------------------------------------------- */
 
@@ -819,6 +822,15 @@ extern "C" void cuinit_ (int    iSx1,
         ihSx2 = iSx2 ;
         ihSx3 = iSx3 ;
 	
+	if (-1 == checkMemRequirement(iSx1,iSx2,iSx3))
+	{
+		printf ("********************** ERROR ******************\n") ;
+		printf ("Memory required to run on GPU is insufficient\n");
+		printf ("Either try reducing the grid size or run on CPU only\n") ;
+		printf ("********************** ERROR ******************\n\n") ;
+		return ;
+	}
+
         iSize = sizeof (float) * (iSx1 + 2) * iSx2 * iSx3 ;
 
         cuError = cudaMalloc((void**)&gpV1, iSize) ;
@@ -906,8 +918,7 @@ extern "C" void cuinit_ (int    iSx1,
                 printf ("cuinit : Failed to allocate memory 11\n") ;
                 goto CUINIT_FAILURE ;
         }
-        iSize = sizeof (ST_TENSOR) * iSx1 * iSx2 * (iSx3/2) ;
-        cuError = cudaMalloc ((void **)&pstMoment, iSize) ;
+        cuError = cudaMalloc ((void **)&pstMoment, iSize2) ;
         if (cudaSuccess != cuError)
         {
                 printf ("cuinit : Failed to allocate memory 12\n") ;
@@ -943,7 +954,8 @@ extern "C" void cuinit_ (int    iSx1,
 
 #ifdef PRINT_DEBUG_INFO
         cudaMemGetInfo(&iFreeMem, &iTotalMem);
-        printf ("cuinit: Memory available after allocation is : %lu\n", iFreeMem);
+        printf ("cuinit: Memory available after allocation is : %lu MB\n", iFreeMem/(1024*1024));
+	printf ("cuinit: Total memory available is : %lu MB\n",iTotalMem/(1024*1024));
         printf ("cuinit: exited with no errors\n") ;
 #endif
 
@@ -1000,6 +1012,10 @@ extern "C" void custressupdatewrapper_ (E_TYPE 	 	eType,
                                          pstHostSig, fData1, fData2, fData3, gpV1, gpV2, gpV3) ;
                 }
                 break ;
+                case E_INVALID_TYPE:
+                {
+                        printf ("custressupdatewrapper_: Invalid input\n") ;
+                }
         }
 
 }
@@ -1598,6 +1614,11 @@ extern "C" void cutensormemset_ (E_TENSOR_TYPE eType)
                         cuError = cudaMemset (pstTau, 0, sizeof (ST_TENSOR) * ihSx1 * ihSx2 * ihSx3/2) ;
                 }
                 break ;
+				case E_INVALID_TENSOR_TYPE:
+				{	
+						printf ("Invalid input\n") ;
+				}
+				break ;
         }
         if (cudaSuccess != cuError)
         {
@@ -2000,6 +2021,47 @@ void cuFreeCudaMemory()
 
 }
 
+int checkMemRequirement(int iSx1,
+			int iSx2,
+			int iSx3)
+{
+	int 		liReq = 0 ;
+	long int 	iTemp = 0 ;
+	size_t 		iTotalMem = 0 ;
+	size_t 		iFreeMem = 0 ;
+
+	/* Ui's, Vi's and fft's */
+	iTemp=((iSx1+2)*iSx2*iSx3*sizeof(float)*8)/(1024*1024) ;
+	liReq+=iTemp ;
+
+	/* sig, moment and tau */
+	iTemp=((iSx1*iSx2*iSx3/2)*sizeof(ST_TENSOR)*3)/(1024*1024) ;
+	liReq+=iTemp ;	
+
+	/* Ti's */
+	iTemp=((iSx1+2)*iSx2*sizeof(float)*3)/(1024*1024) ;
+	liReq+=iTemp ;
+	
+	iTemp=(iSx3/2)*sizeof(ST_LAYER)/(1024*1024) ;
+	liReq+=iTemp ;
+
+	/* dMinArray */	
+	iTemp=((iSx1+2)*iSx2*iSx3*sizeof(float))/(1024*1024) ;
+
+        cudaMemGetInfo(&iFreeMem, &iTotalMem);
+	iTotalMem/=(1024*1024) ;	
+	
+	if ((liReq+iTemp) > iTotalMem)
+	{
+		printf ("\nTotal memory required is : %d MB\n", (int)(liReq+iTemp)) ;
+		printf ("Total available is is : %lu MB \n", iTotalMem) ;
+		return -1 ;
+	}
+	
+	return 0;	 
+}
+
+
 /* ------------------------------------------- utility end -------------------------------------- */
 
 
diff --git a/wscript b/wscript
index 5d8c83d..551f13a 100644
--- a/wscript
+++ b/wscript
@@ -124,13 +124,13 @@ def configure(cnf):
     if cnf.options.use_cuda:
         cnf.env.CUDA=cnf.options.use_cuda
         cnf.load('cuda',tooldir='.')   
-	if not cnf.env.CUDAFLAGS:
-            cnf.env.CUDAFLAGS = ['-gencode','arch=compute_35,code=sm_35']
-#           cnf.env.CUDAFLAGS += ['-Xptxas', '-dlcm=cg']
-#            cnf.env.CUDAFLAGS += ['--maxrregcount=32']
-#            cnf.env.CUDAFLAGS = ['-gencode','arch=compute_30,code=sm_30']
-#            cnf.env.CUDAFLAGS = ['-gencode','arch=compute_20,code=sm_20']
-            cnf.env.CXXFLAGS=['-m64']
+    if not cnf.env.CUDAFLAGS:
+        cnf.env.CUDAFLAGS = ['-gencode','arch=compute_35,code=sm_35']
+#       cnf.env.CUDAFLAGS += ['-Xptxas', '-dlcm=cg']
+#       cnf.env.CUDAFLAGS += ['--maxrregcount=32']
+#       cnf.env.CUDAFLAGS = ['-gencode','arch=compute_30,code=sm_30']
+#       cnf.env.CUDAFLAGS = ['-gencode','arch=compute_20,code=sm_20']
+        cnf.env.CXXFLAGS=['-m64']
         if cnf.options.cuda_dir:
             if not cnf.options.cuda_incdir:
                 cnf.options.cuda_incdir=cnf.options.cuda_dir + "/include"