[CIG-SEISMO] Problem with Adjoint Simulation on IBM BlueGene
Carlos Alberto Moreno Chaves
calbertochaves at gmail.com
Sat Apr 25 18:36:52 PDT 2015
Dear Dr. Dimitri,
Hope this email finds you well! My name is Carlos Chaves and I’m
performing some Adjoint Simulation for Finite Frequency Kernel on a
BlueGene Cluster with 24576 Power PC 450 compute core each are 32 bit
running at 850MHz. The system has 4GB of RAM per node.
I was able to compile the SPECFEM3D Globe on Bluein the following way:
#@ job_name = script_conf
#@ comment = "Nothing"
#@ error = $(job_name).$(jobid).out
#@ output = $(job_name).$(jobid).out
#@ environment = COPY_ALL
#@ wall_clock_limit = 01:20:00
#@ notification = error
#@ job_type = bluegene
#@ class = compute
#@ bg_size = 1
#@ queue
make clean
./configure --prefix=/bgpscratch/nu3/KERNEL/specfem3d_globe-master
--build=ppc64 --host=powerpc-bgp-linux
FC=/bgsys/drivers/ppcfloor/comm/xl/bin/mpixlf90_r
MPIFC=/bgsys/drivers/ppcfloor/comm/xl/bin/mpixlf90_r
CC=/opt/ibmcmp/vac/bg/9.0/bin/bgxlc_r CXX=/opt/ibmcmp/vac/bg/9.0/bin/bgcc_r
MPI_INC=/bgsys/drivers/ppcfloor/comm/xl/include MPILIBS="-lmpich.cnk
-ldcmf.cnk -ldcmfcoll.cnk -lSPI.cna -lrt -lpthread"
LDFLAGS="-L/bgsys/drivers/ppcfloor/comm/xl/lib
-L/bgsys/drivers/ppcfloor/runtime/SPI"
make create_header_file
/bgsys/drivers/ppcfloor/bin/mpirun -np 1 -mode VN -exe
/bgpscratch/nu3/KERNEL/specfem3d_globe-master/bin/xcreate_header_file
make meshfem3D
make specfem3D
using the FLAG_CHECK:
FLAGS_CHECK="-O4 -qsave -qstrict -qtune=auto -qarch=450d -qcache=auto
-qhalt=w -qfree=f90 -qsuffix=f=f90 -qlanglvl=95pure -Q -Q+rank,swap_all
-Wl,-relax"
The routines create_header_file, meshfem3D and specfem3D were successfully
created .
After this, I ran the forward simulation, using the instructions of the
specfem manual (change_simulation_type –F) and everything worked very well!
#!/bin/sh
#@ job_name = script_run
#@ comment = "FWD"
#@ error = $(job_name).$(jobid).out
#@ output = $(job_name).$(jobid).out
#@ environment = COPY_ALL
#@ wall_clock_limit = 24:00:00
#@ notification = error
#@ job_type = bluegene
#@ class = compute
#@ group = rcsg
#@ bg_size = 512
#@ queue
/bgsys/drivers/ppcfloor/bin/mpirun -np 1944 -mode VN -exe
/bgpscratch/nu3/KERNEL/specfem3d_globe-master/bin/xmeshfem3D
/bgsys/drivers/ppcfloor/bin/mpirun -np 1944 -mode VN -exe
/bgpscratch/nu3/KERNEL/specfem3d_globe-master/bin/xspecfem3D
After preparing the adjoint sources, I ran the kernel simulation with the
following script:
#!/bin/sh
#@ job_name = script_run_adj
#@ comment = "ADJ_KERNEL"
#@ error = $(job_name).$(jobid).out
#@ output = $(job_name).$(jobid).out
#@ environment = COPY_ALL
#@ wall_clock_limit = 24:00:00
#@ notification = error
#@ job_type = bluegene
#@ class = compute
#@ group = rcsg
#@ bg_size = 512
#@ queue
/bgsys/drivers/ppcfloor/bin/mpirun -np 1944 -mode VN -exe
/bgpscratch/nu3/KERNEL/specfem3d_globe-master/bin/xspecfem3D
However, I got the following error:
Adjoint thread: associated buffer length is invalid<Apr 25 19:32:47.438179>
BE_MPI (ERROR): The error message in the job record is as follows:
<Apr 25 19:32:47.438262> BE_MPI (ERROR): "killed with signal 15"
I’ve tried other options, but without success. Do you have any clue what
might be happening?
Below I’m seding the core file generated after this error.
+++PARALLEL TOOLS CONSORTIUM LIGHTWEIGHT COREFILE FORMAT version 1.0
+++LCB 1.0
Program: /bgpscratch/nu3/KERNEL/specfem3d_globe-master/bin/xspecfem3D
Job ID : 694355
Personality:
XYZT coordinates : 2,5,5,0
MPI Rank : 1448
DDR Size (MB) : 4096
Mode : VN
+++ID TGID 1548, Core 0, Thread 1 State 40000000, Sched: 48000000
General Purpose Registers:
r00=00000078 r01=898feec0 r02=89906780 r03=00000000 r04=898feec0
r05=898ff368 r06=89906780 r07=898ff368
r08=89906780 r09=898ff368 r10=00000000 r11=00000000 r12=24002422
r13=0157f158 r14=00000000 r15=00000000
r16=01a9f7b8 r17=00000000 r18=00000000 r19=01570000 r20=00000001
r21=01570000 r22=00000000 r23=89500000
r24=00000007 r25=0143c238 r26=007d0f00 r27=898feed0 r28=007d0f00
r29=898feec0 r30=012c1fe4 r31=898ff320
Special Purpose Registers:
lr=012c2394 cr=04002422 xer=00000000 ctr=00000000
msr=0002f200 dear=00000000 esr=00000000 fpscr=0010fb70
sprg0=00100000 sprg1=00000000 sprg2=00000000 sprg3=00000000 sprg4=c0001023
sprg5=bea81900 sprg6=0108000b sprg7=0154050b usprg0=00000000
srr0=0001c3f0 srr1=00002000 csrr0=00000000 csrr1=00000000
mcsrr0=00000000 mcsrr1=00000000
dbcr0=400f0000 dbcr2=cca00000 dac1=89500000 dac2=89500fff
Floating Point Registers
f0=00000000 00000000 00000000 00000000 f1=00000000 00000000 00000000
00000000
f2=00000000 00000000 00000000 00000000 f3=00000000 00000000 00000000
00000000
f4=00000000 00000000 00000000 00000000 f5=00000000 00000000 00000000
00000000
f6=00000000 00000000 00000000 00000000 f7=00000000 00000000 00000000
00000000
f8=00000000 00000000 00000000 00000000 f9=00000000 00000000 00000000
00000000
f10=00000000 00000000 00000000 00000000 f11=00000000 00000000 00000000
00000000
f12=00000000 00000000 00000000 00000000 f13=00000000 00000000 00000000
00000000
f14=00000000 00000000 00000000 00000000 f15=00000000 00000000 00000000
00000000
f16=00000000 00000000 00000000 00000000 f17=00000000 00000000 00000000
00000000
f18=00000000 00000000 00000000 00000000 f19=00000000 00000000 00000000
00000000
f20=00000000 00000000 00000000 00000000 f21=00000000 00000000 00000000
00000000
f22=00000000 00000000 00000000 00000000 f23=00000000 00000000 00000000
00000000
f24=00000000 00000000 00000000 00000000 f25=00000000 00000000 00000000
00000000
f26=00000000 00000000 00000000 00000000 f27=00000000 00000000 00000000
00000000
f28=00000000 00000000 00000000 00000000 f29=00000000 00000000 00000000
00000000
f30=00000000 00000000 00000000 00000000 f31=00000000 00000000 00000000
00000000
Memory:
Stack top : 0x00000000
Stack bottom : 0x00000000
Stack pointer : 0x898feec0
Heap top : 0x89000000
Heap bottom : 0x894fffff
Heap break : 0x89401000
TLB[ 0] 16K V=0x00814000-0x00817fff TID=00 P=0x7_ffff0000
___|__|____|_____ _|___|IL1I|IL1D|IL2I|IL2D _I_G_RW____ DevBus.LOCKBOX_SUP.
TLB[ 1] 16K V=0x00818000-0x0081bfff TID=00 P=0x7_ffff4000
___|__|____|_____ _|___|IL1I|IL1D|IL2I|IL2D _I_G_RW_rw_ DevBus.LOCKBOX_USR.
TLB[ 2] 4K V=0x00808000-0x00808fff TID=00 P=0x7_30000000
___|__|____|_____ _|___|IL1I|IL1D|IL2I|IL2D _I_G_RW____ DevBus.BIC.
TLB[ 3] 4K V=0x00804000-0x00804fff TID=00 P=0x7_10000000
___|__|____|_____ _|___|IL1I|IL1D|IL2I|IL2D _I_G_RW_rw_ DevBus.UPC.
TLB[ 4] 256K V=0xfff40000-0xfff7ffff TID=02 P=0x0_00300000
___|__|SWOA|L2PFO _|WL1|____|____|____|____ __M__RWX___ DDR.
TLB[ 5] 1K V=0x00800000-0x008003ff TID=00 P=0x6_10000000
___|__|____|_____ _|___|IL1I|IL1D|IL2I|IL2D _I_G_RW_rw_ NetBus.TREE0.
TLB[ 6] 1K V=0x00800400-0x008007ff TID=00 P=0x6_11000000
___|__|____|_____ _|___|IL1I|IL1D|IL2I|IL2D _I_G_RW____ NetBus.TREE1.
TLB[ 7] 16K V=0x00810000-0x00813fff TID=00 P=0x6_00000000
___|__|____|_____ _|___|IL1I|IL1D|IL2I|IL2D _I_G_RW_rw_ NetBus.DMA0.
TLB[ 8] 1M V=0x00000000-0x000fffff TID=00 P=0x0_00000000
___|__|____|L2PFO _|___|____|____|____|____ _____R_X___ DDR.
TLB[ 9] 1M V=0x00100000-0x001fffff TID=00 P=0x0_00100000
___|__|SWOA|L2PFO _|WL1|____|____|____|____ __M__RW____ DDR.
TLB[10] 1M V=0x00200000-0x002fffff TID=00 P=0x0_00200000
___|__|SWOA|L2PFO _|WL1|____|____|____|____ __M__RW____ DDR.
TLB[11] 16M V=0xf0000000-0xf0ffffff TID=01 P=0x0_c0000000
___|__|SWOA|L2PFO _|WL1|____|____|____|____ __M__RWXrwx DDR.
TLB[12] 1M V=0x01000000-0x010fffff TID=01 P=0x0_80000000
___|__|SWOA|L2PFO _|WL1|____|____|____|____ __M__RWXr_x DDR.
TLB[13] 1M V=0x01100000-0x011fffff TID=01 P=0x0_80100000
___|__|SWOA|L2PFO _|WL1|____|____|____|____ __M__RWXr_x DDR.
TLB[14] 1M V=0x01200000-0x012fffff TID=01 P=0x0_80200000
___|__|SWOA|L2PFO _|WL1|____|____|____|____ __M__RWXr_x DDR.
TLB[15] 1M V=0x01300000-0x013fffff TID=01 P=0x0_80300000
___|__|SWOA|L2PFO _|WL1|____|____|____|____ __M__RWXr_x DDR.
TLB[16] 1M V=0x01400000-0x014fffff TID=01 P=0x0_80400000
___|__|SWOA|L2PFO _|WL1|____|____|____|____ __M__RWXr_x DDR.
TLB[17] 1M V=0x01500000-0x015fffff TID=01 P=0x0_40500000
___|__|SWOA|L2PFO _|WL1|____|____|____|____ __M__RWXrwx DDR.
TLB[18] 1M V=0x01600000-0x016fffff TID=01 P=0x0_40600000
___|__|SWOA|L2PFO _|WL1|____|____|____|____ __M__RWXrwx DDR.
TLB[19] 1M V=0x01700000-0x017fffff TID=01 P=0x0_40700000
___|__|SWOA|L2PFO _|WL1|____|____|____|____ __M__RWXrwx DDR.
TLB[20] 1M V=0x01800000-0x018fffff TID=01 P=0x0_40800000
___|__|SWOA|L2PFO _|WL1|____|____|____|____ __M__RWXrwx DDR.
TLB[21] 1M V=0x01900000-0x019fffff TID=01 P=0x0_40900000
___|__|SWOA|L2PFO _|WL1|____|____|____|____ __M__RWXrwx DDR.
TLB[22] 1M V=0x01a00000-0x01afffff TID=01 P=0x0_40a00000
___|__|SWOA|L2PFO _|WL1|____|____|____|____ __M__RWXrwx DDR.
TLB[23] 1M V=0x01b00000-0x01bfffff TID=01 P=0x0_40b00000
___|__|SWOA|L2PFO _|WL1|____|____|____|____ __M__RWXrwx DDR.
TLB[24] 1M V=0x01c00000-0x01cfffff TID=01 P=0x0_40c00000
___|__|SWOA|L2PFO _|WL1|____|____|____|____ __M__RWXrwx DDR.
TLB[25] 1M V=0x01d00000-0x01dfffff TID=01 P=0x0_40d00000
___|__|SWOA|L2PFO _|WL1|____|____|____|____ __M__RWXrwx DDR.
TLB[26] 1M V=0x01e00000-0x01efffff TID=01 P=0x0_40e00000
___|__|SWOA|L2PFO _|WL1|____|____|____|____ __M__RWXrwx DDR.
TLB[27] 1M V=0x01f00000-0x01ffffff TID=01 P=0x0_40f00000
___|__|SWOA|L2PFO _|WL1|____|____|____|____ __M__RWXrwx DDR.
TLB[28] 16M V=0x02000000-0x02ffffff TID=01 P=0x0_41000000
___|__|SWOA|L2PFO _|WL1|____|____|____|____ __M__RWXrwx DDR.
TLB[29] 16M V=0x03000000-0x03ffffff TID=01 P=0x0_42000000
___|__|SWOA|L2PFO _|WL1|____|____|____|____ __M__RWXrwx DDR.
TLB[30] 16M V=0x04000000-0x04ffffff TID=01 P=0x0_43000000
___|__|SWOA|L2PFO _|WL1|____|____|____|____ __M__RWXrwx DDR.
TLB[31] 16M V=0x05000000-0x05ffffff TID=01 P=0x0_44000000
___|__|SWOA|L2PFO _|WL1|____|____|____|____ __M__RWXrwx DDR.
TLB[32] 16M V=0x06000000-0x06ffffff TID=01 P=0x0_45000000
___|__|SWOA|L2PFO _|WL1|____|____|____|____ __M__RWXrwx DDR.
TLB[33] 16M V=0x07000000-0x07ffffff TID=01 P=0x0_46000000
___|__|SWOA|L2PFO _|WL1|____|____|____|____ __M__RWXrwx DDR.
TLB[34] 16M V=0x08000000-0x08ffffff TID=01 P=0x0_47000000
___|__|SWOA|L2PFO _|WL1|____|____|____|____ __M__RWXrwx DDR.
TLB[35] 256M V=0xb0000000-0xbfffffff TID=01 P=0x0_30000000
___|__|SWOA|L2PFO _|WL1|____|____|____|____ __M__RWXrwx DDR.
TLB[36] 256M V=0xa0000000-0xafffffff TID=01 P=0x0_20000000
___|__|SWOA|L2PFO _|WL1|____|____|____|____ __M__RWXrwx DDR.
TLB[37] 256M V=0x90000000-0x9fffffff TID=01 P=0x0_10000000
___|__|SWOA|L2PFO _|WL1|____|____|____|____ __M__RWXrwx DDR.
TLB[38] 16M V=0x8f000000-0x8fffffff TID=01 P=0x0_0f000000
___|__|SWOA|L2PFO _|WL1|____|____|____|____ __M__RWXrwx DDR.
TLB[39] 16M V=0x8e000000-0x8effffff TID=01 P=0x0_0e000000
___|__|SWOA|L2PFO _|WL1|____|____|____|____ __M__RWXrwx DDR.
TLB[40] 16M V=0x8d000000-0x8dffffff TID=01 P=0x0_0d000000
___|__|SWOA|L2PFO _|WL1|____|____|____|____ __M__RWXrwx DDR.
TLB[41] 16M V=0x8c000000-0x8cffffff TID=01 P=0x0_0c000000
___|__|SWOA|L2PFO _|WL1|____|____|____|____ __M__RWXrwx DDR.
TLB[42] 16M V=0x8b000000-0x8bffffff TID=01 P=0x0_0b000000
___|__|SWOA|L2PFO _|WL1|____|____|____|____ __M__RWXrwx DDR.
TLB[43] 16M V=0x8a000000-0x8affffff TID=01 P=0x0_0a000000
___|__|SWOA|L2PFO _|WL1|____|____|____|____ __M__RWXrwx DDR.
TLB[44] 16M V=0x89000000-0x89ffffff TID=01 P=0x0_09000000
___|__|SWOA|L2PFO _|WL1|____|____|____|____ __M__RWXrwx DDR.
Interrupt Summary:
Core[0]: External Interrupts = 1.
Core[0]: System Calls = 269.
+++STACK
0x0137f45c
---STACK
---ID
+++ID TGID 1548, Core 0, Thread 5 State 00000000, Sched: 48000000 Running
***FAULT Encountered unhandled signal 0x00000006 (6) (???)
Generated by interrupt..................0x0000000C (IPI signal received)
While executing instruction at..........0x0138E700
Dereferencing memory at.................0x00000000
Debugger attached.......................N
General Purpose Registers:
r00=000000fa r01=bfff5b40 r02=89007460 r03=00000000 r04=0000060c
r05=00000006 r06=00000000 r07=7f7f7f7f
r08=00000010 r09=89000000 r10=bfff5be0 r11=ffffffff r12=28000882
r13=0157f158 r14=000005dc r15=fffff2d0
r16=0000012c r17=000005dc r18=000003e8 r19=00000000 r20=bfff5f80
r21=00000005 r22=00000000 r23=00000001
r24=0151d078 r25=013c2b70 r26=000003e8 r27=01aa0000 r28=01570000
r29=00000006 r30=01aa0000 r31=0000060c
Special Purpose Registers:
lr=01346ce4 cr=08000884 xer=00000000 ctr=00000000
msr=0002f200 dear=00000000 esr=00000000 fpscr=0010fb70
sprg0=00100000 sprg1=00000000 sprg2=00000000 sprg3=00000000 sprg4=c0001023
sprg5=bea81900 sprg6=0108000b sprg7=0154050b usprg0=00000000
srr0=0001c3f0 srr1=00002000 csrr0=00000000 csrr1=00000000
mcsrr0=00000000 mcsrr1=00000000
dbcr0=400f0000 dbcr2=cca00000 dac1=8a5a0000 dac2=8a5a0fff
Floating Point Registers
f0=00000000 00000000 00000000 00000000 f1=00000000 00000000 00000000
00000000
f2=00000000 00000000 00000000 00000000 f3=00000000 00000000 00000000
00000000
f4=00000000 00000000 00000000 00000000 f5=00000000 00000000 00000000
00000000
f6=00000000 00000000 00000000 00000000 f7=00000000 00000000 00000000
00000000
f8=00000000 00000000 00000000 00000000 f9=00000000 00000000 00000000
00000000
f10=3bf5fe65 a09f5564 3fef6721 28bd7b17 f11=3fd74783 b0f72a1f 3fef7121
61e353ef
f12=b8a6bda0 5d671e20 3fef7b21 9b092cc7 f13=3fd74783 b0f72a2b 3fef8521
d42f059f
f14=00000000 00000000 00000000 00000000 f15=00000000 00000000 00000000
00000000
f16=00000000 00000000 00000000 00000000 f17=00000000 00000000 00000000
00000000
f18=00000000 00000000 00000000 00000000 f19=00000000 00000000 00000000
00000000
f20=00000000 00000000 00000000 00000000 f21=00000000 00000000 00000000
00000000
f22=00000000 00000000 00000000 00000000 f23=00000000 00000000 00000000
00000000
f24=00000000 00000000 00000000 00000000 f25=00000000 00000000 00000000
00000000
f26=00000000 00000000 00000000 00000000 f27=00000000 00000000 00000000
00000000
f28=00000000 00000000 00000000 00000000 f29=00000000 00000000 00000000
00000000
f30=00000000 00000000 00000000 00000000 f31=00000000 00000000 00000000
00000000
Memory:
Stack top : 0x89000000
Stack bottom : 0xbfffffef
Stack pointer : 0xbfff5b40
+++STACK
0x0138e700
0x01346e2c
0x011fe39c
0x0103b3b8
0x0110de44
0x0133e6f0
0x0133e964
0xfffffffc
---STACK
---ID
---LCB
Thank you!
Best Regards,
Carlos Chaves
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.geodynamics.org/pipermail/cig-seismo/attachments/20150425/f21674a5/attachment-0001.html>
More information about the CIG-SEISMO
mailing list