EFEP90 10CDMP3 CD t 0 t 0 To pull a bigger wagon, it is easier to add more oxen than to grow a gigantic ox 10t 0 t 0 n p Ts Tp if E(n, p) < 1 p, then T (n) < T (n, p) s p S(n,p) = p
: f(x)=sin(cos(x)) : f(x)=sin(cos(x)) 0: sin(cos(x 1 )) f(x) v99 v100 0: v1=cos(x1) v2=cos(x2).. =cos(x99) =cos(x100) Wait 1: Wait sin(v1) sin(v2).. sin(v99) sin(v100) : f(x1) f(x2).. f(x99) f(x100) 1: sin(cos(x 2 )) t JacobiAx = b Jacobi 1.x0 2. x1 xp+1 x2 xp+2 p-1 xp x2p k x1 x2 x3 xp-1 xp xp+1 xp+2 xp+3 x2p-1 x2p x2p+1 x2p+2 x2p+3 x3p-1 x3p 3. k x1 x2 x3 xp-1 xp xp+1 xp+2 xp+3 x2p-1 x2p x2p+1 x2p+2 x2p+3 x3p-1 x3p
, 0 1 2 p-2 p-1 OpenMP MPI (Message Passing Interface) C C++ Fortran MPIMessage Passing Interface MPI / MPI MPICHLAM MPI Hello! program hello implicit none integer myid,nproces,ierr call MPI_INIT(ierr) call MPI_COMM_SIZE(MPI_COMM_WORLD,nproc,ierr) call MPI_COMM_RANK(MPI_COMM_WORLD,myid,ierr) write(*,*)"hello!","total Proces is:",nproc,"my process is:", myid call MPI_FINALIZE(ierr) end program hello implicit none call MPI_INIT call MPI_COMM_SIZE FORTRAN90 MPI MPI MPI MPI Fortran90 Fortran77include mpif.h MPI_INIT(ierr) call MPI_COMM_RANK call MPI_FINALIZE end MPI MPIMPI
MPI_COMM_SIZE(MPI_COMM_WORLD,nproc,ierr) MPI_COMM_RANK(MPI_COMM_WORLD,myid,ierr) communicator MPI (MPI_COMM_WORLD) communicator MPI (MPI_COMM_WORLD) MPI_FINALIZE(ierr) MPI program programname [] call MPI_INIT(ierr)!MPI call MPI_COMM_SIZE(MPI_COMM_WORLD,nproces,ierr) call MPI_COMM_RANK(MPI_COMM_WORLD,myid,ierr) [(MPIMPI)] call MPI_FINALIZE(ierr)!MPI [(MPI)] end program program main call MPI_INIT(ierr) call MPI_COMM_SIZE(MPI_COMM_WORLD,nproc,ierr) call MPI_COMM_RANK(MPI_COMM_WORLD,myid,ierr) data=1234 If (myid == 0) then data=4321 call MPI_SEND(data,1,MPI_INTEGER,1,0,MPI_COMM_WORLD,ierr) else if (myid == 1) then write(*,*)"before recv: Myid=",myid,"DATA=", data call MPI_RECV(data,1,MPI_INTEGER,0,0,MPI_COMM_WORLD, & statuss,ierr) write(*,*)"after recv: Myid=", myid, "DATA=", data end if call MPI_FINALIZE(ierr) end
communicator communicator program main implicit none integer myid,nprocs,ierr integer data call MPI_INIT(ierr) call MPI_COMM_SIZE(MPI_COMM_WORLD,nprocs,ierr) call MPI_COMM_RANK(MPI_COMM_WORLD,myid,ierr) call MPI_ALLREDUCE(myid,data,1,MPI_INTEGER,& MPI_SUM,MPI_COMM_WORLD,ierr) write(*,*)"myid=",myid,"data=",data call MPI_FINALIZE(ierr) end MPI_ALLREDUCE(myid,data,1,MPI_INTEGER,MPI_SUM,& MPI_COMM_WORLD,ierr) communicator MPI_SUM MPI_MIN MPI_MAX
13 14 15 16 13 14 15 16 1 2 3 4 1 2 3 4 Process 0 Process 1 Commnode(i, j) j i-1 Commnode(2, 4) dt PEFEP90 Commnode Node Force Process 0 Process 1 Node Force(t) Node Force(t) Node Force(t+dt) Node Force(t+dt) F1 MPI_ALLREDUCE F2 Process 0 Process 1 Process 0 Process 1 F=F1+F2 F=F1+F2 Update Node Force F Update Node Force F Next Next
1 2 2 Process 0 Process 1 Process 0 Process 1 dt1 dt2 MPI_ALLREDUCE Min(dt1,dt2) dt dt Next Next 3 4 R = 100mm, H = 300mm200mm/s 2 CPU 4 core = 8 processors OpenMP FortranCC++ http://www.openmp.org/ Processor Processor Processor Processor Memory Fork/join parallelism Master thread Time Other thread Fork Join Fork Join C/C++OpenMP #include <omp.h> main () { int var1, var2, var3; #pragma omp parallel private(var1, var2) shared(var3) { OMP_NUM_THREADS int omp_get_num_procs(void) void omp_set_num_threads(int t) int t; t = omp_get_num_procs(); Omp_set_num_threads(t);
#pragma omp parallel for #pragma omp parallel for for (i=first; i<size; i+=prime) marked[i]=1; private shared private clause #pragma omp parallel for private(j) for (i = 0; i < n; i++) for (j = 0; j < n; j++) a[i][j] = min(a[i][j], a[i][k]+tmp[j]); critical pragma double area, pi, x; int i,n; area = 0.0; #pragma omp parallel for private(x) for (i=0; i<=n; i++) { x = (i+0.5)/n; #pragma omp critical area += 4.0/(1+x*x); value of area 11.667 pi = area/n; Protect a block of code that should be executed by a single thread at a time 11.667 15.432 15.230 Thread A +3.765 Thread B +3.563 reduction double area, pi, x; int i,n; area = 0.0; #pragma omp parallel for for (i=0; i<=n; i++) { x = (i+0.5)/n; #pragma omp critical area += 4.0/(1+x*x); pi = area/n; private(x) reduction(+:area) Threads n = 100,000 Execution time of program (sec) critical pragma reduction clause 1 0.0780 0.0273 2 0.1510 0.0146 3 0.3400 0.0105 4 0.3608 0.0086 5 0.4710 0.0076 parallel pragma #include <omp.h> int main(int argc, char* argv[]) { int nthreads, tid; int nprocs; char buf[32]; #pragma omp parallel private(nthreads, tid) { /* Obtain and print thread id */ tid = omp_get_thread_num(); printf("hello World from OMP thread %d\n", tid); /* Only master thread does this */ if (tid==0) { nthreads = omp_get_num_threads(); printf("number of threads %d\n", nthreads); return 0;