From e5486c976df88b7467befb200bbdb7c2a3dee710 Mon Sep 17 00:00:00 2001 From: Johannes Mey <johannes.mey@tu-dresden.de> Date: Wed, 8 Feb 2017 12:31:03 +0100 Subject: [PATCH] huge update of tests --- Parser/test-data/OpenAcc/slots.f | 7 + .../ExecutableConstruct1.f | 4 + .../fragments/ExecutableConstruct/Slot.f | 2 + .../fragments/ExecutableConstruct/TypedSlot.f | 2 + Parser/test-data/nas/ft-min.f | 300 ++ Parser/test-data/nas/ft.f | 1065 ++++++ Parser/test-data/nas/ft2.f | 910 +++++ Parser/test-data/nas/ft3.f | 2815 ++++++++++++++++ Parser/test-data/nas/ft4.f | 1106 ++++++ Parser/test-data/nas/ft5.f | 2954 +++++++++++++++++ Parser/test-data/rules/R503a.f90 | 14 + Parser/test-data/rules/tmp.f90 | 4 + Parser/test-data/slots/block.f | 7 + Parser/test-data/specht/array_operations.f | 1166 +++++++ .../test-data/specht/condensed_primary_part.f | 930 ++++++ Parser/test-data/specht/matrix_operations.f | 651 ++++ .../specht/tiny_matrix_products_explicit.f | 1386 ++++++++ .../specht/transformed_condensed_part.f | 151 + .../specht/transformed_primary_part.f | 939 ++++++ .../test/org/tud/forty/test/FragmentTest.java | 17 + Parser/test/org/tud/forty/test/NASTest.java | 37 + Parser/test/org/tud/forty/test/SlotTest.java | 1 + .../test/org/tud/forty/test/SpechtTest.java | 42 + Parser/test/org/tud/forty/test/TestBase.java | 117 +- 24 files changed, 14607 insertions(+), 20 deletions(-) create mode 100644 Parser/test-data/OpenAcc/slots.f create mode 100644 Parser/test-data/fragments/ExecutableConstruct/ExecutableConstruct1.f create mode 100644 Parser/test-data/fragments/ExecutableConstruct/Slot.f create mode 100644 Parser/test-data/fragments/ExecutableConstruct/TypedSlot.f create mode 100644 Parser/test-data/nas/ft-min.f create mode 100644 Parser/test-data/nas/ft.f create mode 100644 Parser/test-data/nas/ft2.f create mode 100644 Parser/test-data/nas/ft3.f create mode 100644 Parser/test-data/nas/ft4.f create mode 100644 Parser/test-data/nas/ft5.f create mode 100644 Parser/test-data/rules/R503a.f90 create mode 100644 Parser/test-data/rules/tmp.f90 create mode 100644 Parser/test-data/slots/block.f create mode 100644 Parser/test-data/specht/array_operations.f create mode 100644 Parser/test-data/specht/condensed_primary_part.f create mode 100644 Parser/test-data/specht/matrix_operations.f create mode 100644 Parser/test-data/specht/tiny_matrix_products_explicit.f create mode 100644 Parser/test-data/specht/transformed_condensed_part.f create mode 100644 Parser/test-data/specht/transformed_primary_part.f create mode 100644 Parser/test/org/tud/forty/test/NASTest.java create mode 100644 Parser/test/org/tud/forty/test/SpechtTest.java diff --git a/Parser/test-data/OpenAcc/slots.f b/Parser/test-data/OpenAcc/slots.f new file mode 100644 index 0000000..f5238a7 --- /dev/null +++ b/Parser/test-data/OpenAcc/slots.f @@ -0,0 +1,7 @@ + +!$acc loop private(#PVT#) collapse(#NEST#) +do i = 0,10 + #inner# +end do +!$acc end loop +end \ No newline at end of file diff --git a/Parser/test-data/fragments/ExecutableConstruct/ExecutableConstruct1.f b/Parser/test-data/fragments/ExecutableConstruct/ExecutableConstruct1.f new file mode 100644 index 0000000..34086a5 --- /dev/null +++ b/Parser/test-data/fragments/ExecutableConstruct/ExecutableConstruct1.f @@ -0,0 +1,4 @@ + + do a = b,c + #inner# + end do diff --git a/Parser/test-data/fragments/ExecutableConstruct/Slot.f b/Parser/test-data/fragments/ExecutableConstruct/Slot.f new file mode 100644 index 0000000..6fc897b --- /dev/null +++ b/Parser/test-data/fragments/ExecutableConstruct/Slot.f @@ -0,0 +1,2 @@ +#inner# + diff --git a/Parser/test-data/fragments/ExecutableConstruct/TypedSlot.f b/Parser/test-data/fragments/ExecutableConstruct/TypedSlot.f new file mode 100644 index 0000000..269ce36 --- /dev/null +++ b/Parser/test-data/fragments/ExecutableConstruct/TypedSlot.f @@ -0,0 +1,2 @@ +#inner:ExecutableConstruct# + diff --git a/Parser/test-data/nas/ft-min.f b/Parser/test-data/nas/ft-min.f new file mode 100644 index 0000000..a282b98 --- /dev/null +++ b/Parser/test-data/nas/ft-min.f @@ -0,0 +1,300 @@ +!-------------------------------------------------------------------------! +! ! +! N A S P A R A L L E L B E N C H M A R K S 3.3 ! +! ! +! O p e n M P V E R S I O N ! +! ! +! F T ! +! ! +!-------------------------------------------------------------------------! +! ! +! This benchmark is an OpenMP version of the NPB FT code. ! +! It is described in NAS Technical Report 99-011. ! +! ! +! Permission to use, copy, distribute and modify this software ! +! for any purpose with or without fee is hereby granted. We ! +! request, however, that all derived work reference the NAS ! +! Parallel Benchmarks 3.3. This software is provided "as is" ! +! without express or implied warranty. ! +! ! +! Information on NPB 3.3, including the technical report, the ! +! original specifications, source code, results and information ! +! on how to submit new results, is available at: ! +! ! +! http://www.nas.nasa.gov/Software/NPB/ ! +! ! +! Send comments or suggestions to npb@nas.nasa.gov ! +! ! +! NAS Parallel Benchmarks Group ! +! NASA Ames Research Center ! +! Mail Stop: T27A-1 ! +! Moffett Field, CA 94035-1000 ! +! ! +! E-mail: npb@nas.nasa.gov ! +! Fax: (650) 604-3957 ! +! ! +!-------------------------------------------------------------------------! + +!--------------------------------------------------------------------- + +! Authors: D. Bailey +! W. Saphir +! H. Jin + +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +! FT benchmark +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + program ft + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + + implicit none + +! CLASS = A +! +! +! This file is generated automatically by the setparams utility. +! It sets the number of processors and the class of the NPB +! in this directory. Do not modify it by hand. +! + integer nx, ny, nz, maxdim, niter_default + integer ntotal, nxp, nyp, ntotalp + parameter (nx=256, ny=256, nz=128, maxdim=256) + parameter (niter_default=6) + parameter (nxp=nx+1, nyp=ny) + parameter (ntotal=nx*nyp*nz) + parameter (ntotalp=nxp*nyp*nz) + logical convertdouble + parameter (convertdouble = .false.) + character compiletime*11 + parameter (compiletime='18 Oct 2016') + character npbversion*5 + parameter (npbversion='3.3.1') + character cs1*8 + parameter (cs1='gfortran') + character cs2*6 + parameter (cs2='$(F77)') + character cs3*6 + parameter (cs3='(none)') + character cs4*6 + parameter (cs4='(none)') + character cs5*40 + parameter (cs5='-ffree-form -O3 -fopenmp -mcmodel=medium') + character cs6*28 + parameter (cs6='-O3 -fopenmp -mcmodel=medium') + character cs7*6 + parameter (cs7='randi8') + + +! If processor array is 1x1 -> 0D grid decomposition + + +! Cache blocking params. These values are good for most +! RISC processors. +! FFT parameters: +! fftblock controls how many ffts are done at a time. +! The default is appropriate for most cache-based machines +! On vector machines, the FFT can be vectorized with vector +! length equal to the block size, so the block size should +! be as large as possible. This is the size of the smallest +! dimension of the problem: 128 for class A, 256 for class B and +! 512 for class C. + + integer :: fftblock_default, fftblockpad_default +! parameter (fftblock_default=16, fftblockpad_default=18) + parameter (fftblock_default=32, fftblockpad_default=33) + + integer :: fftblock, fftblockpad + common /blockinfo/ fftblock, fftblockpad + +! we need a bunch of logic to keep track of how +! arrays are laid out. + + +! Note: this serial version is the derived from the parallel 0D case +! of the ft NPB. +! The computation proceeds logically as + +! set up initial conditions +! fftx(1) +! transpose (1->2) +! ffty(2) +! transpose (2->3) +! fftz(3) +! time evolution +! fftz(3) +! transpose (3->2) +! ffty(2) +! transpose (2->1) +! fftx(1) +! compute residual(1) + +! for the 0D, 1D, 2D strategies, the layouts look like xxx + +! 0D 1D 2D +! 1: xyz xyz xyz + +! the array dimensions are stored in dims(coord, phase) + integer :: dims(3) + common /layout/ dims + + integer :: T_total, T_setup, T_fft, T_evolve, T_checksum, & + T_fftx, T_ffty, & + T_fftz, T_max + parameter (T_total = 1, T_setup = 2, T_fft = 3, & + T_evolve = 4, T_checksum = 5, & + T_fftx = 6, & + T_ffty = 7, & + T_fftz = 8, T_max = 8) + + + + logical :: timers_enabled + + + external timer_read + double precision :: timer_read + external ilog2 + integer :: ilog2 + + external randlc + double precision :: randlc + + +! other stuff + logical :: debug, debugsynch + common /dbg/ debug, debugsynch, timers_enabled + + double precision :: seed, a, pi, alpha + parameter (seed = 314159265.d0, a = 1220703125.d0, & + pi = 3.141592653589793238d0, alpha=1.0d-6) + + +! roots of unity array +! relies on x being largest dimension? + double complex u(nxp) + common /ucomm/ u + + +! for checksum data + double complex sums(0:niter_default) + common /sumcomm/ sums + +! number of iterations + integer :: niter + common /iter/ niter + integer :: i + +!--------------------------------------------------------------------- +! u0, u1, u2 are the main arrays in the problem. +! Depending on the decomposition, these arrays will have different +! dimensions. To accomodate all possibilities, we allocate them as +! one-dimensional arrays and pass them to subroutines for different +! views +! - u0 contains the initial (transformed) initial condition +! - u1 and u2 are working arrays +! - twiddle contains exponents for the time evolution operator. +!--------------------------------------------------------------------- + + double complex u0(ntotalp), & + u1(ntotalp) +! > u2(ntotalp) + double precision :: twiddle(ntotalp) +!--------------------------------------------------------------------- +! Large arrays are in common so that they are allocated on the +! heap rather than the stack. This common block is not +! referenced directly anywhere else. Padding is to avoid accidental +! cache problems, since all array sizes are powers of two. +!--------------------------------------------------------------------- + +! double complex pad1(3), pad2(3), pad3(3) +! common /bigarrays/ u0, pad1, u1, pad2, u2, pad3, twiddle + double complex pad1(3), pad2(3) + common /bigarrays/ u0, pad1, u1, pad2, twiddle + + integer :: iter + double precision :: total_time, mflops + logical :: verified + character class + +!--------------------------------------------------------------------- +! Run the entire problem once to make sure all data is touched. +! This reduces variable startup costs, which is important for such a +! short benchmark. The other NPB 2 implementations are similar. +!--------------------------------------------------------------------- + do i = 1, t_max + call timer_clear(i) + end do + call setup() + call init_ui(u0, u1, twiddle, dims(1), dims(2), dims(3)) + call compute_indexmap(twiddle, dims(1), dims(2), dims(3)) + call compute_initial_conditions(u1, dims(1), dims(2), dims(3)) + call fft_init (dims(1)) + call fft(1, u1, u0) + +!--------------------------------------------------------------------- +! Start over from the beginning. Note that all operations must +! be timed, in contrast to other benchmarks. +!--------------------------------------------------------------------- + do i = 1, t_max + call timer_clear(i) + end do + + call timer_start(T_total) + if (timers_enabled) call timer_start(T_setup) + + call compute_indexmap(twiddle, dims(1), dims(2), dims(3)) + + call compute_initial_conditions(u1, dims(1), dims(2), dims(3)) + + call fft_init (dims(1)) + + if (timers_enabled) call timer_stop(T_setup) + if (timers_enabled) call timer_start(T_fft) + call fft(1, u1, u0) + if (timers_enabled) call timer_stop(T_fft) + + do iter = 1, niter + if (timers_enabled) call timer_start(T_evolve) + call evolve(u0, u1, twiddle, dims(1), dims(2), dims(3)) + if (timers_enabled) call timer_stop(T_evolve) + if (timers_enabled) call timer_start(T_fft) + ! call fft(-1, u1, u2) + call fft(-1, u1, u1) + if (timers_enabled) call timer_stop(T_fft) + if (timers_enabled) call timer_start(T_checksum) + ! call checksum(iter, u2, dims(1), dims(2), dims(3)) + call checksum(iter, u1, dims(1), dims(2), dims(3)) + if (timers_enabled) call timer_stop(T_checksum) + end do + + call verify(nx, ny, nz, niter, verified, class) + + call timer_stop(t_total) + total_time = timer_read(t_total) + + if( total_time /= 0. ) then + mflops = 1.0d-6*float(ntotal) * & + (14.8157+7.19641*log(float(ntotal)) & + + (5.23518+7.21113*log(float(ntotal)))*niter) & + /total_time + else + mflops = 0.0 + endif + call print_results('FT', class, nx, ny, nz, niter, & + total_time, mflops, ' floating point', verified, & + npbversion, compiletime, cs1, cs2, cs3, cs4, cs5, cs6, cs7) + if (timers_enabled) call print_timers() + + END PROGRAM + diff --git a/Parser/test-data/nas/ft.f b/Parser/test-data/nas/ft.f new file mode 100644 index 0000000..c57dcb2 --- /dev/null +++ b/Parser/test-data/nas/ft.f @@ -0,0 +1,1065 @@ +!-------------------------------------------------------------------------! +! ! +! N A S P A R A L L E L B E N C H M A R K S 3.3 ! +! ! +! O p e n M P V E R S I O N ! +! ! +! F T ! +! ! +!-------------------------------------------------------------------------! +! ! +! This benchmark is an OpenMP version of the NPB FT code. ! +! It is described in NAS Technical Report 99-011. ! +! ! +! Permission to use, copy, distribute and modify this software ! +! for any purpose with or without fee is hereby granted. We ! +! request, however, that all derived work reference the NAS ! +! Parallel Benchmarks 3.3. This software is provided "as is" ! +! without express or implied warranty. ! +! ! +! Information on NPB 3.3, including the technical report, the ! +! original specifications, source code, results and information ! +! on how to submit new results, is available at: ! +! ! +! http://www.nas.nasa.gov/Software/NPB/ ! +! ! +! Send comments or suggestions to npb@nas.nasa.gov ! +! ! +! NAS Parallel Benchmarks Group ! +! NASA Ames Research Center ! +! Mail Stop: T27A-1 ! +! Moffett Field, CA 94035-1000 ! +! ! +! E-mail: npb@nas.nasa.gov ! +! Fax: (650) 604-3957 ! +! ! +!-------------------------------------------------------------------------! + +!--------------------------------------------------------------------- + +! Authors: D. Bailey +! W. Saphir +! H. Jin + +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +! FT benchmark +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + program ft + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + + implicit none + +! include 'global.h' + integer :: i + +!--------------------------------------------------------------------- +! u0, u1, u2 are the main arrays in the problem. +! Depending on the decomposition, these arrays will have different +! dimensions. To accomodate all possibilities, we allocate them as +! one-dimensional arrays and pass them to subroutines for different +! views +! - u0 contains the initial (transformed) initial condition +! - u1 and u2 are working arrays +! - twiddle contains exponents for the time evolution operator. +!--------------------------------------------------------------------- + + double complex u0(ntotalp), u1(ntotalp) +! > u2(ntotalp) + double precision :: twiddle(ntotalp) +!--------------------------------------------------------------------- +! Large arrays are in common so that they are allocated on the +! heap rather than the stack. This common block is not +! referenced directly anywhere else. Padding is to avoid accidental +! cache problems, since all array sizes are powers of two. +!--------------------------------------------------------------------- + +! double complex pad1(3), pad2(3), pad3(3) +! common /bigarrays/ u0, pad1, u1, pad2, u2, pad3, twiddle + double complex pad1(3), pad2(3) + common /bigarrays/ u0, pad1, u1, pad2, twiddle + + integer :: iter + double precision :: total_time, mflops + logical :: verified + character class + +!--------------------------------------------------------------------- +! Run the entire problem once to make sure all data is touched. +! This reduces variable startup costs, which is important for such a +! short benchmark. The other NPB 2 implementations are similar. +!--------------------------------------------------------------------- + do i = 1, t_max + call timer_clear(i) + end do + call setup() + call init_ui(u0, u1, twiddle, dims(1), dims(2), dims(3)) + call compute_indexmap(twiddle, dims(1), dims(2), dims(3)) + call compute_initial_conditions(u1, dims(1), dims(2), dims(3)) + call fft_init (dims(1)) + call fft(1, u1, u0) + +!--------------------------------------------------------------------- +! Start over from the beginning. Note that all operations must +! be timed, in contrast to other benchmarks. +!--------------------------------------------------------------------- + do i = 1, t_max + call timer_clear(i) + end do + + call timer_start(T_total) + if (timers_enabled) call timer_start(T_setup) + + call compute_indexmap(twiddle, dims(1), dims(2), dims(3)) + + call compute_initial_conditions(u1, dims(1), dims(2), dims(3)) + + call fft_init (dims(1)) + + if (timers_enabled) call timer_stop(T_setup) + if (timers_enabled) call timer_start(T_fft) + call fft(1, u1, u0) + if (timers_enabled) call timer_stop(T_fft) + + do iter = 1, niter + if (timers_enabled) call timer_start(T_evolve) + call evolve(u0, u1, twiddle, dims(1), dims(2), dims(3)) + if (timers_enabled) call timer_stop(T_evolve) + if (timers_enabled) call timer_start(T_fft) + ! call fft(-1, u1, u2) + call fft(-1, u1, u1) + if (timers_enabled) call timer_stop(T_fft) + if (timers_enabled) call timer_start(T_checksum) + ! call checksum(iter, u2, dims(1), dims(2), dims(3)) + call checksum(iter, u1, dims(1), dims(2), dims(3)) + if (timers_enabled) call timer_stop(T_checksum) + end do + + call verify(nx, ny, nz, niter, verified, class) + + call timer_stop(t_total) + total_time = timer_read(t_total) + + if( total_time /= 0. ) then + mflops = 1.0d-6*float(ntotal) * (14.8157+7.19641*log(float(ntotal)) + (5.23518+7.21113*log(float(ntotal)))*niter) /total_time + else + mflops = 0.0 + endif + call print_results('FT', class, nx, ny, nz, niter, total_time, mflops, ' floating point', verified, npbversion, compiletime, cs1, cs2, cs3, cs4, cs5, cs6, cs7) + if (timers_enabled) call print_timers() + + END PROGRAM + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine init_ui(u0, u1, twiddle, d1, d2, d3) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +! touch all the big data +!--------------------------------------------------------------------- + + implicit none + integer :: d1, d2, d3 + double complex u0(d1+1,d2,d3) + double complex u1(d1+1,d2,d3) + double precision :: twiddle(d1+1,d2,d3) + integer :: i, j, k + +! omp parallel do default(shared) private(i,j,k) + do k = 1, d3 + do j = 1, d2 + do i = 1, d1 + u0(i,j,k) = 0.d0 + u1(i,j,k) = 0.d0 + twiddle(i,j,k) = 0.d0 + end do + end do + end do + + return + end subroutine init_ui + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine evolve(u0, u1, twiddle, d1, d2, d3) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +! evolve u0 -> u1 (t time steps) in fourier space +!--------------------------------------------------------------------- + + implicit none + include 'global.h' + integer :: d1, d2, d3 + double complex u0(d1+1,d2,d3) + double complex u1(d1+1,d2,d3) + double precision :: twiddle(d1+1,d2,d3) + integer :: i, j, k + +! omp parallel do default(shared) private(i,j,k) + do k = 1, d3 + do j = 1, d2 + do i = 1, d1 + u0(i,j,k) = u0(i,j,k) * twiddle(i,j,k) + u1(i,j,k) = u0(i,j,k) + end do + end do + end do + + return + end subroutine evolve + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine compute_initial_conditions(u0, d1, d2, d3) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +! Fill in array u0 with initial conditions from +! random number generator +!--------------------------------------------------------------------- + implicit none + include 'global.h' + integer :: d1, d2, d3 + double complex u0(d1+1, d2, d3) + integer :: k, j + double precision :: x0, start, an, dummy, starts(nz) + + + start = seed +!--------------------------------------------------------------------- +! Jump to the starting element for our first plane. +!--------------------------------------------------------------------- + call ipow46(a, 0, an) + dummy = randlc(start, an) + call ipow46(a, 2*nx*ny, an) + + starts(1) = start + do k = 2, dims(3) + dummy = randlc(start, an) + starts(k) = start + end do + +!--------------------------------------------------------------------- +! Go through by z planes filling in one square at a time. +!--------------------------------------------------------------------- +! omp parallel do default(shared) private(k,j,x0) + do k = 1, dims(3) + x0 = starts(k) + do j = 1, dims(2) + call vranlc(2*nx, x0, a, u0(1, j, k)) + end do + end do + + return + end subroutine compute_initial_conditions + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine ipow46(a, exponent, result) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +! compute a^exponent mod 2^46 +!--------------------------------------------------------------------- + + implicit none + double precision :: a, result, dummy, q, r + integer :: exponent, n, n2 + external randlc + double precision :: randlc +!--------------------------------------------------------------------- +! Use +! a^n = a^(n/2)*a^(n/2) if n even else +! a^n = a*a^(n-1) if n odd +!--------------------------------------------------------------------- + result = 1 + if (exponent == 0) return + q = a + r = 1 + n = exponent + + + do while (n > 1) + n2 = n/2 + if (n2 * 2 == n) then + dummy = randlc(q, q) + n = n2 + else + dummy = randlc(r, q) + n = n-1 + endif + end do + dummy = randlc(r, q) + result = r + return + end subroutine ipow46 + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine setup + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + implicit none + include 'global.h' + + integer :: fstatus +!$ integer omp_get_max_threads +!$ external omp_get_max_threads + debug = .FALSE. + + open (unit=2,file='timer.flag',status='old',iostat=fstatus) + if (fstatus == 0) then + timers_enabled = .TRUE. + close(2) + else + timers_enabled = .FALSE. + endif + + write(*, 1000) + + niter = niter_default + + write(*, 1001) nx, ny, nz + write(*, 1002) niter +!$ write(*, 1003) omp_get_max_threads() + write(*, *) + + + 1000 format(//,' NAS Parallel Benchmarks (NPB3.3-OMP)', ' - FT Benchmark', /) + 1001 format(' Size : ', i4, 'x', i4, 'x', i4) + 1002 format(' Iterations :', i7) + 1003 format(' Number of available threads :', i7) + + dims(1) = nx + dims(2) = ny + dims(3) = nz + + +!--------------------------------------------------------------------- +! Set up info for blocking of ffts and transposes. This improves +! performance on cache-based systems. Blocking involves +! working on a chunk of the problem at a time, taking chunks +! along the first, second, or third dimension. + +! - In cffts1 blocking is on 2nd dimension (with fft on 1st dim) +! - In cffts2/3 blocking is on 1st dimension (with fft on 2nd and 3rd dims) + +! Since 1st dim is always in processor, we'll assume it's long enough +! (default blocking factor is 16 so min size for 1st dim is 16) +! The only case we have to worry about is cffts1 in a 2d decomposition. +! so the blocking factor should not be larger than the 2nd dimension. +!--------------------------------------------------------------------- + + fftblock = fftblock_default + fftblockpad = fftblockpad_default + + if (fftblock /= fftblock_default) fftblockpad = fftblock+3 + + return + end subroutine setup + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine compute_indexmap(twiddle, d1, d2, d3) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +! compute function from local (i,j,k) to ibar^2+jbar^2+kbar^2 +! for time evolution exponent. +!--------------------------------------------------------------------- + + implicit none + include 'global.h' + integer :: d1, d2, d3 + double precision :: twiddle(d1+1, d2, d3) + integer :: i, j, k, kk, kk2, jj, kj2, ii + double precision :: ap + +!--------------------------------------------------------------------- +! basically we want to convert the fortran indices +! 1 2 3 4 5 6 7 8 +! to +! 0 1 2 3 -4 -3 -2 -1 +! The following magic formula does the trick: +! mod(i-1+n/2, n) - n/2 +!--------------------------------------------------------------------- + + ap = - 4.d0 * alpha * pi *pi + +! omp parallel do default(shared) private(i,j,k,kk,kk2,jj,kj2,ii) + do k = 1, dims(3) + kk = mod(k-1+nz/2, nz) - nz/2 + kk2 = kk*kk + do j = 1, dims(2) + jj = mod(j-1+ny/2, ny) - ny/2 + kj2 = jj*jj+kk2 + do i = 1, dims(1) + ii = mod(i-1+nx/2, nx) - nx/2 + twiddle(i,j,k) = dexp(ap*dble(ii*ii+kj2)) + end do + end do + end do + + return + end subroutine compute_indexmap + + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine print_timers() + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + implicit none + integer :: i + include 'global.h' + double precision :: t, t_m + character(25) :: tstrings(T_max) + data tstrings / ' total ', ' setup ', ' fft ', ' evolve ', ' checksum ', ' fftx ', ' ffty ', ' fftz ' / + + t_m = timer_read(T_total) + if (t_m <= 0.0d0) t_m = 1.0d0 + do i = 1, t_max + t = timer_read(i) + write(*, 100) i, tstrings(i), t, t*100.0/t_m + end do + 100 format(' timer ', i2, '(', A16, ') :', F9.4, ' (',F6.2,'%)') + return + end subroutine print_timers + + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine fft(dir, x1, x2) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + implicit none + include 'global.h' + integer :: dir + double complex x1(ntotalp), x2(ntotalp) + + double complex y1(fftblockpad_default*maxdim), y2(fftblockpad_default*maxdim) + +!--------------------------------------------------------------------- +! note: args x1, x2 must be different arrays +! note: args for cfftsx are (direction, layout, xin, xout, scratch) +! xin/xout may be the same and it can be somewhat faster +! if they are +!--------------------------------------------------------------------- + + if (dir == 1) then + call cffts1(1, dims(1), dims(2), dims(3), x1, x1, y1, y2) + call cffts2(1, dims(1), dims(2), dims(3), x1, x1, y1, y2) + call cffts3(1, dims(1), dims(2), dims(3), x1, x2, y1, y2) + else + call cffts3(-1, dims(1), dims(2), dims(3), x1, x1, y1, y2) + call cffts2(-1, dims(1), dims(2), dims(3), x1, x1, y1, y2) + call cffts1(-1, dims(1), dims(2), dims(3), x1, x2, y1, y2) + endif + return + end subroutine fft + + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine cffts1(is, d1, d2, d3, x, xout, y1, y2) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + implicit none + + include 'global.h' + integer :: is, d1, d2, d3, logd1 + double complex x(d1+1,d2,d3) + double complex xout(d1+1,d2,d3) + double complex y1(fftblockpad, d1), y2(fftblockpad, d1) + integer :: i, j, k, jj + + logd1 = ilog2(d1) + + if (timers_enabled) call timer_start(T_fftx) +! omp parallel do default(shared) private(i,j,k,jj,y1,y2) shared(is,logd1,d1) + do k = 1, d3 + do jj = 0, d2 - fftblock, fftblock + do j = 1, fftblock + do i = 1, d1 + y1(j,i) = x(i,j+jj,k) + enddo + enddo + + call cfftz (is, logd1, d1, y1, y2) + + + do j = 1, fftblock + do i = 1, d1 + xout(i,j+jj,k) = y1(j,i) + enddo + enddo + enddo + enddo + if (timers_enabled) call timer_stop(T_fftx) + + return + end subroutine cffts1 + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine cffts2(is, d1, d2, d3, x, xout, y1, y2) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + implicit none + + include 'global.h' + integer :: is, d1, d2, d3, logd2 + double complex x(d1+1,d2,d3) + double complex xout(d1+1,d2,d3) + double complex y1(fftblockpad, d2), y2(fftblockpad, d2) + integer :: i, j, k, ii + + logd2 = ilog2(d2) + + if (timers_enabled) call timer_start(T_ffty) +! omp parallel do default(shared) private(i,j,k,ii,y1,y2) shared(is,logd2,d2) + do k = 1, d3 + do ii = 0, d1 - fftblock, fftblock + do j = 1, d2 + do i = 1, fftblock + y1(i,j) = x(i+ii,j,k) + enddo + enddo + + call cfftz (is, logd2, d2, y1, y2) + + do j = 1, d2 + do i = 1, fftblock + xout(i+ii,j,k) = y1(i,j) + enddo + enddo + enddo + enddo + if (timers_enabled) call timer_stop(T_ffty) + + return + end subroutine cffts2 + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine cffts3(is, d1, d2, d3, x, xout, y1, y2) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + implicit none + + include 'global.h' + integer :: is, d1, d2, d3, logd3 + double complex x(d1+1,d2,d3) + double complex xout(d1+1,d2,d3) + double complex y1(fftblockpad, d3), y2(fftblockpad, d3) + integer :: i, j, k, ii + + logd3 = ilog2(d3) + + if (timers_enabled) call timer_start(T_fftz) +! omp parallel do default(shared) private(i,j,k,ii,y1,y2) shared(is) + do j = 1, d2 + do ii = 0, d1 - fftblock, fftblock + do k = 1, d3 + do i = 1, fftblock + y1(i,k) = x(i+ii,j,k) + enddo + enddo + + call cfftz (is, logd3, d3, y1, y2) + + do k = 1, d3 + do i = 1, fftblock + xout(i+ii,j,k) = y1(i,k) + enddo + enddo + enddo + enddo + if (timers_enabled) call timer_stop(T_fftz) + + return + end subroutine cffts3 + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine fft_init (n) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +! compute the roots-of-unity array that will be used for subsequent FFTs. +!--------------------------------------------------------------------- + + implicit none + include 'global.h' + + integer :: m,n,nu,ku,i,j,ln + double precision :: t, ti + + +!--------------------------------------------------------------------- +! Initialize the U array with sines and cosines in a manner that permits +! stride one access at each FFT iteration. +!--------------------------------------------------------------------- + nu = n + m = ilog2(n) + u(1) = m + ku = 2 + ln = 1 + + do j = 1, m + t = pi / ln + + do i = 0, ln - 1 + ti = i * t + u(i+ku) = dcmplx (cos (ti), sin(ti)) + enddo + + ku = ku + ln + ln = 2 * ln + enddo + + return + end subroutine fft_init + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine cfftz (is, m, n, x, y) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +! Computes NY N-point complex-to-complex FFTs of X using an algorithm due +! to Swarztrauber. X is both the input and the output array, while Y is a +! scratch array. It is assumed that N = 2^M. Before calling CFFTZ to +! perform FFTs, the array U must be initialized by calling CFFTZ with IS +! set to 0 and M set to MX, where MX is the maximum value of M for any +! subsequent call. +!--------------------------------------------------------------------- + + implicit none + include 'global.h' + + integer :: is,m,n,i,j,l,mx + double complex x, y + + dimension x(fftblockpad,n), y(fftblockpad,n) + +!--------------------------------------------------------------------- +! Check if input parameters are invalid. +!--------------------------------------------------------------------- + mx = u(1) + if ((is /= 1 .AND. is /= -1) .OR. m < 1 .OR. m > mx) then + write (*, 1) is, m, mx + 1 format ('CFFTZ: Either U has not been initialized, or else'/ 'one of the input parameters is invalid', 3I5) + stop + endif + +!--------------------------------------------------------------------- +! Perform one variant of the Stockham FFT. +!--------------------------------------------------------------------- + do l = 1, m, 2 + call fftz2 (is, l, m, n, fftblock, fftblockpad, u, x, y) + if (l == m) goto 160 + call fftz2 (is, l + 1, m, n, fftblock, fftblockpad, u, y, x) + enddo + + goto 180 + +!--------------------------------------------------------------------- +! Copy Y to X. +!--------------------------------------------------------------------- + 160 do j = 1, n + do i = 1, fftblock + x(i,j) = y(i,j) + enddo + enddo + + 180 continue + + return + end subroutine cfftz + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine fftz2 (is, l, m, n, ny, ny1, u, x, y) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +! Performs the L-th iteration of the second variant of the Stockham FFT. +!--------------------------------------------------------------------- + + implicit none + + integer :: is,k,l,m,n,ny,ny1,n1,li,lj,lk,ku,i,j,i11,i12,i21,i22 + double complex u,x,y,u1,x11,x21 + dimension u(n), x(ny1,n), y(ny1,n) + + +!--------------------------------------------------------------------- +! Set initial parameters. +!--------------------------------------------------------------------- + + n1 = n / 2 + lk = 2 ** (l - 1) + li = 2 ** (m - l) + lj = 2 * lk + ku = li + 1 + + do i = 0, li - 1 + i11 = i * lk + 1 + i12 = i11 + n1 + i21 = i * lj + 1 + i22 = i21 + lk + if (is >= 1) then + u1 = u(ku+i) + else + u1 = dconjg (u(ku+i)) + endif + + !--------------------------------------------------------------------- + ! This loop is vectorizable. + !--------------------------------------------------------------------- + do k = 0, lk - 1 + do j = 1, ny + x11 = x(j,i11+k) + x21 = x(j,i12+k) + y(j,i21+k) = x11 + x21 + y(j,i22+k) = u1 * (x11 - x21) + enddo + enddo + enddo + + return + end subroutine fftz2 + +!--------------------------------------------------------------------- + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + integer function ilog2(n) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + implicit none + integer :: n, nn, lg + if (n == 1) then + ilog2=0 + return + endif + lg = 1 + nn = 2 + do while (nn < n) + nn = nn*2 + lg = lg+1 + end do + ilog2 = lg + return + end function ilog2 + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine checksum(i, u1, d1, d2, d3) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + implicit none + include 'global.h' + integer :: i, d1, d2, d3 + double complex u1(d1+1,d2,d3) + integer :: j, q,r,s + double complex chk + chk = (0.0,0.0) + +! omp parallel do default(shared) private(i,q,r,s) reduction(+:chk) + do j=1,1024 + q = mod(j, nx)+1 + r = mod(3*j,ny)+1 + s = mod(5*j,nz)+1 + chk=chk+u1(q,r,s) + end do + + chk = chk/dble(ntotal) + + write (*, 30) i, chk + 30 format (' T =',I5,5X,'Checksum =',1P2D22.12) + sums(i) = chk + return + end subroutine checksum + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine verify (d1, d2, d3, nt, verified, class) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + implicit none + include 'global.h' + integer :: d1, d2, d3, nt + character class + logical :: verified + integer :: i + double precision :: err, epsilon + +!--------------------------------------------------------------------- +! Reference checksums +!--------------------------------------------------------------------- + double complex csum_ref(25) + + + class = 'U' + + epsilon = 1.0d-12 + verified = .FALSE. + + if (d1 == 64 .AND. d2 == 64 .AND. d3 == 64 .AND. nt == 6) then + !--------------------------------------------------------------------- + ! Sample size reference checksums + !--------------------------------------------------------------------- + class = 'S' + csum_ref(1) = dcmplx(5.546087004964D+02, 4.845363331978D+02) + csum_ref(2) = dcmplx(5.546385409189D+02, 4.865304269511D+02) + csum_ref(3) = dcmplx(5.546148406171D+02, 4.883910722336D+02) + csum_ref(4) = dcmplx(5.545423607415D+02, 4.901273169046D+02) + csum_ref(5) = dcmplx(5.544255039624D+02, 4.917475857993D+02) + csum_ref(6) = dcmplx(5.542683411902D+02, 4.932597244941D+02) + + else if (d1 == 128 .AND. d2 == 128 .AND. d3 == 32 .AND. nt == 6) then + !--------------------------------------------------------------------- + ! Class W size reference checksums + !--------------------------------------------------------------------- + class = 'W' + csum_ref(1) = dcmplx(5.673612178944D+02, 5.293246849175D+02) + csum_ref(2) = dcmplx(5.631436885271D+02, 5.282149986629D+02) + csum_ref(3) = dcmplx(5.594024089970D+02, 5.270996558037D+02) + csum_ref(4) = dcmplx(5.560698047020D+02, 5.260027904925D+02) + csum_ref(5) = dcmplx(5.530898991250D+02, 5.249400845633D+02) + csum_ref(6) = dcmplx(5.504159734538D+02, 5.239212247086D+02) + + else if (d1 == 256 .AND. d2 == 256 .AND. d3 == 128 .AND. nt == 6) then + !--------------------------------------------------------------------- + ! Class A size reference checksums + !--------------------------------------------------------------------- + class = 'A' + csum_ref(1) = dcmplx(5.046735008193D+02, 5.114047905510D+02) + csum_ref(2) = dcmplx(5.059412319734D+02, 5.098809666433D+02) + csum_ref(3) = dcmplx(5.069376896287D+02, 5.098144042213D+02) + csum_ref(4) = dcmplx(5.077892868474D+02, 5.101336130759D+02) + csum_ref(5) = dcmplx(5.085233095391D+02, 5.104914655194D+02) + csum_ref(6) = dcmplx(5.091487099959D+02, 5.107917842803D+02) + + else if (d1 == 512 .AND. d2 == 256 .AND. d3 == 256 .AND. nt == 20) then + !--------------------------------------------------------------------- + ! Class B size reference checksums + !--------------------------------------------------------------------- + class = 'B' + csum_ref(1) = dcmplx(5.177643571579D+02, 5.077803458597D+02) + csum_ref(2) = dcmplx(5.154521291263D+02, 5.088249431599D+02) + csum_ref(3) = dcmplx(5.146409228649D+02, 5.096208912659D+02) + csum_ref(4) = dcmplx(5.142378756213D+02, 5.101023387619D+02) + csum_ref(5) = dcmplx(5.139626667737D+02, 5.103976610617D+02) + csum_ref(6) = dcmplx(5.137423460082D+02, 5.105948019802D+02) + csum_ref(7) = dcmplx(5.135547056878D+02, 5.107404165783D+02) + csum_ref(8) = dcmplx(5.133910925466D+02, 5.108576573661D+02) + csum_ref(9) = dcmplx(5.132470705390D+02, 5.109577278523D+02) + csum_ref(10) = dcmplx(5.131197729984D+02, 5.110460304483D+02) + csum_ref(11) = dcmplx(5.130070319283D+02, 5.111252433800D+02) + csum_ref(12) = dcmplx(5.129070537032D+02, 5.111968077718D+02) + csum_ref(13) = dcmplx(5.128182883502D+02, 5.112616233064D+02) + csum_ref(14) = dcmplx(5.127393733383D+02, 5.113203605551D+02) + csum_ref(15) = dcmplx(5.126691062020D+02, 5.113735928093D+02) + csum_ref(16) = dcmplx(5.126064276004D+02, 5.114218460548D+02) + csum_ref(17) = dcmplx(5.125504076570D+02, 5.114656139760D+02) + csum_ref(18) = dcmplx(5.125002331720D+02, 5.115053595966D+02) + csum_ref(19) = dcmplx(5.124551951846D+02, 5.115415130407D+02) + csum_ref(20) = dcmplx(5.124146770029D+02, 5.115744692211D+02) + + else if (d1 == 512 .AND. d2 == 512 .AND. d3 == 512 .AND. nt == 20) then + !--------------------------------------------------------------------- + ! Class C size reference checksums + !--------------------------------------------------------------------- + class = 'C' + csum_ref(1) = dcmplx(5.195078707457D+02, 5.149019699238D+02) + csum_ref(2) = dcmplx(5.155422171134D+02, 5.127578201997D+02) + csum_ref(3) = dcmplx(5.144678022222D+02, 5.122251847514D+02) + csum_ref(4) = dcmplx(5.140150594328D+02, 5.121090289018D+02) + csum_ref(5) = dcmplx(5.137550426810D+02, 5.121143685824D+02) + csum_ref(6) = dcmplx(5.135811056728D+02, 5.121496764568D+02) + csum_ref(7) = dcmplx(5.134569343165D+02, 5.121870921893D+02) + csum_ref(8) = dcmplx(5.133651975661D+02, 5.122193250322D+02) + csum_ref(9) = dcmplx(5.132955192805D+02, 5.122454735794D+02) + csum_ref(10) = dcmplx(5.132410471738D+02, 5.122663649603D+02) + csum_ref(11) = dcmplx(5.131971141679D+02, 5.122830879827D+02) + csum_ref(12) = dcmplx(5.131605205716D+02, 5.122965869718D+02) + csum_ref(13) = dcmplx(5.131290734194D+02, 5.123075927445D+02) + csum_ref(14) = dcmplx(5.131012720314D+02, 5.123166486553D+02) + csum_ref(15) = dcmplx(5.130760908195D+02, 5.123241541685D+02) + csum_ref(16) = dcmplx(5.130528295923D+02, 5.123304037599D+02) + csum_ref(17) = dcmplx(5.130310107773D+02, 5.123356167976D+02) + csum_ref(18) = dcmplx(5.130103090133D+02, 5.123399592211D+02) + csum_ref(19) = dcmplx(5.129905029333D+02, 5.123435588985D+02) + csum_ref(20) = dcmplx(5.129714421109D+02, 5.123465164008D+02) + + else if (d1 == 2048 .AND. d2 == 1024 .AND. d3 == 1024 .AND. nt == 25) then + !--------------------------------------------------------------------- + ! Class D size reference checksums + !--------------------------------------------------------------------- + class = 'D' + csum_ref(1) = dcmplx(5.122230065252D+02, 5.118534037109D+02) + csum_ref(2) = dcmplx(5.120463975765D+02, 5.117061181082D+02) + csum_ref(3) = dcmplx(5.119865766760D+02, 5.117096364601D+02) + csum_ref(4) = dcmplx(5.119518799488D+02, 5.117373863950D+02) + csum_ref(5) = dcmplx(5.119269088223D+02, 5.117680347632D+02) + csum_ref(6) = dcmplx(5.119082416858D+02, 5.117967875532D+02) + csum_ref(7) = dcmplx(5.118943814638D+02, 5.118225281841D+02) + csum_ref(8) = dcmplx(5.118842385057D+02, 5.118451629348D+02) + csum_ref(9) = dcmplx(5.118769435632D+02, 5.118649119387D+02) + csum_ref(10) = dcmplx(5.118718203448D+02, 5.118820803844D+02) + csum_ref(11) = dcmplx(5.118683569061D+02, 5.118969781011D+02) + csum_ref(12) = dcmplx(5.118661708593D+02, 5.119098918835D+02) + csum_ref(13) = dcmplx(5.118649768950D+02, 5.119210777066D+02) + csum_ref(14) = dcmplx(5.118645605626D+02, 5.119307604484D+02) + csum_ref(15) = dcmplx(5.118647586618D+02, 5.119391362671D+02) + csum_ref(16) = dcmplx(5.118654451572D+02, 5.119463757241D+02) + csum_ref(17) = dcmplx(5.118665212451D+02, 5.119526269238D+02) + csum_ref(18) = dcmplx(5.118679083821D+02, 5.119580184108D+02) + csum_ref(19) = dcmplx(5.118695433664D+02, 5.119626617538D+02) + csum_ref(20) = dcmplx(5.118713748264D+02, 5.119666538138D+02) + csum_ref(21) = dcmplx(5.118733606701D+02, 5.119700787219D+02) + csum_ref(22) = dcmplx(5.118754661974D+02, 5.119730095953D+02) + csum_ref(23) = dcmplx(5.118776626738D+02, 5.119755100241D+02) + csum_ref(24) = dcmplx(5.118799262314D+02, 5.119776353561D+02) + csum_ref(25) = dcmplx(5.118822370068D+02, 5.119794338060D+02) + + else if (d1 == 4096 .AND. d2 == 2048 .AND. d3 == 2048 .AND. nt == 25) then + !--------------------------------------------------------------------- + ! Class E size reference checksums + !--------------------------------------------------------------------- + class = 'E' + csum_ref(1) = dcmplx(5.121601045346D+02, 5.117395998266D+02) + csum_ref(2) = dcmplx(5.120905403678D+02, 5.118614716182D+02) + csum_ref(3) = dcmplx(5.120623229306D+02, 5.119074203747D+02) + csum_ref(4) = dcmplx(5.120438418997D+02, 5.119345900733D+02) + csum_ref(5) = dcmplx(5.120311521872D+02, 5.119551325550D+02) + csum_ref(6) = dcmplx(5.120226088809D+02, 5.119720179919D+02) + csum_ref(7) = dcmplx(5.120169296534D+02, 5.119861371665D+02) + csum_ref(8) = dcmplx(5.120131225172D+02, 5.119979364402D+02) + csum_ref(9) = dcmplx(5.120104767108D+02, 5.120077674092D+02) + csum_ref(10) = dcmplx(5.120085127969D+02, 5.120159443121D+02) + csum_ref(11) = dcmplx(5.120069224127D+02, 5.120227453670D+02) + csum_ref(12) = dcmplx(5.120055158164D+02, 5.120284096041D+02) + csum_ref(13) = dcmplx(5.120041820159D+02, 5.120331373793D+02) + csum_ref(14) = dcmplx(5.120028605402D+02, 5.120370938679D+02) + csum_ref(15) = dcmplx(5.120015223011D+02, 5.120404138831D+02) + csum_ref(16) = dcmplx(5.120001570022D+02, 5.120432068837D+02) + csum_ref(17) = dcmplx(5.119987650555D+02, 5.120455615860D+02) + csum_ref(18) = dcmplx(5.119973525091D+02, 5.120475499442D+02) + csum_ref(19) = dcmplx(5.119959279472D+02, 5.120492304629D+02) + csum_ref(20) = dcmplx(5.119945006558D+02, 5.120506508902D+02) + csum_ref(21) = dcmplx(5.119930795911D+02, 5.120518503782D+02) + csum_ref(22) = dcmplx(5.119916728462D+02, 5.120528612016D+02) + csum_ref(23) = dcmplx(5.119902874185D+02, 5.120537101195D+02) + csum_ref(24) = dcmplx(5.119889291565D+02, 5.120544194514D+02) + csum_ref(25) = dcmplx(5.119876028049D+02, 5.120550079284D+02) + + endif + + + if (class /= 'U') then + + do i = 1, nt + err = abs( (sums(i) - csum_ref(i)) / csum_ref(i) ) + if ( .NOT. (err <= epsilon)) goto 100 + end do + verified = .TRUE. + 100 continue + + endif + + + if (class /= 'U') then + if (verified) then + write(*,2000) + 2000 format(' Result verification successful') + else + write(*,2001) + 2001 format(' Result verification failed') + endif + endif + print *, 'class = ', class + + return + end subroutine verify + + diff --git a/Parser/test-data/nas/ft2.f b/Parser/test-data/nas/ft2.f new file mode 100644 index 0000000..63ea2cd --- /dev/null +++ b/Parser/test-data/nas/ft2.f @@ -0,0 +1,910 @@ + + program ft + + if (timers_enabled) call print_timers() + + END PROGRAM + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine init_ui(u0, u1, twiddle, d1, d2, d3) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +! touch all the big data +!--------------------------------------------------------------------- + + implicit none + integer :: d1, d2, d3 + double complex u0(d1+1,d2,d3) + double complex u1(d1+1,d2,d3) + double precision :: twiddle(d1+1,d2,d3) + integer :: i, j, k + +! omp parallel do default(shared) private(i,j,k) + do k = 1, d3 + do j = 1, d2 + do i = 1, d1 + u0(i,j,k) = 0.d0 + u1(i,j,k) = 0.d0 + twiddle(i,j,k) = 0.d0 + end do + end do + end do + + return + end subroutine init_ui + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine evolve(u0, u1, twiddle, d1, d2, d3) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +! evolve u0 -> u1 (t time steps) in fourier space +!--------------------------------------------------------------------- + + implicit none + ! include 'global.h' + integer :: d1, d2, d3 + double complex u0(d1+1,d2,d3) + double complex u1(d1+1,d2,d3) + double precision :: twiddle(d1+1,d2,d3) + integer :: i, j, k + +! omp parallel do default(shared) private(i,j,k) + do k = 1, d3 + do j = 1, d2 + do i = 1, d1 + u0(i,j,k) = u0(i,j,k) * twiddle(i,j,k) + u1(i,j,k) = u0(i,j,k) + end do + end do + end do + + return + end subroutine evolve + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine compute_initial_conditions(u0, d1, d2, d3) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +! Fill in array u0 with initial conditions from +! random number generator +!--------------------------------------------------------------------- + implicit none +! include 'global.h' + integer :: d1, d2, d3 + double complex u0(d1+1, d2, d3) + integer :: k, j + double precision :: x0, start, an, dummy, starts(nz) + + + start = seed +!--------------------------------------------------------------------- +! Jump to the starting element for our first plane. +!--------------------------------------------------------------------- + call ipow46(a, 0, an) + dummy = randlc(start, an) + call ipow46(a, 2*nx*ny, an) + + starts(1) = start + do k = 2, dims(3) + dummy = randlc(start, an) + starts(k) = start + end do + +!--------------------------------------------------------------------- +! Go through by z planes filling in one square at a time. +!--------------------------------------------------------------------- +! omp parallel do default(shared) private(k,j,x0) + do k = 1, dims(3) + x0 = starts(k) + do j = 1, dims(2) + call vranlc(2*nx, x0, a, u0(1, j, k)) + end do + end do + + return + end subroutine compute_initial_conditions + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine ipow46(a, exponent, result) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +! compute a^exponent mod 2^46 +!--------------------------------------------------------------------- + + implicit none + double precision :: a, result, dummy, q, r + integer :: exponent, n, n2 + external randlc + double precision :: randlc +!--------------------------------------------------------------------- +! Use +! a^n = a^(n/2)*a^(n/2) if n even else +! a^n = a*a^(n-1) if n odd +!--------------------------------------------------------------------- + result = 1 + if (exponent == 0) return + q = a + r = 1 + n = exponent + + + do while (n > 1) + n2 = n/2 + if (n2 * 2 == n) then + dummy = randlc(q, q) + n = n2 + else + dummy = randlc(r, q) + n = n-1 + endif + end do + dummy = randlc(r, q) + result = r + return + end subroutine ipow46 + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine setup + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + implicit none +! include 'global.h' + + integer :: fstatus +!$ integer omp_get_max_threads +!$ external omp_get_max_threads + debug = .FALSE. + + open (unit=2,file='timer.flag',status='old',iostat=fstatus) + if (fstatus == 0) then + timers_enabled = .TRUE. + close(2) + else + timers_enabled = .FALSE. + endif + + write(*, 1000) + + niter = niter_default + + write(*, 1001) nx, ny, nz + write(*, 1002) niter +!$ write(*, 1003) omp_get_max_threads() + write(*, *) + + + 1000 format(//,' NAS Parallel Benchmarks (NPB3.3-OMP)', ' - FT Benchmark', /) + 1001 format(' Size : ', i4, 'x', i4, 'x', i4) + 1002 format(' Iterations :', i7) + 1003 format(' Number of available threads :', i7) + + dims(1) = nx + dims(2) = ny + dims(3) = nz + + +!--------------------------------------------------------------------- +! Set up info for blocking of ffts and transposes. This improves +! performance on cache-based systems. Blocking involves +! working on a chunk of the problem at a time, taking chunks +! along the first, second, or third dimension. + +! - In cffts1 blocking is on 2nd dimension (with fft on 1st dim) +! - In cffts2/3 blocking is on 1st dimension (with fft on 2nd and 3rd dims) + +! Since 1st dim is always in processor, we'll assume it's long enough +! (default blocking factor is 16 so min size for 1st dim is 16) +! The only case we have to worry about is cffts1 in a 2d decomposition. +! so the blocking factor should not be larger than the 2nd dimension. +!--------------------------------------------------------------------- + + fftblock = fftblock_default + fftblockpad = fftblockpad_default + + if (fftblock /= fftblock_default) fftblockpad = fftblock+3 + + return + end subroutine setup + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine compute_indexmap(twiddle, d1, d2, d3) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +! compute function from local (i,j,k) to ibar^2+jbar^2+kbar^2 +! for time evolution exponent. +!--------------------------------------------------------------------- + + implicit none +! include 'global.h' + integer :: d1, d2, d3 + double precision :: twiddle(d1+1, d2, d3) + integer :: i, j, k, kk, kk2, jj, kj2, ii + double precision :: ap + +!--------------------------------------------------------------------- +! basically we want to convert the fortran indices +! 1 2 3 4 5 6 7 8 +! to +! 0 1 2 3 -4 -3 -2 -1 +! The following magic formula does the trick: +! mod(i-1+n/2, n) - n/2 +!--------------------------------------------------------------------- + + ap = - 4.d0 * alpha * pi *pi + +! omp parallel do default(shared) private(i,j,k,kk,kk2,jj,kj2,ii) + do k = 1, dims(3) + kk = mod(k-1+nz/2, nz) - nz/2 + kk2 = kk*kk + do j = 1, dims(2) + jj = mod(j-1+ny/2, ny) - ny/2 + kj2 = jj*jj+kk2 + do i = 1, dims(1) + ii = mod(i-1+nx/2, nx) - nx/2 + twiddle(i,j,k) = dexp(ap*dble(ii*ii+kj2)) + end do + end do + end do + + return + end subroutine compute_indexmap + + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine print_timers() + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + implicit none + integer :: i +! include 'global.h' + double precision :: t, t_m + character(25) :: tstrings(T_max) + data tstrings / ' total ', ' setup ', ' fft ', ' evolve ', ' checksum ', ' fftx ', ' ffty ', ' fftz ' / + + t_m = timer_read(T_total) + if (t_m <= 0.0d0) t_m = 1.0d0 + do i = 1, t_max + t = timer_read(i) + write(*, 100) i, tstrings(i), t, t*100.0/t_m + end do + 100 format(' timer ', i2, '(', A16, ') :', F9.4, ' (',F6.2,'%)') + return + end subroutine print_timers + + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine fft(dir, x1, x2) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + implicit none +! include 'global.h' + integer :: dir + double complex x1(ntotalp), x2(ntotalp) + + double complex y1(fftblockpad_default*maxdim), y2(fftblockpad_default*maxdim) + +!--------------------------------------------------------------------- +! note: args x1, x2 must be different arrays +! note: args for cfftsx are (direction, layout, xin, xout, scratch) +! xin/xout may be the same and it can be somewhat faster +! if they are +!--------------------------------------------------------------------- + + if (dir == 1) then + call cffts1(1, dims(1), dims(2), dims(3), x1, x1, y1, y2) + call cffts2(1, dims(1), dims(2), dims(3), x1, x1, y1, y2) + call cffts3(1, dims(1), dims(2), dims(3), x1, x2, y1, y2) + else + call cffts3(-1, dims(1), dims(2), dims(3), x1, x1, y1, y2) + call cffts2(-1, dims(1), dims(2), dims(3), x1, x1, y1, y2) + call cffts1(-1, dims(1), dims(2), dims(3), x1, x2, y1, y2) + endif + return + end subroutine fft + + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine cffts1(is, d1, d2, d3, x, xout, y1, y2) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + implicit none + +! include 'global.h' + integer :: is, d1, d2, d3, logd1 + double complex x(d1+1,d2,d3) + double complex xout(d1+1,d2,d3) + double complex y1(fftblockpad, d1), y2(fftblockpad, d1) + integer :: i, j, k, jj + + logd1 = ilog2(d1) + + if (timers_enabled) call timer_start(T_fftx) +! omp parallel do default(shared) private(i,j,k,jj,y1,y2) shared(is,logd1,d1) + do k = 1, d3 + do jj = 0, d2 - fftblock, fftblock + do j = 1, fftblock + do i = 1, d1 + y1(j,i) = x(i,j+jj,k) + enddo + enddo + + call cfftz (is, logd1, d1, y1, y2) + + + do j = 1, fftblock + do i = 1, d1 + xout(i,j+jj,k) = y1(j,i) + enddo + enddo + enddo + enddo + if (timers_enabled) call timer_stop(T_fftx) + + return + end subroutine cffts1 + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine cffts2(is, d1, d2, d3, x, xout, y1, y2) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + implicit none + + include 'global.h' + integer :: is, d1, d2, d3, logd2 + double complex x(d1+1,d2,d3) + double complex xout(d1+1,d2,d3) + double complex y1(fftblockpad, d2), y2(fftblockpad, d2) + integer :: i, j, k, ii + + logd2 = ilog2(d2) + + if (timers_enabled) call timer_start(T_ffty) +! omp parallel do default(shared) private(i,j,k,ii,y1,y2) shared(is,logd2,d2) + do k = 1, d3 + do ii = 0, d1 - fftblock, fftblock + do j = 1, d2 + do i = 1, fftblock + y1(i,j) = x(i+ii,j,k) + enddo + enddo + + call cfftz (is, logd2, d2, y1, y2) + + do j = 1, d2 + do i = 1, fftblock + xout(i+ii,j,k) = y1(i,j) + enddo + enddo + enddo + enddo + if (timers_enabled) call timer_stop(T_ffty) + + return + end subroutine cffts2 + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine cffts3(is, d1, d2, d3, x, xout, y1, y2) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + implicit none + + include 'global.h' + integer :: is, d1, d2, d3, logd3 + double complex x(d1+1,d2,d3) + double complex xout(d1+1,d2,d3) + double complex y1(fftblockpad, d3), y2(fftblockpad, d3) + integer :: i, j, k, ii + + logd3 = ilog2(d3) + + if (timers_enabled) call timer_start(T_fftz) +! omp parallel do default(shared) private(i,j,k,ii,y1,y2) shared(is) + do j = 1, d2 + do ii = 0, d1 - fftblock, fftblock + do k = 1, d3 + do i = 1, fftblock + y1(i,k) = x(i+ii,j,k) + enddo + enddo + + call cfftz (is, logd3, d3, y1, y2) + + do k = 1, d3 + do i = 1, fftblock + xout(i+ii,j,k) = y1(i,k) + enddo + enddo + enddo + enddo + if (timers_enabled) call timer_stop(T_fftz) + + return + end subroutine cffts3 + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine fft_init (n) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +! compute the roots-of-unity array that will be used for subsequent FFTs. +!--------------------------------------------------------------------- + + implicit none + include 'global.h' + + integer :: m,n,nu,ku,i,j,ln + double precision :: t, ti + + +!--------------------------------------------------------------------- +! Initialize the U array with sines and cosines in a manner that permits +! stride one access at each FFT iteration. +!--------------------------------------------------------------------- + nu = n + m = ilog2(n) + u(1) = m + ku = 2 + ln = 1 + + do j = 1, m + t = pi / ln + + do i = 0, ln - 1 + ti = i * t + u(i+ku) = dcmplx (cos (ti), sin(ti)) + enddo + + ku = ku + ln + ln = 2 * ln + enddo + + return + end subroutine fft_init + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine cfftz (is, m, n, x, y) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +! Computes NY N-point complex-to-complex FFTs of X using an algorithm due +! to Swarztrauber. X is both the input and the output array, while Y is a +! scratch array. It is assumed that N = 2^M. Before calling CFFTZ to +! perform FFTs, the array U must be initialized by calling CFFTZ with IS +! set to 0 and M set to MX, where MX is the maximum value of M for any +! subsequent call. +!--------------------------------------------------------------------- + + implicit none + include 'global.h' + + integer :: is,m,n,i,j,l,mx + double complex x, y + + dimension x(fftblockpad,n), y(fftblockpad,n) + +!--------------------------------------------------------------------- +! Check if input parameters are invalid. +!--------------------------------------------------------------------- + mx = u(1) + if ((is /= 1 .AND. is /= -1) .OR. m < 1 .OR. m > mx) then + write (*, 1) is, m, mx + 1 format ('CFFTZ: Either U has not been initialized, or else'/ 'one of the input parameters is invalid', 3I5) + stop + endif + +!--------------------------------------------------------------------- +! Perform one variant of the Stockham FFT. +!--------------------------------------------------------------------- + do l = 1, m, 2 + call fftz2 (is, l, m, n, fftblock, fftblockpad, u, x, y) + if (l == m) goto 160 + call fftz2 (is, l + 1, m, n, fftblock, fftblockpad, u, y, x) + enddo + + goto 180 + +!--------------------------------------------------------------------- +! Copy Y to X. +!--------------------------------------------------------------------- + 160 do j = 1, n + do i = 1, fftblock + x(i,j) = y(i,j) + enddo + enddo + + 180 continue + + return + end subroutine cfftz + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine fftz2 (is, l, m, n, ny, ny1, u, x, y) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +! Performs the L-th iteration of the second variant of the Stockham FFT. +!--------------------------------------------------------------------- + + implicit none + + integer :: is,k,l,m,n,ny,ny1,n1,li,lj,lk,ku,i,j,i11,i12,i21,i22 + double complex u,x,y,u1,x11,x21 + dimension u(n), x(ny1,n), y(ny1,n) + + +!--------------------------------------------------------------------- +! Set initial parameters. +!--------------------------------------------------------------------- + + n1 = n / 2 + lk = 2 ** (l - 1) + li = 2 ** (m - l) + lj = 2 * lk + ku = li + 1 + + do i = 0, li - 1 + i11 = i * lk + 1 + i12 = i11 + n1 + i21 = i * lj + 1 + i22 = i21 + lk + if (is >= 1) then + u1 = u(ku+i) + else + u1 = dconjg (u(ku+i)) + endif + + !--------------------------------------------------------------------- + ! This loop is vectorizable. + !--------------------------------------------------------------------- + do k = 0, lk - 1 + do j = 1, ny + x11 = x(j,i11+k) + x21 = x(j,i12+k) + y(j,i21+k) = x11 + x21 + y(j,i22+k) = u1 * (x11 - x21) + enddo + enddo + enddo + + return + end subroutine fftz2 + +!--------------------------------------------------------------------- + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + integer function ilog2(n) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + implicit none + integer :: n, nn, lg + if (n == 1) then + ilog2=0 + return + endif + lg = 1 + nn = 2 + do while (nn < n) + nn = nn*2 + lg = lg+1 + end do + ilog2 = lg + return + end function ilog2 + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine checksum(i, u1, d1, d2, d3) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + implicit none + include 'global.h' + integer :: i, d1, d2, d3 + double complex u1(d1+1,d2,d3) + integer :: j, q,r,s + double complex chk + chk = (0.0,0.0) + +! omp parallel do default(shared) private(i,q,r,s) reduction(+:chk) + do j=1,1024 + q = mod(j, nx)+1 + r = mod(3*j,ny)+1 + s = mod(5*j,nz)+1 + chk=chk+u1(q,r,s) + end do + + chk = chk/dble(ntotal) + + write (*, 30) i, chk + 30 format (' T =',I5,5X,'Checksum =',1P2D22.12) + sums(i) = chk + return + end subroutine checksum + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine verify (d1, d2, d3, nt, verified, class) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + implicit none + include 'global.h' + integer :: d1, d2, d3, nt + character class + logical :: verified + integer :: i + double precision :: err, epsilon + +!--------------------------------------------------------------------- +! Reference checksums +!--------------------------------------------------------------------- + double complex csum_ref(25) + + + class = 'U' + + epsilon = 1.0d-12 + verified = .FALSE. + + if (d1 == 64 .AND. d2 == 64 .AND. d3 == 64 .AND. nt == 6) then + !--------------------------------------------------------------------- + ! Sample size reference checksums + !--------------------------------------------------------------------- + class = 'S' + csum_ref(1) = dcmplx(5.546087004964D+02, 4.845363331978D+02) + csum_ref(2) = dcmplx(5.546385409189D+02, 4.865304269511D+02) + csum_ref(3) = dcmplx(5.546148406171D+02, 4.883910722336D+02) + csum_ref(4) = dcmplx(5.545423607415D+02, 4.901273169046D+02) + csum_ref(5) = dcmplx(5.544255039624D+02, 4.917475857993D+02) + csum_ref(6) = dcmplx(5.542683411902D+02, 4.932597244941D+02) + + else if (d1 == 128 .AND. d2 == 128 .AND. d3 == 32 .AND. nt == 6) then + !--------------------------------------------------------------------- + ! Class W size reference checksums + !--------------------------------------------------------------------- + class = 'W' + csum_ref(1) = dcmplx(5.673612178944D+02, 5.293246849175D+02) + csum_ref(2) = dcmplx(5.631436885271D+02, 5.282149986629D+02) + csum_ref(3) = dcmplx(5.594024089970D+02, 5.270996558037D+02) + csum_ref(4) = dcmplx(5.560698047020D+02, 5.260027904925D+02) + csum_ref(5) = dcmplx(5.530898991250D+02, 5.249400845633D+02) + csum_ref(6) = dcmplx(5.504159734538D+02, 5.239212247086D+02) + + else if (d1 == 256 .AND. d2 == 256 .AND. d3 == 128 .AND. nt == 6) then + !--------------------------------------------------------------------- + ! Class A size reference checksums + !--------------------------------------------------------------------- + class = 'A' + csum_ref(1) = dcmplx(5.046735008193D+02, 5.114047905510D+02) + csum_ref(2) = dcmplx(5.059412319734D+02, 5.098809666433D+02) + csum_ref(3) = dcmplx(5.069376896287D+02, 5.098144042213D+02) + csum_ref(4) = dcmplx(5.077892868474D+02, 5.101336130759D+02) + csum_ref(5) = dcmplx(5.085233095391D+02, 5.104914655194D+02) + csum_ref(6) = dcmplx(5.091487099959D+02, 5.107917842803D+02) + + else if (d1 == 512 .AND. d2 == 256 .AND. d3 == 256 .AND. nt == 20) then + !--------------------------------------------------------------------- + ! Class B size reference checksums + !--------------------------------------------------------------------- + class = 'B' + csum_ref(1) = dcmplx(5.177643571579D+02, 5.077803458597D+02) + csum_ref(2) = dcmplx(5.154521291263D+02, 5.088249431599D+02) + csum_ref(3) = dcmplx(5.146409228649D+02, 5.096208912659D+02) + csum_ref(4) = dcmplx(5.142378756213D+02, 5.101023387619D+02) + csum_ref(5) = dcmplx(5.139626667737D+02, 5.103976610617D+02) + csum_ref(6) = dcmplx(5.137423460082D+02, 5.105948019802D+02) + csum_ref(7) = dcmplx(5.135547056878D+02, 5.107404165783D+02) + csum_ref(8) = dcmplx(5.133910925466D+02, 5.108576573661D+02) + csum_ref(9) = dcmplx(5.132470705390D+02, 5.109577278523D+02) + csum_ref(10) = dcmplx(5.131197729984D+02, 5.110460304483D+02) + csum_ref(11) = dcmplx(5.130070319283D+02, 5.111252433800D+02) + csum_ref(12) = dcmplx(5.129070537032D+02, 5.111968077718D+02) + csum_ref(13) = dcmplx(5.128182883502D+02, 5.112616233064D+02) + csum_ref(14) = dcmplx(5.127393733383D+02, 5.113203605551D+02) + csum_ref(15) = dcmplx(5.126691062020D+02, 5.113735928093D+02) + csum_ref(16) = dcmplx(5.126064276004D+02, 5.114218460548D+02) + csum_ref(17) = dcmplx(5.125504076570D+02, 5.114656139760D+02) + csum_ref(18) = dcmplx(5.125002331720D+02, 5.115053595966D+02) + csum_ref(19) = dcmplx(5.124551951846D+02, 5.115415130407D+02) + csum_ref(20) = dcmplx(5.124146770029D+02, 5.115744692211D+02) + + else if (d1 == 512 .AND. d2 == 512 .AND. d3 == 512 .AND. nt == 20) then + !--------------------------------------------------------------------- + ! Class C size reference checksums + !--------------------------------------------------------------------- + class = 'C' + csum_ref(1) = dcmplx(5.195078707457D+02, 5.149019699238D+02) + csum_ref(2) = dcmplx(5.155422171134D+02, 5.127578201997D+02) + csum_ref(3) = dcmplx(5.144678022222D+02, 5.122251847514D+02) + csum_ref(4) = dcmplx(5.140150594328D+02, 5.121090289018D+02) + csum_ref(5) = dcmplx(5.137550426810D+02, 5.121143685824D+02) + csum_ref(6) = dcmplx(5.135811056728D+02, 5.121496764568D+02) + csum_ref(7) = dcmplx(5.134569343165D+02, 5.121870921893D+02) + csum_ref(8) = dcmplx(5.133651975661D+02, 5.122193250322D+02) + csum_ref(9) = dcmplx(5.132955192805D+02, 5.122454735794D+02) + csum_ref(10) = dcmplx(5.132410471738D+02, 5.122663649603D+02) + csum_ref(11) = dcmplx(5.131971141679D+02, 5.122830879827D+02) + csum_ref(12) = dcmplx(5.131605205716D+02, 5.122965869718D+02) + csum_ref(13) = dcmplx(5.131290734194D+02, 5.123075927445D+02) + csum_ref(14) = dcmplx(5.131012720314D+02, 5.123166486553D+02) + csum_ref(15) = dcmplx(5.130760908195D+02, 5.123241541685D+02) + csum_ref(16) = dcmplx(5.130528295923D+02, 5.123304037599D+02) + csum_ref(17) = dcmplx(5.130310107773D+02, 5.123356167976D+02) + csum_ref(18) = dcmplx(5.130103090133D+02, 5.123399592211D+02) + csum_ref(19) = dcmplx(5.129905029333D+02, 5.123435588985D+02) + csum_ref(20) = dcmplx(5.129714421109D+02, 5.123465164008D+02) + + else if (d1 == 2048 .AND. d2 == 1024 .AND. d3 == 1024 .AND. nt == 25) then + !--------------------------------------------------------------------- + ! Class D size reference checksums + !--------------------------------------------------------------------- + class = 'D' + csum_ref(1) = dcmplx(5.122230065252D+02, 5.118534037109D+02) + csum_ref(2) = dcmplx(5.120463975765D+02, 5.117061181082D+02) + csum_ref(3) = dcmplx(5.119865766760D+02, 5.117096364601D+02) + csum_ref(4) = dcmplx(5.119518799488D+02, 5.117373863950D+02) + csum_ref(5) = dcmplx(5.119269088223D+02, 5.117680347632D+02) + csum_ref(6) = dcmplx(5.119082416858D+02, 5.117967875532D+02) + csum_ref(7) = dcmplx(5.118943814638D+02, 5.118225281841D+02) + csum_ref(8) = dcmplx(5.118842385057D+02, 5.118451629348D+02) + csum_ref(9) = dcmplx(5.118769435632D+02, 5.118649119387D+02) + csum_ref(10) = dcmplx(5.118718203448D+02, 5.118820803844D+02) + csum_ref(11) = dcmplx(5.118683569061D+02, 5.118969781011D+02) + csum_ref(12) = dcmplx(5.118661708593D+02, 5.119098918835D+02) + csum_ref(13) = dcmplx(5.118649768950D+02, 5.119210777066D+02) + csum_ref(14) = dcmplx(5.118645605626D+02, 5.119307604484D+02) + csum_ref(15) = dcmplx(5.118647586618D+02, 5.119391362671D+02) + csum_ref(16) = dcmplx(5.118654451572D+02, 5.119463757241D+02) + csum_ref(17) = dcmplx(5.118665212451D+02, 5.119526269238D+02) + csum_ref(18) = dcmplx(5.118679083821D+02, 5.119580184108D+02) + csum_ref(19) = dcmplx(5.118695433664D+02, 5.119626617538D+02) + csum_ref(20) = dcmplx(5.118713748264D+02, 5.119666538138D+02) + csum_ref(21) = dcmplx(5.118733606701D+02, 5.119700787219D+02) + csum_ref(22) = dcmplx(5.118754661974D+02, 5.119730095953D+02) + csum_ref(23) = dcmplx(5.118776626738D+02, 5.119755100241D+02) + csum_ref(24) = dcmplx(5.118799262314D+02, 5.119776353561D+02) + csum_ref(25) = dcmplx(5.118822370068D+02, 5.119794338060D+02) + + else if (d1 == 4096 .AND. d2 == 2048 .AND. d3 == 2048 .AND. nt == 25) then + !--------------------------------------------------------------------- + ! Class E size reference checksums + !--------------------------------------------------------------------- + class = 'E' + csum_ref(1) = dcmplx(5.121601045346D+02, 5.117395998266D+02) + csum_ref(2) = dcmplx(5.120905403678D+02, 5.118614716182D+02) + csum_ref(3) = dcmplx(5.120623229306D+02, 5.119074203747D+02) + csum_ref(4) = dcmplx(5.120438418997D+02, 5.119345900733D+02) + csum_ref(5) = dcmplx(5.120311521872D+02, 5.119551325550D+02) + csum_ref(6) = dcmplx(5.120226088809D+02, 5.119720179919D+02) + csum_ref(7) = dcmplx(5.120169296534D+02, 5.119861371665D+02) + csum_ref(8) = dcmplx(5.120131225172D+02, 5.119979364402D+02) + csum_ref(9) = dcmplx(5.120104767108D+02, 5.120077674092D+02) + csum_ref(10) = dcmplx(5.120085127969D+02, 5.120159443121D+02) + csum_ref(11) = dcmplx(5.120069224127D+02, 5.120227453670D+02) + csum_ref(12) = dcmplx(5.120055158164D+02, 5.120284096041D+02) + csum_ref(13) = dcmplx(5.120041820159D+02, 5.120331373793D+02) + csum_ref(14) = dcmplx(5.120028605402D+02, 5.120370938679D+02) + csum_ref(15) = dcmplx(5.120015223011D+02, 5.120404138831D+02) + csum_ref(16) = dcmplx(5.120001570022D+02, 5.120432068837D+02) + csum_ref(17) = dcmplx(5.119987650555D+02, 5.120455615860D+02) + csum_ref(18) = dcmplx(5.119973525091D+02, 5.120475499442D+02) + csum_ref(19) = dcmplx(5.119959279472D+02, 5.120492304629D+02) + csum_ref(20) = dcmplx(5.119945006558D+02, 5.120506508902D+02) + csum_ref(21) = dcmplx(5.119930795911D+02, 5.120518503782D+02) + csum_ref(22) = dcmplx(5.119916728462D+02, 5.120528612016D+02) + csum_ref(23) = dcmplx(5.119902874185D+02, 5.120537101195D+02) + csum_ref(24) = dcmplx(5.119889291565D+02, 5.120544194514D+02) + csum_ref(25) = dcmplx(5.119876028049D+02, 5.120550079284D+02) + + endif + + + if (class /= 'U') then + + do i = 1, nt + err = abs( (sums(i) - csum_ref(i)) / csum_ref(i) ) + if ( .NOT. (err <= epsilon)) goto 100 + end do + verified = .TRUE. + 100 continue + + endif + + + if (class /= 'U') then + if (verified) then + write(*,2000) + 2000 format(' Result verification successful') + else + write(*,2001) + 2001 format(' Result verification failed') + endif + endif + print *, 'class = ', class + + return + end subroutine verify + + diff --git a/Parser/test-data/nas/ft3.f b/Parser/test-data/nas/ft3.f new file mode 100644 index 0000000..4dd7019 --- /dev/null +++ b/Parser/test-data/nas/ft3.f @@ -0,0 +1,2815 @@ +!-------------------------------------------------------------------------! +! ! +! N A S P A R A L L E L B E N C H M A R K S 3.3 ! +! ! +! O p e n M P V E R S I O N ! +! ! +! F T ! +! ! +!-------------------------------------------------------------------------! +! ! +! This benchmark is an OpenMP version of the NPB FT code. ! +! It is described in NAS Technical Report 99-011. ! +! ! +! Permission to use, copy, distribute and modify this software ! +! for any purpose with or without fee is hereby granted. We ! +! request, however, that all derived work reference the NAS ! +! Parallel Benchmarks 3.3. This software is provided "as is" ! +! without express or implied warranty. ! +! ! +! Information on NPB 3.3, including the technical report, the ! +! original specifications, source code, results and information ! +! on how to submit new results, is available at: ! +! ! +! http://www.nas.nasa.gov/Software/NPB/ ! +! ! +! Send comments or suggestions to npb@nas.nasa.gov ! +! ! +! NAS Parallel Benchmarks Group ! +! NASA Ames Research Center ! +! Mail Stop: T27A-1 ! +! Moffett Field, CA 94035-1000 ! +! ! +! E-mail: npb@nas.nasa.gov ! +! Fax: (650) 604-3957 ! +! ! +!-------------------------------------------------------------------------! + +!--------------------------------------------------------------------- + +! Authors: D. Bailey +! W. Saphir +! H. Jin + +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +! FT benchmark +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + program ft + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + + implicit none + +! CLASS = A +! +! +! This file is generated automatically by the setparams utility. +! It sets the number of processors and the class of the NPB +! in this directory. Do not modify it by hand. +! + integer nx, ny, nz, maxdim, niter_default + integer ntotal, nxp, nyp, ntotalp + parameter (nx=256, ny=256, nz=128, maxdim=256) + parameter (niter_default=6) + parameter (nxp=nx+1, nyp=ny) + parameter (ntotal=nx*nyp*nz) + parameter (ntotalp=nxp*nyp*nz) + logical convertdouble + parameter (convertdouble = .false.) + character compiletime*11 + parameter (compiletime='18 Oct 2016') + character npbversion*5 + parameter (npbversion='3.3.1') + character cs1*8 + parameter (cs1='gfortran') + character cs2*6 + parameter (cs2='$(F77)') + character cs3*6 + parameter (cs3='(none)') + character cs4*6 + parameter (cs4='(none)') + character cs5*40 + parameter (cs5='-ffree-form -O3 -fopenmp -mcmodel=medium') + character cs6*28 + parameter (cs6='-O3 -fopenmp -mcmodel=medium') + character cs7*6 + parameter (cs7='randi8') + + +! If processor array is 1x1 -> 0D grid decomposition + + +! Cache blocking params. These values are good for most +! RISC processors. +! FFT parameters: +! fftblock controls how many ffts are done at a time. +! The default is appropriate for most cache-based machines +! On vector machines, the FFT can be vectorized with vector +! length equal to the block size, so the block size should +! be as large as possible. This is the size of the smallest +! dimension of the problem: 128 for class A, 256 for class B and +! 512 for class C. + + integer :: fftblock_default, fftblockpad_default +! parameter (fftblock_default=16, fftblockpad_default=18) + parameter (fftblock_default=32, fftblockpad_default=33) + + integer :: fftblock, fftblockpad + common /blockinfo/ fftblock, fftblockpad + +! we need a bunch of logic to keep track of how +! arrays are laid out. + + +! Note: this serial version is the derived from the parallel 0D case +! of the ft NPB. +! The computation proceeds logically as + +! set up initial conditions +! fftx(1) +! transpose (1->2) +! ffty(2) +! transpose (2->3) +! fftz(3) +! time evolution +! fftz(3) +! transpose (3->2) +! ffty(2) +! transpose (2->1) +! fftx(1) +! compute residual(1) + +! for the 0D, 1D, 2D strategies, the layouts look like xxx + +! 0D 1D 2D +! 1: xyz xyz xyz + +! the array dimensions are stored in dims(coord, phase) + integer :: dims(3) + common /layout/ dims + + integer :: T_total, T_setup, T_fft, T_evolve, T_checksum, T_fftx, T_ffty, T_fftz, T_max + parameter (T_total = 1, T_setup = 2, T_fft = 3, T_evolve = 4, T_checksum = 5, T_fftx = 6, T_ffty = 7, T_fftz = 8, T_max = 8) + + + + logical :: timers_enabled + + + external timer_read + double precision :: timer_read + external ilog2 + integer :: ilog2 + + external randlc + double precision :: randlc + + +! other stuff + logical :: debug, debugsynch + common /dbg/ debug, debugsynch, timers_enabled + + double precision :: seed, a, pi, alpha + parameter (seed = 314159265.d0, a = 1220703125.d0, pi = 3.141592653589793238d0, alpha=1.0d-6) + + +! roots of unity array +! relies on x being largest dimension? + double complex u(nxp) + common /ucomm/ u + + +! for checksum data + double complex sums(0:niter_default) + common /sumcomm/ sums + +! number of iterations + integer :: niter + common /iter/ niter + integer :: i + +!--------------------------------------------------------------------- +! u0, u1, u2 are the main arrays in the problem. +! Depending on the decomposition, these arrays will have different +! dimensions. To accomodate all possibilities, we allocate them as +! one-dimensional arrays and pass them to subroutines for different +! views +! - u0 contains the initial (transformed) initial condition +! - u1 and u2 are working arrays +! - twiddle contains exponents for the time evolution operator. +!--------------------------------------------------------------------- + + double complex u0(ntotalp), u1(ntotalp) +! > u2(ntotalp) + double precision :: twiddle(ntotalp) +!--------------------------------------------------------------------- +! Large arrays are in common so that they are allocated on the +! heap rather than the stack. This common block is not +! referenced directly anywhere else. Padding is to avoid accidental +! cache problems, since all array sizes are powers of two. +!--------------------------------------------------------------------- + +! double complex pad1(3), pad2(3), pad3(3) +! common /bigarrays/ u0, pad1, u1, pad2, u2, pad3, twiddle + double complex pad1(3), pad2(3) + common /bigarrays/ u0, pad1, u1, pad2, twiddle + + integer :: iter + double precision :: total_time, mflops + logical :: verified + character class + +!--------------------------------------------------------------------- +! Run the entire problem once to make sure all data is touched. +! This reduces variable startup costs, which is important for such a +! short benchmark. The other NPB 2 implementations are similar. +!--------------------------------------------------------------------- + do i = 1, t_max + call timer_clear(i) + end do + call setup() + call init_ui(u0, u1, twiddle, dims(1), dims(2), dims(3)) + call compute_indexmap(twiddle, dims(1), dims(2), dims(3)) + call compute_initial_conditions(u1, dims(1), dims(2), dims(3)) + call fft_init (dims(1)) + call fft(1, u1, u0) + +!--------------------------------------------------------------------- +! Start over from the beginning. Note that all operations must +! be timed, in contrast to other benchmarks. +!--------------------------------------------------------------------- + do i = 1, t_max + call timer_clear(i) + end do + + call timer_start(T_total) + if (timers_enabled) call timer_start(T_setup) + + call compute_indexmap(twiddle, dims(1), dims(2), dims(3)) + + call compute_initial_conditions(u1, dims(1), dims(2), dims(3)) + + call fft_init (dims(1)) + + if (timers_enabled) call timer_stop(T_setup) + if (timers_enabled) call timer_start(T_fft) + call fft(1, u1, u0) + if (timers_enabled) call timer_stop(T_fft) + + do iter = 1, niter + if (timers_enabled) call timer_start(T_evolve) + call evolve(u0, u1, twiddle, dims(1), dims(2), dims(3)) + if (timers_enabled) call timer_stop(T_evolve) + if (timers_enabled) call timer_start(T_fft) + ! call fft(-1, u1, u2) + call fft(-1, u1, u1) + if (timers_enabled) call timer_stop(T_fft) + if (timers_enabled) call timer_start(T_checksum) + ! call checksum(iter, u2, dims(1), dims(2), dims(3)) + call checksum(iter, u1, dims(1), dims(2), dims(3)) + if (timers_enabled) call timer_stop(T_checksum) + end do + + call verify(nx, ny, nz, niter, verified, class) + + call timer_stop(t_total) + total_time = timer_read(t_total) + + if( total_time /= 0. ) then + mflops = 1.0d-6*float(ntotal) * (14.8157+7.19641*log(float(ntotal)) + (5.23518+7.21113*log(float(ntotal)))*niter) /total_time + else + mflops = 0.0 + endif + call print_results('FT', class, nx, ny, nz, niter, total_time, mflops, ' floating point', verified, npbversion, compiletime, cs1, cs2, cs3, cs4, cs5, cs6, cs7) + if (timers_enabled) call print_timers() + + END PROGRAM + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine init_ui(u0, u1, twiddle, d1, d2, d3) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +! touch all the big data +!--------------------------------------------------------------------- + + implicit none + integer :: d1, d2, d3 + double complex u0(d1+1,d2,d3) + double complex u1(d1+1,d2,d3) + double precision :: twiddle(d1+1,d2,d3) + integer :: i, j, k + +! omp parallel do default(shared) private(i,j,k) + do k = 1, d3 + do j = 1, d2 + do i = 1, d1 + u0(i,j,k) = 0.d0 + u1(i,j,k) = 0.d0 + twiddle(i,j,k) = 0.d0 + end do + end do + end do + + return + end subroutine init_ui + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine evolve(u0, u1, twiddle, d1, d2, d3) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +! evolve u0 -> u1 (t time steps) in fourier space +!--------------------------------------------------------------------- + + implicit none +! CLASS = A +! +! +! This file is generated automatically by the setparams utility. +! It sets the number of processors and the class of the NPB +! in this directory. Do not modify it by hand. +! + integer nx, ny, nz, maxdim, niter_default + integer ntotal, nxp, nyp, ntotalp + parameter (nx=256, ny=256, nz=128, maxdim=256) + parameter (niter_default=6) + parameter (nxp=nx+1, nyp=ny) + parameter (ntotal=nx*nyp*nz) + parameter (ntotalp=nxp*nyp*nz) + logical convertdouble + parameter (convertdouble = .false.) + character compiletime*11 + parameter (compiletime='18 Oct 2016') + character npbversion*5 + parameter (npbversion='3.3.1') + character cs1*8 + parameter (cs1='gfortran') + character cs2*6 + parameter (cs2='$(F77)') + character cs3*6 + parameter (cs3='(none)') + character cs4*6 + parameter (cs4='(none)') + character cs5*40 + parameter (cs5='-ffree-form -O3 -fopenmp -mcmodel=medium') + character cs6*28 + parameter (cs6='-O3 -fopenmp -mcmodel=medium') + character cs7*6 + parameter (cs7='randi8') + + +! If processor array is 1x1 -> 0D grid decomposition + + +! Cache blocking params. These values are good for most +! RISC processors. +! FFT parameters: +! fftblock controls how many ffts are done at a time. +! The default is appropriate for most cache-based machines +! On vector machines, the FFT can be vectorized with vector +! length equal to the block size, so the block size should +! be as large as possible. This is the size of the smallest +! dimension of the problem: 128 for class A, 256 for class B and +! 512 for class C. + + integer :: fftblock_default, fftblockpad_default +! parameter (fftblock_default=16, fftblockpad_default=18) + parameter (fftblock_default=32, fftblockpad_default=33) + + integer :: fftblock, fftblockpad + common /blockinfo/ fftblock, fftblockpad + +! we need a bunch of logic to keep track of how +! arrays are laid out. + + +! Note: this serial version is the derived from the parallel 0D case +! of the ft NPB. +! The computation proceeds logically as + +! set up initial conditions +! fftx(1) +! transpose (1->2) +! ffty(2) +! transpose (2->3) +! fftz(3) +! time evolution +! fftz(3) +! transpose (3->2) +! ffty(2) +! transpose (2->1) +! fftx(1) +! compute residual(1) + +! for the 0D, 1D, 2D strategies, the layouts look like xxx + +! 0D 1D 2D +! 1: xyz xyz xyz + +! the array dimensions are stored in dims(coord, phase) + integer :: dims(3) + common /layout/ dims + + integer :: T_total, T_setup, T_fft, T_evolve, T_checksum, T_fftx, T_ffty, T_fftz, T_max + parameter (T_total = 1, T_setup = 2, T_fft = 3, T_evolve = 4, T_checksum = 5, T_fftx = 6, T_ffty = 7, T_fftz = 8, T_max = 8) + + + + logical :: timers_enabled + + + external timer_read + double precision :: timer_read + external ilog2 + integer :: ilog2 + + external randlc + double precision :: randlc + + +! other stuff + logical :: debug, debugsynch + common /dbg/ debug, debugsynch, timers_enabled + + double precision :: seed, a, pi, alpha + parameter (seed = 314159265.d0, a = 1220703125.d0, pi = 3.141592653589793238d0, alpha=1.0d-6) + + +! roots of unity array +! relies on x being largest dimension? + double complex u(nxp) + common /ucomm/ u + + +! for checksum data + double complex sums(0:niter_default) + common /sumcomm/ sums + +! number of iterations + integer :: niter + common /iter/ niter + integer :: d1, d2, d3 + double complex u0(d1+1,d2,d3) + double complex u1(d1+1,d2,d3) + double precision :: twiddle(d1+1,d2,d3) + integer :: i, j, k + +! omp parallel do default(shared) private(i,j,k) + do k = 1, d3 + do j = 1, d2 + do i = 1, d1 + u0(i,j,k) = u0(i,j,k) * twiddle(i,j,k) + u1(i,j,k) = u0(i,j,k) + end do + end do + end do + + return + end subroutine evolve + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine compute_initial_conditions(u0, d1, d2, d3) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +! Fill in array u0 with initial conditions from +! random number generator +!--------------------------------------------------------------------- + implicit none +! CLASS = A +! +! +! This file is generated automatically by the setparams utility. +! It sets the number of processors and the class of the NPB +! in this directory. Do not modify it by hand. +! + integer nx, ny, nz, maxdim, niter_default + integer ntotal, nxp, nyp, ntotalp + parameter (nx=256, ny=256, nz=128, maxdim=256) + parameter (niter_default=6) + parameter (nxp=nx+1, nyp=ny) + parameter (ntotal=nx*nyp*nz) + parameter (ntotalp=nxp*nyp*nz) + logical convertdouble + parameter (convertdouble = .false.) + character compiletime*11 + parameter (compiletime='18 Oct 2016') + character npbversion*5 + parameter (npbversion='3.3.1') + character cs1*8 + parameter (cs1='gfortran') + character cs2*6 + parameter (cs2='$(F77)') + character cs3*6 + parameter (cs3='(none)') + character cs4*6 + parameter (cs4='(none)') + character cs5*40 + parameter (cs5='-ffree-form -O3 -fopenmp -mcmodel=medium') + character cs6*28 + parameter (cs6='-O3 -fopenmp -mcmodel=medium') + character cs7*6 + parameter (cs7='randi8') + + +! If processor array is 1x1 -> 0D grid decomposition + + +! Cache blocking params. These values are good for most +! RISC processors. +! FFT parameters: +! fftblock controls how many ffts are done at a time. +! The default is appropriate for most cache-based machines +! On vector machines, the FFT can be vectorized with vector +! length equal to the block size, so the block size should +! be as large as possible. This is the size of the smallest +! dimension of the problem: 128 for class A, 256 for class B and +! 512 for class C. + + integer :: fftblock_default, fftblockpad_default +! parameter (fftblock_default=16, fftblockpad_default=18) + parameter (fftblock_default=32, fftblockpad_default=33) + + integer :: fftblock, fftblockpad + common /blockinfo/ fftblock, fftblockpad + +! we need a bunch of logic to keep track of how +! arrays are laid out. + + +! Note: this serial version is the derived from the parallel 0D case +! of the ft NPB. +! The computation proceeds logically as + +! set up initial conditions +! fftx(1) +! transpose (1->2) +! ffty(2) +! transpose (2->3) +! fftz(3) +! time evolution +! fftz(3) +! transpose (3->2) +! ffty(2) +! transpose (2->1) +! fftx(1) +! compute residual(1) + +! for the 0D, 1D, 2D strategies, the layouts look like xxx + +! 0D 1D 2D +! 1: xyz xyz xyz + +! the array dimensions are stored in dims(coord, phase) + integer :: dims(3) + common /layout/ dims + + integer :: T_total, T_setup, T_fft, T_evolve, T_checksum, T_fftx, T_ffty, T_fftz, T_max + parameter (T_total = 1, T_setup = 2, T_fft = 3, T_evolve = 4, T_checksum = 5, T_fftx = 6, T_ffty = 7, T_fftz = 8, T_max = 8) + + + + logical :: timers_enabled + + + external timer_read + double precision :: timer_read + external ilog2 + integer :: ilog2 + + external randlc + double precision :: randlc + + +! other stuff + logical :: debug, debugsynch + common /dbg/ debug, debugsynch, timers_enabled + + double precision :: seed, a, pi, alpha + parameter (seed = 314159265.d0, a = 1220703125.d0, pi = 3.141592653589793238d0, alpha=1.0d-6) + + +! roots of unity array +! relies on x being largest dimension? + double complex u(nxp) + common /ucomm/ u + + +! for checksum data + double complex sums(0:niter_default) + common /sumcomm/ sums + +! number of iterations + integer :: niter + common /iter/ niter + integer :: d1, d2, d3 + double complex u0(d1+1, d2, d3) + integer :: k, j + double precision :: x0, start, an, dummy, starts(nz) + + + start = seed +!--------------------------------------------------------------------- +! Jump to the starting element for our first plane. +!--------------------------------------------------------------------- + call ipow46(a, 0, an) + dummy = randlc(start, an) + call ipow46(a, 2*nx*ny, an) + + starts(1) = start + do k = 2, dims(3) + dummy = randlc(start, an) + starts(k) = start + end do + +!--------------------------------------------------------------------- +! Go through by z planes filling in one square at a time. +!--------------------------------------------------------------------- +! omp parallel do default(shared) private(k,j,x0) + do k = 1, dims(3) + x0 = starts(k) + do j = 1, dims(2) + call vranlc(2*nx, x0, a, u0(1, j, k)) + end do + end do + + return + end subroutine compute_initial_conditions + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine ipow46(a, exponent, result) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +! compute a^exponent mod 2^46 +!--------------------------------------------------------------------- + + implicit none + double precision :: a, result, dummy, q, r + integer :: exponent, n, n2 + external randlc + double precision :: randlc +!--------------------------------------------------------------------- +! Use +! a^n = a^(n/2)*a^(n/2) if n even else +! a^n = a*a^(n-1) if n odd +!--------------------------------------------------------------------- + result = 1 + if (exponent == 0) return + q = a + r = 1 + n = exponent + + + do while (n > 1) + n2 = n/2 + if (n2 * 2 == n) then + dummy = randlc(q, q) + n = n2 + else + dummy = randlc(r, q) + n = n-1 + endif + end do + dummy = randlc(r, q) + result = r + return + end subroutine ipow46 + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine setup + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + implicit none +! CLASS = A +! +! +! This file is generated automatically by the setparams utility. +! It sets the number of processors and the class of the NPB +! in this directory. Do not modify it by hand. +! + integer nx, ny, nz, maxdim, niter_default + integer ntotal, nxp, nyp, ntotalp + parameter (nx=256, ny=256, nz=128, maxdim=256) + parameter (niter_default=6) + parameter (nxp=nx+1, nyp=ny) + parameter (ntotal=nx*nyp*nz) + parameter (ntotalp=nxp*nyp*nz) + logical convertdouble + parameter (convertdouble = .false.) + character compiletime*11 + parameter (compiletime='18 Oct 2016') + character npbversion*5 + parameter (npbversion='3.3.1') + character cs1*8 + parameter (cs1='gfortran') + character cs2*6 + parameter (cs2='$(F77)') + character cs3*6 + parameter (cs3='(none)') + character cs4*6 + parameter (cs4='(none)') + character cs5*40 + parameter (cs5='-ffree-form -O3 -fopenmp -mcmodel=medium') + character cs6*28 + parameter (cs6='-O3 -fopenmp -mcmodel=medium') + character cs7*6 + parameter (cs7='randi8') + + +! If processor array is 1x1 -> 0D grid decomposition + + +! Cache blocking params. These values are good for most +! RISC processors. +! FFT parameters: +! fftblock controls how many ffts are done at a time. +! The default is appropriate for most cache-based machines +! On vector machines, the FFT can be vectorized with vector +! length equal to the block size, so the block size should +! be as large as possible. This is the size of the smallest +! dimension of the problem: 128 for class A, 256 for class B and +! 512 for class C. + + integer :: fftblock_default, fftblockpad_default +! parameter (fftblock_default=16, fftblockpad_default=18) + parameter (fftblock_default=32, fftblockpad_default=33) + + integer :: fftblock, fftblockpad + common /blockinfo/ fftblock, fftblockpad + +! we need a bunch of logic to keep track of how +! arrays are laid out. + + +! Note: this serial version is the derived from the parallel 0D case +! of the ft NPB. +! The computation proceeds logically as + +! set up initial conditions +! fftx(1) +! transpose (1->2) +! ffty(2) +! transpose (2->3) +! fftz(3) +! time evolution +! fftz(3) +! transpose (3->2) +! ffty(2) +! transpose (2->1) +! fftx(1) +! compute residual(1) + +! for the 0D, 1D, 2D strategies, the layouts look like xxx + +! 0D 1D 2D +! 1: xyz xyz xyz + +! the array dimensions are stored in dims(coord, phase) + integer :: dims(3) + common /layout/ dims + + integer :: T_total, T_setup, T_fft, T_evolve, T_checksum, T_fftx, T_ffty, T_fftz, T_max + parameter (T_total = 1, T_setup = 2, T_fft = 3, T_evolve = 4, T_checksum = 5, T_fftx = 6, T_ffty = 7, T_fftz = 8, T_max = 8) + + + + logical :: timers_enabled + + + external timer_read + double precision :: timer_read + external ilog2 + integer :: ilog2 + + external randlc + double precision :: randlc + + +! other stuff + logical :: debug, debugsynch + common /dbg/ debug, debugsynch, timers_enabled + + double precision :: seed, a, pi, alpha + parameter (seed = 314159265.d0, a = 1220703125.d0, pi = 3.141592653589793238d0, alpha=1.0d-6) + + +! roots of unity array +! relies on x being largest dimension? + double complex u(nxp) + common /ucomm/ u + + +! for checksum data + double complex sums(0:niter_default) + common /sumcomm/ sums + +! number of iterations + integer :: niter + common /iter/ niter + + integer :: fstatus +! !$ integer omp_get_max_threads +! !$ external omp_get_max_threads + debug = .FALSE. + + open (unit=2,file='timer.flag',status='old',iostat=fstatus) + if (fstatus == 0) then + timers_enabled = .TRUE. + close(2) + else + timers_enabled = .FALSE. + endif + + write(*, 1000) + + niter = niter_default + + write(*, 1001) nx, ny, nz + write(*, 1002) niter +! !$ write(*, 1003) omp_get_max_threads() + write(*, *) + + + 1000 format(//,' NAS Parallel Benchmarks (NPB3.3-OMP)', ' - FT Benchmark', /) + 1001 format(' Size : ', i4, 'x', i4, 'x', i4) + 1002 format(' Iterations :', i7) + 1003 format(' Number of available threads :', i7) + + dims(1) = nx + dims(2) = ny + dims(3) = nz + + +!--------------------------------------------------------------------- +! Set up info for blocking of ffts and transposes. This improves +! performance on cache-based systems. Blocking involves +! working on a chunk of the problem at a time, taking chunks +! along the first, second, or third dimension. + +! - In cffts1 blocking is on 2nd dimension (with fft on 1st dim) +! - In cffts2/3 blocking is on 1st dimension (with fft on 2nd and 3rd dims) + +! Since 1st dim is always in processor, we'll assume it's long enough +! (default blocking factor is 16 so min size for 1st dim is 16) +! The only case we have to worry about is cffts1 in a 2d decomposition. +! so the blocking factor should not be larger than the 2nd dimension. +!--------------------------------------------------------------------- + + fftblock = fftblock_default + fftblockpad = fftblockpad_default + + if (fftblock /= fftblock_default) fftblockpad = fftblock+3 + + return + end subroutine setup + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine compute_indexmap(twiddle, d1, d2, d3) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +! compute function from local (i,j,k) to ibar^2+jbar^2+kbar^2 +! for time evolution exponent. +!--------------------------------------------------------------------- + + implicit none +! CLASS = A +! +! +! This file is generated automatically by the setparams utility. +! It sets the number of processors and the class of the NPB +! in this directory. Do not modify it by hand. +! + integer nx, ny, nz, maxdim, niter_default + integer ntotal, nxp, nyp, ntotalp + parameter (nx=256, ny=256, nz=128, maxdim=256) + parameter (niter_default=6) + parameter (nxp=nx+1, nyp=ny) + parameter (ntotal=nx*nyp*nz) + parameter (ntotalp=nxp*nyp*nz) + logical convertdouble + parameter (convertdouble = .false.) + character compiletime*11 + parameter (compiletime='18 Oct 2016') + character npbversion*5 + parameter (npbversion='3.3.1') + character cs1*8 + parameter (cs1='gfortran') + character cs2*6 + parameter (cs2='$(F77)') + character cs3*6 + parameter (cs3='(none)') + character cs4*6 + parameter (cs4='(none)') + character cs5*40 + parameter (cs5='-ffree-form -O3 -fopenmp -mcmodel=medium') + character cs6*28 + parameter (cs6='-O3 -fopenmp -mcmodel=medium') + character cs7*6 + parameter (cs7='randi8') + + +! If processor array is 1x1 -> 0D grid decomposition + + +! Cache blocking params. These values are good for most +! RISC processors. +! FFT parameters: +! fftblock controls how many ffts are done at a time. +! The default is appropriate for most cache-based machines +! On vector machines, the FFT can be vectorized with vector +! length equal to the block size, so the block size should +! be as large as possible. This is the size of the smallest +! dimension of the problem: 128 for class A, 256 for class B and +! 512 for class C. + + integer :: fftblock_default, fftblockpad_default +! parameter (fftblock_default=16, fftblockpad_default=18) + parameter (fftblock_default=32, fftblockpad_default=33) + + integer :: fftblock, fftblockpad + common /blockinfo/ fftblock, fftblockpad + +! we need a bunch of logic to keep track of how +! arrays are laid out. + + +! Note: this serial version is the derived from the parallel 0D case +! of the ft NPB. +! The computation proceeds logically as + +! set up initial conditions +! fftx(1) +! transpose (1->2) +! ffty(2) +! transpose (2->3) +! fftz(3) +! time evolution +! fftz(3) +! transpose (3->2) +! ffty(2) +! transpose (2->1) +! fftx(1) +! compute residual(1) + +! for the 0D, 1D, 2D strategies, the layouts look like xxx + +! 0D 1D 2D +! 1: xyz xyz xyz + +! the array dimensions are stored in dims(coord, phase) + integer :: dims(3) + common /layout/ dims + + integer :: T_total, T_setup, T_fft, T_evolve, T_checksum, T_fftx, T_ffty, T_fftz, T_max + parameter (T_total = 1, T_setup = 2, T_fft = 3, T_evolve = 4, T_checksum = 5, T_fftx = 6, T_ffty = 7, T_fftz = 8, T_max = 8) + + + + logical :: timers_enabled + + + external timer_read + double precision :: timer_read + external ilog2 + integer :: ilog2 + + external randlc + double precision :: randlc + + +! other stuff + logical :: debug, debugsynch + common /dbg/ debug, debugsynch, timers_enabled + + double precision :: seed, a, pi, alpha + parameter (seed = 314159265.d0, a = 1220703125.d0, pi = 3.141592653589793238d0, alpha=1.0d-6) + + +! roots of unity array +! relies on x being largest dimension? + double complex u(nxp) + common /ucomm/ u + + +! for checksum data + double complex sums(0:niter_default) + common /sumcomm/ sums + +! number of iterations + integer :: niter + common /iter/ niter + integer :: d1, d2, d3 + double precision :: twiddle(d1+1, d2, d3) + integer :: i, j, k, kk, kk2, jj, kj2, ii + double precision :: ap + +!--------------------------------------------------------------------- +! basically we want to convert the fortran indices +! 1 2 3 4 5 6 7 8 +! to +! 0 1 2 3 -4 -3 -2 -1 +! The following magic formula does the trick: +! mod(i-1+n/2, n) - n/2 +!--------------------------------------------------------------------- + + ap = - 4.d0 * alpha * pi *pi + +! omp parallel do default(shared) private(i,j,k,kk,kk2,jj,kj2,ii) + do k = 1, dims(3) + kk = mod(k-1+nz/2, nz) - nz/2 + kk2 = kk*kk + do j = 1, dims(2) + jj = mod(j-1+ny/2, ny) - ny/2 + kj2 = jj*jj+kk2 + do i = 1, dims(1) + ii = mod(i-1+nx/2, nx) - nx/2 + twiddle(i,j,k) = dexp(ap*dble(ii*ii+kj2)) + end do + end do + end do + + return + end subroutine compute_indexmap + + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine print_timers() + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + implicit none + integer :: i +! CLASS = A +! +! +! This file is generated automatically by the setparams utility. +! It sets the number of processors and the class of the NPB +! in this directory. Do not modify it by hand. +! + integer nx, ny, nz, maxdim, niter_default + integer ntotal, nxp, nyp, ntotalp + parameter (nx=256, ny=256, nz=128, maxdim=256) + parameter (niter_default=6) + parameter (nxp=nx+1, nyp=ny) + parameter (ntotal=nx*nyp*nz) + parameter (ntotalp=nxp*nyp*nz) + logical convertdouble + parameter (convertdouble = .false.) + character compiletime*11 + parameter (compiletime='18 Oct 2016') + character npbversion*5 + parameter (npbversion='3.3.1') + character cs1*8 + parameter (cs1='gfortran') + character cs2*6 + parameter (cs2='$(F77)') + character cs3*6 + parameter (cs3='(none)') + character cs4*6 + parameter (cs4='(none)') + character cs5*40 + parameter (cs5='-ffree-form -O3 -fopenmp -mcmodel=medium') + character cs6*28 + parameter (cs6='-O3 -fopenmp -mcmodel=medium') + character cs7*6 + parameter (cs7='randi8') + + +! If processor array is 1x1 -> 0D grid decomposition + + +! Cache blocking params. These values are good for most +! RISC processors. +! FFT parameters: +! fftblock controls how many ffts are done at a time. +! The default is appropriate for most cache-based machines +! On vector machines, the FFT can be vectorized with vector +! length equal to the block size, so the block size should +! be as large as possible. This is the size of the smallest +! dimension of the problem: 128 for class A, 256 for class B and +! 512 for class C. + + integer :: fftblock_default, fftblockpad_default +! parameter (fftblock_default=16, fftblockpad_default=18) + parameter (fftblock_default=32, fftblockpad_default=33) + + integer :: fftblock, fftblockpad + common /blockinfo/ fftblock, fftblockpad + +! we need a bunch of logic to keep track of how +! arrays are laid out. + + +! Note: this serial version is the derived from the parallel 0D case +! of the ft NPB. +! The computation proceeds logically as + +! set up initial conditions +! fftx(1) +! transpose (1->2) +! ffty(2) +! transpose (2->3) +! fftz(3) +! time evolution +! fftz(3) +! transpose (3->2) +! ffty(2) +! transpose (2->1) +! fftx(1) +! compute residual(1) + +! for the 0D, 1D, 2D strategies, the layouts look like xxx + +! 0D 1D 2D +! 1: xyz xyz xyz + +! the array dimensions are stored in dims(coord, phase) + integer :: dims(3) + common /layout/ dims + + integer :: T_total, T_setup, T_fft, T_evolve, T_checksum, T_fftx, T_ffty, T_fftz, T_max + parameter (T_total = 1, T_setup = 2, T_fft = 3, T_evolve = 4, T_checksum = 5, T_fftx = 6, T_ffty = 7, T_fftz = 8, T_max = 8) + + + + logical :: timers_enabled + + + external timer_read + double precision :: timer_read + external ilog2 + integer :: ilog2 + + external randlc + double precision :: randlc + + +! other stuff + logical :: debug, debugsynch + common /dbg/ debug, debugsynch, timers_enabled + + double precision :: seed, a, pi, alpha + parameter (seed = 314159265.d0, a = 1220703125.d0, pi = 3.141592653589793238d0, alpha=1.0d-6) + + +! roots of unity array +! relies on x being largest dimension? + double complex u(nxp) + common /ucomm/ u + + +! for checksum data + double complex sums(0:niter_default) + common /sumcomm/ sums + +! number of iterations + integer :: niter + common /iter/ niter + double precision :: t, t_m + character(25) :: tstrings(T_max) + data tstrings / ' total ', ' setup ', ' fft ', ' evolve ', ' checksum ', ' fftx ', ' ffty ', ' fftz ' / + + t_m = timer_read(T_total) + if (t_m <= 0.0d0) t_m = 1.0d0 + do i = 1, t_max + t = timer_read(i) + write(*, 100) i, tstrings(i), t, t*100.0/t_m + end do + 100 format(' timer ', i2, '(', A16, ') :', F9.4, ' (',F6.2,'%)') + return + end subroutine print_timers + + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine fft(dir, x1, x2) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + implicit none +! CLASS = A +! +! +! This file is generated automatically by the setparams utility. +! It sets the number of processors and the class of the NPB +! in this directory. Do not modify it by hand. +! + integer nx, ny, nz, maxdim, niter_default + integer ntotal, nxp, nyp, ntotalp + parameter (nx=256, ny=256, nz=128, maxdim=256) + parameter (niter_default=6) + parameter (nxp=nx+1, nyp=ny) + parameter (ntotal=nx*nyp*nz) + parameter (ntotalp=nxp*nyp*nz) + logical convertdouble + parameter (convertdouble = .false.) + character compiletime*11 + parameter (compiletime='18 Oct 2016') + character npbversion*5 + parameter (npbversion='3.3.1') + character cs1*8 + parameter (cs1='gfortran') + character cs2*6 + parameter (cs2='$(F77)') + character cs3*6 + parameter (cs3='(none)') + character cs4*6 + parameter (cs4='(none)') + character cs5*40 + parameter (cs5='-ffree-form -O3 -fopenmp -mcmodel=medium') + character cs6*28 + parameter (cs6='-O3 -fopenmp -mcmodel=medium') + character cs7*6 + parameter (cs7='randi8') + + +! If processor array is 1x1 -> 0D grid decomposition + + +! Cache blocking params. These values are good for most +! RISC processors. +! FFT parameters: +! fftblock controls how many ffts are done at a time. +! The default is appropriate for most cache-based machines +! On vector machines, the FFT can be vectorized with vector +! length equal to the block size, so the block size should +! be as large as possible. This is the size of the smallest +! dimension of the problem: 128 for class A, 256 for class B and +! 512 for class C. + + integer :: fftblock_default, fftblockpad_default +! parameter (fftblock_default=16, fftblockpad_default=18) + parameter (fftblock_default=32, fftblockpad_default=33) + + integer :: fftblock, fftblockpad + common /blockinfo/ fftblock, fftblockpad + +! we need a bunch of logic to keep track of how +! arrays are laid out. + + +! Note: this serial version is the derived from the parallel 0D case +! of the ft NPB. +! The computation proceeds logically as + +! set up initial conditions +! fftx(1) +! transpose (1->2) +! ffty(2) +! transpose (2->3) +! fftz(3) +! time evolution +! fftz(3) +! transpose (3->2) +! ffty(2) +! transpose (2->1) +! fftx(1) +! compute residual(1) + +! for the 0D, 1D, 2D strategies, the layouts look like xxx + +! 0D 1D 2D +! 1: xyz xyz xyz + +! the array dimensions are stored in dims(coord, phase) + integer :: dims(3) + common /layout/ dims + + integer :: T_total, T_setup, T_fft, T_evolve, T_checksum, T_fftx, T_ffty, T_fftz, T_max + parameter (T_total = 1, T_setup = 2, T_fft = 3, T_evolve = 4, T_checksum = 5, T_fftx = 6, T_ffty = 7, T_fftz = 8, T_max = 8) + + + + logical :: timers_enabled + + + external timer_read + double precision :: timer_read + external ilog2 + integer :: ilog2 + + external randlc + double precision :: randlc + + +! other stuff + logical :: debug, debugsynch + common /dbg/ debug, debugsynch, timers_enabled + + double precision :: seed, a, pi, alpha + parameter (seed = 314159265.d0, a = 1220703125.d0, pi = 3.141592653589793238d0, alpha=1.0d-6) + + +! roots of unity array +! relies on x being largest dimension? + double complex u(nxp) + common /ucomm/ u + + +! for checksum data + double complex sums(0:niter_default) + common /sumcomm/ sums + +! number of iterations + integer :: niter + common /iter/ niter + integer :: dir + double complex x1(ntotalp), x2(ntotalp) + + double complex y1(fftblockpad_default*maxdim), y2(fftblockpad_default*maxdim) + +!--------------------------------------------------------------------- +! note: args x1, x2 must be different arrays +! note: args for cfftsx are (direction, layout, xin, xout, scratch) +! xin/xout may be the same and it can be somewhat faster +! if they are +!--------------------------------------------------------------------- + + if (dir == 1) then + call cffts1(1, dims(1), dims(2), dims(3), x1, x1, y1, y2) + call cffts2(1, dims(1), dims(2), dims(3), x1, x1, y1, y2) + call cffts3(1, dims(1), dims(2), dims(3), x1, x2, y1, y2) + else + call cffts3(-1, dims(1), dims(2), dims(3), x1, x1, y1, y2) + call cffts2(-1, dims(1), dims(2), dims(3), x1, x1, y1, y2) + call cffts1(-1, dims(1), dims(2), dims(3), x1, x2, y1, y2) + endif + return + end subroutine fft + + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine cffts1(is, d1, d2, d3, x, xout, y1, y2) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + implicit none + +! CLASS = A +! +! +! This file is generated automatically by the setparams utility. +! It sets the number of processors and the class of the NPB +! in this directory. Do not modify it by hand. +! + integer nx, ny, nz, maxdim, niter_default + integer ntotal, nxp, nyp, ntotalp + parameter (nx=256, ny=256, nz=128, maxdim=256) + parameter (niter_default=6) + parameter (nxp=nx+1, nyp=ny) + parameter (ntotal=nx*nyp*nz) + parameter (ntotalp=nxp*nyp*nz) + logical convertdouble + parameter (convertdouble = .false.) + character compiletime*11 + parameter (compiletime='18 Oct 2016') + character npbversion*5 + parameter (npbversion='3.3.1') + character cs1*8 + parameter (cs1='gfortran') + character cs2*6 + parameter (cs2='$(F77)') + character cs3*6 + parameter (cs3='(none)') + character cs4*6 + parameter (cs4='(none)') + character cs5*40 + parameter (cs5='-ffree-form -O3 -fopenmp -mcmodel=medium') + character cs6*28 + parameter (cs6='-O3 -fopenmp -mcmodel=medium') + character cs7*6 + parameter (cs7='randi8') + + +! If processor array is 1x1 -> 0D grid decomposition + + +! Cache blocking params. These values are good for most +! RISC processors. +! FFT parameters: +! fftblock controls how many ffts are done at a time. +! The default is appropriate for most cache-based machines +! On vector machines, the FFT can be vectorized with vector +! length equal to the block size, so the block size should +! be as large as possible. This is the size of the smallest +! dimension of the problem: 128 for class A, 256 for class B and +! 512 for class C. + + integer :: fftblock_default, fftblockpad_default +! parameter (fftblock_default=16, fftblockpad_default=18) + parameter (fftblock_default=32, fftblockpad_default=33) + + integer :: fftblock, fftblockpad + common /blockinfo/ fftblock, fftblockpad + +! we need a bunch of logic to keep track of how +! arrays are laid out. + + +! Note: this serial version is the derived from the parallel 0D case +! of the ft NPB. +! The computation proceeds logically as + +! set up initial conditions +! fftx(1) +! transpose (1->2) +! ffty(2) +! transpose (2->3) +! fftz(3) +! time evolution +! fftz(3) +! transpose (3->2) +! ffty(2) +! transpose (2->1) +! fftx(1) +! compute residual(1) + +! for the 0D, 1D, 2D strategies, the layouts look like xxx + +! 0D 1D 2D +! 1: xyz xyz xyz + +! the array dimensions are stored in dims(coord, phase) + integer :: dims(3) + common /layout/ dims + + integer :: T_total, T_setup, T_fft, T_evolve, T_checksum, T_fftx, T_ffty, T_fftz, T_max + parameter (T_total = 1, T_setup = 2, T_fft = 3, T_evolve = 4, T_checksum = 5, T_fftx = 6, T_ffty = 7, T_fftz = 8, T_max = 8) + + + + logical :: timers_enabled + + + external timer_read + double precision :: timer_read + external ilog2 + integer :: ilog2 + + external randlc + double precision :: randlc + + +! other stuff + logical :: debug, debugsynch + common /dbg/ debug, debugsynch, timers_enabled + + double precision :: seed, a, pi, alpha + parameter (seed = 314159265.d0, a = 1220703125.d0, pi = 3.141592653589793238d0, alpha=1.0d-6) + + +! roots of unity array +! relies on x being largest dimension? + double complex u(nxp) + common /ucomm/ u + + +! for checksum data + double complex sums(0:niter_default) + common /sumcomm/ sums + +! number of iterations + integer :: niter + common /iter/ niter + integer :: is, d1, d2, d3, logd1 + double complex x(d1+1,d2,d3) + double complex xout(d1+1,d2,d3) + double complex y1(fftblockpad, d1), y2(fftblockpad, d1) + integer :: i, j, k, jj + + logd1 = ilog2(d1) + + if (timers_enabled) call timer_start(T_fftx) +! omp parallel do default(shared) private(i,j,k,jj,y1,y2) shared(is,logd1,d1) + do k = 1, d3 + do jj = 0, d2 - fftblock, fftblock + do j = 1, fftblock + do i = 1, d1 + y1(j,i) = x(i,j+jj,k) + enddo + enddo + + call cfftz (is, logd1, d1, y1, y2) + + + do j = 1, fftblock + do i = 1, d1 + xout(i,j+jj,k) = y1(j,i) + enddo + enddo + enddo + enddo + if (timers_enabled) call timer_stop(T_fftx) + + return + end subroutine cffts1 + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine cffts2(is, d1, d2, d3, x, xout, y1, y2) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + implicit none + +! CLASS = A +! +! +! This file is generated automatically by the setparams utility. +! It sets the number of processors and the class of the NPB +! in this directory. Do not modify it by hand. +! + integer nx, ny, nz, maxdim, niter_default + integer ntotal, nxp, nyp, ntotalp + parameter (nx=256, ny=256, nz=128, maxdim=256) + parameter (niter_default=6) + parameter (nxp=nx+1, nyp=ny) + parameter (ntotal=nx*nyp*nz) + parameter (ntotalp=nxp*nyp*nz) + logical convertdouble + parameter (convertdouble = .false.) + character compiletime*11 + parameter (compiletime='18 Oct 2016') + character npbversion*5 + parameter (npbversion='3.3.1') + character cs1*8 + parameter (cs1='gfortran') + character cs2*6 + parameter (cs2='$(F77)') + character cs3*6 + parameter (cs3='(none)') + character cs4*6 + parameter (cs4='(none)') + character cs5*40 + parameter (cs5='-ffree-form -O3 -fopenmp -mcmodel=medium') + character cs6*28 + parameter (cs6='-O3 -fopenmp -mcmodel=medium') + character cs7*6 + parameter (cs7='randi8') + + +! If processor array is 1x1 -> 0D grid decomposition + + +! Cache blocking params. These values are good for most +! RISC processors. +! FFT parameters: +! fftblock controls how many ffts are done at a time. +! The default is appropriate for most cache-based machines +! On vector machines, the FFT can be vectorized with vector +! length equal to the block size, so the block size should +! be as large as possible. This is the size of the smallest +! dimension of the problem: 128 for class A, 256 for class B and +! 512 for class C. + + integer :: fftblock_default, fftblockpad_default +! parameter (fftblock_default=16, fftblockpad_default=18) + parameter (fftblock_default=32, fftblockpad_default=33) + + integer :: fftblock, fftblockpad + common /blockinfo/ fftblock, fftblockpad + +! we need a bunch of logic to keep track of how +! arrays are laid out. + + +! Note: this serial version is the derived from the parallel 0D case +! of the ft NPB. +! The computation proceeds logically as + +! set up initial conditions +! fftx(1) +! transpose (1->2) +! ffty(2) +! transpose (2->3) +! fftz(3) +! time evolution +! fftz(3) +! transpose (3->2) +! ffty(2) +! transpose (2->1) +! fftx(1) +! compute residual(1) + +! for the 0D, 1D, 2D strategies, the layouts look like xxx + +! 0D 1D 2D +! 1: xyz xyz xyz + +! the array dimensions are stored in dims(coord, phase) + integer :: dims(3) + common /layout/ dims + + integer :: T_total, T_setup, T_fft, T_evolve, T_checksum, T_fftx, T_ffty, T_fftz, T_max + parameter (T_total = 1, T_setup = 2, T_fft = 3, T_evolve = 4, T_checksum = 5, T_fftx = 6, T_ffty = 7, T_fftz = 8, T_max = 8) + + + + logical :: timers_enabled + + + external timer_read + double precision :: timer_read + external ilog2 + integer :: ilog2 + + external randlc + double precision :: randlc + + +! other stuff + logical :: debug, debugsynch + common /dbg/ debug, debugsynch, timers_enabled + + double precision :: seed, a, pi, alpha + parameter (seed = 314159265.d0, a = 1220703125.d0, pi = 3.141592653589793238d0, alpha=1.0d-6) + + +! roots of unity array +! relies on x being largest dimension? + double complex u(nxp) + common /ucomm/ u + + +! for checksum data + double complex sums(0:niter_default) + common /sumcomm/ sums + +! number of iterations + integer :: niter + common /iter/ niter + integer :: is, d1, d2, d3, logd2 + double complex x(d1+1,d2,d3) + double complex xout(d1+1,d2,d3) + double complex y1(fftblockpad, d2), y2(fftblockpad, d2) + integer :: i, j, k, ii + + logd2 = ilog2(d2) + + if (timers_enabled) call timer_start(T_ffty) +! omp parallel do default(shared) private(i,j,k,ii,y1,y2) shared(is,logd2,d2) + do k = 1, d3 + do ii = 0, d1 - fftblock, fftblock + do j = 1, d2 + do i = 1, fftblock + y1(i,j) = x(i+ii,j,k) + enddo + enddo + + call cfftz (is, logd2, d2, y1, y2) + + do j = 1, d2 + do i = 1, fftblock + xout(i+ii,j,k) = y1(i,j) + enddo + enddo + enddo + enddo + if (timers_enabled) call timer_stop(T_ffty) + + return + end subroutine cffts2 + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine cffts3(is, d1, d2, d3, x, xout, y1, y2) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + implicit none + +! CLASS = A +! +! +! This file is generated automatically by the setparams utility. +! It sets the number of processors and the class of the NPB +! in this directory. Do not modify it by hand. +! + integer nx, ny, nz, maxdim, niter_default + integer ntotal, nxp, nyp, ntotalp + parameter (nx=256, ny=256, nz=128, maxdim=256) + parameter (niter_default=6) + parameter (nxp=nx+1, nyp=ny) + parameter (ntotal=nx*nyp*nz) + parameter (ntotalp=nxp*nyp*nz) + logical convertdouble + parameter (convertdouble = .false.) + character compiletime*11 + parameter (compiletime='18 Oct 2016') + character npbversion*5 + parameter (npbversion='3.3.1') + character cs1*8 + parameter (cs1='gfortran') + character cs2*6 + parameter (cs2='$(F77)') + character cs3*6 + parameter (cs3='(none)') + character cs4*6 + parameter (cs4='(none)') + character cs5*40 + parameter (cs5='-ffree-form -O3 -fopenmp -mcmodel=medium') + character cs6*28 + parameter (cs6='-O3 -fopenmp -mcmodel=medium') + character cs7*6 + parameter (cs7='randi8') + + +! If processor array is 1x1 -> 0D grid decomposition + + +! Cache blocking params. These values are good for most +! RISC processors. +! FFT parameters: +! fftblock controls how many ffts are done at a time. +! The default is appropriate for most cache-based machines +! On vector machines, the FFT can be vectorized with vector +! length equal to the block size, so the block size should +! be as large as possible. This is the size of the smallest +! dimension of the problem: 128 for class A, 256 for class B and +! 512 for class C. + + integer :: fftblock_default, fftblockpad_default +! parameter (fftblock_default=16, fftblockpad_default=18) + parameter (fftblock_default=32, fftblockpad_default=33) + + integer :: fftblock, fftblockpad + common /blockinfo/ fftblock, fftblockpad + +! we need a bunch of logic to keep track of how +! arrays are laid out. + + +! Note: this serial version is the derived from the parallel 0D case +! of the ft NPB. +! The computation proceeds logically as + +! set up initial conditions +! fftx(1) +! transpose (1->2) +! ffty(2) +! transpose (2->3) +! fftz(3) +! time evolution +! fftz(3) +! transpose (3->2) +! ffty(2) +! transpose (2->1) +! fftx(1) +! compute residual(1) + +! for the 0D, 1D, 2D strategies, the layouts look like xxx + +! 0D 1D 2D +! 1: xyz xyz xyz + +! the array dimensions are stored in dims(coord, phase) + integer :: dims(3) + common /layout/ dims + + integer :: T_total, T_setup, T_fft, T_evolve, T_checksum, T_fftx, T_ffty, T_fftz, T_max + parameter (T_total = 1, T_setup = 2, T_fft = 3, T_evolve = 4, T_checksum = 5, T_fftx = 6, T_ffty = 7, T_fftz = 8, T_max = 8) + + + + logical :: timers_enabled + + + external timer_read + double precision :: timer_read + external ilog2 + integer :: ilog2 + + external randlc + double precision :: randlc + + +! other stuff + logical :: debug, debugsynch + common /dbg/ debug, debugsynch, timers_enabled + + double precision :: seed, a, pi, alpha + parameter (seed = 314159265.d0, a = 1220703125.d0, pi = 3.141592653589793238d0, alpha=1.0d-6) + + +! roots of unity array +! relies on x being largest dimension? + double complex u(nxp) + common /ucomm/ u + + +! for checksum data + double complex sums(0:niter_default) + common /sumcomm/ sums + +! number of iterations + integer :: niter + common /iter/ niter + integer :: is, d1, d2, d3, logd3 + double complex x(d1+1,d2,d3) + double complex xout(d1+1,d2,d3) + double complex y1(fftblockpad, d3), y2(fftblockpad, d3) + integer :: i, j, k, ii + + logd3 = ilog2(d3) + + if (timers_enabled) call timer_start(T_fftz) +! omp parallel do default(shared) private(i,j,k,ii,y1,y2) shared(is) + do j = 1, d2 + do ii = 0, d1 - fftblock, fftblock + do k = 1, d3 + do i = 1, fftblock + y1(i,k) = x(i+ii,j,k) + enddo + enddo + + call cfftz (is, logd3, d3, y1, y2) + + do k = 1, d3 + do i = 1, fftblock + xout(i+ii,j,k) = y1(i,k) + enddo + enddo + enddo + enddo + if (timers_enabled) call timer_stop(T_fftz) + + return + end subroutine cffts3 + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine fft_init (n) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +! compute the roots-of-unity array that will be used for subsequent FFTs. +!--------------------------------------------------------------------- + + implicit none +! CLASS = A +! +! +! This file is generated automatically by the setparams utility. +! It sets the number of processors and the class of the NPB +! in this directory. Do not modify it by hand. +! + integer nx, ny, nz, maxdim, niter_default + integer ntotal, nxp, nyp, ntotalp + parameter (nx=256, ny=256, nz=128, maxdim=256) + parameter (niter_default=6) + parameter (nxp=nx+1, nyp=ny) + parameter (ntotal=nx*nyp*nz) + parameter (ntotalp=nxp*nyp*nz) + logical convertdouble + parameter (convertdouble = .false.) + character compiletime*11 + parameter (compiletime='18 Oct 2016') + character npbversion*5 + parameter (npbversion='3.3.1') + character cs1*8 + parameter (cs1='gfortran') + character cs2*6 + parameter (cs2='$(F77)') + character cs3*6 + parameter (cs3='(none)') + character cs4*6 + parameter (cs4='(none)') + character cs5*40 + parameter (cs5='-ffree-form -O3 -fopenmp -mcmodel=medium') + character cs6*28 + parameter (cs6='-O3 -fopenmp -mcmodel=medium') + character cs7*6 + parameter (cs7='randi8') + + +! If processor array is 1x1 -> 0D grid decomposition + + +! Cache blocking params. These values are good for most +! RISC processors. +! FFT parameters: +! fftblock controls how many ffts are done at a time. +! The default is appropriate for most cache-based machines +! On vector machines, the FFT can be vectorized with vector +! length equal to the block size, so the block size should +! be as large as possible. This is the size of the smallest +! dimension of the problem: 128 for class A, 256 for class B and +! 512 for class C. + + integer :: fftblock_default, fftblockpad_default +! parameter (fftblock_default=16, fftblockpad_default=18) + parameter (fftblock_default=32, fftblockpad_default=33) + + integer :: fftblock, fftblockpad + common /blockinfo/ fftblock, fftblockpad + +! we need a bunch of logic to keep track of how +! arrays are laid out. + + +! Note: this serial version is the derived from the parallel 0D case +! of the ft NPB. +! The computation proceeds logically as + +! set up initial conditions +! fftx(1) +! transpose (1->2) +! ffty(2) +! transpose (2->3) +! fftz(3) +! time evolution +! fftz(3) +! transpose (3->2) +! ffty(2) +! transpose (2->1) +! fftx(1) +! compute residual(1) + +! for the 0D, 1D, 2D strategies, the layouts look like xxx + +! 0D 1D 2D +! 1: xyz xyz xyz + +! the array dimensions are stored in dims(coord, phase) + integer :: dims(3) + common /layout/ dims + + integer :: T_total, T_setup, T_fft, T_evolve, T_checksum, T_fftx, T_ffty, T_fftz, T_max + parameter (T_total = 1, T_setup = 2, T_fft = 3, T_evolve = 4, T_checksum = 5, T_fftx = 6, T_ffty = 7, T_fftz = 8, T_max = 8) + + + + logical :: timers_enabled + + + external timer_read + double precision :: timer_read + external ilog2 + integer :: ilog2 + + external randlc + double precision :: randlc + + +! other stuff + logical :: debug, debugsynch + common /dbg/ debug, debugsynch, timers_enabled + + double precision :: seed, a, pi, alpha + parameter (seed = 314159265.d0, a = 1220703125.d0, pi = 3.141592653589793238d0, alpha=1.0d-6) + + +! roots of unity array +! relies on x being largest dimension? + double complex u(nxp) + common /ucomm/ u + + +! for checksum data + double complex sums(0:niter_default) + common /sumcomm/ sums + +! number of iterations + integer :: niter + common /iter/ niter + + integer :: m,n,nu,ku,i,j,ln + double precision :: t, ti + + +!--------------------------------------------------------------------- +! Initialize the U array with sines and cosines in a manner that permits +! stride one access at each FFT iteration. +!--------------------------------------------------------------------- + nu = n + m = ilog2(n) + u(1) = m + ku = 2 + ln = 1 + + do j = 1, m + t = pi / ln + + do i = 0, ln - 1 + ti = i * t + u(i+ku) = dcmplx (cos (ti), sin(ti)) + enddo + + ku = ku + ln + ln = 2 * ln + enddo + + return + end subroutine fft_init + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine cfftz (is, m, n, x, y) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +! Computes NY N-point complex-to-complex FFTs of X using an algorithm due +! to Swarztrauber. X is both the input and the output array, while Y is a +! scratch array. It is assumed that N = 2^M. Before calling CFFTZ to +! perform FFTs, the array U must be initialized by calling CFFTZ with IS +! set to 0 and M set to MX, where MX is the maximum value of M for any +! subsequent call. +!--------------------------------------------------------------------- + + implicit none +! CLASS = A +! +! +! This file is generated automatically by the setparams utility. +! It sets the number of processors and the class of the NPB +! in this directory. Do not modify it by hand. +! + integer nx, ny, nz, maxdim, niter_default + integer ntotal, nxp, nyp, ntotalp + parameter (nx=256, ny=256, nz=128, maxdim=256) + parameter (niter_default=6) + parameter (nxp=nx+1, nyp=ny) + parameter (ntotal=nx*nyp*nz) + parameter (ntotalp=nxp*nyp*nz) + logical convertdouble + parameter (convertdouble = .false.) + character compiletime*11 + parameter (compiletime='18 Oct 2016') + character npbversion*5 + parameter (npbversion='3.3.1') + character cs1*8 + parameter (cs1='gfortran') + character cs2*6 + parameter (cs2='$(F77)') + character cs3*6 + parameter (cs3='(none)') + character cs4*6 + parameter (cs4='(none)') + character cs5*40 + parameter (cs5='-ffree-form -O3 -fopenmp -mcmodel=medium') + character cs6*28 + parameter (cs6='-O3 -fopenmp -mcmodel=medium') + character cs7*6 + parameter (cs7='randi8') + + +! If processor array is 1x1 -> 0D grid decomposition + + +! Cache blocking params. These values are good for most +! RISC processors. +! FFT parameters: +! fftblock controls how many ffts are done at a time. +! The default is appropriate for most cache-based machines +! On vector machines, the FFT can be vectorized with vector +! length equal to the block size, so the block size should +! be as large as possible. This is the size of the smallest +! dimension of the problem: 128 for class A, 256 for class B and +! 512 for class C. + + integer :: fftblock_default, fftblockpad_default +! parameter (fftblock_default=16, fftblockpad_default=18) + parameter (fftblock_default=32, fftblockpad_default=33) + + integer :: fftblock, fftblockpad + common /blockinfo/ fftblock, fftblockpad + +! we need a bunch of logic to keep track of how +! arrays are laid out. + + +! Note: this serial version is the derived from the parallel 0D case +! of the ft NPB. +! The computation proceeds logically as + +! set up initial conditions +! fftx(1) +! transpose (1->2) +! ffty(2) +! transpose (2->3) +! fftz(3) +! time evolution +! fftz(3) +! transpose (3->2) +! ffty(2) +! transpose (2->1) +! fftx(1) +! compute residual(1) + +! for the 0D, 1D, 2D strategies, the layouts look like xxx + +! 0D 1D 2D +! 1: xyz xyz xyz + +! the array dimensions are stored in dims(coord, phase) + integer :: dims(3) + common /layout/ dims + + integer :: T_total, T_setup, T_fft, T_evolve, T_checksum, T_fftx, T_ffty, T_fftz, T_max + parameter (T_total = 1, T_setup = 2, T_fft = 3, T_evolve = 4, T_checksum = 5, T_fftx = 6, T_ffty = 7, T_fftz = 8, T_max = 8) + + + + logical :: timers_enabled + + + external timer_read + double precision :: timer_read + external ilog2 + integer :: ilog2 + + external randlc + double precision :: randlc + + +! other stuff + logical :: debug, debugsynch + common /dbg/ debug, debugsynch, timers_enabled + + double precision :: seed, a, pi, alpha + parameter (seed = 314159265.d0, a = 1220703125.d0, pi = 3.141592653589793238d0, alpha=1.0d-6) + + +! roots of unity array +! relies on x being largest dimension? + double complex u(nxp) + common /ucomm/ u + + +! for checksum data + double complex sums(0:niter_default) + common /sumcomm/ sums + +! number of iterations + integer :: niter + common /iter/ niter + + integer :: is,m,n,i,j,l,mx + double complex x, y + + dimension x(fftblockpad,n), y(fftblockpad,n) + +!--------------------------------------------------------------------- +! Check if input parameters are invalid. +!--------------------------------------------------------------------- + mx = u(1) + if ((is /= 1 .AND. is /= -1) .OR. m < 1 .OR. m > mx) then + write (*, 1) is, m, mx + 1 format ('CFFTZ: Either U has not been initialized, or else'/ 'one of the input parameters is invalid', 3I5) + stop + endif + + !--------------------------------------------------------------------- + ! Perform one variant of the Stockham FFT. + !--------------------------------------------------------------------- + do l = 1, m, 2 + call fftz2 (is, l, m, n, fftblock, fftblockpad, u, x, y) + if (l == m) goto 160 + call fftz2 (is, l + 1, m, n, fftblock, fftblockpad, u, y, x) + enddo + + goto 180 + + !--------------------------------------------------------------------- + ! Copy Y to X. + !--------------------------------------------------------------------- + 160 do j = 1, n + do i = 1, fftblock + x(i,j) = y(i,j) + enddo + enddo + + 180 continue + + return + end subroutine cfftz + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine fftz2 (is, l, m, n, ny, ny1, u, x, y) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +! Performs the L-th iteration of the second variant of the Stockham FFT. +!--------------------------------------------------------------------- + + implicit none + + integer :: is,k,l,m,n,ny,ny1,n1,li,lj,lk,ku,i,j,i11,i12,i21,i22 + double complex u,x,y,u1,x11,x21 + dimension u(n), x(ny1,n), y(ny1,n) + + +!--------------------------------------------------------------------- +! Set initial parameters. +!--------------------------------------------------------------------- + + n1 = n / 2 + lk = 2 ** (l - 1) + li = 2 ** (m - l) + lj = 2 * lk + ku = li + 1 + + do i = 0, li - 1 + i11 = i * lk + 1 + i12 = i11 + n1 + i21 = i * lj + 1 + i22 = i21 + lk + if (is >= 1) then + u1 = u(ku+i) + else + u1 = dconjg (u(ku+i)) + endif + + !--------------------------------------------------------------------- + ! This loop is vectorizable. + !--------------------------------------------------------------------- + do k = 0, lk - 1 + do j = 1, ny + x11 = x(j,i11+k) + x21 = x(j,i12+k) + y(j,i21+k) = x11 + x21 + y(j,i22+k) = u1 * (x11 - x21) + enddo + enddo + enddo + + return + end subroutine fftz2 + +!--------------------------------------------------------------------- + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + integer function ilog2(n) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + implicit none + integer :: n, nn, lg + if (n == 1) then + ilog2=0 + return + endif + lg = 1 + nn = 2 + do while (nn < n) + nn = nn*2 + lg = lg+1 + end do + ilog2 = lg + return + end function ilog2 + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine checksum(i, u1, d1, d2, d3) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + implicit none +! CLASS = A +! +! +! This file is generated automatically by the setparams utility. +! It sets the number of processors and the class of the NPB +! in this directory. Do not modify it by hand. +! + integer nx, ny, nz, maxdim, niter_default + integer ntotal, nxp, nyp, ntotalp + parameter (nx=256, ny=256, nz=128, maxdim=256) + parameter (niter_default=6) + parameter (nxp=nx+1, nyp=ny) + parameter (ntotal=nx*nyp*nz) + parameter (ntotalp=nxp*nyp*nz) + logical convertdouble + parameter (convertdouble = .false.) + character compiletime*11 + parameter (compiletime='18 Oct 2016') + character npbversion*5 + parameter (npbversion='3.3.1') + character cs1*8 + parameter (cs1='gfortran') + character cs2*6 + parameter (cs2='$(F77)') + character cs3*6 + parameter (cs3='(none)') + character cs4*6 + parameter (cs4='(none)') + character cs5*40 + parameter (cs5='-ffree-form -O3 -fopenmp -mcmodel=medium') + character cs6*28 + parameter (cs6='-O3 -fopenmp -mcmodel=medium') + character cs7*6 + parameter (cs7='randi8') + + +! If processor array is 1x1 -> 0D grid decomposition + + +! Cache blocking params. These values are good for most +! RISC processors. +! FFT parameters: +! fftblock controls how many ffts are done at a time. +! The default is appropriate for most cache-based machines +! On vector machines, the FFT can be vectorized with vector +! length equal to the block size, so the block size should +! be as large as possible. This is the size of the smallest +! dimension of the problem: 128 for class A, 256 for class B and +! 512 for class C. + + integer :: fftblock_default, fftblockpad_default +! parameter (fftblock_default=16, fftblockpad_default=18) + parameter (fftblock_default=32, fftblockpad_default=33) + + integer :: fftblock, fftblockpad + common /blockinfo/ fftblock, fftblockpad + +! we need a bunch of logic to keep track of how +! arrays are laid out. + + +! Note: this serial version is the derived from the parallel 0D case +! of the ft NPB. +! The computation proceeds logically as + +! set up initial conditions +! fftx(1) +! transpose (1->2) +! ffty(2) +! transpose (2->3) +! fftz(3) +! time evolution +! fftz(3) +! transpose (3->2) +! ffty(2) +! transpose (2->1) +! fftx(1) +! compute residual(1) + +! for the 0D, 1D, 2D strategies, the layouts look like xxx + +! 0D 1D 2D +! 1: xyz xyz xyz + +! the array dimensions are stored in dims(coord, phase) + integer :: dims(3) + common /layout/ dims + + integer :: T_total, T_setup, T_fft, T_evolve, T_checksum, T_fftx, T_ffty, T_fftz, T_max + parameter (T_total = 1, T_setup = 2, T_fft = 3, T_evolve = 4, T_checksum = 5, T_fftx = 6, T_ffty = 7, T_fftz = 8, T_max = 8) + + + + logical :: timers_enabled + + + external timer_read + double precision :: timer_read + external ilog2 + integer :: ilog2 + + external randlc + double precision :: randlc + + +! other stuff + logical :: debug, debugsynch + common /dbg/ debug, debugsynch, timers_enabled + + double precision :: seed, a, pi, alpha + parameter (seed = 314159265.d0, a = 1220703125.d0, pi = 3.141592653589793238d0, alpha=1.0d-6) + + +! roots of unity array +! relies on x being largest dimension? + double complex u(nxp) + common /ucomm/ u + + +! for checksum data + double complex sums(0:niter_default) + common /sumcomm/ sums + +! number of iterations + integer :: niter + common /iter/ niter + integer :: i, d1, d2, d3 + double complex u1(d1+1,d2,d3) + integer :: j, q,r,s + double complex chk + chk = (0.0,0.0) + +! omp parallel do default(shared) private(i,q,r,s) reduction(+:chk) + do j=1,1024 + q = mod(j, nx)+1 + r = mod(3*j,ny)+1 + s = mod(5*j,nz)+1 + chk=chk+u1(q,r,s) + end do + + chk = chk/dble(ntotal) + + write (*, 30) i, chk + 30 format (' T =',I5,5X,'Checksum =',1P2D22.12) + sums(i) = chk + return + end subroutine checksum + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine verify (d1, d2, d3, nt, verified, class) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + implicit none +! CLASS = A +! +! +! This file is generated automatically by the setparams utility. +! It sets the number of processors and the class of the NPB +! in this directory. Do not modify it by hand. +! + integer nx, ny, nz, maxdim, niter_default + integer ntotal, nxp, nyp, ntotalp + parameter (nx=256, ny=256, nz=128, maxdim=256) + parameter (niter_default=6) + parameter (nxp=nx+1, nyp=ny) + parameter (ntotal=nx*nyp*nz) + parameter (ntotalp=nxp*nyp*nz) + logical convertdouble + parameter (convertdouble = .false.) + character compiletime*11 + parameter (compiletime='18 Oct 2016') + character npbversion*5 + parameter (npbversion='3.3.1') + character cs1*8 + parameter (cs1='gfortran') + character cs2*6 + parameter (cs2='$(F77)') + character cs3*6 + parameter (cs3='(none)') + character cs4*6 + parameter (cs4='(none)') + character cs5*40 + parameter (cs5='-ffree-form -O3 -fopenmp -mcmodel=medium') + character cs6*28 + parameter (cs6='-O3 -fopenmp -mcmodel=medium') + character cs7*6 + parameter (cs7='randi8') + + +! If processor array is 1x1 -> 0D grid decomposition + + +! Cache blocking params. These values are good for most +! RISC processors. +! FFT parameters: +! fftblock controls how many ffts are done at a time. +! The default is appropriate for most cache-based machines +! On vector machines, the FFT can be vectorized with vector +! length equal to the block size, so the block size should +! be as large as possible. This is the size of the smallest +! dimension of the problem: 128 for class A, 256 for class B and +! 512 for class C. + + integer :: fftblock_default, fftblockpad_default +! parameter (fftblock_default=16, fftblockpad_default=18) + parameter (fftblock_default=32, fftblockpad_default=33) + + integer :: fftblock, fftblockpad + common /blockinfo/ fftblock, fftblockpad + +! we need a bunch of logic to keep track of how +! arrays are laid out. + + +! Note: this serial version is the derived from the parallel 0D case +! of the ft NPB. +! The computation proceeds logically as + +! set up initial conditions +! fftx(1) +! transpose (1->2) +! ffty(2) +! transpose (2->3) +! fftz(3) +! time evolution +! fftz(3) +! transpose (3->2) +! ffty(2) +! transpose (2->1) +! fftx(1) +! compute residual(1) + +! for the 0D, 1D, 2D strategies, the layouts look like xxx + +! 0D 1D 2D +! 1: xyz xyz xyz + +! the array dimensions are stored in dims(coord, phase) + integer :: dims(3) + common /layout/ dims + + integer :: T_total, T_setup, T_fft, T_evolve, T_checksum, T_fftx, T_ffty, T_fftz, T_max + parameter (T_total = 1, T_setup = 2, T_fft = 3, T_evolve = 4, T_checksum = 5, T_fftx = 6, T_ffty = 7, T_fftz = 8, T_max = 8) + + + + logical :: timers_enabled + + + external timer_read + double precision :: timer_read + external ilog2 + integer :: ilog2 + + external randlc + double precision :: randlc + + +! other stuff + logical :: debug, debugsynch + common /dbg/ debug, debugsynch, timers_enabled + + double precision :: seed, a, pi, alpha + parameter (seed = 314159265.d0, a = 1220703125.d0, pi = 3.141592653589793238d0, alpha=1.0d-6) + + +! roots of unity array +! relies on x being largest dimension? + double complex u(nxp) + common /ucomm/ u + + +! for checksum data + double complex sums(0:niter_default) + common /sumcomm/ sums + +! number of iterations + integer :: niter + common /iter/ niter + integer :: d1, d2, d3, nt + character class + logical :: verified + integer :: i + double precision :: err, epsilon + +!--------------------------------------------------------------------- +! Reference checksums +!--------------------------------------------------------------------- + double complex csum_ref(25) + + + class = 'U' + + epsilon = 1.0d-12 + verified = .FALSE. + + if (d1 == 64 .AND. d2 == 64 .AND. d3 == 64 .AND. nt == 6) then + !--------------------------------------------------------------------- + ! Sample size reference checksums + !--------------------------------------------------------------------- + class = 'S' + csum_ref(1) = dcmplx(5.546087004964D+02, 4.845363331978D+02) + csum_ref(2) = dcmplx(5.546385409189D+02, 4.865304269511D+02) + csum_ref(3) = dcmplx(5.546148406171D+02, 4.883910722336D+02) + csum_ref(4) = dcmplx(5.545423607415D+02, 4.901273169046D+02) + csum_ref(5) = dcmplx(5.544255039624D+02, 4.917475857993D+02) + csum_ref(6) = dcmplx(5.542683411902D+02, 4.932597244941D+02) + + else if (d1 == 128 .AND. d2 == 128 .AND. d3 == 32 .AND. nt == 6) then + !--------------------------------------------------------------------- + ! Class W size reference checksums + !--------------------------------------------------------------------- + class = 'W' + csum_ref(1) = dcmplx(5.673612178944D+02, 5.293246849175D+02) + csum_ref(2) = dcmplx(5.631436885271D+02, 5.282149986629D+02) + csum_ref(3) = dcmplx(5.594024089970D+02, 5.270996558037D+02) + csum_ref(4) = dcmplx(5.560698047020D+02, 5.260027904925D+02) + csum_ref(5) = dcmplx(5.530898991250D+02, 5.249400845633D+02) + csum_ref(6) = dcmplx(5.504159734538D+02, 5.239212247086D+02) + + else if (d1 == 256 .AND. d2 == 256 .AND. d3 == 128 .AND. nt == 6) then + !--------------------------------------------------------------------- + ! Class A size reference checksums + !--------------------------------------------------------------------- + class = 'A' + csum_ref(1) = dcmplx(5.046735008193D+02, 5.114047905510D+02) + csum_ref(2) = dcmplx(5.059412319734D+02, 5.098809666433D+02) + csum_ref(3) = dcmplx(5.069376896287D+02, 5.098144042213D+02) + csum_ref(4) = dcmplx(5.077892868474D+02, 5.101336130759D+02) + csum_ref(5) = dcmplx(5.085233095391D+02, 5.104914655194D+02) + csum_ref(6) = dcmplx(5.091487099959D+02, 5.107917842803D+02) + + else if (d1 == 512 .AND. d2 == 256 .AND. d3 == 256 .AND. nt == 20) then + !--------------------------------------------------------------------- + ! Class B size reference checksums + !--------------------------------------------------------------------- + class = 'B' + csum_ref(1) = dcmplx(5.177643571579D+02, 5.077803458597D+02) + csum_ref(2) = dcmplx(5.154521291263D+02, 5.088249431599D+02) + csum_ref(3) = dcmplx(5.146409228649D+02, 5.096208912659D+02) + csum_ref(4) = dcmplx(5.142378756213D+02, 5.101023387619D+02) + csum_ref(5) = dcmplx(5.139626667737D+02, 5.103976610617D+02) + csum_ref(6) = dcmplx(5.137423460082D+02, 5.105948019802D+02) + csum_ref(7) = dcmplx(5.135547056878D+02, 5.107404165783D+02) + csum_ref(8) = dcmplx(5.133910925466D+02, 5.108576573661D+02) + csum_ref(9) = dcmplx(5.132470705390D+02, 5.109577278523D+02) + csum_ref(10) = dcmplx(5.131197729984D+02, 5.110460304483D+02) + csum_ref(11) = dcmplx(5.130070319283D+02, 5.111252433800D+02) + csum_ref(12) = dcmplx(5.129070537032D+02, 5.111968077718D+02) + csum_ref(13) = dcmplx(5.128182883502D+02, 5.112616233064D+02) + csum_ref(14) = dcmplx(5.127393733383D+02, 5.113203605551D+02) + csum_ref(15) = dcmplx(5.126691062020D+02, 5.113735928093D+02) + csum_ref(16) = dcmplx(5.126064276004D+02, 5.114218460548D+02) + csum_ref(17) = dcmplx(5.125504076570D+02, 5.114656139760D+02) + csum_ref(18) = dcmplx(5.125002331720D+02, 5.115053595966D+02) + csum_ref(19) = dcmplx(5.124551951846D+02, 5.115415130407D+02) + csum_ref(20) = dcmplx(5.124146770029D+02, 5.115744692211D+02) + + else if (d1 == 512 .AND. d2 == 512 .AND. d3 == 512 .AND. nt == 20) then + !--------------------------------------------------------------------- + ! Class C size reference checksums + !--------------------------------------------------------------------- + class = 'C' + csum_ref(1) = dcmplx(5.195078707457D+02, 5.149019699238D+02) + csum_ref(2) = dcmplx(5.155422171134D+02, 5.127578201997D+02) + csum_ref(3) = dcmplx(5.144678022222D+02, 5.122251847514D+02) + csum_ref(4) = dcmplx(5.140150594328D+02, 5.121090289018D+02) + csum_ref(5) = dcmplx(5.137550426810D+02, 5.121143685824D+02) + csum_ref(6) = dcmplx(5.135811056728D+02, 5.121496764568D+02) + csum_ref(7) = dcmplx(5.134569343165D+02, 5.121870921893D+02) + csum_ref(8) = dcmplx(5.133651975661D+02, 5.122193250322D+02) + csum_ref(9) = dcmplx(5.132955192805D+02, 5.122454735794D+02) + csum_ref(10) = dcmplx(5.132410471738D+02, 5.122663649603D+02) + csum_ref(11) = dcmplx(5.131971141679D+02, 5.122830879827D+02) + csum_ref(12) = dcmplx(5.131605205716D+02, 5.122965869718D+02) + csum_ref(13) = dcmplx(5.131290734194D+02, 5.123075927445D+02) + csum_ref(14) = dcmplx(5.131012720314D+02, 5.123166486553D+02) + csum_ref(15) = dcmplx(5.130760908195D+02, 5.123241541685D+02) + csum_ref(16) = dcmplx(5.130528295923D+02, 5.123304037599D+02) + csum_ref(17) = dcmplx(5.130310107773D+02, 5.123356167976D+02) + csum_ref(18) = dcmplx(5.130103090133D+02, 5.123399592211D+02) + csum_ref(19) = dcmplx(5.129905029333D+02, 5.123435588985D+02) + csum_ref(20) = dcmplx(5.129714421109D+02, 5.123465164008D+02) + + else if (d1 == 2048 .AND. d2 == 1024 .AND. d3 == 1024 .AND. nt == 25) then + !--------------------------------------------------------------------- + ! Class D size reference checksums + !--------------------------------------------------------------------- + class = 'D' + csum_ref(1) = dcmplx(5.122230065252D+02, 5.118534037109D+02) + csum_ref(2) = dcmplx(5.120463975765D+02, 5.117061181082D+02) + csum_ref(3) = dcmplx(5.119865766760D+02, 5.117096364601D+02) + csum_ref(4) = dcmplx(5.119518799488D+02, 5.117373863950D+02) + csum_ref(5) = dcmplx(5.119269088223D+02, 5.117680347632D+02) + csum_ref(6) = dcmplx(5.119082416858D+02, 5.117967875532D+02) + csum_ref(7) = dcmplx(5.118943814638D+02, 5.118225281841D+02) + csum_ref(8) = dcmplx(5.118842385057D+02, 5.118451629348D+02) + csum_ref(9) = dcmplx(5.118769435632D+02, 5.118649119387D+02) + csum_ref(10) = dcmplx(5.118718203448D+02, 5.118820803844D+02) + csum_ref(11) = dcmplx(5.118683569061D+02, 5.118969781011D+02) + csum_ref(12) = dcmplx(5.118661708593D+02, 5.119098918835D+02) + csum_ref(13) = dcmplx(5.118649768950D+02, 5.119210777066D+02) + csum_ref(14) = dcmplx(5.118645605626D+02, 5.119307604484D+02) + csum_ref(15) = dcmplx(5.118647586618D+02, 5.119391362671D+02) + csum_ref(16) = dcmplx(5.118654451572D+02, 5.119463757241D+02) + csum_ref(17) = dcmplx(5.118665212451D+02, 5.119526269238D+02) + csum_ref(18) = dcmplx(5.118679083821D+02, 5.119580184108D+02) + csum_ref(19) = dcmplx(5.118695433664D+02, 5.119626617538D+02) + csum_ref(20) = dcmplx(5.118713748264D+02, 5.119666538138D+02) + csum_ref(21) = dcmplx(5.118733606701D+02, 5.119700787219D+02) + csum_ref(22) = dcmplx(5.118754661974D+02, 5.119730095953D+02) + csum_ref(23) = dcmplx(5.118776626738D+02, 5.119755100241D+02) + csum_ref(24) = dcmplx(5.118799262314D+02, 5.119776353561D+02) + csum_ref(25) = dcmplx(5.118822370068D+02, 5.119794338060D+02) + + else if (d1 == 4096 .AND. d2 == 2048 .AND. d3 == 2048 .AND. nt == 25) then + !--------------------------------------------------------------------- + ! Class E size reference checksums + !--------------------------------------------------------------------- + class = 'E' + csum_ref(1) = dcmplx(5.121601045346D+02, 5.117395998266D+02) + csum_ref(2) = dcmplx(5.120905403678D+02, 5.118614716182D+02) + csum_ref(3) = dcmplx(5.120623229306D+02, 5.119074203747D+02) + csum_ref(4) = dcmplx(5.120438418997D+02, 5.119345900733D+02) + csum_ref(5) = dcmplx(5.120311521872D+02, 5.119551325550D+02) + csum_ref(6) = dcmplx(5.120226088809D+02, 5.119720179919D+02) + csum_ref(7) = dcmplx(5.120169296534D+02, 5.119861371665D+02) + csum_ref(8) = dcmplx(5.120131225172D+02, 5.119979364402D+02) + csum_ref(9) = dcmplx(5.120104767108D+02, 5.120077674092D+02) + csum_ref(10) = dcmplx(5.120085127969D+02, 5.120159443121D+02) + csum_ref(11) = dcmplx(5.120069224127D+02, 5.120227453670D+02) + csum_ref(12) = dcmplx(5.120055158164D+02, 5.120284096041D+02) + csum_ref(13) = dcmplx(5.120041820159D+02, 5.120331373793D+02) + csum_ref(14) = dcmplx(5.120028605402D+02, 5.120370938679D+02) + csum_ref(15) = dcmplx(5.120015223011D+02, 5.120404138831D+02) + csum_ref(16) = dcmplx(5.120001570022D+02, 5.120432068837D+02) + csum_ref(17) = dcmplx(5.119987650555D+02, 5.120455615860D+02) + csum_ref(18) = dcmplx(5.119973525091D+02, 5.120475499442D+02) + csum_ref(19) = dcmplx(5.119959279472D+02, 5.120492304629D+02) + csum_ref(20) = dcmplx(5.119945006558D+02, 5.120506508902D+02) + csum_ref(21) = dcmplx(5.119930795911D+02, 5.120518503782D+02) + csum_ref(22) = dcmplx(5.119916728462D+02, 5.120528612016D+02) + csum_ref(23) = dcmplx(5.119902874185D+02, 5.120537101195D+02) + csum_ref(24) = dcmplx(5.119889291565D+02, 5.120544194514D+02) + csum_ref(25) = dcmplx(5.119876028049D+02, 5.120550079284D+02) + + endif + + + if (class /= 'U') then + + do i = 1, nt + err = abs( (sums(i) - csum_ref(i)) / csum_ref(i) ) + if ( .NOT. (err <= epsilon)) goto 100 + end do + verified = .TRUE. + 100 continue + + endif + + + if (class /= 'U') then + if (verified) then + write(*,2000) + 2000 format(' Result verification successful') + else + write(*,2001) + 2001 format(' Result verification failed') + endif + endif + print *, 'class = ', class + + return + end subroutine verify + + diff --git a/Parser/test-data/nas/ft4.f b/Parser/test-data/nas/ft4.f new file mode 100644 index 0000000..6cef1c4 --- /dev/null +++ b/Parser/test-data/nas/ft4.f @@ -0,0 +1,1106 @@ +!-------------------------------------------------------------------------! +! ! +! N A S P A R A L L E L B E N C H M A R K S 3.3 ! +! ! +! O p e n M P V E R S I O N ! +! ! +! F T ! +! ! +!-------------------------------------------------------------------------! +! ! +! This benchmark is an OpenMP version of the NPB FT code. ! +! It is described in NAS Technical Report 99-011. ! +! ! +! Permission to use, copy, distribute and modify this software ! +! for any purpose with or without fee is hereby granted. We ! +! request, however, that all derived work reference the NAS ! +! Parallel Benchmarks 3.3. This software is provided "as is" ! +! without express or implied warranty. ! +! ! +! Information on NPB 3.3, including the technical report, the ! +! original specifications, source code, results and information ! +! on how to submit new results, is available at: ! +! ! +! http://www.nas.nasa.gov/Software/NPB/ ! +! ! +! Send comments or suggestions to npb@nas.nasa.gov ! +! ! +! NAS Parallel Benchmarks Group ! +! NASA Ames Research Center ! +! Mail Stop: T27A-1 ! +! Moffett Field, CA 94035-1000 ! +! ! +! E-mail: npb@nas.nasa.gov ! +! Fax: (650) 604-3957 ! +! ! +!-------------------------------------------------------------------------! + +!--------------------------------------------------------------------- + +! Authors: D. Bailey +! W. Saphir +! H. Jin + +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +! FT benchmark +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + program ft + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + + implicit none + + include 'global.h' + integer :: i + +!--------------------------------------------------------------------- +! u0, u1, u2 are the main arrays in the problem. +! Depending on the decomposition, these arrays will have different +! dimensions. To accomodate all possibilities, we allocate them as +! one-dimensional arrays and pass them to subroutines for different +! views +! - u0 contains the initial (transformed) initial condition +! - u1 and u2 are working arrays +! - twiddle contains exponents for the time evolution operator. +!--------------------------------------------------------------------- + + double complex u0(ntotalp), & + u1(ntotalp) +! > u2(ntotalp) + double precision :: twiddle(ntotalp) +!--------------------------------------------------------------------- +! Large arrays are in common so that they are allocated on the +! heap rather than the stack. This common block is not +! referenced directly anywhere else. Padding is to avoid accidental +! cache problems, since all array sizes are powers of two. +!--------------------------------------------------------------------- + +! double complex pad1(3), pad2(3), pad3(3) +! common /bigarrays/ u0, pad1, u1, pad2, u2, pad3, twiddle + double complex pad1(3), pad2(3) + common /bigarrays/ u0, pad1, u1, pad2, twiddle + + integer :: iter + double precision :: total_time, mflops + logical :: verified + character class + +!--------------------------------------------------------------------- +! Run the entire problem once to make sure all data is touched. +! This reduces variable startup costs, which is important for such a +! short benchmark. The other NPB 2 implementations are similar. +!--------------------------------------------------------------------- + do i = 1, t_max + call timer_clear(i) + end do + call setup() + call init_ui(u0, u1, twiddle, dims(1), dims(2), dims(3)) + call compute_indexmap(twiddle, dims(1), dims(2), dims(3)) + call compute_initial_conditions(u1, dims(1), dims(2), dims(3)) + call fft_init (dims(1)) + call fft(1, u1, u0) + +!--------------------------------------------------------------------- +! Start over from the beginning. Note that all operations must +! be timed, in contrast to other benchmarks. +!--------------------------------------------------------------------- + do i = 1, t_max + call timer_clear(i) + end do + + call timer_start(T_total) + if (timers_enabled) call timer_start(T_setup) + + call compute_indexmap(twiddle, dims(1), dims(2), dims(3)) + + call compute_initial_conditions(u1, dims(1), dims(2), dims(3)) + + call fft_init (dims(1)) + + if (timers_enabled) call timer_stop(T_setup) + if (timers_enabled) call timer_start(T_fft) + call fft(1, u1, u0) + if (timers_enabled) call timer_stop(T_fft) + + do iter = 1, niter + if (timers_enabled) call timer_start(T_evolve) + call evolve(u0, u1, twiddle, dims(1), dims(2), dims(3)) + if (timers_enabled) call timer_stop(T_evolve) + if (timers_enabled) call timer_start(T_fft) + ! call fft(-1, u1, u2) + call fft(-1, u1, u1) + if (timers_enabled) call timer_stop(T_fft) + if (timers_enabled) call timer_start(T_checksum) + ! call checksum(iter, u2, dims(1), dims(2), dims(3)) + call checksum(iter, u1, dims(1), dims(2), dims(3)) + if (timers_enabled) call timer_stop(T_checksum) + end do + + call verify(nx, ny, nz, niter, verified, class) + + call timer_stop(t_total) + total_time = timer_read(t_total) + + if( total_time /= 0. ) then + mflops = 1.0d-6*float(ntotal) * & + (14.8157+7.19641*log(float(ntotal)) & + + (5.23518+7.21113*log(float(ntotal)))*niter) & + /total_time + else + mflops = 0.0 + endif + call print_results('FT', class, nx, ny, nz, niter, & + total_time, mflops, ' floating point', verified, & + npbversion, compiletime, cs1, cs2, cs3, cs4, cs5, cs6, cs7) + if (timers_enabled) call print_timers() + + END PROGRAM + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine init_ui(u0, u1, twiddle, d1, d2, d3) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +! touch all the big data +!--------------------------------------------------------------------- + + implicit none + integer :: d1, d2, d3 + double complex u0(d1+1,d2,d3) + double complex u1(d1+1,d2,d3) + double precision :: twiddle(d1+1,d2,d3) + integer :: i, j, k + +! omp parallel do default(shared) private(i,j,k) + do k = 1, d3 + do j = 1, d2 + do i = 1, d1 + u0(i,j,k) = 0.d0 + u1(i,j,k) = 0.d0 + twiddle(i,j,k) = 0.d0 + end do + end do + end do + + return + end subroutine init_ui + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine evolve(u0, u1, twiddle, d1, d2, d3) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +! evolve u0 -> u1 (t time steps) in fourier space +!--------------------------------------------------------------------- + + implicit none + include 'global.h' + integer :: d1, d2, d3 + double complex u0(d1+1,d2,d3) + double complex u1(d1+1,d2,d3) + double precision :: twiddle(d1+1,d2,d3) + integer :: i, j, k + +! omp parallel do default(shared) private(i,j,k) + do k = 1, d3 + do j = 1, d2 + do i = 1, d1 + u0(i,j,k) = u0(i,j,k) * twiddle(i,j,k) + u1(i,j,k) = u0(i,j,k) + end do + end do + end do + + return + end subroutine evolve + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine compute_initial_conditions(u0, d1, d2, d3) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +! Fill in array u0 with initial conditions from +! random number generator +!--------------------------------------------------------------------- + implicit none + include 'global.h' + integer :: d1, d2, d3 + double complex u0(d1+1, d2, d3) + integer :: k, j + double precision :: x0, start, an, dummy, starts(nz) + + + start = seed +!--------------------------------------------------------------------- +! Jump to the starting element for our first plane. +!--------------------------------------------------------------------- + call ipow46(a, 0, an) + dummy = randlc(start, an) + call ipow46(a, 2*nx*ny, an) + + starts(1) = start + do k = 2, dims(3) + dummy = randlc(start, an) + starts(k) = start + end do + +!--------------------------------------------------------------------- +! Go through by z planes filling in one square at a time. +!--------------------------------------------------------------------- +! omp parallel do default(shared) private(k,j,x0) + do k = 1, dims(3) + x0 = starts(k) + do j = 1, dims(2) + call vranlc(2*nx, x0, a, u0(1, j, k)) + end do + end do + + return + end subroutine compute_initial_conditions + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine ipow46(a, exponent, result) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +! compute a^exponent mod 2^46 +!--------------------------------------------------------------------- + + implicit none + double precision :: a, result, dummy, q, r + integer :: exponent, n, n2 + external randlc + double precision :: randlc +!--------------------------------------------------------------------- +! Use +! a^n = a^(n/2)*a^(n/2) if n even else +! a^n = a*a^(n-1) if n odd +!--------------------------------------------------------------------- + result = 1 + if (exponent == 0) return + q = a + r = 1 + n = exponent + + + do while (n > 1) + n2 = n/2 + if (n2 * 2 == n) then + dummy = randlc(q, q) + n = n2 + else + dummy = randlc(r, q) + n = n-1 + endif + end do + dummy = randlc(r, q) + result = r + return + end subroutine ipow46 + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine setup + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + implicit none + include 'global.h' + + integer :: fstatus +!$ integer omp_get_max_threads +!$ external omp_get_max_threads + debug = .FALSE. + + open (unit=2,file='timer.flag',status='old',iostat=fstatus) + if (fstatus == 0) then + timers_enabled = .TRUE. + close(2) + else + timers_enabled = .FALSE. + endif + + write(*, 1000) + + niter = niter_default + + write(*, 1001) nx, ny, nz + write(*, 1002) niter +!$ write(*, 1003) omp_get_max_threads() + write(*, *) + + + 1000 format(//,' NAS Parallel Benchmarks (NPB3.3-OMP)', & + ' - FT Benchmark', /) + 1001 format(' Size : ', i4, 'x', i4, 'x', i4) + 1002 format(' Iterations :', i7) + 1003 format(' Number of available threads :', i7) + + dims(1) = nx + dims(2) = ny + dims(3) = nz + + +!--------------------------------------------------------------------- +! Set up info for blocking of ffts and transposes. This improves +! performance on cache-based systems. Blocking involves +! working on a chunk of the problem at a time, taking chunks +! along the first, second, or third dimension. + +! - In cffts1 blocking is on 2nd dimension (with fft on 1st dim) +! - In cffts2/3 blocking is on 1st dimension (with fft on 2nd and 3rd dims) + +! Since 1st dim is always in processor, we'll assume it's long enough +! (default blocking factor is 16 so min size for 1st dim is 16) +! The only case we have to worry about is cffts1 in a 2d decomposition. +! so the blocking factor should not be larger than the 2nd dimension. +!--------------------------------------------------------------------- + + fftblock = fftblock_default + fftblockpad = fftblockpad_default + + if (fftblock /= fftblock_default) fftblockpad = fftblock+3 + + return + end subroutine setup + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine compute_indexmap(twiddle, d1, d2, d3) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +! compute function from local (i,j,k) to ibar^2+jbar^2+kbar^2 +! for time evolution exponent. +!--------------------------------------------------------------------- + + implicit none + include 'global.h' + integer :: d1, d2, d3 + double precision :: twiddle(d1+1, d2, d3) + integer :: i, j, k, kk, kk2, jj, kj2, ii + double precision :: ap + +!--------------------------------------------------------------------- +! basically we want to convert the fortran indices +! 1 2 3 4 5 6 7 8 +! to +! 0 1 2 3 -4 -3 -2 -1 +! The following magic formula does the trick: +! mod(i-1+n/2, n) - n/2 +!--------------------------------------------------------------------- + + ap = - 4.d0 * alpha * pi *pi + +! omp parallel do default(shared) private(i,j,k,kk,kk2,jj,kj2,ii) + do k = 1, dims(3) + kk = mod(k-1+nz/2, nz) - nz/2 + kk2 = kk*kk + do j = 1, dims(2) + jj = mod(j-1+ny/2, ny) - ny/2 + kj2 = jj*jj+kk2 + do i = 1, dims(1) + ii = mod(i-1+nx/2, nx) - nx/2 + twiddle(i,j,k) = dexp(ap*dble(ii*ii+kj2)) + end do + end do + end do + + return + end subroutine compute_indexmap + + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine print_timers() + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + implicit none + integer :: i + include 'global.h' + double precision :: t, t_m + character(25) :: tstrings(T_max) + data tstrings / ' total ', & + ' setup ', & + ' fft ', & + ' evolve ', & + ' checksum ', & + ' fftx ', & + ' ffty ', & + ' fftz ' / + + t_m = timer_read(T_total) + if (t_m <= 0.0d0) t_m = 1.0d0 + do i = 1, t_max + t = timer_read(i) + write(*, 100) i, tstrings(i), t, t*100.0/t_m + end do + 100 format(' timer ', i2, '(', A16, ') :', F9.4, ' (',F6.2,'%)') + return + end subroutine print_timers + + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine fft(dir, x1, x2) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + implicit none + include 'global.h' + integer :: dir + double complex x1(ntotalp), x2(ntotalp) + + double complex y1(fftblockpad_default*maxdim), & + y2(fftblockpad_default*maxdim) + +!--------------------------------------------------------------------- +! note: args x1, x2 must be different arrays +! note: args for cfftsx are (direction, layout, xin, xout, scratch) +! xin/xout may be the same and it can be somewhat faster +! if they are +!--------------------------------------------------------------------- + + if (dir == 1) then + call cffts1(1, dims(1), dims(2), dims(3), x1, x1, y1, y2) + call cffts2(1, dims(1), dims(2), dims(3), x1, x1, y1, y2) + call cffts3(1, dims(1), dims(2), dims(3), x1, x2, y1, y2) + else + call cffts3(-1, dims(1), dims(2), dims(3), x1, x1, y1, y2) + call cffts2(-1, dims(1), dims(2), dims(3), x1, x1, y1, y2) + call cffts1(-1, dims(1), dims(2), dims(3), x1, x2, y1, y2) + endif + return + end subroutine fft + + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine cffts1(is, d1, d2, d3, x, xout, y1, y2) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + implicit none + + include 'global.h' + integer :: is, d1, d2, d3, logd1 + double complex x(d1+1,d2,d3) + double complex xout(d1+1,d2,d3) + double complex y1(fftblockpad, d1), y2(fftblockpad, d1) + integer :: i, j, k, jj + + logd1 = ilog2(d1) + + if (timers_enabled) call timer_start(T_fftx) +! omp parallel do default(shared) private(i,j,k,jj,y1,y2) +! omp& shared(is,logd1,d1) + do k = 1, d3 + do jj = 0, d2 - fftblock, fftblock + do j = 1, fftblock + do i = 1, d1 + y1(j,i) = x(i,j+jj,k) + enddo + enddo + + call cfftz (is, logd1, d1, y1, y2) + + + do j = 1, fftblock + do i = 1, d1 + xout(i,j+jj,k) = y1(j,i) + enddo + enddo + enddo + enddo + if (timers_enabled) call timer_stop(T_fftx) + + return + end subroutine cffts1 + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine cffts2(is, d1, d2, d3, x, xout, y1, y2) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + implicit none + + include 'global.h' + integer :: is, d1, d2, d3, logd2 + double complex x(d1+1,d2,d3) + double complex xout(d1+1,d2,d3) + double complex y1(fftblockpad, d2), y2(fftblockpad, d2) + integer :: i, j, k, ii + + logd2 = ilog2(d2) + + if (timers_enabled) call timer_start(T_ffty) +! omp parallel do default(shared) private(i,j,k,ii,y1,y2) +! omp& shared(is,logd2,d2) + do k = 1, d3 + do ii = 0, d1 - fftblock, fftblock + do j = 1, d2 + do i = 1, fftblock + y1(i,j) = x(i+ii,j,k) + enddo + enddo + + call cfftz (is, logd2, d2, y1, y2) + + do j = 1, d2 + do i = 1, fftblock + xout(i+ii,j,k) = y1(i,j) + enddo + enddo + enddo + enddo + if (timers_enabled) call timer_stop(T_ffty) + + return + end subroutine cffts2 + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine cffts3(is, d1, d2, d3, x, xout, y1, y2) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + implicit none + + include 'global.h' + integer :: is, d1, d2, d3, logd3 + double complex x(d1+1,d2,d3) + double complex xout(d1+1,d2,d3) + double complex y1(fftblockpad, d3), y2(fftblockpad, d3) + integer :: i, j, k, ii + + logd3 = ilog2(d3) + + if (timers_enabled) call timer_start(T_fftz) +! omp parallel do default(shared) private(i,j,k,ii,y1,y2) +! omp& shared(is) + do j = 1, d2 + do ii = 0, d1 - fftblock, fftblock + do k = 1, d3 + do i = 1, fftblock + y1(i,k) = x(i+ii,j,k) + enddo + enddo + + call cfftz (is, logd3, d3, y1, y2) + + do k = 1, d3 + do i = 1, fftblock + xout(i+ii,j,k) = y1(i,k) + enddo + enddo + enddo + enddo + if (timers_enabled) call timer_stop(T_fftz) + + return + end subroutine cffts3 + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine fft_init (n) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +! compute the roots-of-unity array that will be used for subsequent FFTs. +!--------------------------------------------------------------------- + + implicit none + include 'global.h' + + integer :: m,n,nu,ku,i,j,ln + double precision :: t, ti + + +!--------------------------------------------------------------------- +! Initialize the U array with sines and cosines in a manner that permits +! stride one access at each FFT iteration. +!--------------------------------------------------------------------- + nu = n + m = ilog2(n) + u(1) = m + ku = 2 + ln = 1 + + do j = 1, m + t = pi / ln + + do i = 0, ln - 1 + ti = i * t + u(i+ku) = dcmplx (cos (ti), sin(ti)) + enddo + + ku = ku + ln + ln = 2 * ln + enddo + + return + end subroutine fft_init + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine cfftz (is, m, n, x, y) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +! Computes NY N-point complex-to-complex FFTs of X using an algorithm due +! to Swarztrauber. X is both the input and the output array, while Y is a +! scratch array. It is assumed that N = 2^M. Before calling CFFTZ to +! perform FFTs, the array U must be initialized by calling CFFTZ with IS +! set to 0 and M set to MX, where MX is the maximum value of M for any +! subsequent call. +!--------------------------------------------------------------------- + + implicit none + include 'global.h' + + integer :: is,m,n,i,j,l,mx + double complex x, y + + dimension x(fftblockpad,n), y(fftblockpad,n) + +!--------------------------------------------------------------------- +! Check if input parameters are invalid. +!--------------------------------------------------------------------- + mx = u(1) + if ((is /= 1 .AND. is /= -1) .OR. m < 1 .OR. m > mx) & + then + write (*, 1) is, m, mx + 1 format ('CFFTZ: Either U has not been initialized, or else'/ & + 'one of the input parameters is invalid', 3I5) + stop + endif + +!--------------------------------------------------------------------- +! Perform one variant of the Stockham FFT. +!--------------------------------------------------------------------- + do l = 1, m, 2 + call fftz2 (is, l, m, n, fftblock, fftblockpad, u, x, y) + if (l == m) goto 160 + call fftz2 (is, l + 1, m, n, fftblock, fftblockpad, u, y, x) + enddo + + goto 180 + +!--------------------------------------------------------------------- +! Copy Y to X. +!--------------------------------------------------------------------- + 160 do j = 1, n + do i = 1, fftblock + x(i,j) = y(i,j) + enddo + enddo + + 180 continue + + return + end subroutine cfftz + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine fftz2 (is, l, m, n, ny, ny1, u, x, y) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +! Performs the L-th iteration of the second variant of the Stockham FFT. +!--------------------------------------------------------------------- + + implicit none + + integer :: is,k,l,m,n,ny,ny1,n1,li,lj,lk,ku,i,j,i11,i12,i21,i22 + double complex u,x,y,u1,x11,x21 + dimension u(n), x(ny1,n), y(ny1,n) + + +!--------------------------------------------------------------------- +! Set initial parameters. +!--------------------------------------------------------------------- + + n1 = n / 2 + lk = 2 ** (l - 1) + li = 2 ** (m - l) + lj = 2 * lk + ku = li + 1 + + do i = 0, li - 1 + i11 = i * lk + 1 + i12 = i11 + n1 + i21 = i * lj + 1 + i22 = i21 + lk + if (is >= 1) then + u1 = u(ku+i) + else + u1 = dconjg (u(ku+i)) + endif + + !--------------------------------------------------------------------- + ! This loop is vectorizable. + !--------------------------------------------------------------------- + do k = 0, lk - 1 + do j = 1, ny + x11 = x(j,i11+k) + x21 = x(j,i12+k) + y(j,i21+k) = x11 + x21 + y(j,i22+k) = u1 * (x11 - x21) + enddo + enddo + enddo + + return + end subroutine fftz2 + +!--------------------------------------------------------------------- + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + integer function ilog2(n) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + implicit none + integer :: n, nn, lg + if (n == 1) then + ilog2=0 + return + endif + lg = 1 + nn = 2 + do while (nn < n) + nn = nn*2 + lg = lg+1 + end do + ilog2 = lg + return + end function ilog2 + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine checksum(i, u1, d1, d2, d3) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + implicit none + include 'global.h' + integer :: i, d1, d2, d3 + double complex u1(d1+1,d2,d3) + integer :: j, q,r,s + double complex chk + chk = (0.0,0.0) + +! omp parallel do default(shared) private(i,q,r,s) reduction(+:chk) + do j=1,1024 + q = mod(j, nx)+1 + r = mod(3*j,ny)+1 + s = mod(5*j,nz)+1 + chk=chk+u1(q,r,s) + end do + + chk = chk/dble(ntotal) + + write (*, 30) i, chk + 30 format (' T =',I5,5X,'Checksum =',1P2D22.12) + sums(i) = chk + return + end subroutine checksum + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine verify (d1, d2, d3, nt, verified, class) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + implicit none + include 'global.h' + integer :: d1, d2, d3, nt + character class + logical :: verified + integer :: i + double precision :: err, epsilon + +!--------------------------------------------------------------------- +! Reference checksums +!--------------------------------------------------------------------- + double complex csum_ref(25) + + + class = 'U' + + epsilon = 1.0d-12 + verified = .FALSE. + + if (d1 == 64 .AND. & + d2 == 64 .AND. & + d3 == 64 .AND. & + nt == 6) then + !--------------------------------------------------------------------- + ! Sample size reference checksums + !--------------------------------------------------------------------- + class = 'S' + csum_ref(1) = dcmplx(5.546087004964D+02, 4.845363331978D+02) + csum_ref(2) = dcmplx(5.546385409189D+02, 4.865304269511D+02) + csum_ref(3) = dcmplx(5.546148406171D+02, 4.883910722336D+02) + csum_ref(4) = dcmplx(5.545423607415D+02, 4.901273169046D+02) + csum_ref(5) = dcmplx(5.544255039624D+02, 4.917475857993D+02) + csum_ref(6) = dcmplx(5.542683411902D+02, 4.932597244941D+02) + + else if (d1 == 128 .AND. & + d2 == 128 .AND. & + d3 == 32 .AND. & + nt == 6) then + !--------------------------------------------------------------------- + ! Class W size reference checksums + !--------------------------------------------------------------------- + class = 'W' + csum_ref(1) = dcmplx(5.673612178944D+02, 5.293246849175D+02) + csum_ref(2) = dcmplx(5.631436885271D+02, 5.282149986629D+02) + csum_ref(3) = dcmplx(5.594024089970D+02, 5.270996558037D+02) + csum_ref(4) = dcmplx(5.560698047020D+02, 5.260027904925D+02) + csum_ref(5) = dcmplx(5.530898991250D+02, 5.249400845633D+02) + csum_ref(6) = dcmplx(5.504159734538D+02, 5.239212247086D+02) + + else if (d1 == 256 .AND. & + d2 == 256 .AND. & + d3 == 128 .AND. & + nt == 6) then + !--------------------------------------------------------------------- + ! Class A size reference checksums + !--------------------------------------------------------------------- + class = 'A' + csum_ref(1) = dcmplx(5.046735008193D+02, 5.114047905510D+02) + csum_ref(2) = dcmplx(5.059412319734D+02, 5.098809666433D+02) + csum_ref(3) = dcmplx(5.069376896287D+02, 5.098144042213D+02) + csum_ref(4) = dcmplx(5.077892868474D+02, 5.101336130759D+02) + csum_ref(5) = dcmplx(5.085233095391D+02, 5.104914655194D+02) + csum_ref(6) = dcmplx(5.091487099959D+02, 5.107917842803D+02) + + else if (d1 == 512 .AND. & + d2 == 256 .AND. & + d3 == 256 .AND. & + nt == 20) then + !--------------------------------------------------------------------- + ! Class B size reference checksums + !--------------------------------------------------------------------- + class = 'B' + csum_ref(1) = dcmplx(5.177643571579D+02, 5.077803458597D+02) + csum_ref(2) = dcmplx(5.154521291263D+02, 5.088249431599D+02) + csum_ref(3) = dcmplx(5.146409228649D+02, 5.096208912659D+02) + csum_ref(4) = dcmplx(5.142378756213D+02, 5.101023387619D+02) + csum_ref(5) = dcmplx(5.139626667737D+02, 5.103976610617D+02) + csum_ref(6) = dcmplx(5.137423460082D+02, 5.105948019802D+02) + csum_ref(7) = dcmplx(5.135547056878D+02, 5.107404165783D+02) + csum_ref(8) = dcmplx(5.133910925466D+02, 5.108576573661D+02) + csum_ref(9) = dcmplx(5.132470705390D+02, 5.109577278523D+02) + csum_ref(10) = dcmplx(5.131197729984D+02, 5.110460304483D+02) + csum_ref(11) = dcmplx(5.130070319283D+02, 5.111252433800D+02) + csum_ref(12) = dcmplx(5.129070537032D+02, 5.111968077718D+02) + csum_ref(13) = dcmplx(5.128182883502D+02, 5.112616233064D+02) + csum_ref(14) = dcmplx(5.127393733383D+02, 5.113203605551D+02) + csum_ref(15) = dcmplx(5.126691062020D+02, 5.113735928093D+02) + csum_ref(16) = dcmplx(5.126064276004D+02, 5.114218460548D+02) + csum_ref(17) = dcmplx(5.125504076570D+02, 5.114656139760D+02) + csum_ref(18) = dcmplx(5.125002331720D+02, 5.115053595966D+02) + csum_ref(19) = dcmplx(5.124551951846D+02, 5.115415130407D+02) + csum_ref(20) = dcmplx(5.124146770029D+02, 5.115744692211D+02) + + else if (d1 == 512 .AND. & + d2 == 512 .AND. & + d3 == 512 .AND. & + nt == 20) then + !--------------------------------------------------------------------- + ! Class C size reference checksums + !--------------------------------------------------------------------- + class = 'C' + csum_ref(1) = dcmplx(5.195078707457D+02, 5.149019699238D+02) + csum_ref(2) = dcmplx(5.155422171134D+02, 5.127578201997D+02) + csum_ref(3) = dcmplx(5.144678022222D+02, 5.122251847514D+02) + csum_ref(4) = dcmplx(5.140150594328D+02, 5.121090289018D+02) + csum_ref(5) = dcmplx(5.137550426810D+02, 5.121143685824D+02) + csum_ref(6) = dcmplx(5.135811056728D+02, 5.121496764568D+02) + csum_ref(7) = dcmplx(5.134569343165D+02, 5.121870921893D+02) + csum_ref(8) = dcmplx(5.133651975661D+02, 5.122193250322D+02) + csum_ref(9) = dcmplx(5.132955192805D+02, 5.122454735794D+02) + csum_ref(10) = dcmplx(5.132410471738D+02, 5.122663649603D+02) + csum_ref(11) = dcmplx(5.131971141679D+02, 5.122830879827D+02) + csum_ref(12) = dcmplx(5.131605205716D+02, 5.122965869718D+02) + csum_ref(13) = dcmplx(5.131290734194D+02, 5.123075927445D+02) + csum_ref(14) = dcmplx(5.131012720314D+02, 5.123166486553D+02) + csum_ref(15) = dcmplx(5.130760908195D+02, 5.123241541685D+02) + csum_ref(16) = dcmplx(5.130528295923D+02, 5.123304037599D+02) + csum_ref(17) = dcmplx(5.130310107773D+02, 5.123356167976D+02) + csum_ref(18) = dcmplx(5.130103090133D+02, 5.123399592211D+02) + csum_ref(19) = dcmplx(5.129905029333D+02, 5.123435588985D+02) + csum_ref(20) = dcmplx(5.129714421109D+02, 5.123465164008D+02) + + else if (d1 == 2048 .AND. & + d2 == 1024 .AND. & + d3 == 1024 .AND. & + nt == 25) then + !--------------------------------------------------------------------- + ! Class D size reference checksums + !--------------------------------------------------------------------- + class = 'D' + csum_ref(1) = dcmplx(5.122230065252D+02, 5.118534037109D+02) + csum_ref(2) = dcmplx(5.120463975765D+02, 5.117061181082D+02) + csum_ref(3) = dcmplx(5.119865766760D+02, 5.117096364601D+02) + csum_ref(4) = dcmplx(5.119518799488D+02, 5.117373863950D+02) + csum_ref(5) = dcmplx(5.119269088223D+02, 5.117680347632D+02) + csum_ref(6) = dcmplx(5.119082416858D+02, 5.117967875532D+02) + csum_ref(7) = dcmplx(5.118943814638D+02, 5.118225281841D+02) + csum_ref(8) = dcmplx(5.118842385057D+02, 5.118451629348D+02) + csum_ref(9) = dcmplx(5.118769435632D+02, 5.118649119387D+02) + csum_ref(10) = dcmplx(5.118718203448D+02, 5.118820803844D+02) + csum_ref(11) = dcmplx(5.118683569061D+02, 5.118969781011D+02) + csum_ref(12) = dcmplx(5.118661708593D+02, 5.119098918835D+02) + csum_ref(13) = dcmplx(5.118649768950D+02, 5.119210777066D+02) + csum_ref(14) = dcmplx(5.118645605626D+02, 5.119307604484D+02) + csum_ref(15) = dcmplx(5.118647586618D+02, 5.119391362671D+02) + csum_ref(16) = dcmplx(5.118654451572D+02, 5.119463757241D+02) + csum_ref(17) = dcmplx(5.118665212451D+02, 5.119526269238D+02) + csum_ref(18) = dcmplx(5.118679083821D+02, 5.119580184108D+02) + csum_ref(19) = dcmplx(5.118695433664D+02, 5.119626617538D+02) + csum_ref(20) = dcmplx(5.118713748264D+02, 5.119666538138D+02) + csum_ref(21) = dcmplx(5.118733606701D+02, 5.119700787219D+02) + csum_ref(22) = dcmplx(5.118754661974D+02, 5.119730095953D+02) + csum_ref(23) = dcmplx(5.118776626738D+02, 5.119755100241D+02) + csum_ref(24) = dcmplx(5.118799262314D+02, 5.119776353561D+02) + csum_ref(25) = dcmplx(5.118822370068D+02, 5.119794338060D+02) + + else if (d1 == 4096 .AND. & + d2 == 2048 .AND. & + d3 == 2048 .AND. & + nt == 25) then + !--------------------------------------------------------------------- + ! Class E size reference checksums + !--------------------------------------------------------------------- + class = 'E' + csum_ref(1) = dcmplx(5.121601045346D+02, 5.117395998266D+02) + csum_ref(2) = dcmplx(5.120905403678D+02, 5.118614716182D+02) + csum_ref(3) = dcmplx(5.120623229306D+02, 5.119074203747D+02) + csum_ref(4) = dcmplx(5.120438418997D+02, 5.119345900733D+02) + csum_ref(5) = dcmplx(5.120311521872D+02, 5.119551325550D+02) + csum_ref(6) = dcmplx(5.120226088809D+02, 5.119720179919D+02) + csum_ref(7) = dcmplx(5.120169296534D+02, 5.119861371665D+02) + csum_ref(8) = dcmplx(5.120131225172D+02, 5.119979364402D+02) + csum_ref(9) = dcmplx(5.120104767108D+02, 5.120077674092D+02) + csum_ref(10) = dcmplx(5.120085127969D+02, 5.120159443121D+02) + csum_ref(11) = dcmplx(5.120069224127D+02, 5.120227453670D+02) + csum_ref(12) = dcmplx(5.120055158164D+02, 5.120284096041D+02) + csum_ref(13) = dcmplx(5.120041820159D+02, 5.120331373793D+02) + csum_ref(14) = dcmplx(5.120028605402D+02, 5.120370938679D+02) + csum_ref(15) = dcmplx(5.120015223011D+02, 5.120404138831D+02) + csum_ref(16) = dcmplx(5.120001570022D+02, 5.120432068837D+02) + csum_ref(17) = dcmplx(5.119987650555D+02, 5.120455615860D+02) + csum_ref(18) = dcmplx(5.119973525091D+02, 5.120475499442D+02) + csum_ref(19) = dcmplx(5.119959279472D+02, 5.120492304629D+02) + csum_ref(20) = dcmplx(5.119945006558D+02, 5.120506508902D+02) + csum_ref(21) = dcmplx(5.119930795911D+02, 5.120518503782D+02) + csum_ref(22) = dcmplx(5.119916728462D+02, 5.120528612016D+02) + csum_ref(23) = dcmplx(5.119902874185D+02, 5.120537101195D+02) + csum_ref(24) = dcmplx(5.119889291565D+02, 5.120544194514D+02) + csum_ref(25) = dcmplx(5.119876028049D+02, 5.120550079284D+02) + + endif + + + if (class /= 'U') then + + do i = 1, nt + err = abs( (sums(i) - csum_ref(i)) / csum_ref(i) ) + if ( .NOT. (err <= epsilon)) goto 100 + end do + verified = .TRUE. + 100 continue + + endif + + + if (class /= 'U') then + if (verified) then + write(*,2000) + 2000 format(' Result verification successful') + else + write(*,2001) + 2001 format(' Result verification failed') + endif + endif + print *, 'class = ', class + + return + end subroutine verify + + diff --git a/Parser/test-data/nas/ft5.f b/Parser/test-data/nas/ft5.f new file mode 100644 index 0000000..47dafa5 --- /dev/null +++ b/Parser/test-data/nas/ft5.f @@ -0,0 +1,2954 @@ +!-------------------------------------------------------------------------! +! ! +! N A S P A R A L L E L B E N C H M A R K S 3.3 ! +! ! +! O p e n M P V E R S I O N ! +! ! +! F T ! +! ! +!-------------------------------------------------------------------------! +! ! +! This benchmark is an OpenMP version of the NPB FT code. ! +! It is described in NAS Technical Report 99-011. ! +! ! +! Permission to use, copy, distribute and modify this software ! +! for any purpose with or without fee is hereby granted. We ! +! request, however, that all derived work reference the NAS ! +! Parallel Benchmarks 3.3. This software is provided "as is" ! +! without express or implied warranty. ! +! ! +! Information on NPB 3.3, including the technical report, the ! +! original specifications, source code, results and information ! +! on how to submit new results, is available at: ! +! ! +! http://www.nas.nasa.gov/Software/NPB/ ! +! ! +! Send comments or suggestions to npb@nas.nasa.gov ! +! ! +! NAS Parallel Benchmarks Group ! +! NASA Ames Research Center ! +! Mail Stop: T27A-1 ! +! Moffett Field, CA 94035-1000 ! +! ! +! E-mail: npb@nas.nasa.gov ! +! Fax: (650) 604-3957 ! +! ! +!-------------------------------------------------------------------------! + +!--------------------------------------------------------------------- + +! Authors: D. Bailey +! W. Saphir +! H. Jin + +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +! FT benchmark +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + program ft + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + + implicit none + +! CLASS = A +! +! +! This file is generated automatically by the setparams utility. +! It sets the number of processors and the class of the NPB +! in this directory. Do not modify it by hand. +! + integer nx, ny, nz, maxdim, niter_default + integer ntotal, nxp, nyp, ntotalp + parameter (nx=256, ny=256, nz=128, maxdim=256) + parameter (niter_default=6) + parameter (nxp=nx+1, nyp=ny) + parameter (ntotal=nx*nyp*nz) + parameter (ntotalp=nxp*nyp*nz) + logical convertdouble + parameter (convertdouble = .false.) + character compiletime*11 + parameter (compiletime='18 Oct 2016') + character npbversion*5 + parameter (npbversion='3.3.1') + character cs1*8 + parameter (cs1='gfortran') + character cs2*6 + parameter (cs2='$(F77)') + character cs3*6 + parameter (cs3='(none)') + character cs4*6 + parameter (cs4='(none)') + character cs5*40 + parameter (cs5='-ffree-form -O3 -fopenmp -mcmodel=medium') + character cs6*28 + parameter (cs6='-O3 -fopenmp -mcmodel=medium') + character cs7*6 + parameter (cs7='randi8') + + +! If processor array is 1x1 -> 0D grid decomposition + + +! Cache blocking params. These values are good for most +! RISC processors. +! FFT parameters: +! fftblock controls how many ffts are done at a time. +! The default is appropriate for most cache-based machines +! On vector machines, the FFT can be vectorized with vector +! length equal to the block size, so the block size should +! be as large as possible. This is the size of the smallest +! dimension of the problem: 128 for class A, 256 for class B and +! 512 for class C. + + integer :: fftblock_default, fftblockpad_default +! parameter (fftblock_default=16, fftblockpad_default=18) + parameter (fftblock_default=32, fftblockpad_default=33) + + integer :: fftblock, fftblockpad + common /blockinfo/ fftblock, fftblockpad + +! we need a bunch of logic to keep track of how +! arrays are laid out. + + +! Note: this serial version is the derived from the parallel 0D case +! of the ft NPB. +! The computation proceeds logically as + +! set up initial conditions +! fftx(1) +! transpose (1->2) +! ffty(2) +! transpose (2->3) +! fftz(3) +! time evolution +! fftz(3) +! transpose (3->2) +! ffty(2) +! transpose (2->1) +! fftx(1) +! compute residual(1) + +! for the 0D, 1D, 2D strategies, the layouts look like xxx + +! 0D 1D 2D +! 1: xyz xyz xyz + +! the array dimensions are stored in dims(coord, phase) + integer :: dims(3) + common /layout/ dims + + integer :: T_total, T_setup, T_fft, T_evolve, T_checksum, & + T_fftx, T_ffty, & + T_fftz, T_max + parameter (T_total = 1, T_setup = 2, T_fft = 3, & + T_evolve = 4, T_checksum = 5, & + T_fftx = 6, & + T_ffty = 7, & + T_fftz = 8, T_max = 8) + + + + logical :: timers_enabled + + + external timer_read + double precision :: timer_read + external ilog2 + integer :: ilog2 + + external randlc + double precision :: randlc + + +! other stuff + logical :: debug, debugsynch + common /dbg/ debug, debugsynch, timers_enabled + + double precision :: seed, a, pi, alpha + parameter (seed = 314159265.d0, a = 1220703125.d0, & + pi = 3.141592653589793238d0, alpha=1.0d-6) + + +! roots of unity array +! relies on x being largest dimension? + double complex u(nxp) + common /ucomm/ u + + +! for checksum data + double complex sums(0:niter_default) + common /sumcomm/ sums + +! number of iterations + integer :: niter + common /iter/ niter + integer :: i + +!--------------------------------------------------------------------- +! u0, u1, u2 are the main arrays in the problem. +! Depending on the decomposition, these arrays will have different +! dimensions. To accomodate all possibilities, we allocate them as +! one-dimensional arrays and pass them to subroutines for different +! views +! - u0 contains the initial (transformed) initial condition +! - u1 and u2 are working arrays +! - twiddle contains exponents for the time evolution operator. +!--------------------------------------------------------------------- + + double complex u0(ntotalp), & + u1(ntotalp) +! > u2(ntotalp) + double precision :: twiddle(ntotalp) +!--------------------------------------------------------------------- +! Large arrays are in common so that they are allocated on the +! heap rather than the stack. This common block is not +! referenced directly anywhere else. Padding is to avoid accidental +! cache problems, since all array sizes are powers of two. +!--------------------------------------------------------------------- + +! double complex pad1(3), pad2(3), pad3(3) +! common /bigarrays/ u0, pad1, u1, pad2, u2, pad3, twiddle + double complex pad1(3), pad2(3) + common /bigarrays/ u0, pad1, u1, pad2, twiddle + + integer :: iter + double precision :: total_time, mflops + logical :: verified + character class + +!--------------------------------------------------------------------- +! Run the entire problem once to make sure all data is touched. +! This reduces variable startup costs, which is important for such a +! short benchmark. The other NPB 2 implementations are similar. +!--------------------------------------------------------------------- + do i = 1, t_max + call timer_clear(i) + end do + call setup() + call init_ui(u0, u1, twiddle, dims(1), dims(2), dims(3)) + call compute_indexmap(twiddle, dims(1), dims(2), dims(3)) + call compute_initial_conditions(u1, dims(1), dims(2), dims(3)) + call fft_init (dims(1)) + call fft(1, u1, u0) + +!--------------------------------------------------------------------- +! Start over from the beginning. Note that all operations must +! be timed, in contrast to other benchmarks. +!--------------------------------------------------------------------- + do i = 1, t_max + call timer_clear(i) + end do + + call timer_start(T_total) + if (timers_enabled) call timer_start(T_setup) + + call compute_indexmap(twiddle, dims(1), dims(2), dims(3)) + + call compute_initial_conditions(u1, dims(1), dims(2), dims(3)) + + call fft_init (dims(1)) + + if (timers_enabled) call timer_stop(T_setup) + if (timers_enabled) call timer_start(T_fft) + call fft(1, u1, u0) + if (timers_enabled) call timer_stop(T_fft) + + do iter = 1, niter + if (timers_enabled) call timer_start(T_evolve) + call evolve(u0, u1, twiddle, dims(1), dims(2), dims(3)) + if (timers_enabled) call timer_stop(T_evolve) + if (timers_enabled) call timer_start(T_fft) + ! call fft(-1, u1, u2) + call fft(-1, u1, u1) + if (timers_enabled) call timer_stop(T_fft) + if (timers_enabled) call timer_start(T_checksum) + ! call checksum(iter, u2, dims(1), dims(2), dims(3)) + call checksum(iter, u1, dims(1), dims(2), dims(3)) + if (timers_enabled) call timer_stop(T_checksum) + end do + + call verify(nx, ny, nz, niter, verified, class) + + call timer_stop(t_total) + total_time = timer_read(t_total) + + if( total_time /= 0. ) then + mflops = 1.0d-6*float(ntotal) * & + (14.8157+7.19641*log(float(ntotal)) & + + (5.23518+7.21113*log(float(ntotal)))*niter) & + /total_time + else + mflops = 0.0 + endif + call print_results('FT', class, nx, ny, nz, niter, & + total_time, mflops, ' floating point', verified, & + npbversion, compiletime, cs1, cs2, cs3, cs4, cs5, cs6, cs7) + if (timers_enabled) call print_timers() + + END PROGRAM + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine init_ui(u0, u1, twiddle, d1, d2, d3) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +! touch all the big data +!--------------------------------------------------------------------- + + implicit none + integer :: d1, d2, d3 + double complex u0(d1+1,d2,d3) + double complex u1(d1+1,d2,d3) + double precision :: twiddle(d1+1,d2,d3) + integer :: i, j, k + +! omp parallel do default(shared) private(i,j,k) + do k = 1, d3 + do j = 1, d2 + do i = 1, d1 + u0(i,j,k) = 0.d0 + u1(i,j,k) = 0.d0 + twiddle(i,j,k) = 0.d0 + end do + end do + end do + + return + end subroutine init_ui + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine evolve(u0, u1, twiddle, d1, d2, d3) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +! evolve u0 -> u1 (t time steps) in fourier space +!--------------------------------------------------------------------- + + implicit none +! CLASS = A +! +! +! This file is generated automatically by the setparams utility. +! It sets the number of processors and the class of the NPB +! in this directory. Do not modify it by hand. +! + integer nx, ny, nz, maxdim, niter_default + integer ntotal, nxp, nyp, ntotalp + parameter (nx=256, ny=256, nz=128, maxdim=256) + parameter (niter_default=6) + parameter (nxp=nx+1, nyp=ny) + parameter (ntotal=nx*nyp*nz) + parameter (ntotalp=nxp*nyp*nz) + logical convertdouble + parameter (convertdouble = .false.) + character compiletime*11 + parameter (compiletime='18 Oct 2016') + character npbversion*5 + parameter (npbversion='3.3.1') + character cs1*8 + parameter (cs1='gfortran') + character cs2*6 + parameter (cs2='$(F77)') + character cs3*6 + parameter (cs3='(none)') + character cs4*6 + parameter (cs4='(none)') + character cs5*40 + parameter (cs5='-ffree-form -O3 -fopenmp -mcmodel=medium') + character cs6*28 + parameter (cs6='-O3 -fopenmp -mcmodel=medium') + character cs7*6 + parameter (cs7='randi8') + + +! If processor array is 1x1 -> 0D grid decomposition + + +! Cache blocking params. These values are good for most +! RISC processors. +! FFT parameters: +! fftblock controls how many ffts are done at a time. +! The default is appropriate for most cache-based machines +! On vector machines, the FFT can be vectorized with vector +! length equal to the block size, so the block size should +! be as large as possible. This is the size of the smallest +! dimension of the problem: 128 for class A, 256 for class B and +! 512 for class C. + + integer :: fftblock_default, fftblockpad_default +! parameter (fftblock_default=16, fftblockpad_default=18) + parameter (fftblock_default=32, fftblockpad_default=33) + + integer :: fftblock, fftblockpad + common /blockinfo/ fftblock, fftblockpad + +! we need a bunch of logic to keep track of how +! arrays are laid out. + + +! Note: this serial version is the derived from the parallel 0D case +! of the ft NPB. +! The computation proceeds logically as + +! set up initial conditions +! fftx(1) +! transpose (1->2) +! ffty(2) +! transpose (2->3) +! fftz(3) +! time evolution +! fftz(3) +! transpose (3->2) +! ffty(2) +! transpose (2->1) +! fftx(1) +! compute residual(1) + +! for the 0D, 1D, 2D strategies, the layouts look like xxx + +! 0D 1D 2D +! 1: xyz xyz xyz + +! the array dimensions are stored in dims(coord, phase) + integer :: dims(3) + common /layout/ dims + + integer :: T_total, T_setup, T_fft, T_evolve, T_checksum, & + T_fftx, T_ffty, & + T_fftz, T_max + parameter (T_total = 1, T_setup = 2, T_fft = 3, & + T_evolve = 4, T_checksum = 5, & + T_fftx = 6, & + T_ffty = 7, & + T_fftz = 8, T_max = 8) + + + + logical :: timers_enabled + + + external timer_read + double precision :: timer_read + external ilog2 + integer :: ilog2 + + external randlc + double precision :: randlc + + +! other stuff + logical :: debug, debugsynch + common /dbg/ debug, debugsynch, timers_enabled + + double precision :: seed, a, pi, alpha + parameter (seed = 314159265.d0, a = 1220703125.d0, & + pi = 3.141592653589793238d0, alpha=1.0d-6) + + +! roots of unity array +! relies on x being largest dimension? + double complex u(nxp) + common /ucomm/ u + + +! for checksum data + double complex sums(0:niter_default) + common /sumcomm/ sums + +! number of iterations + integer :: niter + common /iter/ niter + integer :: d1, d2, d3 + double complex u0(d1+1,d2,d3) + double complex u1(d1+1,d2,d3) + double precision :: twiddle(d1+1,d2,d3) + integer :: i, j, k + +! omp parallel do default(shared) private(i,j,k) + do k = 1, d3 + do j = 1, d2 + do i = 1, d1 + u0(i,j,k) = u0(i,j,k) * twiddle(i,j,k) + u1(i,j,k) = u0(i,j,k) + end do + end do + end do + + return + end subroutine evolve + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine compute_initial_conditions(u0, d1, d2, d3) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +! Fill in array u0 with initial conditions from +! random number generator +!--------------------------------------------------------------------- + implicit none +! CLASS = A +! +! +! This file is generated automatically by the setparams utility. +! It sets the number of processors and the class of the NPB +! in this directory. Do not modify it by hand. +! + integer nx, ny, nz, maxdim, niter_default + integer ntotal, nxp, nyp, ntotalp + parameter (nx=256, ny=256, nz=128, maxdim=256) + parameter (niter_default=6) + parameter (nxp=nx+1, nyp=ny) + parameter (ntotal=nx*nyp*nz) + parameter (ntotalp=nxp*nyp*nz) + logical convertdouble + parameter (convertdouble = .false.) + character compiletime*11 + parameter (compiletime='18 Oct 2016') + character npbversion*5 + parameter (npbversion='3.3.1') + character cs1*8 + parameter (cs1='gfortran') + character cs2*6 + parameter (cs2='$(F77)') + character cs3*6 + parameter (cs3='(none)') + character cs4*6 + parameter (cs4='(none)') + character cs5*40 + parameter (cs5='-ffree-form -O3 -fopenmp -mcmodel=medium') + character cs6*28 + parameter (cs6='-O3 -fopenmp -mcmodel=medium') + character cs7*6 + parameter (cs7='randi8') + + +! If processor array is 1x1 -> 0D grid decomposition + + +! Cache blocking params. These values are good for most +! RISC processors. +! FFT parameters: +! fftblock controls how many ffts are done at a time. +! The default is appropriate for most cache-based machines +! On vector machines, the FFT can be vectorized with vector +! length equal to the block size, so the block size should +! be as large as possible. This is the size of the smallest +! dimension of the problem: 128 for class A, 256 for class B and +! 512 for class C. + + integer :: fftblock_default, fftblockpad_default +! parameter (fftblock_default=16, fftblockpad_default=18) + parameter (fftblock_default=32, fftblockpad_default=33) + + integer :: fftblock, fftblockpad + common /blockinfo/ fftblock, fftblockpad + +! we need a bunch of logic to keep track of how +! arrays are laid out. + + +! Note: this serial version is the derived from the parallel 0D case +! of the ft NPB. +! The computation proceeds logically as + +! set up initial conditions +! fftx(1) +! transpose (1->2) +! ffty(2) +! transpose (2->3) +! fftz(3) +! time evolution +! fftz(3) +! transpose (3->2) +! ffty(2) +! transpose (2->1) +! fftx(1) +! compute residual(1) + +! for the 0D, 1D, 2D strategies, the layouts look like xxx + +! 0D 1D 2D +! 1: xyz xyz xyz + +! the array dimensions are stored in dims(coord, phase) + integer :: dims(3) + common /layout/ dims + + integer :: T_total, T_setup, T_fft, T_evolve, T_checksum, & + T_fftx, T_ffty, & + T_fftz, T_max + parameter (T_total = 1, T_setup = 2, T_fft = 3, & + T_evolve = 4, T_checksum = 5, & + T_fftx = 6, & + T_ffty = 7, & + T_fftz = 8, T_max = 8) + + + + logical :: timers_enabled + + + external timer_read + double precision :: timer_read + external ilog2 + integer :: ilog2 + + external randlc + double precision :: randlc + + +! other stuff + logical :: debug, debugsynch + common /dbg/ debug, debugsynch, timers_enabled + + double precision :: seed, a, pi, alpha + parameter (seed = 314159265.d0, a = 1220703125.d0, & + pi = 3.141592653589793238d0, alpha=1.0d-6) + + +! roots of unity array +! relies on x being largest dimension? + double complex u(nxp) + common /ucomm/ u + + +! for checksum data + double complex sums(0:niter_default) + common /sumcomm/ sums + +! number of iterations + integer :: niter + common /iter/ niter + integer :: d1, d2, d3 + double complex u0(d1+1, d2, d3) + integer :: k, j + double precision :: x0, start, an, dummy, starts(nz) + + + start = seed +!--------------------------------------------------------------------- +! Jump to the starting element for our first plane. +!--------------------------------------------------------------------- + call ipow46(a, 0, an) + dummy = randlc(start, an) + call ipow46(a, 2*nx*ny, an) + + starts(1) = start + do k = 2, dims(3) + dummy = randlc(start, an) + starts(k) = start + end do + +!--------------------------------------------------------------------- +! Go through by z planes filling in one square at a time. +!--------------------------------------------------------------------- +! omp parallel do default(shared) private(k,j,x0) + do k = 1, dims(3) + x0 = starts(k) + do j = 1, dims(2) + call vranlc(2*nx, x0, a, u0(1, j, k)) + end do + end do + + return + end subroutine compute_initial_conditions + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine ipow46(a, exponent, result) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +! compute a^exponent mod 2^46 +!--------------------------------------------------------------------- + + implicit none + double precision :: a, result, dummy, q, r + integer :: exponent, n, n2 + external randlc + double precision :: randlc +!--------------------------------------------------------------------- +! Use +! a^n = a^(n/2)*a^(n/2) if n even else +! a^n = a*a^(n-1) if n odd +!--------------------------------------------------------------------- + result = 1 + if (exponent == 0) return + q = a + r = 1 + n = exponent + + + do while (n > 1) + n2 = n/2 + if (n2 * 2 == n) then + dummy = randlc(q, q) + n = n2 + else + dummy = randlc(r, q) + n = n-1 + endif + end do + dummy = randlc(r, q) + result = r + return + end subroutine ipow46 + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine setup + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + implicit none +! CLASS = A +! +! +! This file is generated automatically by the setparams utility. +! It sets the number of processors and the class of the NPB +! in this directory. Do not modify it by hand. +! + integer nx, ny, nz, maxdim, niter_default + integer ntotal, nxp, nyp, ntotalp + parameter (nx=256, ny=256, nz=128, maxdim=256) + parameter (niter_default=6) + parameter (nxp=nx+1, nyp=ny) + parameter (ntotal=nx*nyp*nz) + parameter (ntotalp=nxp*nyp*nz) + logical convertdouble + parameter (convertdouble = .false.) + character compiletime*11 + parameter (compiletime='18 Oct 2016') + character npbversion*5 + parameter (npbversion='3.3.1') + character cs1*8 + parameter (cs1='gfortran') + character cs2*6 + parameter (cs2='$(F77)') + character cs3*6 + parameter (cs3='(none)') + character cs4*6 + parameter (cs4='(none)') + character cs5*40 + parameter (cs5='-ffree-form -O3 -fopenmp -mcmodel=medium') + character cs6*28 + parameter (cs6='-O3 -fopenmp -mcmodel=medium') + character cs7*6 + parameter (cs7='randi8') + + +! If processor array is 1x1 -> 0D grid decomposition + + +! Cache blocking params. These values are good for most +! RISC processors. +! FFT parameters: +! fftblock controls how many ffts are done at a time. +! The default is appropriate for most cache-based machines +! On vector machines, the FFT can be vectorized with vector +! length equal to the block size, so the block size should +! be as large as possible. This is the size of the smallest +! dimension of the problem: 128 for class A, 256 for class B and +! 512 for class C. + + integer :: fftblock_default, fftblockpad_default +! parameter (fftblock_default=16, fftblockpad_default=18) + parameter (fftblock_default=32, fftblockpad_default=33) + + integer :: fftblock, fftblockpad + common /blockinfo/ fftblock, fftblockpad + +! we need a bunch of logic to keep track of how +! arrays are laid out. + + +! Note: this serial version is the derived from the parallel 0D case +! of the ft NPB. +! The computation proceeds logically as + +! set up initial conditions +! fftx(1) +! transpose (1->2) +! ffty(2) +! transpose (2->3) +! fftz(3) +! time evolution +! fftz(3) +! transpose (3->2) +! ffty(2) +! transpose (2->1) +! fftx(1) +! compute residual(1) + +! for the 0D, 1D, 2D strategies, the layouts look like xxx + +! 0D 1D 2D +! 1: xyz xyz xyz + +! the array dimensions are stored in dims(coord, phase) + integer :: dims(3) + common /layout/ dims + + integer :: T_total, T_setup, T_fft, T_evolve, T_checksum, & + T_fftx, T_ffty, & + T_fftz, T_max + parameter (T_total = 1, T_setup = 2, T_fft = 3, & + T_evolve = 4, T_checksum = 5, & + T_fftx = 6, & + T_ffty = 7, & + T_fftz = 8, T_max = 8) + + + + logical :: timers_enabled + + + external timer_read + double precision :: timer_read + external ilog2 + integer :: ilog2 + + external randlc + double precision :: randlc + + +! other stuff + logical :: debug, debugsynch + common /dbg/ debug, debugsynch, timers_enabled + + double precision :: seed, a, pi, alpha + parameter (seed = 314159265.d0, a = 1220703125.d0, & + pi = 3.141592653589793238d0, alpha=1.0d-6) + + +! roots of unity array +! relies on x being largest dimension? + double complex u(nxp) + common /ucomm/ u + + +! for checksum data + double complex sums(0:niter_default) + common /sumcomm/ sums + +! number of iterations + integer :: niter + common /iter/ niter + + integer :: fstatus +!$ integer omp_get_max_threads +!$ external omp_get_max_threads + debug = .FALSE. + + open (unit=2,file='timer.flag',status='old',iostat=fstatus) + if (fstatus == 0) then + timers_enabled = .TRUE. + close(2) + else + timers_enabled = .FALSE. + endif + + write(*, 1000) + + niter = niter_default + + write(*, 1001) nx, ny, nz + write(*, 1002) niter +!$ write(*, 1003) omp_get_max_threads() + write(*, *) + + + 1000 format(//,' NAS Parallel Benchmarks (NPB3.3-OMP)', & + ' - FT Benchmark', /) + 1001 format(' Size : ', i4, 'x', i4, 'x', i4) + 1002 format(' Iterations :', i7) + 1003 format(' Number of available threads :', i7) + + dims(1) = nx + dims(2) = ny + dims(3) = nz + + +!--------------------------------------------------------------------- +! Set up info for blocking of ffts and transposes. This improves +! performance on cache-based systems. Blocking involves +! working on a chunk of the problem at a time, taking chunks +! along the first, second, or third dimension. + +! - In cffts1 blocking is on 2nd dimension (with fft on 1st dim) +! - In cffts2/3 blocking is on 1st dimension (with fft on 2nd and 3rd dims) + +! Since 1st dim is always in processor, we'll assume it's long enough +! (default blocking factor is 16 so min size for 1st dim is 16) +! The only case we have to worry about is cffts1 in a 2d decomposition. +! so the blocking factor should not be larger than the 2nd dimension. +!--------------------------------------------------------------------- + + fftblock = fftblock_default + fftblockpad = fftblockpad_default + + if (fftblock /= fftblock_default) fftblockpad = fftblock+3 + + return + end subroutine setup + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine compute_indexmap(twiddle, d1, d2, d3) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +! compute function from local (i,j,k) to ibar^2+jbar^2+kbar^2 +! for time evolution exponent. +!--------------------------------------------------------------------- + + implicit none +! CLASS = A +! +! +! This file is generated automatically by the setparams utility. +! It sets the number of processors and the class of the NPB +! in this directory. Do not modify it by hand. +! + integer nx, ny, nz, maxdim, niter_default + integer ntotal, nxp, nyp, ntotalp + parameter (nx=256, ny=256, nz=128, maxdim=256) + parameter (niter_default=6) + parameter (nxp=nx+1, nyp=ny) + parameter (ntotal=nx*nyp*nz) + parameter (ntotalp=nxp*nyp*nz) + logical convertdouble + parameter (convertdouble = .false.) + character compiletime*11 + parameter (compiletime='18 Oct 2016') + character npbversion*5 + parameter (npbversion='3.3.1') + character cs1*8 + parameter (cs1='gfortran') + character cs2*6 + parameter (cs2='$(F77)') + character cs3*6 + parameter (cs3='(none)') + character cs4*6 + parameter (cs4='(none)') + character cs5*40 + parameter (cs5='-ffree-form -O3 -fopenmp -mcmodel=medium') + character cs6*28 + parameter (cs6='-O3 -fopenmp -mcmodel=medium') + character cs7*6 + parameter (cs7='randi8') + + +! If processor array is 1x1 -> 0D grid decomposition + + +! Cache blocking params. These values are good for most +! RISC processors. +! FFT parameters: +! fftblock controls how many ffts are done at a time. +! The default is appropriate for most cache-based machines +! On vector machines, the FFT can be vectorized with vector +! length equal to the block size, so the block size should +! be as large as possible. This is the size of the smallest +! dimension of the problem: 128 for class A, 256 for class B and +! 512 for class C. + + integer :: fftblock_default, fftblockpad_default +! parameter (fftblock_default=16, fftblockpad_default=18) + parameter (fftblock_default=32, fftblockpad_default=33) + + integer :: fftblock, fftblockpad + common /blockinfo/ fftblock, fftblockpad + +! we need a bunch of logic to keep track of how +! arrays are laid out. + + +! Note: this serial version is the derived from the parallel 0D case +! of the ft NPB. +! The computation proceeds logically as + +! set up initial conditions +! fftx(1) +! transpose (1->2) +! ffty(2) +! transpose (2->3) +! fftz(3) +! time evolution +! fftz(3) +! transpose (3->2) +! ffty(2) +! transpose (2->1) +! fftx(1) +! compute residual(1) + +! for the 0D, 1D, 2D strategies, the layouts look like xxx + +! 0D 1D 2D +! 1: xyz xyz xyz + +! the array dimensions are stored in dims(coord, phase) + integer :: dims(3) + common /layout/ dims + + integer :: T_total, T_setup, T_fft, T_evolve, T_checksum, & + T_fftx, T_ffty, & + T_fftz, T_max + parameter (T_total = 1, T_setup = 2, T_fft = 3, & + T_evolve = 4, T_checksum = 5, & + T_fftx = 6, & + T_ffty = 7, & + T_fftz = 8, T_max = 8) + + + + logical :: timers_enabled + + + external timer_read + double precision :: timer_read + external ilog2 + integer :: ilog2 + + external randlc + double precision :: randlc + + +! other stuff + logical :: debug, debugsynch + common /dbg/ debug, debugsynch, timers_enabled + + double precision :: seed, a, pi, alpha + parameter (seed = 314159265.d0, a = 1220703125.d0, & + pi = 3.141592653589793238d0, alpha=1.0d-6) + + +! roots of unity array +! relies on x being largest dimension? + double complex u(nxp) + common /ucomm/ u + + +! for checksum data + double complex sums(0:niter_default) + common /sumcomm/ sums + +! number of iterations + integer :: niter + common /iter/ niter + integer :: d1, d2, d3 + double precision :: twiddle(d1+1, d2, d3) + integer :: i, j, k, kk, kk2, jj, kj2, ii + double precision :: ap + +!--------------------------------------------------------------------- +! basically we want to convert the fortran indices +! 1 2 3 4 5 6 7 8 +! to +! 0 1 2 3 -4 -3 -2 -1 +! The following magic formula does the trick: +! mod(i-1+n/2, n) - n/2 +!--------------------------------------------------------------------- + + ap = - 4.d0 * alpha * pi *pi + +! omp parallel do default(shared) private(i,j,k,kk,kk2,jj,kj2,ii) + do k = 1, dims(3) + kk = mod(k-1+nz/2, nz) - nz/2 + kk2 = kk*kk + do j = 1, dims(2) + jj = mod(j-1+ny/2, ny) - ny/2 + kj2 = jj*jj+kk2 + do i = 1, dims(1) + ii = mod(i-1+nx/2, nx) - nx/2 + twiddle(i,j,k) = dexp(ap*dble(ii*ii+kj2)) + end do + end do + end do + + return + end subroutine compute_indexmap + + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine print_timers() + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + implicit none + integer :: i +! CLASS = A +! +! +! This file is generated automatically by the setparams utility. +! It sets the number of processors and the class of the NPB +! in this directory. Do not modify it by hand. +! + integer nx, ny, nz, maxdim, niter_default + integer ntotal, nxp, nyp, ntotalp + parameter (nx=256, ny=256, nz=128, maxdim=256) + parameter (niter_default=6) + parameter (nxp=nx+1, nyp=ny) + parameter (ntotal=nx*nyp*nz) + parameter (ntotalp=nxp*nyp*nz) + logical convertdouble + parameter (convertdouble = .false.) + character compiletime*11 + parameter (compiletime='18 Oct 2016') + character npbversion*5 + parameter (npbversion='3.3.1') + character cs1*8 + parameter (cs1='gfortran') + character cs2*6 + parameter (cs2='$(F77)') + character cs3*6 + parameter (cs3='(none)') + character cs4*6 + parameter (cs4='(none)') + character cs5*40 + parameter (cs5='-ffree-form -O3 -fopenmp -mcmodel=medium') + character cs6*28 + parameter (cs6='-O3 -fopenmp -mcmodel=medium') + character cs7*6 + parameter (cs7='randi8') + + +! If processor array is 1x1 -> 0D grid decomposition + + +! Cache blocking params. These values are good for most +! RISC processors. +! FFT parameters: +! fftblock controls how many ffts are done at a time. +! The default is appropriate for most cache-based machines +! On vector machines, the FFT can be vectorized with vector +! length equal to the block size, so the block size should +! be as large as possible. This is the size of the smallest +! dimension of the problem: 128 for class A, 256 for class B and +! 512 for class C. + + integer :: fftblock_default, fftblockpad_default +! parameter (fftblock_default=16, fftblockpad_default=18) + parameter (fftblock_default=32, fftblockpad_default=33) + + integer :: fftblock, fftblockpad + common /blockinfo/ fftblock, fftblockpad + +! we need a bunch of logic to keep track of how +! arrays are laid out. + + +! Note: this serial version is the derived from the parallel 0D case +! of the ft NPB. +! The computation proceeds logically as + +! set up initial conditions +! fftx(1) +! transpose (1->2) +! ffty(2) +! transpose (2->3) +! fftz(3) +! time evolution +! fftz(3) +! transpose (3->2) +! ffty(2) +! transpose (2->1) +! fftx(1) +! compute residual(1) + +! for the 0D, 1D, 2D strategies, the layouts look like xxx + +! 0D 1D 2D +! 1: xyz xyz xyz + +! the array dimensions are stored in dims(coord, phase) + integer :: dims(3) + common /layout/ dims + + integer :: T_total, T_setup, T_fft, T_evolve, T_checksum, & + T_fftx, T_ffty, & + T_fftz, T_max + parameter (T_total = 1, T_setup = 2, T_fft = 3, & + T_evolve = 4, T_checksum = 5, & + T_fftx = 6, & + T_ffty = 7, & + T_fftz = 8, T_max = 8) + + + + logical :: timers_enabled + + + external timer_read + double precision :: timer_read + external ilog2 + integer :: ilog2 + + external randlc + double precision :: randlc + + +! other stuff + logical :: debug, debugsynch + common /dbg/ debug, debugsynch, timers_enabled + + double precision :: seed, a, pi, alpha + parameter (seed = 314159265.d0, a = 1220703125.d0, & + pi = 3.141592653589793238d0, alpha=1.0d-6) + + +! roots of unity array +! relies on x being largest dimension? + double complex u(nxp) + common /ucomm/ u + + +! for checksum data + double complex sums(0:niter_default) + common /sumcomm/ sums + +! number of iterations + integer :: niter + common /iter/ niter + double precision :: t, t_m + character(25) :: tstrings(T_max) + data tstrings / ' total ', & + ' setup ', & + ' fft ', & + ' evolve ', & + ' checksum ', & + ' fftx ', & + ' ffty ', & + ' fftz ' / + + t_m = timer_read(T_total) + if (t_m <= 0.0d0) t_m = 1.0d0 + do i = 1, t_max + t = timer_read(i) + write(*, 100) i, tstrings(i), t, t*100.0/t_m + end do + 100 format(' timer ', i2, '(', A16, ') :', F9.4, ' (',F6.2,'%)') + return + end subroutine print_timers + + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine fft(dir, x1, x2) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + implicit none +! CLASS = A +! +! +! This file is generated automatically by the setparams utility. +! It sets the number of processors and the class of the NPB +! in this directory. Do not modify it by hand. +! + integer nx, ny, nz, maxdim, niter_default + integer ntotal, nxp, nyp, ntotalp + parameter (nx=256, ny=256, nz=128, maxdim=256) + parameter (niter_default=6) + parameter (nxp=nx+1, nyp=ny) + parameter (ntotal=nx*nyp*nz) + parameter (ntotalp=nxp*nyp*nz) + logical convertdouble + parameter (convertdouble = .false.) + character compiletime*11 + parameter (compiletime='18 Oct 2016') + character npbversion*5 + parameter (npbversion='3.3.1') + character cs1*8 + parameter (cs1='gfortran') + character cs2*6 + parameter (cs2='$(F77)') + character cs3*6 + parameter (cs3='(none)') + character cs4*6 + parameter (cs4='(none)') + character cs5*40 + parameter (cs5='-ffree-form -O3 -fopenmp -mcmodel=medium') + character cs6*28 + parameter (cs6='-O3 -fopenmp -mcmodel=medium') + character cs7*6 + parameter (cs7='randi8') + + +! If processor array is 1x1 -> 0D grid decomposition + + +! Cache blocking params. These values are good for most +! RISC processors. +! FFT parameters: +! fftblock controls how many ffts are done at a time. +! The default is appropriate for most cache-based machines +! On vector machines, the FFT can be vectorized with vector +! length equal to the block size, so the block size should +! be as large as possible. This is the size of the smallest +! dimension of the problem: 128 for class A, 256 for class B and +! 512 for class C. + + integer :: fftblock_default, fftblockpad_default +! parameter (fftblock_default=16, fftblockpad_default=18) + parameter (fftblock_default=32, fftblockpad_default=33) + + integer :: fftblock, fftblockpad + common /blockinfo/ fftblock, fftblockpad + +! we need a bunch of logic to keep track of how +! arrays are laid out. + + +! Note: this serial version is the derived from the parallel 0D case +! of the ft NPB. +! The computation proceeds logically as + +! set up initial conditions +! fftx(1) +! transpose (1->2) +! ffty(2) +! transpose (2->3) +! fftz(3) +! time evolution +! fftz(3) +! transpose (3->2) +! ffty(2) +! transpose (2->1) +! fftx(1) +! compute residual(1) + +! for the 0D, 1D, 2D strategies, the layouts look like xxx + +! 0D 1D 2D +! 1: xyz xyz xyz + +! the array dimensions are stored in dims(coord, phase) + integer :: dims(3) + common /layout/ dims + + integer :: T_total, T_setup, T_fft, T_evolve, T_checksum, & + T_fftx, T_ffty, & + T_fftz, T_max + parameter (T_total = 1, T_setup = 2, T_fft = 3, & + T_evolve = 4, T_checksum = 5, & + T_fftx = 6, & + T_ffty = 7, & + T_fftz = 8, T_max = 8) + + + + logical :: timers_enabled + + + external timer_read + double precision :: timer_read + external ilog2 + integer :: ilog2 + + external randlc + double precision :: randlc + + +! other stuff + logical :: debug, debugsynch + common /dbg/ debug, debugsynch, timers_enabled + + double precision :: seed, a, pi, alpha + parameter (seed = 314159265.d0, a = 1220703125.d0, & + pi = 3.141592653589793238d0, alpha=1.0d-6) + + +! roots of unity array +! relies on x being largest dimension? + double complex u(nxp) + common /ucomm/ u + + +! for checksum data + double complex sums(0:niter_default) + common /sumcomm/ sums + +! number of iterations + integer :: niter + common /iter/ niter + integer :: dir + double complex x1(ntotalp), x2(ntotalp) + + double complex y1(fftblockpad_default*maxdim), & + y2(fftblockpad_default*maxdim) + +!--------------------------------------------------------------------- +! note: args x1, x2 must be different arrays +! note: args for cfftsx are (direction, layout, xin, xout, scratch) +! xin/xout may be the same and it can be somewhat faster +! if they are +!--------------------------------------------------------------------- + + if (dir == 1) then + call cffts1(1, dims(1), dims(2), dims(3), x1, x1, y1, y2) + call cffts2(1, dims(1), dims(2), dims(3), x1, x1, y1, y2) + call cffts3(1, dims(1), dims(2), dims(3), x1, x2, y1, y2) + else + call cffts3(-1, dims(1), dims(2), dims(3), x1, x1, y1, y2) + call cffts2(-1, dims(1), dims(2), dims(3), x1, x1, y1, y2) + call cffts1(-1, dims(1), dims(2), dims(3), x1, x2, y1, y2) + endif + return + end subroutine fft + + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine cffts1(is, d1, d2, d3, x, xout, y1, y2) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + implicit none + +! CLASS = A +! +! +! This file is generated automatically by the setparams utility. +! It sets the number of processors and the class of the NPB +! in this directory. Do not modify it by hand. +! + integer nx, ny, nz, maxdim, niter_default + integer ntotal, nxp, nyp, ntotalp + parameter (nx=256, ny=256, nz=128, maxdim=256) + parameter (niter_default=6) + parameter (nxp=nx+1, nyp=ny) + parameter (ntotal=nx*nyp*nz) + parameter (ntotalp=nxp*nyp*nz) + logical convertdouble + parameter (convertdouble = .false.) + character compiletime*11 + parameter (compiletime='18 Oct 2016') + character npbversion*5 + parameter (npbversion='3.3.1') + character cs1*8 + parameter (cs1='gfortran') + character cs2*6 + parameter (cs2='$(F77)') + character cs3*6 + parameter (cs3='(none)') + character cs4*6 + parameter (cs4='(none)') + character cs5*40 + parameter (cs5='-ffree-form -O3 -fopenmp -mcmodel=medium') + character cs6*28 + parameter (cs6='-O3 -fopenmp -mcmodel=medium') + character cs7*6 + parameter (cs7='randi8') + + +! If processor array is 1x1 -> 0D grid decomposition + + +! Cache blocking params. These values are good for most +! RISC processors. +! FFT parameters: +! fftblock controls how many ffts are done at a time. +! The default is appropriate for most cache-based machines +! On vector machines, the FFT can be vectorized with vector +! length equal to the block size, so the block size should +! be as large as possible. This is the size of the smallest +! dimension of the problem: 128 for class A, 256 for class B and +! 512 for class C. + + integer :: fftblock_default, fftblockpad_default +! parameter (fftblock_default=16, fftblockpad_default=18) + parameter (fftblock_default=32, fftblockpad_default=33) + + integer :: fftblock, fftblockpad + common /blockinfo/ fftblock, fftblockpad + +! we need a bunch of logic to keep track of how +! arrays are laid out. + + +! Note: this serial version is the derived from the parallel 0D case +! of the ft NPB. +! The computation proceeds logically as + +! set up initial conditions +! fftx(1) +! transpose (1->2) +! ffty(2) +! transpose (2->3) +! fftz(3) +! time evolution +! fftz(3) +! transpose (3->2) +! ffty(2) +! transpose (2->1) +! fftx(1) +! compute residual(1) + +! for the 0D, 1D, 2D strategies, the layouts look like xxx + +! 0D 1D 2D +! 1: xyz xyz xyz + +! the array dimensions are stored in dims(coord, phase) + integer :: dims(3) + common /layout/ dims + + integer :: T_total, T_setup, T_fft, T_evolve, T_checksum, & + T_fftx, T_ffty, & + T_fftz, T_max + parameter (T_total = 1, T_setup = 2, T_fft = 3, & + T_evolve = 4, T_checksum = 5, & + T_fftx = 6, & + T_ffty = 7, & + T_fftz = 8, T_max = 8) + + + + logical :: timers_enabled + + + external timer_read + double precision :: timer_read + external ilog2 + integer :: ilog2 + + external randlc + double precision :: randlc + + +! other stuff + logical :: debug, debugsynch + common /dbg/ debug, debugsynch, timers_enabled + + double precision :: seed, a, pi, alpha + parameter (seed = 314159265.d0, a = 1220703125.d0, & + pi = 3.141592653589793238d0, alpha=1.0d-6) + + +! roots of unity array +! relies on x being largest dimension? + double complex u(nxp) + common /ucomm/ u + + +! for checksum data + double complex sums(0:niter_default) + common /sumcomm/ sums + +! number of iterations + integer :: niter + common /iter/ niter + integer :: is, d1, d2, d3, logd1 + double complex x(d1+1,d2,d3) + double complex xout(d1+1,d2,d3) + double complex y1(fftblockpad, d1), y2(fftblockpad, d1) + integer :: i, j, k, jj + + logd1 = ilog2(d1) + + if (timers_enabled) call timer_start(T_fftx) +! omp parallel do default(shared) private(i,j,k,jj,y1,y2) +! omp& shared(is,logd1,d1) + do k = 1, d3 + do jj = 0, d2 - fftblock, fftblock + do j = 1, fftblock + do i = 1, d1 + y1(j,i) = x(i,j+jj,k) + enddo + enddo + + call cfftz (is, logd1, d1, y1, y2) + + + do j = 1, fftblock + do i = 1, d1 + xout(i,j+jj,k) = y1(j,i) + enddo + enddo + enddo + enddo + if (timers_enabled) call timer_stop(T_fftx) + + return + end subroutine cffts1 + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine cffts2(is, d1, d2, d3, x, xout, y1, y2) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + implicit none + +! CLASS = A +! +! +! This file is generated automatically by the setparams utility. +! It sets the number of processors and the class of the NPB +! in this directory. Do not modify it by hand. +! + integer nx, ny, nz, maxdim, niter_default + integer ntotal, nxp, nyp, ntotalp + parameter (nx=256, ny=256, nz=128, maxdim=256) + parameter (niter_default=6) + parameter (nxp=nx+1, nyp=ny) + parameter (ntotal=nx*nyp*nz) + parameter (ntotalp=nxp*nyp*nz) + logical convertdouble + parameter (convertdouble = .false.) + character compiletime*11 + parameter (compiletime='18 Oct 2016') + character npbversion*5 + parameter (npbversion='3.3.1') + character cs1*8 + parameter (cs1='gfortran') + character cs2*6 + parameter (cs2='$(F77)') + character cs3*6 + parameter (cs3='(none)') + character cs4*6 + parameter (cs4='(none)') + character cs5*40 + parameter (cs5='-ffree-form -O3 -fopenmp -mcmodel=medium') + character cs6*28 + parameter (cs6='-O3 -fopenmp -mcmodel=medium') + character cs7*6 + parameter (cs7='randi8') + + +! If processor array is 1x1 -> 0D grid decomposition + + +! Cache blocking params. These values are good for most +! RISC processors. +! FFT parameters: +! fftblock controls how many ffts are done at a time. +! The default is appropriate for most cache-based machines +! On vector machines, the FFT can be vectorized with vector +! length equal to the block size, so the block size should +! be as large as possible. This is the size of the smallest +! dimension of the problem: 128 for class A, 256 for class B and +! 512 for class C. + + integer :: fftblock_default, fftblockpad_default +! parameter (fftblock_default=16, fftblockpad_default=18) + parameter (fftblock_default=32, fftblockpad_default=33) + + integer :: fftblock, fftblockpad + common /blockinfo/ fftblock, fftblockpad + +! we need a bunch of logic to keep track of how +! arrays are laid out. + + +! Note: this serial version is the derived from the parallel 0D case +! of the ft NPB. +! The computation proceeds logically as + +! set up initial conditions +! fftx(1) +! transpose (1->2) +! ffty(2) +! transpose (2->3) +! fftz(3) +! time evolution +! fftz(3) +! transpose (3->2) +! ffty(2) +! transpose (2->1) +! fftx(1) +! compute residual(1) + +! for the 0D, 1D, 2D strategies, the layouts look like xxx + +! 0D 1D 2D +! 1: xyz xyz xyz + +! the array dimensions are stored in dims(coord, phase) + integer :: dims(3) + common /layout/ dims + + integer :: T_total, T_setup, T_fft, T_evolve, T_checksum, & + T_fftx, T_ffty, & + T_fftz, T_max + parameter (T_total = 1, T_setup = 2, T_fft = 3, & + T_evolve = 4, T_checksum = 5, & + T_fftx = 6, & + T_ffty = 7, & + T_fftz = 8, T_max = 8) + + + + logical :: timers_enabled + + + external timer_read + double precision :: timer_read + external ilog2 + integer :: ilog2 + + external randlc + double precision :: randlc + + +! other stuff + logical :: debug, debugsynch + common /dbg/ debug, debugsynch, timers_enabled + + double precision :: seed, a, pi, alpha + parameter (seed = 314159265.d0, a = 1220703125.d0, & + pi = 3.141592653589793238d0, alpha=1.0d-6) + + +! roots of unity array +! relies on x being largest dimension? + double complex u(nxp) + common /ucomm/ u + + +! for checksum data + double complex sums(0:niter_default) + common /sumcomm/ sums + +! number of iterations + integer :: niter + common /iter/ niter + integer :: is, d1, d2, d3, logd2 + double complex x(d1+1,d2,d3) + double complex xout(d1+1,d2,d3) + double complex y1(fftblockpad, d2), y2(fftblockpad, d2) + integer :: i, j, k, ii + + logd2 = ilog2(d2) + + if (timers_enabled) call timer_start(T_ffty) +! omp parallel do default(shared) private(i,j,k,ii,y1,y2) +! omp& shared(is,logd2,d2) + do k = 1, d3 + do ii = 0, d1 - fftblock, fftblock + do j = 1, d2 + do i = 1, fftblock + y1(i,j) = x(i+ii,j,k) + enddo + enddo + + call cfftz (is, logd2, d2, y1, y2) + + do j = 1, d2 + do i = 1, fftblock + xout(i+ii,j,k) = y1(i,j) + enddo + enddo + enddo + enddo + if (timers_enabled) call timer_stop(T_ffty) + + return + end subroutine cffts2 + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine cffts3(is, d1, d2, d3, x, xout, y1, y2) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + implicit none + +! CLASS = A +! +! +! This file is generated automatically by the setparams utility. +! It sets the number of processors and the class of the NPB +! in this directory. Do not modify it by hand. +! + integer nx, ny, nz, maxdim, niter_default + integer ntotal, nxp, nyp, ntotalp + parameter (nx=256, ny=256, nz=128, maxdim=256) + parameter (niter_default=6) + parameter (nxp=nx+1, nyp=ny) + parameter (ntotal=nx*nyp*nz) + parameter (ntotalp=nxp*nyp*nz) + logical convertdouble + parameter (convertdouble = .false.) + character compiletime*11 + parameter (compiletime='18 Oct 2016') + character npbversion*5 + parameter (npbversion='3.3.1') + character cs1*8 + parameter (cs1='gfortran') + character cs2*6 + parameter (cs2='$(F77)') + character cs3*6 + parameter (cs3='(none)') + character cs4*6 + parameter (cs4='(none)') + character cs5*40 + parameter (cs5='-ffree-form -O3 -fopenmp -mcmodel=medium') + character cs6*28 + parameter (cs6='-O3 -fopenmp -mcmodel=medium') + character cs7*6 + parameter (cs7='randi8') + + +! If processor array is 1x1 -> 0D grid decomposition + + +! Cache blocking params. These values are good for most +! RISC processors. +! FFT parameters: +! fftblock controls how many ffts are done at a time. +! The default is appropriate for most cache-based machines +! On vector machines, the FFT can be vectorized with vector +! length equal to the block size, so the block size should +! be as large as possible. This is the size of the smallest +! dimension of the problem: 128 for class A, 256 for class B and +! 512 for class C. + + integer :: fftblock_default, fftblockpad_default +! parameter (fftblock_default=16, fftblockpad_default=18) + parameter (fftblock_default=32, fftblockpad_default=33) + + integer :: fftblock, fftblockpad + common /blockinfo/ fftblock, fftblockpad + +! we need a bunch of logic to keep track of how +! arrays are laid out. + + +! Note: this serial version is the derived from the parallel 0D case +! of the ft NPB. +! The computation proceeds logically as + +! set up initial conditions +! fftx(1) +! transpose (1->2) +! ffty(2) +! transpose (2->3) +! fftz(3) +! time evolution +! fftz(3) +! transpose (3->2) +! ffty(2) +! transpose (2->1) +! fftx(1) +! compute residual(1) + +! for the 0D, 1D, 2D strategies, the layouts look like xxx + +! 0D 1D 2D +! 1: xyz xyz xyz + +! the array dimensions are stored in dims(coord, phase) + integer :: dims(3) + common /layout/ dims + + integer :: T_total, T_setup, T_fft, T_evolve, T_checksum, & + T_fftx, T_ffty, & + T_fftz, T_max + parameter (T_total = 1, T_setup = 2, T_fft = 3, & + T_evolve = 4, T_checksum = 5, & + T_fftx = 6, & + T_ffty = 7, & + T_fftz = 8, T_max = 8) + + + + logical :: timers_enabled + + + external timer_read + double precision :: timer_read + external ilog2 + integer :: ilog2 + + external randlc + double precision :: randlc + + +! other stuff + logical :: debug, debugsynch + common /dbg/ debug, debugsynch, timers_enabled + + double precision :: seed, a, pi, alpha + parameter (seed = 314159265.d0, a = 1220703125.d0, & + pi = 3.141592653589793238d0, alpha=1.0d-6) + + +! roots of unity array +! relies on x being largest dimension? + double complex u(nxp) + common /ucomm/ u + + +! for checksum data + double complex sums(0:niter_default) + common /sumcomm/ sums + +! number of iterations + integer :: niter + common /iter/ niter + integer :: is, d1, d2, d3, logd3 + double complex x(d1+1,d2,d3) + double complex xout(d1+1,d2,d3) + double complex y1(fftblockpad, d3), y2(fftblockpad, d3) + integer :: i, j, k, ii + + logd3 = ilog2(d3) + + if (timers_enabled) call timer_start(T_fftz) +! omp parallel do default(shared) private(i,j,k,ii,y1,y2) +! omp& shared(is) + do j = 1, d2 + do ii = 0, d1 - fftblock, fftblock + do k = 1, d3 + do i = 1, fftblock + y1(i,k) = x(i+ii,j,k) + enddo + enddo + + call cfftz (is, logd3, d3, y1, y2) + + do k = 1, d3 + do i = 1, fftblock + xout(i+ii,j,k) = y1(i,k) + enddo + enddo + enddo + enddo + if (timers_enabled) call timer_stop(T_fftz) + + return + end subroutine cffts3 + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine fft_init (n) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +! compute the roots-of-unity array that will be used for subsequent FFTs. +!--------------------------------------------------------------------- + + implicit none +! CLASS = A +! +! +! This file is generated automatically by the setparams utility. +! It sets the number of processors and the class of the NPB +! in this directory. Do not modify it by hand. +! + integer nx, ny, nz, maxdim, niter_default + integer ntotal, nxp, nyp, ntotalp + parameter (nx=256, ny=256, nz=128, maxdim=256) + parameter (niter_default=6) + parameter (nxp=nx+1, nyp=ny) + parameter (ntotal=nx*nyp*nz) + parameter (ntotalp=nxp*nyp*nz) + logical convertdouble + parameter (convertdouble = .false.) + character compiletime*11 + parameter (compiletime='18 Oct 2016') + character npbversion*5 + parameter (npbversion='3.3.1') + character cs1*8 + parameter (cs1='gfortran') + character cs2*6 + parameter (cs2='$(F77)') + character cs3*6 + parameter (cs3='(none)') + character cs4*6 + parameter (cs4='(none)') + character cs5*40 + parameter (cs5='-ffree-form -O3 -fopenmp -mcmodel=medium') + character cs6*28 + parameter (cs6='-O3 -fopenmp -mcmodel=medium') + character cs7*6 + parameter (cs7='randi8') + + +! If processor array is 1x1 -> 0D grid decomposition + + +! Cache blocking params. These values are good for most +! RISC processors. +! FFT parameters: +! fftblock controls how many ffts are done at a time. +! The default is appropriate for most cache-based machines +! On vector machines, the FFT can be vectorized with vector +! length equal to the block size, so the block size should +! be as large as possible. This is the size of the smallest +! dimension of the problem: 128 for class A, 256 for class B and +! 512 for class C. + + integer :: fftblock_default, fftblockpad_default +! parameter (fftblock_default=16, fftblockpad_default=18) + parameter (fftblock_default=32, fftblockpad_default=33) + + integer :: fftblock, fftblockpad + common /blockinfo/ fftblock, fftblockpad + +! we need a bunch of logic to keep track of how +! arrays are laid out. + + +! Note: this serial version is the derived from the parallel 0D case +! of the ft NPB. +! The computation proceeds logically as + +! set up initial conditions +! fftx(1) +! transpose (1->2) +! ffty(2) +! transpose (2->3) +! fftz(3) +! time evolution +! fftz(3) +! transpose (3->2) +! ffty(2) +! transpose (2->1) +! fftx(1) +! compute residual(1) + +! for the 0D, 1D, 2D strategies, the layouts look like xxx + +! 0D 1D 2D +! 1: xyz xyz xyz + +! the array dimensions are stored in dims(coord, phase) + integer :: dims(3) + common /layout/ dims + + integer :: T_total, T_setup, T_fft, T_evolve, T_checksum, & + T_fftx, T_ffty, & + T_fftz, T_max + parameter (T_total = 1, T_setup = 2, T_fft = 3, & + T_evolve = 4, T_checksum = 5, & + T_fftx = 6, & + T_ffty = 7, & + T_fftz = 8, T_max = 8) + + + + logical :: timers_enabled + + + external timer_read + double precision :: timer_read + external ilog2 + integer :: ilog2 + + external randlc + double precision :: randlc + + +! other stuff + logical :: debug, debugsynch + common /dbg/ debug, debugsynch, timers_enabled + + double precision :: seed, a, pi, alpha + parameter (seed = 314159265.d0, a = 1220703125.d0, & + pi = 3.141592653589793238d0, alpha=1.0d-6) + + +! roots of unity array +! relies on x being largest dimension? + double complex u(nxp) + common /ucomm/ u + + +! for checksum data + double complex sums(0:niter_default) + common /sumcomm/ sums + +! number of iterations + integer :: niter + common /iter/ niter + + integer :: m,n,nu,ku,i,j,ln + double precision :: t, ti + + +!--------------------------------------------------------------------- +! Initialize the U array with sines and cosines in a manner that permits +! stride one access at each FFT iteration. +!--------------------------------------------------------------------- + nu = n + m = ilog2(n) + u(1) = m + ku = 2 + ln = 1 + + do j = 1, m + t = pi / ln + + do i = 0, ln - 1 + ti = i * t + u(i+ku) = dcmplx (cos (ti), sin(ti)) + enddo + + ku = ku + ln + ln = 2 * ln + enddo + + return + end subroutine fft_init + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine cfftz (is, m, n, x, y) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +! Computes NY N-point complex-to-complex FFTs of X using an algorithm due +! to Swarztrauber. X is both the input and the output array, while Y is a +! scratch array. It is assumed that N = 2^M. Before calling CFFTZ to +! perform FFTs, the array U must be initialized by calling CFFTZ with IS +! set to 0 and M set to MX, where MX is the maximum value of M for any +! subsequent call. +!--------------------------------------------------------------------- + + implicit none +! CLASS = A +! +! +! This file is generated automatically by the setparams utility. +! It sets the number of processors and the class of the NPB +! in this directory. Do not modify it by hand. +! + integer nx, ny, nz, maxdim, niter_default + integer ntotal, nxp, nyp, ntotalp + parameter (nx=256, ny=256, nz=128, maxdim=256) + parameter (niter_default=6) + parameter (nxp=nx+1, nyp=ny) + parameter (ntotal=nx*nyp*nz) + parameter (ntotalp=nxp*nyp*nz) + logical convertdouble + parameter (convertdouble = .false.) + character compiletime*11 + parameter (compiletime='18 Oct 2016') + character npbversion*5 + parameter (npbversion='3.3.1') + character cs1*8 + parameter (cs1='gfortran') + character cs2*6 + parameter (cs2='$(F77)') + character cs3*6 + parameter (cs3='(none)') + character cs4*6 + parameter (cs4='(none)') + character cs5*40 + parameter (cs5='-ffree-form -O3 -fopenmp -mcmodel=medium') + character cs6*28 + parameter (cs6='-O3 -fopenmp -mcmodel=medium') + character cs7*6 + parameter (cs7='randi8') + + +! If processor array is 1x1 -> 0D grid decomposition + + +! Cache blocking params. These values are good for most +! RISC processors. +! FFT parameters: +! fftblock controls how many ffts are done at a time. +! The default is appropriate for most cache-based machines +! On vector machines, the FFT can be vectorized with vector +! length equal to the block size, so the block size should +! be as large as possible. This is the size of the smallest +! dimension of the problem: 128 for class A, 256 for class B and +! 512 for class C. + + integer :: fftblock_default, fftblockpad_default +! parameter (fftblock_default=16, fftblockpad_default=18) + parameter (fftblock_default=32, fftblockpad_default=33) + + integer :: fftblock, fftblockpad + common /blockinfo/ fftblock, fftblockpad + +! we need a bunch of logic to keep track of how +! arrays are laid out. + + +! Note: this serial version is the derived from the parallel 0D case +! of the ft NPB. +! The computation proceeds logically as + +! set up initial conditions +! fftx(1) +! transpose (1->2) +! ffty(2) +! transpose (2->3) +! fftz(3) +! time evolution +! fftz(3) +! transpose (3->2) +! ffty(2) +! transpose (2->1) +! fftx(1) +! compute residual(1) + +! for the 0D, 1D, 2D strategies, the layouts look like xxx + +! 0D 1D 2D +! 1: xyz xyz xyz + +! the array dimensions are stored in dims(coord, phase) + integer :: dims(3) + common /layout/ dims + + integer :: T_total, T_setup, T_fft, T_evolve, T_checksum, & + T_fftx, T_ffty, & + T_fftz, T_max + parameter (T_total = 1, T_setup = 2, T_fft = 3, & + T_evolve = 4, T_checksum = 5, & + T_fftx = 6, & + T_ffty = 7, & + T_fftz = 8, T_max = 8) + + + + logical :: timers_enabled + + + external timer_read + double precision :: timer_read + external ilog2 + integer :: ilog2 + + external randlc + double precision :: randlc + + +! other stuff + logical :: debug, debugsynch + common /dbg/ debug, debugsynch, timers_enabled + + double precision :: seed, a, pi, alpha + parameter (seed = 314159265.d0, a = 1220703125.d0, & + pi = 3.141592653589793238d0, alpha=1.0d-6) + + +! roots of unity array +! relies on x being largest dimension? + double complex u(nxp) + common /ucomm/ u + + +! for checksum data + double complex sums(0:niter_default) + common /sumcomm/ sums + +! number of iterations + integer :: niter + common /iter/ niter + + integer :: is,m,n,i,j,l,mx + double complex x, y + + dimension x(fftblockpad,n), y(fftblockpad,n) + +!--------------------------------------------------------------------- +! Check if input parameters are invalid. +!--------------------------------------------------------------------- + mx = u(1) + if ((is /= 1 .AND. is /= -1) .OR. m < 1 .OR. m > mx) & + then + write (*, 1) is, m, mx + 1 format ('CFFTZ: Either U has not been initialized, or else'/ & + 'one of the input parameters is invalid', 3I5) + stop + endif + +!--------------------------------------------------------------------- +! Perform one variant of the Stockham FFT. +!--------------------------------------------------------------------- + do l = 1, m, 2 + call fftz2 (is, l, m, n, fftblock, fftblockpad, u, x, y) + if (l == m) goto 160 + call fftz2 (is, l + 1, m, n, fftblock, fftblockpad, u, y, x) + enddo + + goto 180 + +!--------------------------------------------------------------------- +! Copy Y to X. +!--------------------------------------------------------------------- + 160 do j = 1, n + do i = 1, fftblock + x(i,j) = y(i,j) + enddo + enddo + + 180 continue + + return + end subroutine cfftz + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine fftz2 (is, l, m, n, ny, ny1, u, x, y) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + +!--------------------------------------------------------------------- +! Performs the L-th iteration of the second variant of the Stockham FFT. +!--------------------------------------------------------------------- + + implicit none + + integer :: is,k,l,m,n,ny,ny1,n1,li,lj,lk,ku,i,j,i11,i12,i21,i22 + double complex u,x,y,u1,x11,x21 + dimension u(n), x(ny1,n), y(ny1,n) + + +!--------------------------------------------------------------------- +! Set initial parameters. +!--------------------------------------------------------------------- + + n1 = n / 2 + lk = 2 ** (l - 1) + li = 2 ** (m - l) + lj = 2 * lk + ku = li + 1 + + do i = 0, li - 1 + i11 = i * lk + 1 + i12 = i11 + n1 + i21 = i * lj + 1 + i22 = i21 + lk + if (is >= 1) then + u1 = u(ku+i) + else + u1 = dconjg (u(ku+i)) + endif + + !--------------------------------------------------------------------- + ! This loop is vectorizable. + !--------------------------------------------------------------------- + do k = 0, lk - 1 + do j = 1, ny + x11 = x(j,i11+k) + x21 = x(j,i12+k) + y(j,i21+k) = x11 + x21 + y(j,i22+k) = u1 * (x11 - x21) + enddo + enddo + enddo + + return + end subroutine fftz2 + +!--------------------------------------------------------------------- + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + integer function ilog2(n) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + implicit none + integer :: n, nn, lg + if (n == 1) then + ilog2=0 + return + endif + lg = 1 + nn = 2 + do while (nn < n) + nn = nn*2 + lg = lg+1 + end do + ilog2 = lg + return + end function ilog2 + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine checksum(i, u1, d1, d2, d3) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + implicit none +! CLASS = A +! +! +! This file is generated automatically by the setparams utility. +! It sets the number of processors and the class of the NPB +! in this directory. Do not modify it by hand. +! + integer nx, ny, nz, maxdim, niter_default + integer ntotal, nxp, nyp, ntotalp + parameter (nx=256, ny=256, nz=128, maxdim=256) + parameter (niter_default=6) + parameter (nxp=nx+1, nyp=ny) + parameter (ntotal=nx*nyp*nz) + parameter (ntotalp=nxp*nyp*nz) + logical convertdouble + parameter (convertdouble = .false.) + character compiletime*11 + parameter (compiletime='18 Oct 2016') + character npbversion*5 + parameter (npbversion='3.3.1') + character cs1*8 + parameter (cs1='gfortran') + character cs2*6 + parameter (cs2='$(F77)') + character cs3*6 + parameter (cs3='(none)') + character cs4*6 + parameter (cs4='(none)') + character cs5*40 + parameter (cs5='-ffree-form -O3 -fopenmp -mcmodel=medium') + character cs6*28 + parameter (cs6='-O3 -fopenmp -mcmodel=medium') + character cs7*6 + parameter (cs7='randi8') + + +! If processor array is 1x1 -> 0D grid decomposition + + +! Cache blocking params. These values are good for most +! RISC processors. +! FFT parameters: +! fftblock controls how many ffts are done at a time. +! The default is appropriate for most cache-based machines +! On vector machines, the FFT can be vectorized with vector +! length equal to the block size, so the block size should +! be as large as possible. This is the size of the smallest +! dimension of the problem: 128 for class A, 256 for class B and +! 512 for class C. + + integer :: fftblock_default, fftblockpad_default +! parameter (fftblock_default=16, fftblockpad_default=18) + parameter (fftblock_default=32, fftblockpad_default=33) + + integer :: fftblock, fftblockpad + common /blockinfo/ fftblock, fftblockpad + +! we need a bunch of logic to keep track of how +! arrays are laid out. + + +! Note: this serial version is the derived from the parallel 0D case +! of the ft NPB. +! The computation proceeds logically as + +! set up initial conditions +! fftx(1) +! transpose (1->2) +! ffty(2) +! transpose (2->3) +! fftz(3) +! time evolution +! fftz(3) +! transpose (3->2) +! ffty(2) +! transpose (2->1) +! fftx(1) +! compute residual(1) + +! for the 0D, 1D, 2D strategies, the layouts look like xxx + +! 0D 1D 2D +! 1: xyz xyz xyz + +! the array dimensions are stored in dims(coord, phase) + integer :: dims(3) + common /layout/ dims + + integer :: T_total, T_setup, T_fft, T_evolve, T_checksum, & + T_fftx, T_ffty, & + T_fftz, T_max + parameter (T_total = 1, T_setup = 2, T_fft = 3, & + T_evolve = 4, T_checksum = 5, & + T_fftx = 6, & + T_ffty = 7, & + T_fftz = 8, T_max = 8) + + + + logical :: timers_enabled + + + external timer_read + double precision :: timer_read + external ilog2 + integer :: ilog2 + + external randlc + double precision :: randlc + + +! other stuff + logical :: debug, debugsynch + common /dbg/ debug, debugsynch, timers_enabled + + double precision :: seed, a, pi, alpha + parameter (seed = 314159265.d0, a = 1220703125.d0, & + pi = 3.141592653589793238d0, alpha=1.0d-6) + + +! roots of unity array +! relies on x being largest dimension? + double complex u(nxp) + common /ucomm/ u + + +! for checksum data + double complex sums(0:niter_default) + common /sumcomm/ sums + +! number of iterations + integer :: niter + common /iter/ niter + integer :: i, d1, d2, d3 + double complex u1(d1+1,d2,d3) + integer :: j, q,r,s + double complex chk + chk = (0.0,0.0) + +! omp parallel do default(shared) private(i,q,r,s) reduction(+:chk) + do j=1,1024 + q = mod(j, nx)+1 + r = mod(3*j,ny)+1 + s = mod(5*j,nz)+1 + chk=chk+u1(q,r,s) + end do + + chk = chk/dble(ntotal) + + write (*, 30) i, chk + 30 format (' T =',I5,5X,'Checksum =',1P2D22.12) + sums(i) = chk + return + end subroutine checksum + + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + subroutine verify (d1, d2, d3, nt, verified, class) + +!--------------------------------------------------------------------- +!--------------------------------------------------------------------- + + implicit none +! CLASS = A +! +! +! This file is generated automatically by the setparams utility. +! It sets the number of processors and the class of the NPB +! in this directory. Do not modify it by hand. +! + integer nx, ny, nz, maxdim, niter_default + integer ntotal, nxp, nyp, ntotalp + parameter (nx=256, ny=256, nz=128, maxdim=256) + parameter (niter_default=6) + parameter (nxp=nx+1, nyp=ny) + parameter (ntotal=nx*nyp*nz) + parameter (ntotalp=nxp*nyp*nz) + logical convertdouble + parameter (convertdouble = .false.) + character compiletime*11 + parameter (compiletime='18 Oct 2016') + character npbversion*5 + parameter (npbversion='3.3.1') + character cs1*8 + parameter (cs1='gfortran') + character cs2*6 + parameter (cs2='$(F77)') + character cs3*6 + parameter (cs3='(none)') + character cs4*6 + parameter (cs4='(none)') + character cs5*40 + parameter (cs5='-ffree-form -O3 -fopenmp -mcmodel=medium') + character cs6*28 + parameter (cs6='-O3 -fopenmp -mcmodel=medium') + character cs7*6 + parameter (cs7='randi8') + + +! If processor array is 1x1 -> 0D grid decomposition + + +! Cache blocking params. These values are good for most +! RISC processors. +! FFT parameters: +! fftblock controls how many ffts are done at a time. +! The default is appropriate for most cache-based machines +! On vector machines, the FFT can be vectorized with vector +! length equal to the block size, so the block size should +! be as large as possible. This is the size of the smallest +! dimension of the problem: 128 for class A, 256 for class B and +! 512 for class C. + + integer :: fftblock_default, fftblockpad_default +! parameter (fftblock_default=16, fftblockpad_default=18) + parameter (fftblock_default=32, fftblockpad_default=33) + + integer :: fftblock, fftblockpad + common /blockinfo/ fftblock, fftblockpad + +! we need a bunch of logic to keep track of how +! arrays are laid out. + + +! Note: this serial version is the derived from the parallel 0D case +! of the ft NPB. +! The computation proceeds logically as + +! set up initial conditions +! fftx(1) +! transpose (1->2) +! ffty(2) +! transpose (2->3) +! fftz(3) +! time evolution +! fftz(3) +! transpose (3->2) +! ffty(2) +! transpose (2->1) +! fftx(1) +! compute residual(1) + +! for the 0D, 1D, 2D strategies, the layouts look like xxx + +! 0D 1D 2D +! 1: xyz xyz xyz + +! the array dimensions are stored in dims(coord, phase) + integer :: dims(3) + common /layout/ dims + + integer :: T_total, T_setup, T_fft, T_evolve, T_checksum, & + T_fftx, T_ffty, & + T_fftz, T_max + parameter (T_total = 1, T_setup = 2, T_fft = 3, & + T_evolve = 4, T_checksum = 5, & + T_fftx = 6, & + T_ffty = 7, & + T_fftz = 8, T_max = 8) + + + + logical :: timers_enabled + + + external timer_read + double precision :: timer_read + external ilog2 + integer :: ilog2 + + external randlc + double precision :: randlc + + +! other stuff + logical :: debug, debugsynch + common /dbg/ debug, debugsynch, timers_enabled + + double precision :: seed, a, pi, alpha + parameter (seed = 314159265.d0, a = 1220703125.d0, & + pi = 3.141592653589793238d0, alpha=1.0d-6) + + +! roots of unity array +! relies on x being largest dimension? + double complex u(nxp) + common /ucomm/ u + + +! for checksum data + double complex sums(0:niter_default) + common /sumcomm/ sums + +! number of iterations + integer :: niter + common /iter/ niter + integer :: d1, d2, d3, nt + character class + logical :: verified + integer :: i + double precision :: err, epsilon + +!--------------------------------------------------------------------- +! Reference checksums +!--------------------------------------------------------------------- + double complex csum_ref(25) + + + class = 'U' + + epsilon = 1.0d-12 + verified = .FALSE. + + if (d1 == 64 .AND. & + d2 == 64 .AND. & + d3 == 64 .AND. & + nt == 6) then + !--------------------------------------------------------------------- + ! Sample size reference checksums + !--------------------------------------------------------------------- + class = 'S' + csum_ref(1) = dcmplx(5.546087004964D+02, 4.845363331978D+02) + csum_ref(2) = dcmplx(5.546385409189D+02, 4.865304269511D+02) + csum_ref(3) = dcmplx(5.546148406171D+02, 4.883910722336D+02) + csum_ref(4) = dcmplx(5.545423607415D+02, 4.901273169046D+02) + csum_ref(5) = dcmplx(5.544255039624D+02, 4.917475857993D+02) + csum_ref(6) = dcmplx(5.542683411902D+02, 4.932597244941D+02) + + else if (d1 == 128 .AND. & + d2 == 128 .AND. & + d3 == 32 .AND. & + nt == 6) then + !--------------------------------------------------------------------- + ! Class W size reference checksums + !--------------------------------------------------------------------- + class = 'W' + csum_ref(1) = dcmplx(5.673612178944D+02, 5.293246849175D+02) + csum_ref(2) = dcmplx(5.631436885271D+02, 5.282149986629D+02) + csum_ref(3) = dcmplx(5.594024089970D+02, 5.270996558037D+02) + csum_ref(4) = dcmplx(5.560698047020D+02, 5.260027904925D+02) + csum_ref(5) = dcmplx(5.530898991250D+02, 5.249400845633D+02) + csum_ref(6) = dcmplx(5.504159734538D+02, 5.239212247086D+02) + + else if (d1 == 256 .AND. & + d2 == 256 .AND. & + d3 == 128 .AND. & + nt == 6) then + !--------------------------------------------------------------------- + ! Class A size reference checksums + !--------------------------------------------------------------------- + class = 'A' + csum_ref(1) = dcmplx(5.046735008193D+02, 5.114047905510D+02) + csum_ref(2) = dcmplx(5.059412319734D+02, 5.098809666433D+02) + csum_ref(3) = dcmplx(5.069376896287D+02, 5.098144042213D+02) + csum_ref(4) = dcmplx(5.077892868474D+02, 5.101336130759D+02) + csum_ref(5) = dcmplx(5.085233095391D+02, 5.104914655194D+02) + csum_ref(6) = dcmplx(5.091487099959D+02, 5.107917842803D+02) + + else if (d1 == 512 .AND. & + d2 == 256 .AND. & + d3 == 256 .AND. & + nt == 20) then + !--------------------------------------------------------------------- + ! Class B size reference checksums + !--------------------------------------------------------------------- + class = 'B' + csum_ref(1) = dcmplx(5.177643571579D+02, 5.077803458597D+02) + csum_ref(2) = dcmplx(5.154521291263D+02, 5.088249431599D+02) + csum_ref(3) = dcmplx(5.146409228649D+02, 5.096208912659D+02) + csum_ref(4) = dcmplx(5.142378756213D+02, 5.101023387619D+02) + csum_ref(5) = dcmplx(5.139626667737D+02, 5.103976610617D+02) + csum_ref(6) = dcmplx(5.137423460082D+02, 5.105948019802D+02) + csum_ref(7) = dcmplx(5.135547056878D+02, 5.107404165783D+02) + csum_ref(8) = dcmplx(5.133910925466D+02, 5.108576573661D+02) + csum_ref(9) = dcmplx(5.132470705390D+02, 5.109577278523D+02) + csum_ref(10) = dcmplx(5.131197729984D+02, 5.110460304483D+02) + csum_ref(11) = dcmplx(5.130070319283D+02, 5.111252433800D+02) + csum_ref(12) = dcmplx(5.129070537032D+02, 5.111968077718D+02) + csum_ref(13) = dcmplx(5.128182883502D+02, 5.112616233064D+02) + csum_ref(14) = dcmplx(5.127393733383D+02, 5.113203605551D+02) + csum_ref(15) = dcmplx(5.126691062020D+02, 5.113735928093D+02) + csum_ref(16) = dcmplx(5.126064276004D+02, 5.114218460548D+02) + csum_ref(17) = dcmplx(5.125504076570D+02, 5.114656139760D+02) + csum_ref(18) = dcmplx(5.125002331720D+02, 5.115053595966D+02) + csum_ref(19) = dcmplx(5.124551951846D+02, 5.115415130407D+02) + csum_ref(20) = dcmplx(5.124146770029D+02, 5.115744692211D+02) + + else if (d1 == 512 .AND. & + d2 == 512 .AND. & + d3 == 512 .AND. & + nt == 20) then + !--------------------------------------------------------------------- + ! Class C size reference checksums + !--------------------------------------------------------------------- + class = 'C' + csum_ref(1) = dcmplx(5.195078707457D+02, 5.149019699238D+02) + csum_ref(2) = dcmplx(5.155422171134D+02, 5.127578201997D+02) + csum_ref(3) = dcmplx(5.144678022222D+02, 5.122251847514D+02) + csum_ref(4) = dcmplx(5.140150594328D+02, 5.121090289018D+02) + csum_ref(5) = dcmplx(5.137550426810D+02, 5.121143685824D+02) + csum_ref(6) = dcmplx(5.135811056728D+02, 5.121496764568D+02) + csum_ref(7) = dcmplx(5.134569343165D+02, 5.121870921893D+02) + csum_ref(8) = dcmplx(5.133651975661D+02, 5.122193250322D+02) + csum_ref(9) = dcmplx(5.132955192805D+02, 5.122454735794D+02) + csum_ref(10) = dcmplx(5.132410471738D+02, 5.122663649603D+02) + csum_ref(11) = dcmplx(5.131971141679D+02, 5.122830879827D+02) + csum_ref(12) = dcmplx(5.131605205716D+02, 5.122965869718D+02) + csum_ref(13) = dcmplx(5.131290734194D+02, 5.123075927445D+02) + csum_ref(14) = dcmplx(5.131012720314D+02, 5.123166486553D+02) + csum_ref(15) = dcmplx(5.130760908195D+02, 5.123241541685D+02) + csum_ref(16) = dcmplx(5.130528295923D+02, 5.123304037599D+02) + csum_ref(17) = dcmplx(5.130310107773D+02, 5.123356167976D+02) + csum_ref(18) = dcmplx(5.130103090133D+02, 5.123399592211D+02) + csum_ref(19) = dcmplx(5.129905029333D+02, 5.123435588985D+02) + csum_ref(20) = dcmplx(5.129714421109D+02, 5.123465164008D+02) + + else if (d1 == 2048 .AND. & + d2 == 1024 .AND. & + d3 == 1024 .AND. & + nt == 25) then + !--------------------------------------------------------------------- + ! Class D size reference checksums + !--------------------------------------------------------------------- + class = 'D' + csum_ref(1) = dcmplx(5.122230065252D+02, 5.118534037109D+02) + csum_ref(2) = dcmplx(5.120463975765D+02, 5.117061181082D+02) + csum_ref(3) = dcmplx(5.119865766760D+02, 5.117096364601D+02) + csum_ref(4) = dcmplx(5.119518799488D+02, 5.117373863950D+02) + csum_ref(5) = dcmplx(5.119269088223D+02, 5.117680347632D+02) + csum_ref(6) = dcmplx(5.119082416858D+02, 5.117967875532D+02) + csum_ref(7) = dcmplx(5.118943814638D+02, 5.118225281841D+02) + csum_ref(8) = dcmplx(5.118842385057D+02, 5.118451629348D+02) + csum_ref(9) = dcmplx(5.118769435632D+02, 5.118649119387D+02) + csum_ref(10) = dcmplx(5.118718203448D+02, 5.118820803844D+02) + csum_ref(11) = dcmplx(5.118683569061D+02, 5.118969781011D+02) + csum_ref(12) = dcmplx(5.118661708593D+02, 5.119098918835D+02) + csum_ref(13) = dcmplx(5.118649768950D+02, 5.119210777066D+02) + csum_ref(14) = dcmplx(5.118645605626D+02, 5.119307604484D+02) + csum_ref(15) = dcmplx(5.118647586618D+02, 5.119391362671D+02) + csum_ref(16) = dcmplx(5.118654451572D+02, 5.119463757241D+02) + csum_ref(17) = dcmplx(5.118665212451D+02, 5.119526269238D+02) + csum_ref(18) = dcmplx(5.118679083821D+02, 5.119580184108D+02) + csum_ref(19) = dcmplx(5.118695433664D+02, 5.119626617538D+02) + csum_ref(20) = dcmplx(5.118713748264D+02, 5.119666538138D+02) + csum_ref(21) = dcmplx(5.118733606701D+02, 5.119700787219D+02) + csum_ref(22) = dcmplx(5.118754661974D+02, 5.119730095953D+02) + csum_ref(23) = dcmplx(5.118776626738D+02, 5.119755100241D+02) + csum_ref(24) = dcmplx(5.118799262314D+02, 5.119776353561D+02) + csum_ref(25) = dcmplx(5.118822370068D+02, 5.119794338060D+02) + + else if (d1 == 4096 .AND. & + d2 == 2048 .AND. & + d3 == 2048 .AND. & + nt == 25) then + !--------------------------------------------------------------------- + ! Class E size reference checksums + !--------------------------------------------------------------------- + class = 'E' + csum_ref(1) = dcmplx(5.121601045346D+02, 5.117395998266D+02) + csum_ref(2) = dcmplx(5.120905403678D+02, 5.118614716182D+02) + csum_ref(3) = dcmplx(5.120623229306D+02, 5.119074203747D+02) + csum_ref(4) = dcmplx(5.120438418997D+02, 5.119345900733D+02) + csum_ref(5) = dcmplx(5.120311521872D+02, 5.119551325550D+02) + csum_ref(6) = dcmplx(5.120226088809D+02, 5.119720179919D+02) + csum_ref(7) = dcmplx(5.120169296534D+02, 5.119861371665D+02) + csum_ref(8) = dcmplx(5.120131225172D+02, 5.119979364402D+02) + csum_ref(9) = dcmplx(5.120104767108D+02, 5.120077674092D+02) + csum_ref(10) = dcmplx(5.120085127969D+02, 5.120159443121D+02) + csum_ref(11) = dcmplx(5.120069224127D+02, 5.120227453670D+02) + csum_ref(12) = dcmplx(5.120055158164D+02, 5.120284096041D+02) + csum_ref(13) = dcmplx(5.120041820159D+02, 5.120331373793D+02) + csum_ref(14) = dcmplx(5.120028605402D+02, 5.120370938679D+02) + csum_ref(15) = dcmplx(5.120015223011D+02, 5.120404138831D+02) + csum_ref(16) = dcmplx(5.120001570022D+02, 5.120432068837D+02) + csum_ref(17) = dcmplx(5.119987650555D+02, 5.120455615860D+02) + csum_ref(18) = dcmplx(5.119973525091D+02, 5.120475499442D+02) + csum_ref(19) = dcmplx(5.119959279472D+02, 5.120492304629D+02) + csum_ref(20) = dcmplx(5.119945006558D+02, 5.120506508902D+02) + csum_ref(21) = dcmplx(5.119930795911D+02, 5.120518503782D+02) + csum_ref(22) = dcmplx(5.119916728462D+02, 5.120528612016D+02) + csum_ref(23) = dcmplx(5.119902874185D+02, 5.120537101195D+02) + csum_ref(24) = dcmplx(5.119889291565D+02, 5.120544194514D+02) + csum_ref(25) = dcmplx(5.119876028049D+02, 5.120550079284D+02) + + endif + + + if (class /= 'U') then + + do i = 1, nt + err = abs( (sums(i) - csum_ref(i)) / csum_ref(i) ) + if ( .NOT. (err <= epsilon)) goto 100 + end do + verified = .TRUE. + 100 continue + + endif + + + if (class /= 'U') then + if (verified) then + write(*,2000) + 2000 format(' Result verification successful') + else + write(*,2001) + 2001 format(' Result verification failed') + endif + endif + print *, 'class = ', class + + return + end subroutine verify + + diff --git a/Parser/test-data/rules/R503a.f90 b/Parser/test-data/rules/R503a.f90 new file mode 100644 index 0000000..ea8476f --- /dev/null +++ b/Parser/test-data/rules/R503a.f90 @@ -0,0 +1,14 @@ +real(RDP), intent(in) :: d(:) +real(RDP), intent(in) :: d(0:) +real(RDP), intent(in) :: d(:,0:) +real(RDP), intent(in) :: u_v( :,:) +real(RDP), intent(inout) :: r_e( :,:,:) +real(RDP), intent(inout) :: r_v( :,:) +REAL, DIMENSION (N, 10) :: W +REAL A (:), B (0:) +REAL, POINTER :: D (:, :) +REAL, DIMENSION (:), POINTER :: P +REAL, ALLOCATABLE, DIMENSION (:) :: E +REAL, PARAMETER :: V(0:*) = [0.1, 1.1] + +end diff --git a/Parser/test-data/rules/tmp.f90 b/Parser/test-data/rules/tmp.f90 new file mode 100644 index 0000000..70f6222 --- /dev/null +++ b/Parser/test-data/rules/tmp.f90 @@ -0,0 +1,4 @@ +!$acc parallel async(2+2) +x = #test# +!$acc end parallel +end diff --git a/Parser/test-data/slots/block.f b/Parser/test-data/slots/block.f new file mode 100644 index 0000000..52ffae9 --- /dev/null +++ b/Parser/test-data/slots/block.f @@ -0,0 +1,7 @@ +! BlockSlot + + do x = 0,p + #test# + end do + +end program diff --git a/Parser/test-data/specht/array_operations.f b/Parser/test-data/specht/array_operations.f new file mode 100644 index 0000000..6e2be42 --- /dev/null +++ b/Parser/test-data/specht/array_operations.f @@ -0,0 +1,1166 @@ +!> \file array_operations.f +!> \brief Standard operations with arrays +!> \author Immo Huismann +!> \date 2015/01/05 +!> \copyright Institute of Fluid Mechanics, TU Dresden, 01062 Dresden, Germany +!> +!> \details +!> ## Rationale ## +!> This module provides operations on arrays that work on the targeted hardware, +!> providing an abstraction for subsequent modules. +!> As the PGI compiler is completely useless when compiling simple things like +!> +!> !$acc kernels +!> x = y * z +!> !$acc end kernels +!> +!> leading to working, but very slow variants of the operation. This module +!> provides an easy way to get things working with the compiler. +!> +!> ## Programming ## +!> To enable the compilers to do some more optimization, all public routines +!> possess an explicit shape interface, while their called background +!> implementation uses an explicit size interface and only one loop. +!> +!> All routines inside this module possess a similar interface. The first array +!> passed is the value that will be set. Every other array will have intent(in). +!> If these vectors receive a factor, it will be passed before the specific +!> vector. +!> +!> Inside this module all arrays are named x, y, or z (and w in one instance). +!> Scalars are a, b, c, etc. +!> +!> \todo +!> * Most routines have an interface now. Those who do not, need to be purged +!> or moved to a legacy module that can get purged with enough time. +!> * Redo the comments on all routines that don't sport formulas in brief. +!=============================================================================== + +module Array_Operations + ! HiBASE modules + use Kind_Parameters, only: RDP + use Constants, only: ONE + + ! Specht base modules + use ACC_Parameters, only: ACC_EXEC_QUEUE + implicit none + private + + public :: AddScaledVector + public :: Scale + public :: ScaleAndAddVector + public :: MultiplyWithVector + + public :: SetToZero + public :: SetToValue + public :: SetToVector + public :: SetToScaledVector + public :: SetToTwoScaledVectors + public :: SetToThreeScaledVectors + public :: SetToMultipliedVectors + + public :: PointWiseInversion + public :: PointWiseSqrt + + !----------------------------------------------------------------------------- + !> \brief Sets a vector to zero: x <- 0 + !> \author Immo Huismann + + interface SetToZero + module procedure SetToZero_1 + module procedure SetToZero_2 + module procedure SetToZero_3 + module procedure SetToZero_4 + + module procedure SetToZero_432 + end interface SetToZero + + !----------------------------------------------------------------------------- + !> \brief Sets a vector to another one: x <- y + !> \author Immo Huismann + + interface SetToVector + module procedure SetToVector_2 + module procedure SetToVector_3 + module procedure SetToVector_4 + end interface SetToVector + + !----------------------------------------------------------------------------- + !> \brief Scales a vector x <- a x + !> \author Immo Huismann + + interface Scale + module procedure Scale_2 + module procedure Scale_3 + module procedure Scale_4 + + module procedure Scale_432 + end interface Scale + + !----------------------------------------------------------------------------- + !> \brief Scales a vector and adds another to it: x <- a * x + y + !> \author Immo Huismann + + interface ScaleAndAddVector + module procedure ScaleAndAddVector_2 + module procedure ScaleAndAddVector_3 + module procedure ScaleAndAddVector_4 + + module procedure ScaleAndAddVector_432 + end interface ScaleAndAddVector + + !----------------------------------------------------------------------------- + !> \brief Adds a scaled vector to a vector: x <- x + a * y + !> \author Immo Huismann + + interface AddScaledVector + module procedure AddScaledVector_2 + module procedure AddScaledVector_3 + module procedure AddScaledVector_4 + + module procedure AddScaledVector_432 + end interface AddScaledVector + + !----------------------------------------------------------------------------- + !> \brief Sets to a scaled vector: x <- a * y + !> \author Immo Huismann + + interface SetToScaledVector + module procedure SetToScaledVector_2 + module procedure SetToScaledVector_3 + module procedure SetToScaledVector_4 + + module procedure SetToScaledVector_432 + end interface SetToScaledVector + + !----------------------------------------------------------------------------- + !> \brief Sets to the elementwise product of two vectors: x_i <- y_i * z_i + !> \author Immo Huismann + + interface SetToMultipliedVectors + module procedure SetToMultipliedVectors_2 + module procedure SetToMultipliedVectors_3 + module procedure SetToMultipliedVectors_4 + + module procedure SetToMultipliedVectors_432 + end interface SetToMultipliedVectors + + !----------------------------------------------------------------------------- + !> \brief Elementwise product of two vectors: x_i <- x_i * y_i + !> \author Immo Huismann + + interface MultiplyWithVector + module procedure MultiplyWithVector_2 + module procedure MultiplyWithVector_3 + module procedure MultiplyWithVector_4 + + module procedure MultiplyWithVector_432 + end interface MultiplyWithVector + + !----------------------------------------------------------------------------- + !> \brief Elementwise inversion of vector: x_i <- 1 / x_i + !> \author Immo Huismann + + interface PointWiseInversion + module procedure PointWiseInversion_2 + module procedure PointWiseInversion_3 + module procedure PointWiseInversion_4 + + module procedure PointWiseInversion_432 + end interface PointWiseInversion + + !----------------------------------------------------------------------------- + !> \brief Elementwise inversion of vector: x_i <- 1 / x_i + !> \author Immo Huismann + + interface PointWiseSqrt + module procedure PointWiseSqrt_2 + module procedure PointWiseSqrt_3 + module procedure PointWiseSqrt_4 + + module procedure PointWiseSqrt_432 + end interface PointWiseSqrt + + !----------------------------------------------------------------------------- + !> \brief Sets the variable to a given value: x_i <- a + !> \author Immo Huismann + + interface SetToValue + module procedure SetToValue_2 + module procedure SetToValue_3 + module procedure SetToValue_4 + end interface SetToValue + +contains + +!=============================================================================== +! Set some vector to zero + +!------------------------------------------------------------------------------- +!> \brief Sets a vector to zero: x <- 0 +!> \author Immo Huismann + +subroutine SetToZero_1(x) + real(RDP), intent(out) :: x(:) !< x <- 0 + + call SetToZero_ES(size(x),x) + +end subroutine SetToZero_1 + +!------------------------------------------------------------------------------- +!> \brief Sets a vector to zero: x <- 0 +!> \author Immo Huismann + +subroutine SetToZero_2(x) + real(RDP), intent(out) :: x(:,:) !< x <- 0 + + call SetToZero_ES(size(x),x) + +end subroutine SetToZero_2 + +!------------------------------------------------------------------------------- +!> \brief Sets a vector to zero: x <- 0 +!> \author Immo Huismann + +subroutine SetToZero_3(x) + real(RDP), intent(out) :: x(:,:,:) !< x <- 0 + + call SetToZero_ES(size(x),x) + +end subroutine SetToZero_3 + +!------------------------------------------------------------------------------- +!> \brief Sets a vector to zero: x <- 0 +!> \author Immo Huismann + +subroutine SetToZero_4(x) + real(RDP), intent(out) :: x(:,:,:,:) !< x <- 0 + + call SetToZero_ES(size(x),x) + +end subroutine SetToZero_4 + +!------------------------------------------------------------------------------- +!> \brief Sets a vector to zero: x <- 0 +!> \author Immo Huismann + +subroutine SetToZero_432(x_4,x_3,x_2) + real(RDP), intent(out) :: x_2(:,:) !< x <- 0 + real(RDP), intent(out) :: x_3(:,:,:) !< x <- 0 + real(RDP), intent(out) :: x_4(:,:,:,:) !< x <- 0 + + call SetToZero_ES(size(x_2),x_2) + call SetToZero_ES(size(x_3),x_3) + call SetToZero_ES(size(x_4),x_4) + +end subroutine SetToZero_432 + +!------------------------------------------------------------------------------- +!> \brief Sets a vector to zero, explicit size +!> \author Immo Huismann + +subroutine SetToZero_ES(n_points,x) + integer, intent(in) :: n_points !< number of points to work on + real(RDP), intent(out) :: x(n_points) !< x_i <- 0 + + integer :: i + + !$acc data present(x) + !$acc parallel async(ACC_EXEC_QUEUE) + !$acc loop + !$omp do private(i) + do i = 1, n_points + x(i) = 0 + end do + !$omp end do + !$acc end loop + !$acc end parallel + !$acc end data + +end subroutine SetToZero_ES + +!=============================================================================== +! Add scaled vectors + +!------------------------------------------------------------------------------- +!> \brief Adds a scaled vector to the variable x <- x + a * y +!> \author Immo Huismann + +subroutine AddScaledVector_2(x,a,y) + real(RDP), intent(inout) :: x(:,:) !< x <- x + a * y + real(RDP), intent(in) :: a !< scaling factor + real(RDP), intent(in) :: y(:,:) !< array to scale and add to x + + call AddScaledVector_ES(size(x),x,a,y) + +end subroutine AddScaledVector_2 + +!------------------------------------------------------------------------------- +!> \brief Adds a scaled vector to the variable x <- x + a * y +!> \author Immo Huismann + +subroutine AddScaledVector_3(x,a,y) + real(RDP), intent(inout) :: x(:,:,:) !< x <- x + a * y + real(RDP), intent(in) :: a !< scaling factor + real(RDP), intent(in) :: y(:,:,:) !< array to scale and add to x + + call AddScaledVector_ES(size(x),x,a,y) + +end subroutine AddScaledVector_3 + +!------------------------------------------------------------------------------- +!> \brief Adds a scaled vector to the variable x <- x + a * y +!> \author Immo Huismann + +subroutine AddScaledVector_4(x,a,y) + real(RDP), intent(inout) :: x(:,:,:,:) !< x <- x + a * y + real(RDP), intent(in) :: a !< scaling factor + real(RDP), intent(in) :: y(:,:,:,:) !< array to scale and add to x + + call AddScaledVector_ES(size(x),x,a,y) + +end subroutine AddScaledVector_4 + +!------------------------------------------------------------------------------- +!> \brief Adds a scaled vector to the variable x <- x + a * y +!> \author Immo Huismann + +subroutine AddScaledVector_432(x_4,x_3,x_2,a,y_4,y_3,y_2) + real(RDP), intent(inout) :: x_4(:,:,:,:) !< x <- x + a * y + real(RDP), intent(inout) :: x_3(:,:,:) !< x <- x + a * y + real(RDP), intent(inout) :: x_2(:,:) !< x <- x + a * y + real(RDP), intent(in) :: a !< scaling factor + real(RDP), intent(in) :: y_4(:,:,:,:) !< array to scale and add to x + real(RDP), intent(in) :: y_3(:,:,:) !< array to scale and add to x + real(RDP), intent(in) :: y_2(:,:) !< array to scale and add to x + + call AddScaledVector_ES(size(x_4),x_4,a,y_4) + call AddScaledVector_ES(size(x_3),x_3,a,y_3) + call AddScaledVector_ES(size(x_2),x_2,a,y_2) + +end subroutine AddScaledVector_432 + +!------------------------------------------------------------------------------- +!> \brief Adds a scaled vector to the variable +!> \author Immo Huismann + +subroutine AddScaledVector_ES(n_points,x,a,y) + integer, intent(in) :: n_points !< number of points to work on + real(RDP), intent(inout) :: x(n_points) !< x_i <- x_i + a * y_i + real(RDP), intent(in) :: a !< scaling factor + real(RDP), intent(in) :: y(n_points) !< array to scale and add to x + + integer :: i + + !$acc data present(x,y) + !$acc parallel async(ACC_EXEC_QUEUE) + !$acc loop + !$omp do private(i) + do i = 1, n_points + x(i) = x(i) + a * y(i) + end do + !$omp end do + !$acc end loop + !$acc end parallel + !$acc end data + +end subroutine AddScaledVector_ES + +!=============================================================================== +! Scales a vector + +!------------------------------------------------------------------------------- +!> \brief Scales a vector x <- a * x +!> \author Immo Huismann + +subroutine Scale_2(x,a) + real(RDP), intent(inout) :: x(:,:) !< vector to scale + real(RDP), intent(in) :: a !< scaling factor + + call Scale_ES(size(x),x,a) + +end subroutine Scale_2 + +!------------------------------------------------------------------------------- +!> \brief Scales a vector x <- a * x +!> \author Immo Huismann + +subroutine Scale_3(x,a) + real(RDP), intent(inout) :: x(:,:,:) !< vector to scale + real(RDP), intent(in) :: a !< scaling factor + + call Scale_ES(size(x),x,a) + +end subroutine Scale_3 + +!------------------------------------------------------------------------------- +!> \brief Scales a vector x <- a * x +!> \author Immo Huismann + +subroutine Scale_4(x,a) + real(RDP), intent(inout) :: x(:,:,:,:) !< vector to scale and add to + real(RDP), intent(in) :: a !< scaling factor + + call Scale_ES(size(x),x,a) + +end subroutine Scale_4 + +!------------------------------------------------------------------------------- +!> \brief Scales a vector x <- a * x +!> \author Immo Huismann + +subroutine Scale_432(x_4,x_3,x_2,a) + real(RDP), intent(inout) :: x_4(:,:,:,:) !< x <- x + real(RDP), intent(inout) :: x_3(:,:,:) !< x <- x + real(RDP), intent(inout) :: x_2(:,:) !< x <- x + real(RDP), intent(in) :: a !< scaling factor + + call Scale_ES(size(x_4),x_4,a) + call Scale_ES(size(x_3),x_3,a) + call Scale_ES(size(x_2),x_2,a) + +end subroutine Scale_432 + +!------------------------------------------------------------------------------- +!> \brief Scales a vector and adds another one to it (aka daxpy) +!> \author Immo Huismann + +subroutine Scale_ES(n_points,x,a) + integer, intent(in) :: n_points !< number of points in the vector + real(RDP), intent(inout) :: x(n_points) !< vector to scale and add to + real(RDP), intent(in) :: a !< scaling factor + + integer :: i + + !$acc data present(x,y) + !$acc parallel async(ACC_EXEC_QUEUE) + !$acc loop + !$omp do private(i) + do i = 1, n_points + x(i) = a * x(i) + end do + !$omp end do + !$acc end loop + !$acc end parallel + !$acc end data + +end subroutine Scale_ES + +!=============================================================================== +! Scale and add vectors + +!------------------------------------------------------------------------------- +!> \brief Scales a vector and adds another one to it x <- a * x + y +!> \author Immo Huismann + +subroutine ScaleAndAddVector_2(x,a,y) + real(RDP), intent(inout) :: x(:,:) !< vector to scale and add to + real(RDP), intent(in) :: a !< scaling factor + real(RDP), intent(in) :: y(:,:) !< vector to add to x + + call ScaleAndAddVector_ES(size(x),x,a,y) + +end subroutine ScaleAndAddVector_2 + +!------------------------------------------------------------------------------- +!> \brief Scales a vector and adds another one to it x <- a * x + y +!> \author Immo Huismann + +subroutine ScaleAndAddVector_3(x,a,y) + real(RDP), intent(inout) :: x(:,:,:) !< vector to scale and add to + real(RDP), intent(in) :: a !< scaling factor + real(RDP), intent(in) :: y(:,:,:) !< vector to add to x + + call ScaleAndAddVector_ES(size(x),x,a,y) + +end subroutine ScaleAndAddVector_3 + +!------------------------------------------------------------------------------- +!> \brief Scales a vector and adds another one to it x <- a * x + y +!> \author Immo Huismann + +subroutine ScaleAndAddVector_4(x,a,y) + real(RDP), intent(inout) :: x(:,:,:,:) !< vector to scale and add to + real(RDP), intent(in) :: a !< scaling factor + real(RDP), intent(in) :: y(:,:,:,:) !< vector to add to x + + call ScaleAndAddVector_ES(size(x),x,a,y) + +end subroutine ScaleAndAddVector_4 + +!------------------------------------------------------------------------------- +!> \brief Scales a vector and adds another one to it x <- a * x + y +!> \author Immo Huismann + +subroutine ScaleAndAddVector_432(x_4,x_3,x_2,a,y_4,y_3,y_2) + real(RDP), intent(inout) :: x_4(:,:,:,:) !< x <- x + a * y + real(RDP), intent(inout) :: x_3(:,:,:) !< x <- x + a * y + real(RDP), intent(inout) :: x_2(:,:) !< x <- x + a * y + real(RDP), intent(in) :: a !< scaling factor + real(RDP), intent(in) :: y_4(:,:,:,:) !< array to scale and add to x + real(RDP), intent(in) :: y_3(:,:,:) !< array to scale and add to x + real(RDP), intent(in) :: y_2(:,:) !< array to scale and add to x + + call ScaleAndAddVector_ES(size(x_4),x_4,a,y_4) + call ScaleAndAddVector_ES(size(x_3),x_3,a,y_3) + call ScaleAndAddVector_ES(size(x_2),x_2,a,y_2) + +end subroutine ScaleAndAddVector_432 + +!------------------------------------------------------------------------------- +!> \brief Scales a vector and adds another one to it (aka daxpy) +!> \author Immo Huismann + +subroutine ScaleAndAddVector_ES(n_points,x,a,y) + integer, intent(in) :: n_points !< number of points in the vector + real(RDP), intent(inout) :: x(n_points) !< vector to scale and add to + real(RDP), intent(in) :: a !< scaling factor + real(RDP), intent(in) :: y(n_points) !< vector to add to x + + integer :: i + + !$acc data present(x,y) + !$acc parallel async(ACC_EXEC_QUEUE) + !$acc loop + !$omp do private(i) + do i = 1, n_points + x(i) = a * x(i) + y(i) + end do + !$omp end do + !$acc end loop + !$acc end parallel + !$acc end data + +end subroutine ScaleAndAddVector_ES + +!=============================================================================== +! Set to scaled vectors + +!------------------------------------------------------------------------------- +!> \brief Sets x <- a * y +!> \author Immo Huismann + +subroutine SetToScaledVector_2(x,a,y) + real(RDP), intent(out) :: x(:,:) !< x <- a * y + real(RDP), intent(in) :: a !< scaling factor for y + real(RDP), intent(in) :: y(:,:) !< vector to scale and store in x + + call SetToScaledVector_ES(size(x),x,a,y) + +end subroutine SetToScaledVector_2 + +!------------------------------------------------------------------------------- +!> \brief Sets \f$ x <- a * y \f$ +!> \author Immo Huismann + +subroutine SetToScaledVector_3(x,a,y) + real(RDP), intent(out) :: x(:,:,:) !< x <- a * y + real(RDP), intent(in) :: a !< scaling factor for y + real(RDP), intent(in) :: y(:,:,:) !< vector to scale and store in x + + call SetToScaledVector_ES(size(x),x,a,y) + +end subroutine SetToScaledVector_3 + +!------------------------------------------------------------------------------- +!> \brief Sets x <- a * y +!> \author Immo Huismann + +subroutine SetToScaledVector_4(x,a,y) + real(RDP), intent(in) :: y(:,:,:,:) !< x <- a * y + real(RDP), intent(in) :: a !< scaling factor for y + real(RDP), intent(out) :: x(:,:,:,:) !< vector to scale and store in x + + call SetToScaledVector_ES(size(x),x,a,y) + +end subroutine SetToScaledVector_4 + +!------------------------------------------------------------------------------- +!> \brief Sets x <- a * y +!> \author Immo Huismann + +subroutine SetToScaledVector_432(x_4,x_3,x_2,a,y_4,y_3,y_2) + real(RDP), intent(inout) :: x_4(:,:,:,:) !< x <- a * y + real(RDP), intent(inout) :: x_3(:,:,:) !< x <- a * y + real(RDP), intent(inout) :: x_2(:,:) !< x <- a * y + real(RDP), intent(in) :: a !< scaling factor for y + real(RDP), intent(in) :: y_4(:,:,:,:) !< vector to scale and store in x + real(RDP), intent(in) :: y_3(:,:,:) !< vector to scale and store in x + real(RDP), intent(in) :: y_2(:,:) !< vector to scale and store in x + + call SetToScaledVector_ES(size(x_4),x_4,a,y_4) + call SetToScaledVector_ES(size(x_3),x_3,a,y_3) + call SetToScaledVector_ES(size(x_2),x_2,a,y_2) + +end subroutine SetToScaledVector_432 + +!------------------------------------------------------------------------------- +!> \brief Sets x <- a * y - explicit size subroutine +!> \author Immo Huismann + +subroutine SetToScaledVector_ES(n_points,x,a,y) + integer, intent(in) :: n_points !< number of points to work on + real(RDP), intent(out) :: x(n_points) !< x_i <- a * y_i + real(RDP), intent(in) :: a !< scaling factor for y + real(RDP), intent(in) :: y(n_points) !< vector to scale and store in x + + integer :: i + + !$acc data present(x,y) + !$acc parallel async(ACC_EXEC_QUEUE) + !$acc loop + !$omp do private(i) + do i = 1, n_points + x(i) = a * y(i) + end do + !$omp end do + !$acc end loop + !$acc end parallel + !$acc end data + +end subroutine SetToScaledVector_ES + + +!=============================================================================== +! Multiply two vectors, store in first + +!------------------------------------------------------------------------------- +!> \brief Pointwise multiplication x_i <- x_i * y_i +!> \author Immo Huismann + +subroutine MultiplyWithVector_2(x,y) + real(RDP), intent(inout) :: x(:,:) !< vector to save into + real(RDP), intent(in) :: y(:,:) !< vector to multiply it with + + call MultiplyWithVector_ES(size(x),x,y) + +end subroutine MultiplyWithVector_2 + +!------------------------------------------------------------------------------- +!> \brief Pointwise multiplication x_i <- x_i * y_i +!> \author Immo Huismann + +subroutine MultiplyWithVector_3(x,y) + real(RDP), intent(inout) :: x(:,:,:) !< vector to save into + real(RDP), intent(in) :: y(:,:,:) !< vector to multiply it with + + call MultiplyWithVector_ES(size(x),x,y) + +end subroutine MultiplyWithVector_3 + +!------------------------------------------------------------------------------- +!> \brief Pointwise multiplication x_i <- x_i * y_i +!> \author Immo Huismann + +subroutine MultiplyWithVector_4(x,y) + real(RDP), intent(inout) :: x(:,:,:,:) !< vector to save into + real(RDP), intent(in) :: y(:,:,:,:) !< vector to multiply it with + + call MultiplyWithVector_ES(size(x),x,y) + +end subroutine MultiplyWithVector_4 + +!------------------------------------------------------------------------------- +!> \brief Pointwise multiplication x_i <- x_i * y_i +!> \author Immo Huismann + +subroutine MultiplyWithVector_432(x_4,x_3,x_2,y_4,y_3,y_2) + real(RDP), intent(inout) :: x_4(:,:,:,:) !< vector to save into + real(RDP), intent(inout) :: x_3(:,:,:) !< vector to save into + real(RDP), intent(inout) :: x_2(:,:) !< vector to save into + real(RDP), intent(in) :: y_4(:,:,:,:) !< vector to multiply it with + real(RDP), intent(in) :: y_3(:,:,:) !< vector to multiply it with + real(RDP), intent(in) :: y_2(:,:) !< vector to multiply it with + + call MultiplyWithVector_ES(size(x_4),x_4,y_4) + call MultiplyWithVector_ES(size(x_3),x_3,y_3) + call MultiplyWithVector_ES(size(x_2),x_2,y_2) + +end subroutine MultiplyWithVector_432 + +!------------------------------------------------------------------------------- +!> \brief Pointwise multiplication x_i <- x_i * y_i +!> \author Immo Huismann + +subroutine MultiplyWithVector_ES(n_points,x,y) + integer, intent(in) :: n_points !< number of points in the vectors + real(RDP), intent(inout) :: x(n_points) !< first vector, store result in it + real(RDP), intent(in) :: y(n_points) !< second vector, multiply with it + + integer :: i + + !$acc data present(x,y) + !$acc parallel async(ACC_EXEC_QUEUE) + !$acc loop + !$omp do private(i) + do i = 1, n_points + x(i) = x(i) * y(i) + end do + !$omp end do + !$acc end loop + !$acc end parallel + !$acc end data + +end subroutine MultiplyWithVector_ES + +!=============================================================================== +! Set a vector to the elementwise product of two other vectors + +!------------------------------------------------------------------------------- +!> \brief Pointwise multiplication x_i <- y_i * z_i +!> \author Immo Huismann + +subroutine SetToMultipliedVectors_2(x,y,z) + real(RDP), intent(out) :: x(:,:) !< array to store in + real(RDP), intent(in) :: y(:,:) + real(RDP), intent(in) :: z(:,:) + + call SetToMultipliedVectors_ES(size(x),x,y,z) + +end subroutine SetToMultipliedVectors_2 + +!------------------------------------------------------------------------------- +!> \brief Pointwise multiplication x_i <- y_i * z_i +!> \author Immo Huismann + +subroutine SetToMultipliedVectors_3(x,y,z) + real(RDP), intent(out) :: x(:,:,:) !< array to store in + real(RDP), intent(in) :: y(:,:,:) + real(RDP), intent(in) :: z(:,:,:) + + call SetToMultipliedVectors_ES(size(x),x,y,z) + +end subroutine SetToMultipliedVectors_3 + +!------------------------------------------------------------------------------- +!> \brief Pointwise multiplication x_i <- y_i * z_i +!> \author Immo Huismann + +subroutine SetToMultipliedVectors_4(x,y,z) + real(RDP), intent(out) :: x(:,:,:,:) !< array to store in + real(RDP), intent(in) :: y(:,:,:,:) + real(RDP), intent(in) :: z(:,:,:,:) + + call SetToMultipliedVectors_ES(size(x),x,y,z) + +end subroutine SetToMultipliedVectors_4 + +!------------------------------------------------------------------------------- +!> \brief Pointwise multiplication x_i <- y_i * z_i +!> \author Immo Huismann + +subroutine SetToMultipliedVectors_432(x_4,x_3,x_2, y_4,y_3,y_2, z_4,z_3,z_2) + real(RDP), intent(out) :: x_4(:,:,:,:) !< x_i <- y_i * z_i + real(RDP), intent(out) :: x_3(:,:,:) !< x_i <- y_i * z_i + real(RDP), intent(out) :: x_2(:,:) !< x_i <- y_i * z_i + real(RDP), intent(in) :: y_4(:,:,:,:) + real(RDP), intent(in) :: y_3(:,:,:) + real(RDP), intent(in) :: y_2(:,:) + real(RDP), intent(in) :: z_4(:,:,:,:) + real(RDP), intent(in) :: z_3(:,:,:) + real(RDP), intent(in) :: z_2(:,:) + + call SetToMultipliedVectors_ES(size(x_4),x_4,y_4,z_4) + call SetToMultipliedVectors_ES(size(x_3),x_3,y_3,z_3) + call SetToMultipliedVectors_ES(size(x_2),x_2,y_2,z_2) + +end subroutine SetToMultipliedVectors_432 + +!------------------------------------------------------------------------------- +!> \brief Pointwise multiplication x_i <- y_i * z_i - explicit size routine +!> \author Immo Huismann + +subroutine SetToMultipliedVectors_ES(n_points,x,y,z) + integer, intent(in) :: n_points !< number of points to work on + real(RDP), intent(out) :: x(n_points) !< x_i <- y_i * z_i + real(RDP), intent(in) :: y(n_points) !< First array in multiplication + real(RDP), intent(in) :: z(n_points) !< Second array in multiplication + + integer :: i + + !$acc data present(x,y,z) + !$acc parallel async(ACC_EXEC_QUEUE) + !$acc loop + !$omp do private(i) + do i = 1, n_points + x(i) = y(i) * z(i) + end do + !$omp end do + !$acc end loop + !$acc end parallel + !$acc end data + +end subroutine SetToMultipliedVectors_ES + +!=============================================================================== +! Copy a vector + +!------------------------------------------------------------------------------- +!> \brief Sets the variable to a given value, x <- y +!> \author Immo Huismann + +subroutine SetToVector_2(x,y) + real(RDP), intent(out) :: x(:,:) !< x <- y + real(RDP), intent(in) :: y(:,:) !< vector to set it to + + call SetToVector_ES(size(x),x,y) + +end subroutine SetToVector_2 + +!------------------------------------------------------------------------------- +!> \brief Sets the variable to a given value, x <- y +!> \author Immo Huismann + +subroutine SetToVector_3(x,y) + real(RDP), intent(out) :: x(:,:,:) !< x <- y + real(RDP), intent(in) :: y(:,:,:) !< vector to set it to + + call SetToVector_ES(size(x),x,y) + +end subroutine SetToVector_3 + +!------------------------------------------------------------------------------- +!> \brief Sets the variable to a given value, x <- y +!> \author Immo Huismann + +subroutine SetToVector_4(x,y) + real(RDP), intent(out) :: x(:,:,:,:) !< x <- y + real(RDP), intent(in) :: y(:,:,:,:) !< vector to set it to + + call SetToVector_ES(size(x),x,y) + +end subroutine SetToVector_4 + +!------------------------------------------------------------------------------- +!> \brief Sets containing n entries to zero +!> \author Immo Huismann + +subroutine SetToVector_ES(n_point,x,y) + integer, intent(in) :: n_point !< number of points to work on + real(RDP), intent(out) :: x(n_point) !< x_i <- y_i + real(RDP), intent(in) :: y(n_point) !< vector to set x to + + integer :: i + + !$acc data present(x,y) + !$acc parallel async(ACC_EXEC_QUEUE) + !$acc loop + !$omp do private(i) + do i = 1, n_point + x(i) = y(i) + end do + !$omp end do + !$acc end loop + !$acc end parallel + !$acc end data + +end subroutine SetToVector_ES + +!=============================================================================== +! Pointwise inversion + +!------------------------------------------------------------------------------- +!> \brief Inverts every element of an array, x_i <- 1 / x_i +!> \author Immo Huismann + +subroutine PointWiseInversion_2(x) + real(RDP), intent(inout) :: x(:,:) !< x_i <- 1 / x_i + + call PointWiseInversion_ES(size(x),x) + +end subroutine PointWiseInversion_2 + +!------------------------------------------------------------------------------- +!> \brief Inverts every element of an array, x_i <- 1 / x_i +!> \author Immo Huismann + +subroutine PointWiseInversion_3(x) + real(RDP), intent(inout) :: x(:,:,:) !< x_i <- 1 / x_i + + call PointWiseInversion_ES(size(x),x) + +end subroutine PointWiseInversion_3 + +!------------------------------------------------------------------------------- +!> \brief Inverts every element of an array. +!> \author Immo Huismann + +subroutine PointWiseInversion_4(x) + real(RDP), intent(inout) :: x(:,:,:,:) !< x_i <- 1 / x_i + + call PointWiseInversion_ES(size(x),x) + +end subroutine PointWiseInversion_4 + +!------------------------------------------------------------------------------- +!> \brief Inverts every element of an array. +!> \author Immo Huismann + +subroutine PointWiseInversion_432(x_4,x_3,x_2) + real(RDP), intent(inout) :: x_4(:,:,:,:) !< x_i <- 1 / x_i + real(RDP), intent(inout) :: x_3(:,:,:) !< x_i <- 1 / x_i + real(RDP), intent(inout) :: x_2(:,:) !< x_i <- 1 / x_i + + call PointWiseInversion_ES(size(x_4),x_4) + call PointWiseInversion_ES(size(x_3),x_3) + call PointWiseInversion_ES(size(x_2),x_2) + +end subroutine PointWiseInversion_432 + +!------------------------------------------------------------------------------- +!> \brief Inverts every element of an array - explicit size version +!> \author Immo Huismann + +subroutine PointWiseInversion_ES(n_point,x) + integer, intent(in) :: n_point !< number of points to process + real(RDP), intent(inout) :: x(n_point) !< x_i <- 1 / x_i + + integer :: i + + !$acc data present(x) + !$acc parallel async(ACC_EXEC_QUEUE) + !$acc loop + !$omp do private(i) + do i = 1, n_point + x(i) = ONE / x(i) + end do + !$omp end do + !$acc end loop + !$acc end parallel + !$acc end data + +end subroutine PointWiseInversion_ES + +!=============================================================================== +! Sqrt + +!------------------------------------------------------------------------------- +!> \brief Square root of every element of an array, x_i <- sqrt(x_i) +!> \author Immo Huismann + +subroutine PointWiseSqrt_2(x) + real(RDP), intent(inout) :: x(:,:) !< x_i <- sqrt(x_i) + + call PointWiseSqrt_ES(size(x),x) + +end subroutine PointWiseSqrt_2 + +!------------------------------------------------------------------------------- +!> \brief Square root of every element of an array, x_i <- sqrt(x_i) +!> \author Immo Huismann + +subroutine PointWiseSqrt_3(x) + real(RDP), intent(inout) :: x(:,:,:) !< x_i <- sqrt(x_i) + + call PointWiseSqrt_ES(size(x),x) + +end subroutine PointWiseSqrt_3 + +!------------------------------------------------------------------------------- +!> \brief Square root of every element of an array, x_i <- sqrt(x_i) +!> \author Immo Huismann + +subroutine PointWiseSqrt_4(x) + real(RDP), intent(inout) :: x(:,:,:,:) !< x_i <- sqrt(x_i) + + call PointWiseSqrt_ES(size(x),x) + +end subroutine PointWiseSqrt_4 + +!------------------------------------------------------------------------------- +!> \brief Square root of every element of an array, x_i <- sqrt(x_i) +!> \author Immo Huismann + +subroutine PointWiseSqrt_432(x_4,x_3,x_2) + real(RDP), intent(inout) :: x_4(:,:,:,:) !< x_i <- sqrt(x_i) + real(RDP), intent(inout) :: x_3(:,:,:) !< x_i <- sqrt(x_i) + real(RDP), intent(inout) :: x_2(:,:) !< x_i <- sqrt(x_i) + + call PointWiseSqrt_ES(size(x_4),x_4) + call PointWiseSqrt_ES(size(x_3),x_3) + call PointWiseSqrt_ES(size(x_2),x_2) + +end subroutine PointWiseSqrt_432 + +!------------------------------------------------------------------------------- +!> \brief Inverts every element of an array - explicit size version +!> \author Immo Huismann + +subroutine PointWiseSqrt_ES(n_point,x) + integer, intent(in) :: n_point !< number of points to work on + real(RDP), intent(inout) :: x(n_point) !< x_i <- sqrt(x_i) + + integer :: i + + !$acc data present(x) + !$acc parallel async(ACC_EXEC_QUEUE) + !$acc loop + !$omp do private(i) + do i = 1, n_point + x(i) = sqrt(x(i)) + end do + !$omp end do + !$acc end loop + !$acc end parallel + !$acc end data + +end subroutine PointWiseSqrt_ES + +!=============================================================================== +! x <- a * y + b * z + +!------------------------------------------------------------------------------- +!> \brief Explicit size routine for x <- a * y + b * z +!> \author Immo Huismann + +subroutine SetToTwoScaledVectors_ES(n_points,x,a,y,b,z) + integer, intent(in) :: n_points + real(RDP), intent(out) :: x(n_points) !< array to set + real(RDP), intent(in) :: a !< scaling factor for y + real(RDP), intent(in) :: y(n_points) !< array to set x to + real(RDP), intent(in) :: b !< scaling factor for z + real(RDP), intent(in) :: z(n_points) !< array to scale and add to x + + integer :: i + + !$acc data present(x,y,z) + !$acc parallel async(ACC_EXEC_QUEUE) + !$acc loop + !$omp do private(i) + do i = 1, n_points + x(i) = a * y(i) + b * z(i) + end do + !$omp end do + !$acc end loop + !$acc end parallel + !$acc end data + +end subroutine SetToTwoScaledVectors_ES + +!------------------------------------------------------------------------------- +!> \brief Adds a scaled vector to the variable w <- a * x + b * y + c * z +!> \author Immo Huismann + +subroutine SetToThreeScaledVectors(w,a,x,b,y,c,z) + real(RDP), intent(out) :: w(:,:,:,:) !< array to set + real(RDP), intent(in) :: a !< scaling factor for x + real(RDP), intent(in) :: x(:,:,:,:) !< first array to set x to + real(RDP), intent(in) :: b !< scaling factor for z + real(RDP), intent(in) :: y(:,:,:,:) !< second array to scale and add to w + real(RDP), intent(in) :: c !< scaling factor for z + real(RDP), intent(in) :: z(:,:,:,:) !< third array to scale and add to w + + call SetToThreeScaledVectors_ES(size(x),w,a,x,b,y,c,z) + +end subroutine SetToThreeScaledVectors + +!------------------------------------------------------------------------------- +!> \brief Adds a scaled vector to the variable w <- a * x + b * y + c * z +!> \author Immo Huismann + +subroutine SetToThreeScaledVectors_ES(n_points,w,a,x,b,y,c,z) + integer, intent(in) :: n_points !< number of points to process + real(RDP), intent(out) :: w(n_points) !< array to set + real(RDP), intent(in) :: a !< scaling factor for x + real(RDP), intent(in) :: x(n_points) !< first array to set x to + real(RDP), intent(in) :: b !< scaling factor for z + real(RDP), intent(in) :: y(n_points) !< second array to scale and add to w + real(RDP), intent(in) :: c !< scaling factor for z + real(RDP), intent(in) :: z(n_points) !< third array to scale and add to w + + integer :: i + + !$acc data present(w,x,y,z) + !$acc parallel async(ACC_EXEC_QUEUE) + !$acc loop + !$omp do private(i) + do i = 1, n_points + w(i) = a * x(i) + b * y(i) + c * z(i) + end do + !$omp end do + !$acc end loop + !$acc end parallel + !$acc end data + +end subroutine SetToThreeScaledVectors_ES + + +!------------------------------------------------------------------------------- +!> \brief Adds a scaled vector to the variable x <- a * y + b * z +!> \author Immo Huismann + +subroutine SetToTwoScaledVectors(x,a,y,b,z) + real(RDP), intent(out) :: x(:,:,:,:) !< array to set + real(RDP), intent(in) :: a !< scaling factor for y + real(RDP), intent(in) :: y(:,:,:,:) !< array to set x to + real(RDP), intent(in) :: b !< scaling factor for z + real(RDP), intent(in) :: z(:,:,:,:) !< array to scale and add to x + + call SetToTwoScaledVectors_ES(size(x),x,a,y,b,z) + +end subroutine SetToTwoScaledVectors + +!------------------------------------------------------------------------------- +!> \brief Sets the variable to a given value: x_i <- a +!> \author Immo Huismann + +subroutine SetToValue_2(x,a) + real(RDP), intent(out) :: x(:,:) + real(RDP), intent(in) :: a + + call SetToValue_ES(size(x),x,a) + +end subroutine SetToValue_2 + +!------------------------------------------------------------------------------- +!> \brief Sets the variable to a given value: x_i <- a +!> \author Immo Huismann + +subroutine SetToValue_3(x,a) + real(RDP), intent(out) :: x(:,:,:) + real(RDP), intent(in) :: a + + call SetToValue_ES(size(x),x,a) + +end subroutine SetToValue_3 + +!------------------------------------------------------------------------------- +!> \brief Sets the variable to a given value: x_i <- a +!> \author Immo Huismann + +subroutine SetToValue_4(x,a) + real(RDP), intent(out) :: x(:,:,:,:) + real(RDP), intent(in) :: a + + call SetToValue_ES(size(x),x,a) + +end subroutine SetToValue_4 + +!------------------------------------------------------------------------------- +!> \brief Sets containing n entries to a, x_i <- a +!> \author Immo Huismann + +subroutine SetToValue_ES(n_point,x,a) + integer, intent(in) :: n_point !< number of points in array + real(RDP), intent(out) :: x(n_point) !< array to set to a + real(RDP), intent(in) :: a !< value to set array to + + integer :: i + + !$acc data present(x) + !$acc parallel async(ACC_EXEC_QUEUE) + !$acc loop + !$omp do private(i) + do i = 1, n_point + x(i) = a + end do + !$omp end do + !$acc end loop + !$acc end parallel + !$acc end data + +end subroutine SetToValue_ES + +!=============================================================================== + +end module Array_Operations diff --git a/Parser/test-data/specht/condensed_primary_part.f b/Parser/test-data/specht/condensed_primary_part.f new file mode 100644 index 0000000..6b16812 --- /dev/null +++ b/Parser/test-data/specht/condensed_primary_part.f @@ -0,0 +1,930 @@ +!> \file condensed_primary_part.f +!> \brief Primary part for the condensed operator +!> \author Immo Huismann +!> \date 2015/02/17 +!> \copyright Institute of Fluid Mechanics, TU Dresden, 01062 Dresden, Germany +!=============================================================================== + +module Condensed_Primary_Part + use Kind_parameters, only: RDP + use Constants, only: HALF + use Standard_Operators_1D, only: StandardOperators1D + use Factored_Matrix_Products_2D, only: AddFactoredMatrixProduct2D + + use ACC_Parameters, only: ACC_EXEC_QUEUE + use Element_Boundary_Parameters, only: N_VERTICES, N_FACES, N_EDGES + use Geometry_Coefficients, only: GetHelmholtzCoefficients + implicit none + private + + public :: AddHelmholtzOpCondensedPrimary + + !----------------------------------------------------------------------------- + !> \brief Adds the condensed primary part to the result + !> \author Immo Huismann + + interface AddHelmholtzOpCondensedPrimary + module procedure AddHelmholtzOpCondensedPrimary_ClassicInterface + module procedure AddHelmholtzOpCondensedPrimary_ShortInterface + end interface AddHelmholtzOpCondensedPrimary + +contains + +!------------------------------------------------------------------------------- +!> \brief Calculates the primary part for the condensed system, adds it to r +!> \author Immo Huismann + +subroutine AddHelmholtzOpCondensedPrimary_ClassicInterface & + (op,h,lambda,u_f,u_e,u_v,r_f,r_e,r_v,tmp_f_1,tmp_f_2) + class(StandardOperators1D), intent(in) :: op !< 1D operators + real(RDP), intent(in) :: h(:,:) !< element width + real(RDP), intent(in) :: lambda !< Helmholtz parameter + + real(RDP), intent(in) :: u_f(:,:,:,:) !< u values on faces + real(RDP), intent(in) :: u_e( :,:,:) !< u values on edges + real(RDP), intent(in) :: u_v( :,:) !< u values on vertices + + real(RDP), intent(inout) :: r_f(:,:,:,:) !< r values on faces + real(RDP), intent(inout) :: r_e( :,:,:) !< r values on edges + real(RDP), intent(inout) :: r_v( :,:) !< r values on vertices + + real(RDP), intent(out) :: tmp_f_1(:,:,:,:) !< scratch values on faces + real(RDP), intent(out) :: tmp_f_2(:,:,:,:) !< scratch values on faces + + integer :: n ! number of internal points on one edge + integer :: n_element ! number of elements + + real(RDP), allocatable, save :: d(:,:) ! element-wise Helmholtz coefficients + + !............................................................................. + ! Initialization + + ! Get the problem size + n = size(r_f,1) + n_element = size(r_f,4) + + ! allocate and set the Helmholtz coefficients + call GetHelmholtzCoefficients(lambda, h, d) + + !$acc data copyin(d) async(ACC_EXEC_QUEUE) + + ! Add the primary parts ...................................................... + call HelmholtzOpPrimFaceToFace (n,n_element,op,d,u_f,r_f,tmp_f_1,tmp_f_2) + call HelmholtzOpPrimEdgeToFace (n,n_element,op,d,u_e,r_f) + call HelmholtzOpPrimFaceToEdge (n,n_element,op,d,u_f,r_e) + call HelmholtzOpPrimEdgeToEdge (n,n_element,op,d,u_e,r_e) + call HelmholtzOpPrimVertexToEdge (n,n_element,op,d,u_v,r_e) + call HelmholtzOpPrimEdgeToVertex (n,n_element,op,d,u_e,r_v) + call HelmholtzOpPrimVertexToVertex(n,n_element,op,d,u_v,r_v) + + !$acc end data + + ! Deallocate shared variables ................................................ + !$omp single + deallocate(d) + !$omp end single + +end subroutine AddHelmholtzOpCondensedPrimary_ClassicInterface + +!------------------------------------------------------------------------------- +!> \brief Calculates the primary part for the condensed system, adds it to r +!> \author Immo Huismann +!> +!> \details +!> This version gets the helmholtz coefficients directly and won't calculate +!> them every iteration + +subroutine AddHelmholtzOpCondensedPrimary_ShortInterface & + (op,d,u_f,u_e,u_v,r_f,r_e,r_v,tmp_f_1,tmp_f_2) + class(StandardOperators1D), intent(in) :: op !< 1D operators + real(RDP), intent(in) :: d(0:,:) !< Helmholtz coefficients + + real(RDP), intent(in) :: u_f(:,:,:,:) !< u values on faces + real(RDP), intent(in) :: u_e( :,:,:) !< u values on edges + real(RDP), intent(in) :: u_v( :,:) !< u values on vertices + + real(RDP), intent(inout) :: r_f(:,:,:,:) !< r values on faces + real(RDP), intent(inout) :: r_e( :,:,:) !< r values on edges + real(RDP), intent(inout) :: r_v( :,:) !< r values on vertices + + real(RDP), intent(out) :: tmp_f_1(:,:,:,:) !< scratch values on faces + real(RDP), intent(out) :: tmp_f_2(:,:,:,:) !< scratch values on faces + + integer :: n ! number of internal points on one edge + integer :: n_element ! number of elements + + !............................................................................. + ! Initialization + + ! Get the problem size + n = size(r_f,1) + n_element = size(r_f,4) + + ! Add the primary parts ...................................................... + call HelmholtzOpPrimFaceToFace (n,n_element,op,d,u_f,r_f,tmp_f_1,tmp_f_2) + call HelmholtzOpPrimEdgeToFace (n,n_element,op,d,u_e,r_f) + call HelmholtzOpPrimFaceToEdge (n,n_element,op,d,u_f,r_e) + call HelmholtzOpPrimEdgeToEdge (n,n_element,op,d,u_e,r_e) + call HelmholtzOpPrimVertexToEdge (n,n_element,op,d,u_v,r_e) + call HelmholtzOpPrimEdgeToVertex (n,n_element,op,d,u_e,r_v) + call HelmholtzOpPrimVertexToVertex(n,n_element,op,d,u_v,r_v) + +end subroutine AddHelmholtzOpCondensedPrimary_ShortInterface + +! Bound to Bound operations ==================================================== + +!------------------------------------------------------------------------------- +!> \brief Calculates vertex to vertex interaction in the Helmholtz operator +!> \author Immo Huismann +!> +!> \details +!> Mind that the properties +!> +!> L_{0p} == L_{p0} +!> L_{00} == L_{pp} +!> w_{0} == w_{p} +!> +!> are being exploited to shorten all terms. +!> These properties hold for the GLL and GL points (they do for all symmetric +!> node distribution), but for other base choices they might not. +!> +!> Explicit size for an easier compilation with OpenACC. + +subroutine HelmholtzOpPrimVertexToVertex(n,n_element,op,d,u,r) + integer, intent(in) :: n !< po - 1 + integer, intent(in) :: n_element !< number of elements + class(StandardOperators1D), intent(in) :: op !< 1D operators + + real(RDP), intent(in) :: d(0:3,n_element) !< element helmholtz coef. + real(RDP), intent(in) :: u(N_VERTICES,n_element) !< variable u on vertices + real(RDP), intent(inout) :: r(N_VERTICES,n_element) !< r = H u, r on vertices + + real(RDP) :: laplace_factor_00, laplace_factor_0p, mass_factor + real(RDP) :: L_00, L_0p, w_0 ! extracts from the system matrices + real(RDP) :: d_xy_00,d_xz_00, d_yz_00 ! geometry coefficients from 0 to 0 + real(RDP) :: d_xy_0p,d_xz_0p, d_yz_0p ! geometry coefficients from 0 to p + real(RDP) :: d_xyz ! coefficient from mass term + integer :: e ! element loop index + + ! Calculate the factors common to each element ............................... + ! Mind that the properties + ! + ! L_{0p} == L_{p0} + ! L_{00} == L_{pp} + ! w_{0} == w_{p} + ! + ! are being exploited to shorten all terms + + L_00 = op%L(0,0 ) + L_0p = op%L(0,n+1) + w_0 = op%w(0) + + mass_factor = w_0 * w_0 * w_0 + laplace_factor_00 = w_0 * w_0 * L_00 + laplace_factor_0p = w_0 * w_0 * L_0p + + ! vertex to vertex interaction ............................................... + + !$acc parallel async(ACC_EXEC_QUEUE) present(d,u,r) + !$acc loop + !$omp do private(e,d_xyz,d_xy_00,d_xy_0p,d_xz_00,d_xz_0p,d_yz_00,d_yz_0p) + do e = 1, n_element + ! set the metric coefficients + d_xyz = d(0,e) * mass_factor + + d_xy_00 = d(3,e) * laplace_factor_00 + d_xy_0p = d(3,e) * laplace_factor_0p + + d_xz_00 = d(2,e) * laplace_factor_00 + d_xz_0p = d(2,e) * laplace_factor_0p + + d_yz_00 = d(1,e) * laplace_factor_00 + d_yz_0p = d(1,e) * laplace_factor_0p + + ! r(0,0,0,e) + ! depends on self u(0,0,0,e) -- u(1,e) + ! + x u(p,0,0,e) -- u(2,e) + ! + y u(0,p,0,e) -- u(3,e) + ! + z u(0,0,p,e) -- u(5,e) + r(1,e) = r(1,e) & + + (d_xyz + d_xy_00 + d_xz_00 + d_yz_00) * u(1,e) & + + ( d_yz_0p) * u(2,e) & + + ( d_xz_0p ) * u(3,e) & + + ( d_xy_0p ) * u(5,e) + + ! r(p,0,0,e) + ! depends on self u(p,0,0,e) -- u(2,e) + ! - x u(0,0,0,e) -- u(1,e) + ! + y u(p,p,0,e) -- u(4,e) + ! + z u(p,0,p,e) -- u(6,e) + r(2,e) = r(2,e) & + + (d_xyz + d_xy_00 + d_xz_00 + d_yz_00) * u(2,e) & + + ( d_yz_0p) * u(1,e) & + + ( d_xz_0p ) * u(4,e) & + + ( d_xy_0p ) * u(6,e) + + ! r(0,p,0,e) + ! depends on self u(0,p,0,e) -- u(3,e) + ! + x u(p,p,0,e) -- u(4,e) + ! - y u(0,0,0,e) -- u(1,e) + ! + z u(0,p,p,e) -- u(7,e) + r(3,e) = r(3,e) & + + (d_xyz + d_xy_00 + d_xz_00 + d_yz_00) * u(3,e) & + + ( d_yz_0p) * u(4,e) & + + ( d_xz_0p ) * u(1,e) & + + ( d_xy_0p ) * u(7,e) + + ! r(p,p,0,e) + ! depends on self u(p,p,0,e) -- u(4,e) + ! - x u(0,p,0,e) -- u(3,e) + ! - y u(p,0,0,e) -- u(2,e) + ! + z u(p,p,p,e) -- u(8,e) + r(4,e) = r(4,e) & + + (d_xyz + d_xy_00 + d_xz_00 + d_yz_00) * u(4,e) & + + ( d_yz_0p) * u(3,e) & + + ( d_xz_0p ) * u(2,e) & + + ( d_xy_0p ) * u(8,e) + + ! r(0,0,p,e) + ! depends on self u(0,0,p,e) -- u(5,e) + ! + x u(p,0,p,e) -- u(6,e) + ! + y u(0,p,p,e) -- u(7,e) + ! - z u(0,0,p,e) -- u(5,e) + r(5,e) = r(5,e) & + + (d_xyz + d_xy_00 + d_xz_00 + d_yz_00) * u(5,e) & + + ( d_yz_0p) * u(6,e) & + + ( d_xz_0p ) * u(7,e) & + + ( d_xy_0p ) * u(1,e) + + ! r(p,0,p,e) + ! depends on self u(p,0,p,e) -- u(6,e) + ! - x u(0,0,p,e) -- u(5,e) + ! + y u(p,p,p,e) -- u(8,e) + ! - z u(p,0,0,e) -- u(2,e) + r(6,e) = r(6,e) & + + (d_xyz + d_xy_00 + d_xz_00 + d_yz_00) * u(6,e) & + + ( d_yz_0p) * u(5,e) & + + ( d_xz_0p ) * u(8,e) & + + ( d_xy_0p ) * u(2,e) + + ! r(0,p,p,e) + ! depends on self u(0,p,p,e) -- u(7,e) + ! + x u(p,p,p,e) -- u(8,e) + ! - y u(0,0,p,e) -- u(5,e) + ! - z u(0,p,0,e) -- u(3,e) + r(7,e) = r(7,e) & + + (d_xyz + d_xy_00 + d_xz_00 + d_yz_00) * u(7,e) & + + ( d_yz_0p) * u(8,e) & + + ( d_xz_0p ) * u(5,e) & + + ( d_xy_0p ) * u(3,e) + + ! r(p,p,p,e) + ! depends on self u(p,p,p,e) -- u(8,e) + ! - x u(0,p,p,e) -- u(7,e) + ! - y u(p,0,p,e) -- u(6,e) + ! - z u(p,p,0,e) -- u(4,e) + r(8,e) = r(8,e) & + + (d_xyz + d_xy_00 + d_xz_00 + d_yz_00) * u(8,e) & + + ( d_yz_0p) * u(7,e) & + + ( d_xz_0p ) * u(6,e) & + + ( d_xy_0p ) * u(4,e) + + end do + !$omp end do + !$acc end parallel + + +end subroutine HelmholtzOpPrimVertexToVertex + +!------------------------------------------------------------------------------- +!> \brief Vertex to edge interaction in the Helmholtz operator r = H u +!> \author Immo Huismann + +subroutine HelmholtzOpPrimVertexToEdge(n,n_element,op,d,u_v,r_e) + integer, intent(in) :: n !< points per inner line + integer, intent(in) :: n_element !< number of elements + class(StandardOperators1D), intent(in) :: op !< polynomial system + real(RDP), intent(in) :: d(0:3,n_element) !< helmholtz factors + real(RDP), intent(in) :: u_v( N_VERTICES,n_element) !< variable u + real(RDP), intent(inout) :: r_e(n,N_EDGES, n_element) !< r = H u + + real(RDP) :: mat_i0(n), mat_ip(n) ! matrices to use + real(RDP) :: d_1_0, d_1_p, d_2_0, d_2_p, d_3_0, d_3_p ! matrix entry * metric + integer :: i, e ! loop indices + + ! Initialization ............................................................. + + ! Set the rows of the matrices + mat_i0 = op%L(1:n,0 ) * op%w(0) * op%w(0) + mat_ip = op%L(1:n,n+1) * op%w(0) * op%w(0) + + ! Computation ................................................................ + + ! Scalar to vector -> a perfectly nested loop + + !$acc parallel async(ACC_EXEC_QUEUE) present(d,u_v,r_e) + !$acc loop collapse(2) private(d_1_0,d_1_p,d_2_0,d_2_p,d_3_0,d_3_p) + !$omp do private(i,e, d_1_0,d_1_p,d_2_0,d_2_p,d_3_0,d_3_p) + do e = 1, n_element + do i = 1, n + ! Compute factors for point and element + d_1_0 = d(1,e) * mat_i0(i); d_1_p = d(1,e) * mat_ip(i) + d_2_0 = d(2,e) * mat_i0(i); d_2_p = d(2,e) * mat_ip(i) + d_3_0 = d(3,e) * mat_i0(i); d_3_p = d(3,e) * mat_ip(i) + + !........................................................................... + ! x_1 edges + r_e(i, 1,e) = r_e(i, 1,e) + d_1_0 * u_v(1,e) + d_1_p * u_v(2,e) + r_e(i, 2,e) = r_e(i, 2,e) + d_1_0 * u_v(3,e) + d_1_p * u_v(4,e) + r_e(i, 3,e) = r_e(i, 3,e) + d_1_0 * u_v(5,e) + d_1_p * u_v(6,e) + r_e(i, 4,e) = r_e(i, 4,e) + d_1_0 * u_v(7,e) + d_1_p * u_v(8,e) + + !........................................................................... + ! x_2 edges + r_e(i, 5,e) = r_e(i, 5,e) + d_2_0 * u_v(1,e) + d_2_p * u_v(3,e) + r_e(i, 6,e) = r_e(i, 6,e) + d_2_0 * u_v(2,e) + d_2_p * u_v(4,e) + r_e(i, 7,e) = r_e(i, 7,e) + d_2_0 * u_v(5,e) + d_2_p * u_v(7,e) + r_e(i, 8,e) = r_e(i, 8,e) + d_2_0 * u_v(6,e) + d_2_p * u_v(8,e) + + !........................................................................... + ! x_3 edges + r_e(i, 9,e) = r_e(i, 9,e) + d_3_0 * u_v(1,e) + d_3_p * u_v(5,e) + r_e(i,10,e) = r_e(i,10,e) + d_3_0 * u_v(2,e) + d_3_p * u_v(6,e) + r_e(i,11,e) = r_e(i,11,e) + d_3_0 * u_v(3,e) + d_3_p * u_v(7,e) + r_e(i,12,e) = r_e(i,12,e) + d_3_0 * u_v(4,e) + d_3_p * u_v(8,e) + + end do + end do + !$omp end do + !$acc end loop + !$acc end parallel + +end subroutine HelmholtzOpPrimVertexToEdge + +!------------------------------------------------------------------------------- +!> \brief Edge to vertex interaction in the Helmholtz operator r = H u +!> \author Immo Huismann +!> +!> \todo +!> * maybe think about reimplementing it with a loop over the edges +!> * could have fewer loads, though that is probably wasted on this routine + +subroutine HelmholtzOpPrimEdgeToVertex(n,n_element,op,d,u_e,r_v) + integer, intent(in) :: n !< points per edge + integer, intent(in) :: n_element !< number of elements + class(StandardOperators1D), intent(in) :: op !< 1D operators + real(RDP), intent(in) :: d(0:3,n_element) !< helmholtz factors + real(RDP), intent(in) :: u_e(n,N_EDGES, n_element) !< variable u on edges + real(RDP), intent(inout) :: r_v( N_VERTICES,n_element) !< r = H u on vertices + + real(RDP) :: mat_i0(n), mat_ip(n) ! operators + integer :: i, e + + ! Initialization ............................................................. + ! Set the rows of the matrices + mat_i0 = op%L(1:n,0 ) * op%w(0) * op%w(0) + mat_ip = op%L(1:n,n+1) * op%w(0) * op%w(0) + + ! Computation ................................................................ + + + !$acc parallel async(ACC_EXEC_QUEUE) present(d,u_e,r_v) + !$acc loop + !$omp do private(i,e) + do e = 1, n_element + do i = 1, n + r_v(1,e) = r_v(1,e) + mat_I0(i) * d(1,e) * u_e(i, 1,e) & + + mat_I0(i) * d(2,e) * u_e(i, 5,e) & + + mat_I0(i) * d(3,e) * u_e(i, 9,e) + + r_v(2,e) = r_v(2,e) + mat_Ip(i) * d(1,e) * u_e(i, 1,e) & + + mat_I0(i) * d(2,e) * u_e(i, 6,e) & + + mat_I0(i) * d(3,e) * u_e(i,10,e) + + r_v(3,e) = r_v(3,e) + mat_I0(i) * d(1,e) * u_e(i, 2,e) & + + mat_Ip(i) * d(2,e) * u_e(i, 5,e) & + + mat_I0(i) * d(3,e) * u_e(i,11,e) + + r_v(4,e) = r_v(4,e) + mat_Ip(i) * d(1,e) * u_e(i, 2,e) & + + mat_Ip(i) * d(2,e) * u_e(i, 6,e) & + + mat_I0(i) * d(3,e) * u_e(i,12,e) + + r_v(5,e) = r_v(5,e) + mat_I0(i) * d(1,e) * u_e(i, 3,e) & + + mat_I0(i) * d(2,e) * u_e(i, 7,e) & + + mat_Ip(i) * d(3,e) * u_e(i, 9,e) + + r_v(6,e) = r_v(6,e) + mat_Ip(i) * d(1,e) * u_e(i, 3,e) & + + mat_I0(i) * d(2,e) * u_e(i, 8,e) & + + mat_Ip(i) * d(3,e) * u_e(i,10,e) + + r_v(7,e) = r_v(7,e) + mat_I0(i) * d(1,e) * u_e(i, 4,e) & + + mat_Ip(i) * d(2,e) * u_e(i, 7,e) & + + mat_Ip(i) * d(3,e) * u_e(i,11,e) + + r_v(8,e) = r_v(8,e) + mat_Ip(i) * d(1,e) * u_e(i, 4,e) & + + mat_Ip(i) * d(2,e) * u_e(i, 8,e) & + + mat_Ip(i) * d(3,e) * u_e(i,12,e) + + end do + end do + !$omp end do + !$acc end loop + !$acc end parallel + +end subroutine HelmholtzOpPrimEdgeToVertex + +!------------------------------------------------------------------------------- +!> \brief Edge to vertex interaction in the Helmholtz operator r = H u +!> \author Immo Huismann + +subroutine HelmholtzOpPrimEdgeToEdge(n,n_element,op,d,u_e,r_e) + integer, intent(in) :: n !< points per inner line + integer, intent(in) :: n_element !< number of elements + class(StandardOperators1D), intent(in) :: op !< 1D operators + real(RDP), intent(in) :: d(0:3,n_element) !< element helmholtz coeffs + real(RDP), intent(in) :: u_e(n,N_EDGES,n_element) !< variable u on edges + real(RDP), intent(inout) :: r_e(n,N_EDGES,n_element) !< r = H u on edges + + real(RDP) :: w_0 ! first coefficient in mass matrix + real(RDP) :: L_00 ! coefficient (0,0) in laplace matrix + real(RDP) :: L_0p ! coefficient (0,p) in laplace matrix + real(RDP) :: w_I(n) ! inner part of mass matrix + real(RDP) :: L_I_w_w(n,n) ! inner part of laplace matrix * w_0^2 + real(RDP) :: factors(3) ! metric factors * matrices + + integer :: i,j,e + + ! initialization ............................................................. + + ! Set the operators + w_0 = op%w(0) + L_00 = op%L(0,0 ) + L_0p = op%L(0,n+1) + w_I = op%w(1:n) + L_I_w_w = op%L(1:n,1:n) * w_0 * w_0 + + ! Laplace term in edge direction ............................................. + + + !$acc parallel async(ACC_EXEC_QUEUE) present(d,u_e,r_e) + !$acc loop collapse(2) + !$omp do private(i,j,e) + do e = 1, n_element + do i = 1, n + do j = 1, n + r_e(i, 1,e) = r_e(i, 1,e) + d(1,e) * L_I_w_w(i,j) * u_e(j, 1,e) + r_e(i, 2,e) = r_e(i, 2,e) + d(1,e) * L_I_w_w(i,j) * u_e(j, 2,e) + r_e(i, 3,e) = r_e(i, 3,e) + d(1,e) * L_I_w_w(i,j) * u_e(j, 3,e) + r_e(i, 4,e) = r_e(i, 4,e) + d(1,e) * L_I_w_w(i,j) * u_e(j, 4,e) + + r_e(i, 5,e) = r_e(i, 5,e) + d(2,e) * L_I_w_w(i,j) * u_e(j, 5,e) + r_e(i, 6,e) = r_e(i, 6,e) + d(2,e) * L_I_w_w(i,j) * u_e(j, 6,e) + r_e(i, 7,e) = r_e(i, 7,e) + d(2,e) * L_I_w_w(i,j) * u_e(j, 7,e) + r_e(i, 8,e) = r_e(i, 8,e) + d(2,e) * L_I_w_w(i,j) * u_e(j, 8,e) + + r_e(i, 9,e) = r_e(i, 9,e) + d(3,e) * L_I_w_w(i,j) * u_e(j, 9,e) + r_e(i,10,e) = r_e(i,10,e) + d(3,e) * L_I_w_w(i,j) * u_e(j,10,e) + r_e(i,11,e) = r_e(i,11,e) + d(3,e) * L_I_w_w(i,j) * u_e(j,11,e) + r_e(i,12,e) = r_e(i,12,e) + d(3,e) * L_I_w_w(i,j) * u_e(j,12,e) + end do + end do + end do + !$omp end do + !$acc end loop + !$acc end parallel + + ! Laplace term to other edges ................................................ + + + !$acc parallel async(ACC_EXEC_QUEUE) present(d,u_e,r_e) + !$acc loop collapse(2) private(factors) + !$omp do private(i,e) + do e = 1, n_element + do i = 1, n + + !........................................................................... + ! x_1 edges + ! Calculate the metric factors, then use them + factors(1) = w_I(i) * w_0 * (d(0,e) * w_0 + L_00 * (d(2,e) + d(3,e))) + factors(2) = w_I(i) * w_0 * L_0p * d(2,e) + factors(3) = w_I(i) * w_0 * L_0p * d(3,e) + + r_e(i, 1,e) = r_e(i, 1,e) & + + factors(1) * u_e(i, 1,e) & + + factors(2) * u_e(i, 2,e) & + + factors(3) * u_e(i, 3,e) + + r_e(i, 2,e) = r_e(i, 2,e) & + + factors(1) * u_e(i, 2,e) & + + factors(2) * u_e(i, 1,e) & + + factors(3) * u_e(i, 4,e) + + r_e(i, 3,e) = r_e(i, 3,e) & + + factors(1) * u_e(i, 3,e) & + + factors(2) * u_e(i, 4,e) & + + factors(3) * u_e(i, 1,e) + + r_e(i, 4,e) = r_e(i, 4,e) & + + factors(1) * u_e(i, 4,e) & + + factors(2) * u_e(i, 3,e) & + + factors(3) * u_e(i, 2,e) + + !........................................................................... + ! x_2 edges + ! Calculate the metric factors, then use them + factors(1) = w_I(i) * w_0 * (d(0,e) * w_0 + L_00 * (d(1,e) + d(3,e))) + factors(2) = w_I(i) * w_0 * L_0p * d(1,e) + factors(3) = w_I(i) * w_0 * L_0p * d(3,e) + + r_e(i, 5,e) = r_e(i, 5,e) & + + factors(1) * u_e(i, 5,e) & + + factors(2) * u_e(i, 6,e) & + + factors(3) * u_e(i, 7,e) + + r_e(i, 6,e) = r_e(i, 6,e) & + + factors(1) * u_e(i, 6,e) & + + factors(2) * u_e(i, 5,e) & + + factors(3) * u_e(i, 8,e) + + r_e(i, 7,e) = r_e(i, 7,e) & + + factors(1) * u_e(i, 7,e) & + + factors(2) * u_e(i, 8,e) & + + factors(3) * u_e(i, 5,e) + + r_e(i, 8,e) = r_e(i, 8,e) & + + factors(1) * u_e(i, 8,e) & + + factors(2) * u_e(i, 7,e) & + + factors(3) * u_e(i, 6,e) + + !........................................................................... + ! x_3 edges + ! Calculate the metric factors, then use them + factors(1) = w_I(i) * w_0 * (d(0,e) * w_0 + L_00 * (d(1,e) + d(2,e))) + factors(2) = w_I(i) * w_0 * L_0p * d(1,e) + factors(3) = w_I(i) * w_0 * L_0p * d(2,e) + + r_e(i, 9,e) = r_e(i, 9,e) & + + factors(1) * u_e(i, 9,e) & + + factors(2) * u_e(i,10,e) & + + factors(3) * u_e(i,11,e) + + r_e(i,10,e) = r_e(i,10,e) & + + factors(1) * u_e(i,10,e) & + + factors(2) * u_e(i, 9,e) & + + factors(3) * u_e(i,12,e) + + r_e(i,11,e) = r_e(i,11,e) & + + factors(1) * u_e(i,11,e) & + + factors(2) * u_e(i,12,e) & + + factors(3) * u_e(i, 9,e) + + r_e(i,12,e) = r_e(i,12,e) & + + factors(1) * u_e(i,12,e) & + + factors(2) * u_e(i,11,e) & + + factors(3) * u_e(i,10,e) + + end do + end do + !$omp end do + !$acc end loop + !$acc end parallel + +end subroutine HelmholtzOpPrimEdgeToEdge + +!------------------------------------------------------------------------------- +!> \brief Edge to face interaction in the Helmholtz operator r = H u +!> \author Immo Huismann + +subroutine HelmholtzOpPrimEdgeToFace(n,n_element,op,d,u_e,r_f) + integer, intent(in) :: n !< points per inner line + integer, intent(in) :: n_element !< number of elements + class(StandardOperators1D), intent(in) :: op !< 1D operators + real(RDP), intent(in) :: d(0:3,n_element) !< helmholtz coefficients + real(RDP), intent(in) :: u_e( n,N_EDGES,n_element) !< variable u on edges + real(RDP), intent(inout) :: r_f(n,n,N_FACES,n_element) !< r = H u on faces + + integer :: e,i,j ! loop indices + + real(RDP) :: w_I(n), L_I0(n), L_Ip(n) ! operators + + ! initialization ............................................................. + + ! Set the row matrices + w_I = op%w(1:n) + L_I0 = op%L(1:n,0 ) * op%w(0) + L_Ip = op%L(1:n,n+1) * op%w(0) + + ! x_1 edges .................................................................. + + !$acc parallel async(ACC_EXEC_QUEUE) present(d,u_e,r_f) + !$acc loop collapse(3) + !$omp do private(i,j,e) + do e = 1, n_element + do j = 1, n + do i = 1, n + + ! x_1 faces + r_f(i,j,1,e) = r_f(i,j, 1,e) & + + d(3,e) * w_I(i ) * L_I0( j) * u_e(i, 5,e) & + + d(3,e) * w_I(i ) * L_Ip( j) * u_e(i, 7,e) & + + d(2,e) * w_I( j) * L_I0(i ) * u_e( j, 9,e) & + + d(2,e) * w_I( j) * L_Ip(i ) * u_e( j,11,e) + + r_f(i,j,2,e) = r_f(i,j, 2,e) & + + d(3,e) * w_I(i ) * L_I0( j) * u_e(i, 6,e) & + + d(3,e) * w_I(i ) * L_Ip( j) * u_e(i, 8,e) & + + d(2,e) * w_I( j) * L_I0(i ) * u_e( j,10,e) & + + d(2,e) * w_I( j) * L_Ip(i ) * u_e( j,12,e) + + ! x_2 faces + r_f(i,j,3,e) = r_f(i,j, 3,e) & + + d(3,e) * w_I(i ) * L_I0( j) * u_e(i, 1,e) & + + d(3,e) * w_I(i ) * L_Ip( j) * u_e(i, 3,e) & + + d(1,e) * w_I( j) * L_I0(i ) * u_e( j, 9,e) & + + d(1,e) * w_I( j) * L_Ip(i ) * u_e( j,10,e) + + r_f(i,j,4,e) = r_f(i,j, 4,e) & + + d(3,e) * w_I(i ) * L_I0( j) * u_e(i, 2,e) & + + d(3,e) * w_I(i ) * L_Ip( j) * u_e(i, 4,e) & + + d(1,e) * w_I( j) * L_I0(i ) * u_e( j,11,e) & + + d(1,e) * w_I( j) * L_Ip(i ) * u_e( j,12,e) + + ! x_3 faces + r_f(i,j,5,e) = r_f(i,j, 5,e) & + + d(2,e) * w_I(i ) * L_I0( j) * u_e(i, 1,e) & + + d(2,e) * w_I(i ) * L_Ip( j) * u_e(i, 2,e) & + + d(1,e) * w_I( j) * L_I0(i ) * u_e( j, 5,e) & + + d(1,e) * w_I( j) * L_Ip(i ) * u_e( j, 6,e) + + r_f(i,j,6,e) = r_f(i,j, 6,e) & + + d(2,e) * w_I(i ) * L_I0( j) * u_e(i, 3,e) & + + d(2,e) * w_I(i ) * L_Ip( j) * u_e(i, 4,e) & + + d(1,e) * w_I( j) * L_I0(i ) * u_e( j, 7,e) & + + d(1,e) * w_I( j) * L_Ip(i ) * u_e( j, 8,e) + + end do + end do + end do + !$omp end do + !$acc end loop + !$acc end parallel + +end subroutine HelmholtzOpPrimEdgeToFace + +!------------------------------------------------------------------------------- +!> \brief Face to edge interaction in the Helmholtz operator r = H u +!> \author Immo Huismann +!> +!> \todo +!> * Put the computation of d_i into an outer routine + +subroutine HelmholtzOpPrimFaceToEdge(n,n_element,op,d,u_f,r_e) + integer, intent(in) :: n !< points per inner line + integer, intent(in) :: n_element !< number of elements + class(StandardOperators1D), intent(in) :: op !< 1D operators + real(RDP), intent(in) :: d(0:3,n_element) !< helmholtz coefficients + real(RDP), intent(in) :: u_f(n,n,N_FACES,n_element) !< variable u on faces + real(RDP), intent(inout) :: r_e( n,N_EDGES,n_element) !< r = H u on edges + + integer :: e,i,j ! loop indices + + real(RDP) :: w_I(n), L_0I(n), L_pI(n) ! operators + + ! initialization ............................................................. + + ! Set the row matrices + w_I = op%w(1:n) + L_0I = op%L(0 ,1:n) * op%w(0) + L_pI = op%L(n+1,1:n) * op%w(0) + + ! One loop to rule them all .................................................. + + ! A three times nested loop is utilized. One for the matrix multiplication, + ! two for the points themselve. + !$acc parallel async(ACC_EXEC_QUEUE) present(d,u_f,r_e) + !$acc loop collapse(2) + !$omp do private(i,j,e) + do e = 1, n_element + do i = 1, n + do j = 1, n + r_e(i, 1,e) = r_e(i , 1,e) & + + d(2,e) * w_I(i) * L_0I(j) * u_f(i,j, 5,e) & + + d(3,e) * w_I(i) * L_0I(j) * u_f(i,j, 3,e) + + r_e(i, 2,e) = r_e(i , 2,e) & + + d(2,e) * w_I(i) * L_pI(j) * u_f(i,j, 5,e) & + + d(3,e) * w_I(i) * L_0I(j) * u_f(i,j, 4,e) + + r_e(i, 3,e) = r_e(i , 3,e) & + + d(2,e) * w_I(i) * L_0I(j) * u_f(i,j, 6,e) & + + d(3,e) * w_I(i) * L_pI(j) * u_f(i,j, 3,e) + + r_e(i, 4,e) = r_e(i , 4,e) & + + d(2,e) * w_I(i) * L_pI(j) * u_f(i,j, 6,e) & + + d(3,e) * w_I(i) * L_pI(j) * u_f(i,j, 4,e) + + r_e(i, 5,e) = r_e(i , 5,e) & + + d(3,e) * w_I(i) * L_0I(j) * u_f(i,j, 1,e) & + + d(1,e) * w_I(i) * L_0I(j) * u_f(j,i, 5,e) + + r_e(i, 6,e) = r_e(i , 6,e) & + + d(3,e) * w_I(i) * L_0I(j) * u_f(i,j, 2,e) & + + d(1,e) * w_I(i) * L_pI(j) * u_f(j,i, 5,e) + + r_e(i, 7,e) = r_e(i , 7,e) & + + d(3,e) * w_I(i) * L_pI(j) * u_f(i,j, 1,e) & + + d(1,e) * w_I(i) * L_0I(j) * u_f(j,i, 6,e) + + r_e(i, 8,e) = r_e(i , 8,e) & + + d(3,e) * w_I(i) * L_pI(j) * u_f(i,j, 2,e) & + + d(1,e) * w_I(i) * L_pI(j) * u_f(j,i, 6,e) + + r_e(i, 9,e) = r_e(i , 9,e) & + + d(2,e) * w_I(i) * L_0I(j) * u_f(j,i, 1,e) & + + d(1,e) * w_I(i) * L_0I(j) * u_f(j,i, 3,e) + + r_e(i,10,e) = r_e(i ,10,e) & + + d(2,e) * w_I(i) * L_0I(j) * u_f(j,i, 2,e) & + + d(1,e) * w_I(i) * L_pI(j) * u_f(j,i, 3,e) + + r_e(i,11,e) = r_e(i ,11,e) & + + d(2,e) * w_I(i) * L_pI(j) * u_f(j,i, 1,e) & + + d(1,e) * w_I(i) * L_0I(j) * u_f(j,i, 4,e) + + r_e(i,12,e) = r_e(i ,12,e) & + + d(2,e) * w_I(i) * L_pI(j) * u_f(j,i, 2,e) & + + d(1,e) * w_I(i) * L_pI(j) * u_f(j,i, 4,e) + + end do + end do + end do + !$omp end do + !$acc end loop + !$acc end parallel + +end subroutine HelmholtzOpPrimFaceToEdge + +!------------------------------------------------------------------------------- +!> \brief Face to face interaction in the Helmholtz operator r = H u +!> \author Immo Huismann +!> +!> \todo +!> * Put the computation of d_i into an outer routine +!> * Put the treatment of one direction's faces into a separate subroutine +!> * leads to a more cache-friendly implementation (most crucial part of this +!> module) +!> +!> \details +!> For each edge two loops are present for the laplace terms. This is due to a +!> compiler bug (with O3) in ifort. + +subroutine HelmholtzOpPrimFaceToFace(n,n_element,op,d,u_f,r_f,tmp_f_1,tmp_f_2) + integer, intent(in) :: n !< points per inner line + integer, intent(in) :: n_element !< number of elements + class(StandardOperators1D), intent(in) :: op !< 1D operators + real(RDP), intent(in) :: d(0:3,n_element) !< helmholtz factors + real(RDP), intent(in) :: u_f (n,n,N_FACES,n_element) !< variable u on faces + real(RDP), intent(inout) :: r_f (n,n,N_FACES,n_element) !< r = H u on edges + real(RDP), intent(out) :: tmp_f_1(n,n,N_FACES,n_element) !< scratch values + real(RDP), intent(out) :: tmp_f_2(n,n,N_FACES,n_element) !< scratch values + + integer :: i,j ! loop indices + + real(RDP) :: M_II(n,n), w_I(n), L_II(n,n), L_0p, w_0,L_00 ! operators + + ! initialization ............................................................. + + ! Set the row matrices + w_0 = op%w(0) + w_I = op%w(1:n) + L_00 = op%L(0, 0 ) + L_II = op%L(1:n,1:n) + L_0p = op%L(0, n+1) + M_II = reshape([((w_I(i) * w_I(j), i = 1, n), j = 1,n)], [n,n]) + + ! Work in each of the three directions seperatedly + call FaceToFace& + (n,n_element,d,w_0,L_00,L_0p,w_I,M_II,L_II,u_f,r_f,tmp_f_1,tmp_f_2) + + +end subroutine HelmholtzOpPrimFaceToFace + +!=============================================================================== +! Suboperators + +subroutine FaceToFace& + (n,n_element,d,w_0,L_00,L_0p,w_I,M_II,L_I,u_f,r_f,tmp_f_1,tmp_f_2) + integer, intent(in) :: n !< points per edge + integer, intent(in) :: n_element !< number of elements + real(RDP), intent(in) :: d(0:3,n_element) !< mass term coefficient + real(RDP), intent(in) :: w_0 !< coefficient (0,0) of mass matrix + real(RDP), intent(in) :: L_00 !< coefficient (0,0) of laplace matrix + real(RDP), intent(in) :: L_0p !< coefficient (0,p) of laplace matrix + real(RDP), intent(in) :: w_I(n) !< inner part of mass matrix + real(RDP), intent(in) :: M_II(n,n) !< inner part of 2D mass matrix + real(RDP), intent(in) :: L_I (n,n) !< inner part of laplace matrix + + real(RDP), intent(in) :: u_f (n,n,N_FACES,n_element) + real(RDP), intent(inout) :: r_f (n,n,N_FACES,n_element) + real(RDP), intent(out) :: tmp_f_1(n,n,N_FACES,n_element) + real(RDP), intent(out) :: tmp_f_2(n,n,N_FACES,n_element) + + ! laplace factors on faces, in first and second dimension on face + real(RDP), allocatable, save :: d_f_1(:,:), d_f_2(:,:) + + integer :: e,i,j + + !............................................................................. + ! Initialization of on-face laplace factors + + !$omp single + allocate(d_f_1(N_FACES, n_element)) + allocate(d_f_2(N_FACES, n_element)) + !$omp end single + + !$omp do private(e) + do e = 1, n_element + ! metric factor for first dimension of face + d_f_1(1, e) = d(2,e) + d_f_1(2, e) = d(2,e) + d_f_1(3, e) = d(1,e) + d_f_1(4, e) = d(1,e) + d_f_1(5, e) = d(1,e) + d_f_1(6, e) = d(1,e) + + ! metric factor for second dimension of face + d_f_2(1, e) = d(3,e) + d_f_2(2, e) = d(3,e) + d_f_2(3, e) = d(3,e) + d_f_2(4, e) = d(3,e) + d_f_2(5, e) = d(2,e) + d_f_2(6, e) = d(2,e) + end do + !$omp end do + + !............................................................................. + ! Mass term + orthogonal components + + !$acc parallel async(ACC_EXEC_QUEUE) present(d,u_f,r_f) + !$acc loop + !$omp do private(i,j,e) + do e = 1, n_element + !$acc loop collapse(2) + do j = 1, n + do i = 1, n + r_f(i,j,1,e) = r_f(i,j,1,e) & + + M_II(i,j) * (d(0,e) * w_0 + d(1,e) * L_00) * u_f(i,j,1,e) & + + M_II(i,j) * d(1,e) * L_0p * u_f(i,j,2,e) + r_f(i,j,2,e) = r_f(i,j,2,e) & + + M_II(i,j) * (d(0,e) * w_0 + d(1,e) * L_00) * u_f(i,j,2,e) & + + M_II(i,j) * d(1,e) * L_0p * u_f(i,j,1,e) + end do + end do + !$acc end loop + !$acc loop collapse(2) + do j = 1, n + do i = 1, n + r_f(i,j,3,e) = r_f(i,j,3,e) & + + M_II(i,j) * (d(0,e) * w_0 + d(2,e) * L_00) * u_f(i,j,3,e) & + + M_II(i,j) * d(2,e) * L_0p * u_f(i,j,4,e) + r_f(i,j,4,e) = r_f(i,j,4,e) & + + M_II(i,j) * (d(0,e) * w_0 + d(2,e) * L_00) * u_f(i,j,4,e) & + + M_II(i,j) * d(2,e) * L_0p * u_f(i,j,3,e) + end do + end do + !$acc end loop + !$acc loop collapse(2) + do j = 1, n + do i = 1, n + r_f(i,j,5,e) = r_f(i,j,5,e) & + + M_II(i,j) * (d(0,e) * w_0 + d(3,e) * L_00) * u_f(i,j,5,e) & + + M_II(i,j) * d(3,e) * L_0p * u_f(i,j,6,e) + r_f(i,j,6,e) = r_f(i,j,6,e) & + + M_II(i,j) * (d(0,e) * w_0 + d(3,e) * L_00) * u_f(i,j,6,e) & + + M_II(i,j) * d(3,e) * L_0p * u_f(i,j,5,e) + end do + end do + !$acc end loop + end do + !$omp end do + !$acc end parallel + + !............................................................................. + ! Laplace factors + + ! Laplace in first direction of face + call AddFactoredMatrixProduct2D(L_I,w_I * w_0,d_f_1,u_f,r_f,tmp_f_1,tmp_f_2) + ! Laplace in second direction of face + call AddFactoredMatrixProduct2D(w_I,L_I * w_0,d_f_2,u_f,r_f,tmp_f_1,tmp_f_2) + + !............................................................................. + ! Epilogue + !$omp single + deallocate(d_f_1,d_f_2) + !$omp end single + +end subroutine FaceToFace + +!=============================================================================== + +end module Condensed_Primary_Part diff --git a/Parser/test-data/specht/matrix_operations.f b/Parser/test-data/specht/matrix_operations.f new file mode 100644 index 0000000..06c9cfd --- /dev/null +++ b/Parser/test-data/specht/matrix_operations.f @@ -0,0 +1,651 @@ +!> \file matrix_operations.f +!> \brief Provides explicit shape routines for simple matrix operations +!> \author Immo Huismann +!> \date 2014/08/14 +!> \copyright Institute of Fluid Mechanics, TU Dresden, 01062 Dresden, Germany +!> +!> \details +!> This module provides routines in explicit size for some simple matrix and +!> array operations. +!> +!> Currently the operations of diagonal matrices on matrices, scalar products +!> and the transposition of triad dimensions are supported. +!> +!> \todo +!> * Create unit tests. +!> * OpenACC and OpenMP in unit tests +!> * Check OpenMP parallelization of the routines +!=============================================================================== + +module Matrix_Operations + ! HiBASE modules + use Kind_Parameters, only: RDP + + ! Specht base modules + use ACC_Parameters, only: ACC_EXEC_QUEUE + implicit none + private + + public :: DiagonalMatrixProduct ! application of a diagonal matrix + public :: AddDiagonalMatrixProduct ! as above, but adding to result vector + public :: InPlaceDiagonalMatrixProduct ! as first, but inplace operation + + public :: StackedTranspose ! transposition of multiple matrices + public :: AddStackedTranspose ! transposition of multiple matrices + + !----------------------------------------------------------------------------- + !> \brief A matrix product with a diagonal matrix as operator + !> \author Immo Huismann + + interface DiagonalMatrixProduct + module procedure DiagonalMatrixProduct1 + module procedure DiagonalMatrixProduct2 + module procedure DiagonalMatrixProduct3 + module procedure ScaledDiagonalMatrixProduct2 + module procedure ScaledDiagonalMatrixProduct22 + module procedure ScaledDiagonalMatrixProduct3 + end interface DiagonalMatrixProduct + + !----------------------------------------------------------------------------- + !> \brief A matrix product with a diagonal matrix as operator + !> \author Immo Huismann + + interface InPlaceDiagonalMatrixProduct + module procedure InPlaceDiagonalMatrixProduct3 + module procedure ScaledInPlaceDiagonalMatrixProduct3 + end interface InPlaceDiagonalMatrixProduct + + !----------------------------------------------------------------------------- + !> \brief An additive matrix product with a diagonal matrix as operator + !> \author Immo Huismann + + interface AddDiagonalMatrixProduct + module procedure AddDiagonalMatrixProduct1 + module procedure AddDiagonalMatrixProduct2 + module procedure AddDiagonalMatrixProduct3 + module procedure AddScaledDiagonalMatrixProduct2 + end interface AddDiagonalMatrixProduct + +contains + +! Wrappers ===================================================================== + +!------------------------------------------------------------------------------- +!> \brief Applies a diagonal matrix to the values +!> \author Immo Huismann + +subroutine DiagonalMatrixProduct1(diag,values,res) + real(RDP), intent(in) :: diag (:) !< Diagonal matrix to use B on + real(RDP), intent(in) :: values(:,:,:,:) !< Input data + real(RDP), intent( out) :: res (:,:,:,:) !< Matrix to store the result in + + integer :: k,m + + k = size(diag) + m = product([size(values,2),size(values,3),size(values,4)]) + + call DiagonalMatrixProduct_ES(k,m,diag,values,res) + +end subroutine DiagonalMatrixProduct1 + +!------------------------------------------------------------------------------- +!> \brief Applies a diagonal matrix to the values +!> \author Immo Huismann + +subroutine DiagonalMatrixProduct2(diag,values,res) + real(RDP), intent(in) :: diag (:,:) !< Diagonal matrix to use B on + real(RDP), intent(in) :: values(:,:,:,:) !< Input data + real(RDP), intent( out) :: res (:,:,:,:) !< Matrix to store the result in + + integer :: k,m + + k = size(diag) + m = product([size(values,3),size(values,4)]) + + call DiagonalMatrixProduct_ES(k,m,diag,values,res) + +end subroutine DiagonalMatrixProduct2 + +!------------------------------------------------------------------------------- +!> \brief This interface provides DGEMM for a square matrix A +!> \author Immo Huismann + +subroutine DiagonalMatrixProduct3(diag,values,res) + real(RDP), intent(in) :: diag (:,:,:) !< Diagonal matrix to use B on + real(RDP), intent(in) :: values(:,:,:,:) !< Input data + real(RDP), intent( out) :: res (:,:,:,:) !< Matrix to store the result in + + integer :: k,m + + k = size(diag) + m = size(values,4) + + call DiagonalMatrixProduct_ES(k,m,diag,values,res) + +end subroutine DiagonalMatrixProduct3 + +!------------------------------------------------------------------------------- +!> \brief This interface provides DGEMM for a square matrix A +!> \author Immo Huismann + +subroutine InPlaceDiagonalMatrixProduct3(diag,values) + real(RDP), intent(in) :: diag (:,:,:) !< Diagonal matrix to use B on + real(RDP), intent(inout) :: values(:,:,:,:) !< Input data + + integer :: k,m + + k = size(diag) + m = size(values,4) + + call InPlaceDiagonalMatrixProduct_ES(k,m,diag,values) + +end subroutine InPlaceDiagonalMatrixProduct3 + +!------------------------------------------------------------------------------- +!> \brief This interface provides DGEMM for a square matrix A +!> \author Immo Huismann + +subroutine ScaledInPlaceDiagonalMatrixProduct3(diag,scales,values) + real(RDP), intent(in) :: diag (:,:,:) !< Diagonal matrix to use B on + real(RDP), intent(in) :: scales( :) !< elementwise scale + real(RDP), intent(inout) :: values(:,:,:,:) !< Input data + + integer :: k,m + + k = size(diag) + m = size(scales) + + call ScaledInPlaceDiagonalMatrixProduct_ES(k,m,diag,scales,values) + +end subroutine ScaledInPlaceDiagonalMatrixProduct3 + +!------------------------------------------------------------------------------- +!> \brief This interface provides DGEMM for a square matrix A +!> \author Immo Huismann + +subroutine ScaledDiagonalMatrixProduct2(diag,scales,values,res) + real(RDP), intent(in) :: diag (:,:) !< Diagonal matrix to use B on + real(RDP), intent(in) :: scales (:) !< Scaling factors + real(RDP), intent(in) :: values(:,:,:) !< Input data + real(RDP), intent( out) :: res (:,:,:) !< Matrix to store result in + + integer :: k,m + + k = size(diag) + m = size(scales) + + call ScaledDiagonalMatrixProduct_ES(k,m,diag,scales,values,res) + +end subroutine ScaledDiagonalMatrixProduct2 + +!------------------------------------------------------------------------------- +!> \brief This interface provides DGEMM for a square matrix A +!> \author Immo Huismann + +subroutine ScaledDiagonalMatrixProduct22(diag,scales,values,res) + real(RDP), intent(in) :: diag (:,:) !< Diagonal matrix to use B on + real(RDP), intent(in) :: scales (:,:) !< Scaling factors + real(RDP), intent(in) :: values(:,:,:,:) !< Input data + real(RDP), intent( out) :: res (:,:,:,:) !< Matrix to store result in + + integer :: k,m + + k = size(diag) + m = size(scales) + + call ScaledDiagonalMatrixProduct_ES(k,m,diag,scales,values,res) + +end subroutine ScaledDiagonalMatrixProduct22 + +!------------------------------------------------------------------------------- +!> \brief This interface provides DGEMM for a square matrix A +!> \author Immo Huismann + +subroutine ScaledDiagonalMatrixProduct3(diag,scales,values,res) + real(RDP), intent(in) :: diag (:,:,:) !< Diagonal matrix to use B on + real(RDP), intent(in) :: scales (:) !< Scaling factors + real(RDP), intent(in) :: values(:,:,:,:) !< Input data + real(RDP), intent( out) :: res (:,:,:,:) !< Matrix to store result in + + integer :: k,m + + k = size(diag) + m = size(scales) + + call ScaledDiagonalMatrixProduct_ES(k,m,diag,scales,values,res) + +end subroutine ScaledDiagonalMatrixProduct3 + +!------------------------------------------------------------------------------- +!> \brief This interface provides DGEMM for a square matrix A +!> \author Immo Huismann + +subroutine AddDiagonalMatrixProduct1(diag,values,res) + real(RDP), intent(in) :: diag (:) !< Diagonal matrix to use B on + real(RDP), intent(in) :: values(:,:,:,:) !< Input data + real(RDP), intent(inout) :: res (:,:,:,:) !< Matrix to store the result in + + integer :: k,m + + k = size(diag) + m = product([size(values,2),size(values,3),size(values,4)]) + + call AddDiagonalMatrixProduct_ES (k,m,diag,values,res) + +end subroutine AddDiagonalMatrixProduct1 + +!------------------------------------------------------------------------------- +!> \brief This interface provides DGEMM for a square matrix A +!> \author Immo Huismann + +subroutine AddDiagonalMatrixProduct2(diag,values,res) + real(RDP), intent(in) :: diag (:,:) !< Diagonal matrix to use B on + real(RDP), intent(in) :: values(:,:,:,:) !< Input data + real(RDP), intent(inout) :: res (:,:,:,:) !< Matrix to store the result in + + integer :: k,m + + k = size(diag) + m = product([size(values,3),size(values,4)]) + + call AddDiagonalMatrixProduct_ES(k,m,diag,values,res) + +end subroutine AddDiagonalMatrixProduct2 + +!------------------------------------------------------------------------------- +!> \brief This interface provides DGEMM for a square matrix A +!> \author Immo Huismann + +subroutine AddDiagonalMatrixProduct3(diag,values,res) + real(RDP), intent(in) :: diag (:,:,:) !< Diagonal matrix to use B on + real(RDP), intent(in) :: values(:,:,:,:) !< Input data + real(RDP), intent(inout) :: res (:,:,:,:) !< Matrix to store the result in + + integer :: k,m + + k = size(diag) + m = size(values,4) + + call AddDiagonalMatrixProduct_ES(k,m,diag,values,res) + +end subroutine AddDiagonalMatrixProduct3 + +!------------------------------------------------------------------------------- +!> \brief This interface provides DGEMM for a square matrix A +!> \author Immo Huismann + +subroutine AddScaledDiagonalMatrixProduct2(diag,scales,values,res) + real(RDP), intent(in) :: diag (:,:) !< Diagonal matrix to use B on + real(RDP), intent(in) :: scales (:) !< Scaling factors + real(RDP), intent(in) :: values(:,:,:) !< Input data + real(RDP), intent(inout) :: res (:,:,:) !< Matrix to store result in + + integer :: k,m + + k = size(diag) + m = size(scales) + + call AddScaledDiagonalMatrixProduct_ES(k,m,diag,scales,values,res) + +end subroutine AddScaledDiagonalMatrixProduct2 + +! Implementations ============================================================== + +!------------------------------------------------------------------------------- +!> \brief Provides a diagonal matrix multiplication +!> \author Immo Huismann +!> +!> \details +!> Explicit size application of a diagonal matrix on a vector. +!> As it is implemented in explicit size with only two array dimensions instead +!> of four or six, it is easily vectorized by the compiler. +!> +!> This routine is not designed to be used directly. Rather one of the wrappers +!> with explicit shape should be used. Mind that this routine is not +!> thread-safe, as it uses OpenACC and OpenMP to parallelize the operations. + +subroutine DiagonalMatrixProduct_ES (k,m,diag,values,res) + integer , intent(in) :: k !< size of first dim of matrix A + integer , intent(in) :: m !< size of second dim of B + real(RDP), intent(in) :: diag (k ) !< Diagonal matrix to use B on + real(RDP), intent(in) :: values(k,m) !< Input data + real(RDP), intent( out) :: res (k,m) !< Matrix to store the result im + + integer :: i,j ! loop indices + + ! Mind that the copy of the diag matrix is not allowed in the data statement, + ! unless the data statement receives the 'async' keyword. + ! Obmitting the async might lead to the parallel region starting without the + ! copy process being finished. + + !$acc data present(values,res) copyin(diag) async(ACC_EXEC_QUEUE) + + !$acc parallel async(ACC_EXEC_QUEUE) + + !$acc loop independent collapse(2) private(i,j) + !$omp do private(i,j) + do j = 1, m + do i = 1, k + ! point-wise multiplication + res(i,j) = diag(i) * values(i,j) + end do + end do + !$omp end do + !$acc end loop + + !$acc end parallel + + !$acc end data + +end subroutine DiagonalMatrixProduct_ES + +!------------------------------------------------------------------------------- +!> \brief Provides a diagonal matrix multiplication +!> \author Immo Huismann +!> +!> \details +!> Explicit size application of a diagonal matrix on a vector. +!> As it is implemented in explicit size with only two array dimensions instead +!> of four or six, it is easily vectorized by the compiler. +!> +!> This routine is not designed to be used directly. Rather one of the wrappers +!> with explicit shape should be used. Mind that this routine is not +!> thread-safe, as it uses OpenACC and OpenMP to parallelize the operations. + +subroutine InPlaceDiagonalMatrixProduct_ES (k,m,diag,values) + integer , intent(in) :: k !< size of first dim of matrix A + integer , intent(in) :: m !< size of second dim of B + real(RDP), intent(in) :: diag (k ) !< Diagonal matrix to use B on + real(RDP), intent(inout) :: values(k,m) !< data to work on + + integer :: i,j ! loop indices + + ! Mind that the copy of the diag matrix is not allowed in the data statement, + ! unless the data statement receives the 'async' keyword. + ! Obmitting the async might lead to the parallel region starting without the + ! copy process being finished. + + !$acc data present(values) copyin(diag) async(ACC_EXEC_QUEUE) + !$acc parallel async(ACC_EXEC_QUEUE) + + !$acc loop collapse(2) private(i,j) + !$omp do private(i,j) + do j = 1, m + do i = 1, k + ! point-wise multiplication + values(i,j) = diag(i) * values(i,j) + end do + end do + !$omp end do + !$acc end loop + + !$acc end parallel + !$acc end data + +end subroutine InPlaceDiagonalMatrixProduct_ES + +!------------------------------------------------------------------------------- +!> \brief Provides a diagonal matrix multiplication +!> \author Immo Huismann +!> +!> \details +!> Explicit size application of a diagonal matrix on a vector. +!> As it is implemented in explicit size with only two array dimensions instead +!> of four or six, it is easily vectorized by the compiler. +!> +!> This routine is not designed to be used directly. Rather one of the wrappers +!> with explicit shape should be used. Mind that this routine is not +!> thread-safe, as it uses OpenACC and OpenMP to parallelize the operations. + +subroutine ScaledDiagonalMatrixProduct_ES (k,m,diag,factors,values,res) + integer , intent(in) :: k !< size of first dim of matrix A + integer , intent(in) :: m !< size of second dim of B + real(RDP), intent(in) :: diag (k ) !< Diagonal matrix to use B on + real(RDP), intent(in) :: factors( m) !< Scaling factors + real(RDP), intent(in) :: values (k,m) !< Input data + real(RDP), intent( out) :: res (k,m) !< Matrix to store the result im + + integer :: i,j ! loop indices + + ! Mind that the copy of the diag matrix is not allowed in the data statement, + ! unless the data statement receives the 'async' keyword. + ! Obmitting the async might lead to the parallel region starting without the + ! copy process being finished. + + !$acc data present(values,res) copyin(diag, factors) async(ACC_EXEC_QUEUE) + !$acc parallel async(ACC_EXEC_QUEUE) + + !$acc loop collapse(2) private(i,j) + !$omp do private(i,j) + do j = 1, m + do i = 1, k + ! point-wise multiplication + res(i,j) = diag(i) * factors(j) * values(i,j) + end do + end do + !$omp end do + !$acc end loop + + !$acc end parallel + !$acc end data + +end subroutine ScaledDiagonalMatrixProduct_ES + +!------------------------------------------------------------------------------- +!> \brief Provides a diagonal matrix multiplication +!> \author Immo Huismann +!> +!> \details +!> Explicit size application of a diagonal matrix on a vector. +!> As it is implemented in explicit size with only two array dimensions instead +!> of four or six, it is easily vectorized by the compiler. +!> +!> This routine is not designed to be used directly. Rather one of the wrappers +!> with explicit shape should be used. Mind that this routine is not +!> thread-safe, as it uses OpenACC and OpenMP to parallelize the operations. + +subroutine AddScaledDiagonalMatrixProduct_ES (k,m,diag,factors,values,res) + integer , intent(in) :: k !< size of first dim of matrix A + integer , intent(in) :: m !< size of second dim of B + real(RDP), intent(in) :: diag (k ) !< Diagonal matrix to use B on + real(RDP), intent(in) :: factors( m) !< Scaling factors + real(RDP), intent(in) :: values (k,m) !< Input data + real(RDP), intent(inout) :: res (k,m) !< Matrix to store the result im + + integer :: i,j ! loop indices + + ! Mind that the copy of the diag matrix is not allowed in the data statement, + ! unless the data statement receives the 'async' keyword. + ! Obmitting the async might lead to the parallel region starting without the + ! copy process being finished. + + !$acc data present(values,res) copyin(diag, factors) async(ACC_EXEC_QUEUE) + !$acc parallel async(ACC_EXEC_QUEUE) + + !$acc loop collapse(2) private(i,j) + !$omp do private(i,j) + do j = 1, m + do i = 1, k + ! point-wise multiplication + res(i,j) = res(i,j) + diag(i) * factors(j) * values(i,j) + end do + end do + !$omp end do + !$acc end loop + + !$acc end parallel + !$acc end data + +end subroutine AddScaledDiagonalMatrixProduct_ES + +!------------------------------------------------------------------------------- +!> \brief Provides a diagonal matrix multiplication +!> \author Immo Huismann +!> +!> \details +!> Explicit size application of a diagonal matrix on a vector. +!> As it is implemented in explicit size with only two array dimensions instead +!> of four or six, it is easily vectorized by the compiler. +!> +!> This routine is not designed to be used directly. Rather one of the wrappers +!> with explicit shape should be used. Mind that this routine is not +!> thread-safe, as it uses OpenACC and OpenMP to parallelize the operations. + +subroutine ScaledInPlaceDiagonalMatrixProduct_ES (k,m,diag,scales,values) + integer , intent(in) :: k !< size of first dim of matrix A + integer , intent(in) :: m !< size of second dim of B + real(RDP), intent(in) :: diag (k ) !< Diagonal matrix to use B on + real(RDP), intent(in) :: scales( m) !< elementwise scale + real(RDP), intent(inout) :: values(k,m) !< data to work on + + integer :: i,j ! loop indices + + ! Mind that the copy of the diag matrix is not allowed in the data statement, + ! unless the data statement receives the 'async' keyword. + ! Obmitting the async might lead to the parallel region starting without the + ! copy process being finished. + + !$acc data present(values) copyin(diag,scales) async(ACC_EXEC_QUEUE) + !$acc parallel async(ACC_EXEC_QUEUE) + + !$acc loop collapse(2) private(i,j) + !$omp do private(i,j) + do j = 1, m + do i = 1, k + ! point-wise multiplication + values(i,j) = diag(i) * scales(j) * values(i,j) + end do + end do + !$omp end do + !$acc end loop + !$acc end parallel + + !$acc end data + +end subroutine ScaledInPlaceDiagonalMatrixProduct_ES + +!------------------------------------------------------------------------------- +!> Same as DiagonalMatrixProduct_ES, only differing by adding to res instead of +!> setting it to the result of diag times values. + +subroutine AddDiagonalMatrixProduct_ES (k,m,diag,values,res) + integer , intent(in) :: k !< size of first dim of matrix A + integer , intent(in) :: m !< size of second dim of B + real(RDP), intent(in) :: diag (k ) !< Diagonal matrix to use B on + real(RDP), intent(in) :: values(k,m) !< Input data + real(RDP), intent(inout) :: res (k,m) !< Matrix to store the result im + + integer :: i,j ! loop indices + + ! copying in async queue 2, so that the computation is not hindered + !$acc data present(values,res) copyin(diag) async(ACC_EXEC_QUEUE) + !$acc parallel async(ACC_EXEC_QUEUE) + + !$acc loop collapse(2) private(i,j) + !$omp do private(i,j) + do j = 1, m + do i = 1, k + ! point-wise multiplication + res(i,j) = res(i,j) + diag(i) * values(i,j) + end do + end do + !$omp end do + !$acc end loop + + !$acc end parallel + !$acc end data + +end subroutine AddDiagonalMatrixProduct_ES + +!------------------------------------------------------------------------------- +!> \brief A transpose operations working on the first two dimensions of a triad +!> \author Immo Huismann +!> +!> \details +!> By itself, this routine is useless. Its goal lies in providing a means of +!> directly working on different dimensions of a data set by using the first +!> dimension. This in turn enables a better performance, as the cache lines are +!> fully utilized, or on a GPGPU the memory accesses are coalesced. + +subroutine StackedTranspose(n_1,n_2,n_stack,in,out) + integer , intent(in) :: n_1 !< first dimension of in, second of out + integer , intent(in) :: n_2 !< second dimension of in, first of out + integer , intent(in) :: n_stack !< stacked dimension of the transpose + + real(RDP), intent(in) :: in(n_1, n_2, n_stack) !< input vector + real(RDP), intent(out) :: out(n_2, n_1, n_stack) !< its transpose + + integer :: i,j,k ! loop indices + + + ! Transposing the first two dimensions. As typically storing data is more + ! costly than loading data, the j loop is chosen as the inner loop. This + ! delivers a slightly better performance, as the memory writes are aligned. + + !$acc data present(in,out) + !$acc parallel async(ACC_EXEC_QUEUE) + + !$acc loop collapse(3) private(i,j,k) + !$omp do private(i,j,k) + do k = 1,n_stack + do i = 1, n_1 + do j = 1, n_2 + out(j,i,k) = in(i,j,k) + end do + end do + end do + !$omp end do + !$acc end loop + + !$acc end parallel + !$acc end data + +end subroutine StackedTranspose + +!------------------------------------------------------------------------------- +!> \brief A transpose operations working on the first two dimensions of a triad. +!> \author Immo Huismann +!> +!> \details +!> By itself, this routine is useless. Its goal lies in providing a means of +!> directly working on different dimensions of a data set by using the first +!> dimension. This in turn enables a better performance, as the cache lines are +!> fully utilized, or on a GPGPU the memory accesses are coalesced. + +subroutine AddStackedTranspose(n_1,n_2,n_stack,in,res) + integer , intent(in ) :: n_1 !< first dimension of in, second of out + integer , intent(in ) :: n_2 !< second dimension of in, first of out + integer , intent(in ) :: n_stack !< stacked dimension of the transpose + + real(RDP), intent(in ) :: in(n_1, n_2, n_stack) !< input vector + real(RDP), intent(inout) :: res(n_2, n_1, n_stack) !< its transpose + + integer :: i,j,k ! loop indices + + ! Transposing the first two dimensions. As typically storing data is more + ! costly than loading data, the j loop is chosen as the inner loop. This + ! delivers a slightly better performance, as the memory writes are aligned. + + !$acc data present(in,res) + !$acc parallel async(ACC_EXEC_QUEUE) + + !$acc loop collapse(3) private(i,j,k) + !$omp do private(i,j,k) + do k = 1,n_stack + do i = 1, n_1 + do j = 1, n_2 + res(j,i,k) = res(j,i,k) + in(i,j,k) + end do + end do + end do + !$omp end do + !$acc end loop + + !$acc end parallel + !$acc end data + +end subroutine AddStackedTranspose + +!=============================================================================== + +end module Matrix_Operations diff --git a/Parser/test-data/specht/tiny_matrix_products_explicit.f b/Parser/test-data/specht/tiny_matrix_products_explicit.f new file mode 100644 index 0000000..5f66425 --- /dev/null +++ b/Parser/test-data/specht/tiny_matrix_products_explicit.f @@ -0,0 +1,1386 @@ +!> \file tiny_matrix_products_explicit.f +!> \brief matrix products, for known matrix sizes +!> \author Immo Huismann +!> \date 2015/04/20 +!> \copyright Institute of Fluid Mechanics, TU Dresden, 01062 Dresden, Germany +!> +!=============================================================================== + +module Tiny_Matrix_Products_Explicit + use Kind_Parameters, only: RDP + use ACC_Parameters, only: ACC_EXEC_QUEUE + implicit none + private + + public :: SquareTransposeMatrixProduct_Generated + public :: AddSquareTransposeMatrixProduct_Generated + +contains + +!------------------------------------------------------------------------------- +!> \brief C <- A B + C, generated version choosing different implementations +!> \author Immo Huismann + +subroutine AddSquareTransposeMatrixProduct_Generated(k,m,A,B,C) + integer, intent(in) :: k!< Matrix dim. + integer, intent(in) :: m!< number of vectors + real(RDP), intent(in) :: A(k,k)!< matrix to multiply with + real(RDP), intent(in) :: B(k,m)!< matrix to multiply + real(RDP), intent(inout) :: C(k,m)!< result + + select case(k) + case( 1) + call AddSquareTransposeMatrixProduct_01(m,A,B,C) + case( 2) + call AddSquareTransposeMatrixProduct_02(m,A,B,C) + case( 3) + call AddSquareTransposeMatrixProduct_03(m,A,B,C) + case( 4) + call AddSquareTransposeMatrixProduct_04(m,A,B,C) + case( 5) + call AddSquareTransposeMatrixProduct_05(m,A,B,C) + case( 6) + call AddSquareTransposeMatrixProduct_06(m,A,B,C) + case( 7) + call AddSquareTransposeMatrixProduct_07(m,A,B,C) + case( 8) + call AddSquareTransposeMatrixProduct_08(m,A,B,C) + case( 9) + call AddSquareTransposeMatrixProduct_09(m,A,B,C) + case(10) + call AddSquareTransposeMatrixProduct_10(m,A,B,C) + case(11) + call AddSquareTransposeMatrixProduct_11(m,A,B,C) + case(12) + call AddSquareTransposeMatrixProduct_12(m,A,B,C) + case(13) + call AddSquareTransposeMatrixProduct_13(m,A,B,C) + case(14) + call AddSquareTransposeMatrixProduct_14(m,A,B,C) + case(15) + call AddSquareTransposeMatrixProduct_15(m,A,B,C) + case(16) + call AddSquareTransposeMatrixProduct_16(m,A,B,C) + case default + call AddSquareTransposeMatrixProduct_XX(k,m,A,B,C) + end select + +end subroutine AddSquareTransposeMatrixProduct_Generated +!------------------------------------------------------------------------------- +!> \brief C <- A B + C for small square matrix A of size k x k +!> \author Immo Huismann +subroutine AddSquareTransposeMatrixProduct_XX(k,m,A_T,B,C) + integer, intent(in) :: k + integer, intent(in) :: m + real(RDP), intent(in) :: A_T(k,k) + real(RDP), intent(in) :: B (k,m) + real(RDP), intent(inout) :: C (k,m) + + integer :: i, j, l ! loop indices + real(RDP) :: tmp ! reduction variable + + !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE) + !$acc loop collapse(2) private(i,j,l,tmp) gang worker vector + !$omp do private(i,j,l,tmp) + do j = 1, m + do i = 1, k + tmp = 0 + do l = 1, k + tmp = tmp + A_T(l,i) * B(l,j) + end do + + C(i,j) = C(i,j) + tmp + end do + end do + !$omp end do + !$acc end loop + !$acc end parallel + +end subroutine AddSquareTransposeMatrixProduct_XX + + +!------------------------------------------------------------------------------- +!> \brief C <- A B + C for small square matrix A of size 1 x 1 +!> \author Immo Huismann +subroutine AddSquareTransposeMatrixProduct_01(m,A_T,B,C) + integer, parameter :: N = 1 + integer, intent(in) :: m + real(RDP), intent(in) :: A_T(N,N) + real(RDP), intent(in) :: B (N,m) + real(RDP), intent(out) :: C (N,m) + + integer :: i, j ! loop indices + real(RDP) :: tmp ! reduction variable + + !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE) + !$acc loop collapse(2) private(i,j,tmp) + !$omp do private(i,j,tmp) + do j = 1, m + do i = 1, N + tmp = & + + A_T( 1,i) * B( 1,j) + + C(i,j) = C(i,j) + tmp + end do + end do + !$omp end do + !$acc end loop + !$acc end parallel + +end subroutine AddSquareTransposeMatrixProduct_01 + +!------------------------------------------------------------------------------- +!> \brief C <- A B + C for small square matrix A of size 2 x 2 +!> \author Immo Huismann +subroutine AddSquareTransposeMatrixProduct_02(m,A_T,B,C) + integer, parameter :: N = 2 + integer, intent(in) :: m + real(RDP), intent(in) :: A_T(N,N) + real(RDP), intent(in) :: B (N,m) + real(RDP), intent(out) :: C (N,m) + + integer :: i, j ! loop indices + real(RDP) :: tmp ! reduction variable + + !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE) + !$acc loop collapse(2) private(i,j,tmp) + !$omp do private(i,j,tmp) + do j = 1, m + do i = 1, N + tmp = & + + A_T( 1,i) * B( 1,j) & + + A_T( 2,i) * B( 2,j) + + C(i,j) = C(i,j) + tmp + end do + end do + !$omp end do + !$acc end loop + !$acc end parallel + +end subroutine AddSquareTransposeMatrixProduct_02 + +!------------------------------------------------------------------------------- +!> \brief C <- A B + C for small square matrix A of size 3 x 3 +!> \author Immo Huismann +subroutine AddSquareTransposeMatrixProduct_03(m,A_T,B,C) + integer, parameter :: N = 3 + integer, intent(in) :: m + real(RDP), intent(in) :: A_T(N,N) + real(RDP), intent(in) :: B (N,m) + real(RDP), intent(out) :: C (N,m) + + integer :: i, j ! loop indices + real(RDP) :: tmp ! reduction variable + + !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE) + !$acc loop collapse(2) private(i,j,tmp) + !$omp do private(i,j,tmp) + do j = 1, m + do i = 1, N + tmp = & + + A_T( 1,i) * B( 1,j) & + + A_T( 2,i) * B( 2,j) & + + A_T( 3,i) * B( 3,j) + + C(i,j) = C(i,j) + tmp + end do + end do + !$omp end do + !$acc end loop + !$acc end parallel + +end subroutine AddSquareTransposeMatrixProduct_03 + +!------------------------------------------------------------------------------- +!> \brief C <- A B + C for small square matrix A of size 4 x 4 +!> \author Immo Huismann +subroutine AddSquareTransposeMatrixProduct_04(m,A_T,B,C) + integer, parameter :: N = 4 + integer, intent(in) :: m + real(RDP), intent(in) :: A_T(N,N) + real(RDP), intent(in) :: B (N,m) + real(RDP), intent(out) :: C (N,m) + + integer :: i, j ! loop indices + real(RDP) :: tmp ! reduction variable + + !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE) + !$acc loop collapse(2) private(i,j,tmp) + !$omp do private(i,j,tmp) + do j = 1, m + do i = 1, N + tmp = & + + A_T( 1,i) * B( 1,j) & + + A_T( 2,i) * B( 2,j) & + + A_T( 3,i) * B( 3,j) & + + A_T( 4,i) * B( 4,j) + + C(i,j) = C(i,j) + tmp + end do + end do + !$omp end do + !$acc end loop + !$acc end parallel + +end subroutine AddSquareTransposeMatrixProduct_04 + +!------------------------------------------------------------------------------- +!> \brief C <- A B + C for small square matrix A of size 5 x 5 +!> \author Immo Huismann +subroutine AddSquareTransposeMatrixProduct_05(m,A_T,B,C) + integer, parameter :: N = 5 + integer, intent(in) :: m + real(RDP), intent(in) :: A_T(N,N) + real(RDP), intent(in) :: B (N,m) + real(RDP), intent(out) :: C (N,m) + + integer :: i, j ! loop indices + real(RDP) :: tmp ! reduction variable + + !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE) + !$acc loop collapse(2) private(i,j,tmp) + !$omp do private(i,j,tmp) + do j = 1, m + do i = 1, N + tmp = & + + A_T( 1,i) * B( 1,j) & + + A_T( 2,i) * B( 2,j) & + + A_T( 3,i) * B( 3,j) & + + A_T( 4,i) * B( 4,j) & + + A_T( 5,i) * B( 5,j) + + C(i,j) = C(i,j) + tmp + end do + end do + !$omp end do + !$acc end loop + !$acc end parallel + +end subroutine AddSquareTransposeMatrixProduct_05 + +!------------------------------------------------------------------------------- +!> \brief C <- A B + C for small square matrix A of size 6 x 6 +!> \author Immo Huismann +subroutine AddSquareTransposeMatrixProduct_06(m,A_T,B,C) + integer, parameter :: N = 6 + integer, intent(in) :: m + real(RDP), intent(in) :: A_T(N,N) + real(RDP), intent(in) :: B (N,m) + real(RDP), intent(out) :: C (N,m) + + integer :: i, j ! loop indices + real(RDP) :: tmp ! reduction variable + + !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE) + !$acc loop collapse(2) private(i,j,tmp) + !$omp do private(i,j,tmp) + do j = 1, m + do i = 1, N + tmp = & + + A_T( 1,i) * B( 1,j) & + + A_T( 2,i) * B( 2,j) & + + A_T( 3,i) * B( 3,j) & + + A_T( 4,i) * B( 4,j) & + + A_T( 5,i) * B( 5,j) & + + A_T( 6,i) * B( 6,j) + + C(i,j) = C(i,j) + tmp + end do + end do + !$omp end do + !$acc end loop + !$acc end parallel + +end subroutine AddSquareTransposeMatrixProduct_06 + +!------------------------------------------------------------------------------- +!> \brief C <- A B + C for small square matrix A of size 7 x 7 +!> \author Immo Huismann +subroutine AddSquareTransposeMatrixProduct_07(m,A_T,B,C) + integer, parameter :: N = 7 + integer, intent(in) :: m + real(RDP), intent(in) :: A_T(N,N) + real(RDP), intent(in) :: B (N,m) + real(RDP), intent(out) :: C (N,m) + + integer :: i, j ! loop indices + real(RDP) :: tmp ! reduction variable + + !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE) + !$acc loop collapse(2) private(i,j,tmp) + !$omp do private(i,j,tmp) + do j = 1, m + do i = 1, N + tmp = & + + A_T( 1,i) * B( 1,j) & + + A_T( 2,i) * B( 2,j) & + + A_T( 3,i) * B( 3,j) & + + A_T( 4,i) * B( 4,j) & + + A_T( 5,i) * B( 5,j) & + + A_T( 6,i) * B( 6,j) & + + A_T( 7,i) * B( 7,j) + + C(i,j) = C(i,j) + tmp + end do + end do + !$omp end do + !$acc end loop + !$acc end parallel + +end subroutine AddSquareTransposeMatrixProduct_07 + +!------------------------------------------------------------------------------- +!> \brief C <- A B + C for small square matrix A of size 8 x 8 +!> \author Immo Huismann +subroutine AddSquareTransposeMatrixProduct_08(m,A_T,B,C) + integer, parameter :: N = 8 + integer, intent(in) :: m + real(RDP), intent(in) :: A_T(N,N) + real(RDP), intent(in) :: B (N,m) + real(RDP), intent(out) :: C (N,m) + + integer :: i, j ! loop indices + real(RDP) :: tmp ! reduction variable + + !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE) + !$acc loop collapse(2) private(i,j,tmp) + !$omp do private(i,j,tmp) + do j = 1, m + do i = 1, N + tmp = & + + A_T( 1,i) * B( 1,j) & + + A_T( 2,i) * B( 2,j) & + + A_T( 3,i) * B( 3,j) & + + A_T( 4,i) * B( 4,j) & + + A_T( 5,i) * B( 5,j) & + + A_T( 6,i) * B( 6,j) & + + A_T( 7,i) * B( 7,j) & + + A_T( 8,i) * B( 8,j) + + C(i,j) = C(i,j) + tmp + end do + end do + !$omp end do + !$acc end loop + !$acc end parallel + +end subroutine AddSquareTransposeMatrixProduct_08 + +!------------------------------------------------------------------------------- +!> \brief C <- A B + C for small square matrix A of size 9 x 9 +!> \author Immo Huismann +subroutine AddSquareTransposeMatrixProduct_09(m,A_T,B,C) + integer, parameter :: N = 9 + integer, intent(in) :: m + real(RDP), intent(in) :: A_T(N,N) + real(RDP), intent(in) :: B (N,m) + real(RDP), intent(out) :: C (N,m) + + integer :: i, j ! loop indices + real(RDP) :: tmp ! reduction variable + + !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE) + !$acc loop collapse(2) private(i,j,tmp) + !$omp do private(i,j,tmp) + do j = 1, m + do i = 1, N + tmp = & + + A_T( 1,i) * B( 1,j) & + + A_T( 2,i) * B( 2,j) & + + A_T( 3,i) * B( 3,j) & + + A_T( 4,i) * B( 4,j) & + + A_T( 5,i) * B( 5,j) & + + A_T( 6,i) * B( 6,j) & + + A_T( 7,i) * B( 7,j) & + + A_T( 8,i) * B( 8,j) & + + A_T( 9,i) * B( 9,j) + + C(i,j) = C(i,j) + tmp + end do + end do + !$omp end do + !$acc end loop + !$acc end parallel + +end subroutine AddSquareTransposeMatrixProduct_09 + +!------------------------------------------------------------------------------- +!> \brief C <- A B + C for small square matrix A of size 10 x 10 +!> \author Immo Huismann +subroutine AddSquareTransposeMatrixProduct_10(m,A_T,B,C) + integer, parameter :: N = 10 + integer, intent(in) :: m + real(RDP), intent(in) :: A_T(N,N) + real(RDP), intent(in) :: B (N,m) + real(RDP), intent(out) :: C (N,m) + + integer :: i, j ! loop indices + real(RDP) :: tmp ! reduction variable + + !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE) + !$acc loop collapse(2) private(i,j,tmp) + !$omp do private(i,j,tmp) + do j = 1, m + do i = 1, N + tmp = & + + A_T( 1,i) * B( 1,j) & + + A_T( 2,i) * B( 2,j) & + + A_T( 3,i) * B( 3,j) & + + A_T( 4,i) * B( 4,j) & + + A_T( 5,i) * B( 5,j) & + + A_T( 6,i) * B( 6,j) & + + A_T( 7,i) * B( 7,j) & + + A_T( 8,i) * B( 8,j) & + + A_T( 9,i) * B( 9,j) & + + A_T(10,i) * B(10,j) + + C(i,j) = C(i,j) + tmp + end do + end do + !$omp end do + !$acc end loop + !$acc end parallel + +end subroutine AddSquareTransposeMatrixProduct_10 + +!------------------------------------------------------------------------------- +!> \brief C <- A B + C for small square matrix A of size 11 x 11 +!> \author Immo Huismann +subroutine AddSquareTransposeMatrixProduct_11(m,A_T,B,C) + integer, parameter :: N = 11 + integer, intent(in) :: m + real(RDP), intent(in) :: A_T(N,N) + real(RDP), intent(in) :: B (N,m) + real(RDP), intent(out) :: C (N,m) + + integer :: i, j ! loop indices + real(RDP) :: tmp ! reduction variable + + !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE) + !$acc loop collapse(2) private(i,j,tmp) + !$omp do private(i,j,tmp) + do j = 1, m + do i = 1, N + tmp = & + + A_T( 1,i) * B( 1,j) & + + A_T( 2,i) * B( 2,j) & + + A_T( 3,i) * B( 3,j) & + + A_T( 4,i) * B( 4,j) & + + A_T( 5,i) * B( 5,j) & + + A_T( 6,i) * B( 6,j) & + + A_T( 7,i) * B( 7,j) & + + A_T( 8,i) * B( 8,j) & + + A_T( 9,i) * B( 9,j) & + + A_T(10,i) * B(10,j) & + + A_T(11,i) * B(11,j) + + C(i,j) = C(i,j) + tmp + end do + end do + !$omp end do + !$acc end loop + !$acc end parallel + +end subroutine AddSquareTransposeMatrixProduct_11 + +!------------------------------------------------------------------------------- +!> \brief C <- A B + C for small square matrix A of size 12 x 12 +!> \author Immo Huismann +subroutine AddSquareTransposeMatrixProduct_12(m,A_T,B,C) + integer, parameter :: N = 12 + integer, intent(in) :: m + real(RDP), intent(in) :: A_T(N,N) + real(RDP), intent(in) :: B (N,m) + real(RDP), intent(out) :: C (N,m) + + integer :: i, j ! loop indices + real(RDP) :: tmp ! reduction variable + + !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE) + !$acc loop collapse(2) private(i,j,tmp) + !$omp do private(i,j,tmp) + do j = 1, m + do i = 1, N + tmp = & + + A_T( 1,i) * B( 1,j) & + + A_T( 2,i) * B( 2,j) & + + A_T( 3,i) * B( 3,j) & + + A_T( 4,i) * B( 4,j) & + + A_T( 5,i) * B( 5,j) & + + A_T( 6,i) * B( 6,j) & + + A_T( 7,i) * B( 7,j) & + + A_T( 8,i) * B( 8,j) & + + A_T( 9,i) * B( 9,j) & + + A_T(10,i) * B(10,j) & + + A_T(11,i) * B(11,j) & + + A_T(12,i) * B(12,j) + + C(i,j) = C(i,j) + tmp + end do + end do + !$omp end do + !$acc end loop + !$acc end parallel + +end subroutine AddSquareTransposeMatrixProduct_12 + +!------------------------------------------------------------------------------- +!> \brief C <- A B + C for small square matrix A of size 13 x 13 +!> \author Immo Huismann +subroutine AddSquareTransposeMatrixProduct_13(m,A_T,B,C) + integer, parameter :: N = 13 + integer, intent(in) :: m + real(RDP), intent(in) :: A_T(N,N) + real(RDP), intent(in) :: B (N,m) + real(RDP), intent(out) :: C (N,m) + + integer :: i, j ! loop indices + real(RDP) :: tmp ! reduction variable + + !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE) + !$acc loop collapse(2) private(i,j,tmp) + !$omp do private(i,j,tmp) + do j = 1, m + do i = 1, N + tmp = & + + A_T( 1,i) * B( 1,j) & + + A_T( 2,i) * B( 2,j) & + + A_T( 3,i) * B( 3,j) & + + A_T( 4,i) * B( 4,j) & + + A_T( 5,i) * B( 5,j) & + + A_T( 6,i) * B( 6,j) & + + A_T( 7,i) * B( 7,j) & + + A_T( 8,i) * B( 8,j) & + + A_T( 9,i) * B( 9,j) & + + A_T(10,i) * B(10,j) & + + A_T(11,i) * B(11,j) & + + A_T(12,i) * B(12,j) & + + A_T(13,i) * B(13,j) + + C(i,j) = C(i,j) + tmp + end do + end do + !$omp end do + !$acc end loop + !$acc end parallel + +end subroutine AddSquareTransposeMatrixProduct_13 + +!------------------------------------------------------------------------------- +!> \brief C <- A B + C for small square matrix A of size 14 x 14 +!> \author Immo Huismann +subroutine AddSquareTransposeMatrixProduct_14(m,A_T,B,C) + integer, parameter :: N = 14 + integer, intent(in) :: m + real(RDP), intent(in) :: A_T(N,N) + real(RDP), intent(in) :: B (N,m) + real(RDP), intent(out) :: C (N,m) + + integer :: i, j ! loop indices + real(RDP) :: tmp ! reduction variable + + !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE) + !$acc loop collapse(2) private(i,j,tmp) + !$omp do private(i,j,tmp) + do j = 1, m + do i = 1, N + tmp = & + + A_T( 1,i) * B( 1,j) & + + A_T( 2,i) * B( 2,j) & + + A_T( 3,i) * B( 3,j) & + + A_T( 4,i) * B( 4,j) & + + A_T( 5,i) * B( 5,j) & + + A_T( 6,i) * B( 6,j) & + + A_T( 7,i) * B( 7,j) & + + A_T( 8,i) * B( 8,j) & + + A_T( 9,i) * B( 9,j) & + + A_T(10,i) * B(10,j) & + + A_T(11,i) * B(11,j) & + + A_T(12,i) * B(12,j) & + + A_T(13,i) * B(13,j) & + + A_T(14,i) * B(14,j) + + C(i,j) = C(i,j) + tmp + end do + end do + !$omp end do + !$acc end loop + !$acc end parallel + +end subroutine AddSquareTransposeMatrixProduct_14 + +!------------------------------------------------------------------------------- +!> \brief C <- A B + C for small square matrix A of size 15 x 15 +!> \author Immo Huismann +subroutine AddSquareTransposeMatrixProduct_15(m,A_T,B,C) + integer, parameter :: N = 15 + integer, intent(in) :: m + real(RDP), intent(in) :: A_T(N,N) + real(RDP), intent(in) :: B (N,m) + real(RDP), intent(out) :: C (N,m) + + integer :: i, j ! loop indices + real(RDP) :: tmp ! reduction variable + + !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE) + !$acc loop collapse(2) private(i,j,tmp) + !$omp do private(i,j,tmp) + do j = 1, m + do i = 1, N + tmp = & + + A_T( 1,i) * B( 1,j) & + + A_T( 2,i) * B( 2,j) & + + A_T( 3,i) * B( 3,j) & + + A_T( 4,i) * B( 4,j) & + + A_T( 5,i) * B( 5,j) & + + A_T( 6,i) * B( 6,j) & + + A_T( 7,i) * B( 7,j) & + + A_T( 8,i) * B( 8,j) & + + A_T( 9,i) * B( 9,j) & + + A_T(10,i) * B(10,j) & + + A_T(11,i) * B(11,j) & + + A_T(12,i) * B(12,j) & + + A_T(13,i) * B(13,j) & + + A_T(14,i) * B(14,j) & + + A_T(15,i) * B(15,j) + + C(i,j) = C(i,j) + tmp + end do + end do + !$omp end do + !$acc end loop + !$acc end parallel + +end subroutine AddSquareTransposeMatrixProduct_15 + +!------------------------------------------------------------------------------- +!> \brief C <- A B + C for small square matrix A of size 16 x 16 +!> \author Immo Huismann +subroutine AddSquareTransposeMatrixProduct_16(m,A_T,B,C) + integer, parameter :: N = 16 + integer, intent(in) :: m + real(RDP), intent(in) :: A_T(N,N) + real(RDP), intent(in) :: B (N,m) + real(RDP), intent(out) :: C (N,m) + + integer :: i, j ! loop indices + real(RDP) :: tmp ! reduction variable + + !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE) + !$acc loop collapse(2) private(i,j,tmp) + !$omp do private(i,j,tmp) + do j = 1, m + do i = 1, N + tmp = & + + A_T( 1,i) * B( 1,j) & + + A_T( 2,i) * B( 2,j) & + + A_T( 3,i) * B( 3,j) & + + A_T( 4,i) * B( 4,j) & + + A_T( 5,i) * B( 5,j) & + + A_T( 6,i) * B( 6,j) & + + A_T( 7,i) * B( 7,j) & + + A_T( 8,i) * B( 8,j) & + + A_T( 9,i) * B( 9,j) & + + A_T(10,i) * B(10,j) & + + A_T(11,i) * B(11,j) & + + A_T(12,i) * B(12,j) & + + A_T(13,i) * B(13,j) & + + A_T(14,i) * B(14,j) & + + A_T(15,i) * B(15,j) & + + A_T(16,i) * B(16,j) + + C(i,j) = C(i,j) + tmp + end do + end do + !$omp end do + !$acc end loop + !$acc end parallel + +end subroutine AddSquareTransposeMatrixProduct_16 + +!------------------------------------------------------------------------------- +!> \brief C <- A B, generated version choosing different implementations +!> \author Immo Huismann + +subroutine SquareTransposeMatrixProduct_Generated(k,m,A,B,C) + integer, intent(in) :: k!< Matrix dim. + integer, intent(in) :: m!< number of vectors + real(RDP), intent(in) :: A(k,k)!< matrix to multiply with + real(RDP), intent(in) :: B(k,m)!< matrix to multiply + real(RDP), intent(out) :: C(k,m)!< result + + select case(k) + case( 1) + call SquareTransposeMatrixProduct_01(m,A,B,C) + case( 2) + call SquareTransposeMatrixProduct_02(m,A,B,C) + case( 3) + call SquareTransposeMatrixProduct_03(m,A,B,C) + case( 4) + call SquareTransposeMatrixProduct_04(m,A,B,C) + case( 5) + call SquareTransposeMatrixProduct_05(m,A,B,C) + case( 6) + call SquareTransposeMatrixProduct_06(m,A,B,C) + case( 7) + call SquareTransposeMatrixProduct_07(m,A,B,C) + case( 8) + call SquareTransposeMatrixProduct_08(m,A,B,C) + case( 9) + call SquareTransposeMatrixProduct_09(m,A,B,C) + case(10) + call SquareTransposeMatrixProduct_10(m,A,B,C) + case(11) + call SquareTransposeMatrixProduct_11(m,A,B,C) + case(12) + call SquareTransposeMatrixProduct_12(m,A,B,C) + case(13) + call SquareTransposeMatrixProduct_13(m,A,B,C) + case(14) + call SquareTransposeMatrixProduct_14(m,A,B,C) + case(15) + call SquareTransposeMatrixProduct_15(m,A,B,C) + case(16) + call SquareTransposeMatrixProduct_16(m,A,B,C) + case default + call SquareTransposeMatrixProduct_XX(k,m,A,B,C) + end select + +end subroutine SquareTransposeMatrixProduct_Generated +!------------------------------------------------------------------------------- +!> \brief C <- A B for small square matrix A of size k x k +!> \author Immo Huismann +subroutine SquareTransposeMatrixProduct_XX(k,m,A_T,B,C) + integer, intent(in) :: k + integer, intent(in) :: m + real(RDP), intent(in) :: A_T(k,k) + real(RDP), intent(in) :: B (k,m) + real(RDP), intent(out) :: C (k,m) + + integer :: i, j, l ! loop indices + real(RDP) :: tmp ! reduction variable + + !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE) + !$acc loop collapse(2) private(i,j,l,tmp) gang worker vector + !$omp do private(i,j,l,tmp) + do j = 1, m + do i = 1, k + tmp = 0 + do l = 1, k + tmp = tmp + A_T(l,i) * B(l,j) + end do + + C(i,j) = tmp + end do + end do + !$omp end do + !$acc end loop + !$acc end parallel + +end subroutine SquareTransposeMatrixProduct_XX + + +!------------------------------------------------------------------------------- +!> \brief C <- A B for small square matrix A of size 1 x 1 +!> \author Immo Huismann +subroutine SquareTransposeMatrixProduct_01(m,A_T,B,C) + integer, parameter :: N = 1 + integer, intent(in) :: m + real(RDP), intent(in) :: A_T(N,N) + real(RDP), intent(in) :: B (N,m) + real(RDP), intent(out) :: C (N,m) + + integer :: i, j ! loop indices + real(RDP) :: tmp ! reduction variable + + !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE) + !$acc loop collapse(2) private(i,j,tmp) + !$omp do private(i,j,tmp) + do j = 1, m + do i = 1, N + tmp = & + + A_T( 1,i) * B( 1,j) + + C(i,j) = tmp + end do + end do + !$omp end do + !$acc end loop + !$acc end parallel + +end subroutine SquareTransposeMatrixProduct_01 + +!------------------------------------------------------------------------------- +!> \brief C <- A B for small square matrix A of size 2 x 2 +!> \author Immo Huismann +subroutine SquareTransposeMatrixProduct_02(m,A_T,B,C) + integer, parameter :: N = 2 + integer, intent(in) :: m + real(RDP), intent(in) :: A_T(N,N) + real(RDP), intent(in) :: B (N,m) + real(RDP), intent(out) :: C (N,m) + + integer :: i, j ! loop indices + real(RDP) :: tmp ! reduction variable + + !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE) + !$acc loop collapse(2) private(i,j,tmp) + !$omp do private(i,j,tmp) + do j = 1, m + do i = 1, N + tmp = & + + A_T( 1,i) * B( 1,j) & + + A_T( 2,i) * B( 2,j) + + C(i,j) = tmp + end do + end do + !$omp end do + !$acc end loop + !$acc end parallel + +end subroutine SquareTransposeMatrixProduct_02 + +!------------------------------------------------------------------------------- +!> \brief C <- A B for small square matrix A of size 3 x 3 +!> \author Immo Huismann +subroutine SquareTransposeMatrixProduct_03(m,A_T,B,C) + integer, parameter :: N = 3 + integer, intent(in) :: m + real(RDP), intent(in) :: A_T(N,N) + real(RDP), intent(in) :: B (N,m) + real(RDP), intent(out) :: C (N,m) + + integer :: i, j ! loop indices + real(RDP) :: tmp ! reduction variable + + !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE) + !$acc loop collapse(2) private(i,j,tmp) + !$omp do private(i,j,tmp) + do j = 1, m + do i = 1, N + tmp = & + + A_T( 1,i) * B( 1,j) & + + A_T( 2,i) * B( 2,j) & + + A_T( 3,i) * B( 3,j) + + C(i,j) = tmp + end do + end do + !$omp end do + !$acc end loop + !$acc end parallel + +end subroutine SquareTransposeMatrixProduct_03 + +!------------------------------------------------------------------------------- +!> \brief C <- A B for small square matrix A of size 4 x 4 +!> \author Immo Huismann +subroutine SquareTransposeMatrixProduct_04(m,A_T,B,C) + integer, parameter :: N = 4 + integer, intent(in) :: m + real(RDP), intent(in) :: A_T(N,N) + real(RDP), intent(in) :: B (N,m) + real(RDP), intent(out) :: C (N,m) + + integer :: i, j ! loop indices + real(RDP) :: tmp ! reduction variable + + !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE) + !$acc loop collapse(2) private(i,j,tmp) + !$omp do private(i,j,tmp) + do j = 1, m + do i = 1, N + tmp = & + + A_T( 1,i) * B( 1,j) & + + A_T( 2,i) * B( 2,j) & + + A_T( 3,i) * B( 3,j) & + + A_T( 4,i) * B( 4,j) + + C(i,j) = tmp + end do + end do + !$omp end do + !$acc end loop + !$acc end parallel + +end subroutine SquareTransposeMatrixProduct_04 + +!------------------------------------------------------------------------------- +!> \brief C <- A B for small square matrix A of size 5 x 5 +!> \author Immo Huismann +subroutine SquareTransposeMatrixProduct_05(m,A_T,B,C) + integer, parameter :: N = 5 + integer, intent(in) :: m + real(RDP), intent(in) :: A_T(N,N) + real(RDP), intent(in) :: B (N,m) + real(RDP), intent(out) :: C (N,m) + + integer :: i, j ! loop indices + real(RDP) :: tmp ! reduction variable + + !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE) + !$acc loop collapse(2) private(i,j,tmp) + !$omp do private(i,j,tmp) + do j = 1, m + do i = 1, N + tmp = & + + A_T( 1,i) * B( 1,j) & + + A_T( 2,i) * B( 2,j) & + + A_T( 3,i) * B( 3,j) & + + A_T( 4,i) * B( 4,j) & + + A_T( 5,i) * B( 5,j) + + C(i,j) = tmp + end do + end do + !$omp end do + !$acc end loop + !$acc end parallel + +end subroutine SquareTransposeMatrixProduct_05 + +!------------------------------------------------------------------------------- +!> \brief C <- A B for small square matrix A of size 6 x 6 +!> \author Immo Huismann +subroutine SquareTransposeMatrixProduct_06(m,A_T,B,C) + integer, parameter :: N = 6 + integer, intent(in) :: m + real(RDP), intent(in) :: A_T(N,N) + real(RDP), intent(in) :: B (N,m) + real(RDP), intent(out) :: C (N,m) + + integer :: i, j ! loop indices + real(RDP) :: tmp ! reduction variable + + !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE) + !$acc loop collapse(2) private(i,j,tmp) + !$omp do private(i,j,tmp) + do j = 1, m + do i = 1, N + tmp = & + + A_T( 1,i) * B( 1,j) & + + A_T( 2,i) * B( 2,j) & + + A_T( 3,i) * B( 3,j) & + + A_T( 4,i) * B( 4,j) & + + A_T( 5,i) * B( 5,j) & + + A_T( 6,i) * B( 6,j) + + C(i,j) = tmp + end do + end do + !$omp end do + !$acc end loop + !$acc end parallel + +end subroutine SquareTransposeMatrixProduct_06 + +!------------------------------------------------------------------------------- +!> \brief C <- A B for small square matrix A of size 7 x 7 +!> \author Immo Huismann +subroutine SquareTransposeMatrixProduct_07(m,A_T,B,C) + integer, parameter :: N = 7 + integer, intent(in) :: m + real(RDP), intent(in) :: A_T(N,N) + real(RDP), intent(in) :: B (N,m) + real(RDP), intent(out) :: C (N,m) + + integer :: i, j ! loop indices + real(RDP) :: tmp ! reduction variable + + !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE) + !$acc loop collapse(2) private(i,j,tmp) + !$omp do private(i,j,tmp) + do j = 1, m + do i = 1, N + tmp = & + + A_T( 1,i) * B( 1,j) & + + A_T( 2,i) * B( 2,j) & + + A_T( 3,i) * B( 3,j) & + + A_T( 4,i) * B( 4,j) & + + A_T( 5,i) * B( 5,j) & + + A_T( 6,i) * B( 6,j) & + + A_T( 7,i) * B( 7,j) + + C(i,j) = tmp + end do + end do + !$omp end do + !$acc end loop + !$acc end parallel + +end subroutine SquareTransposeMatrixProduct_07 + +!------------------------------------------------------------------------------- +!> \brief C <- A B for small square matrix A of size 8 x 8 +!> \author Immo Huismann +subroutine SquareTransposeMatrixProduct_08(m,A_T,B,C) + integer, parameter :: N = 8 + integer, intent(in) :: m + real(RDP), intent(in) :: A_T(N,N) + real(RDP), intent(in) :: B (N,m) + real(RDP), intent(out) :: C (N,m) + + integer :: i, j ! loop indices + real(RDP) :: tmp ! reduction variable + + !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE) + !$acc loop collapse(2) private(i,j,tmp) + !$omp do private(i,j,tmp) + do j = 1, m + do i = 1, N + tmp = & + + A_T( 1,i) * B( 1,j) & + + A_T( 2,i) * B( 2,j) & + + A_T( 3,i) * B( 3,j) & + + A_T( 4,i) * B( 4,j) & + + A_T( 5,i) * B( 5,j) & + + A_T( 6,i) * B( 6,j) & + + A_T( 7,i) * B( 7,j) & + + A_T( 8,i) * B( 8,j) + + C(i,j) = tmp + end do + end do + !$omp end do + !$acc end loop + !$acc end parallel + +end subroutine SquareTransposeMatrixProduct_08 + +!------------------------------------------------------------------------------- +!> \brief C <- A B for small square matrix A of size 9 x 9 +!> \author Immo Huismann +subroutine SquareTransposeMatrixProduct_09(m,A_T,B,C) + integer, parameter :: N = 9 + integer, intent(in) :: m + real(RDP), intent(in) :: A_T(N,N) + real(RDP), intent(in) :: B (N,m) + real(RDP), intent(out) :: C (N,m) + + integer :: i, j ! loop indices + real(RDP) :: tmp ! reduction variable + + !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE) + !$acc loop collapse(2) private(i,j,tmp) + !$omp do private(i,j,tmp) + do j = 1, m + do i = 1, N + tmp = & + + A_T( 1,i) * B( 1,j) & + + A_T( 2,i) * B( 2,j) & + + A_T( 3,i) * B( 3,j) & + + A_T( 4,i) * B( 4,j) & + + A_T( 5,i) * B( 5,j) & + + A_T( 6,i) * B( 6,j) & + + A_T( 7,i) * B( 7,j) & + + A_T( 8,i) * B( 8,j) & + + A_T( 9,i) * B( 9,j) + + C(i,j) = tmp + end do + end do + !$omp end do + !$acc end loop + !$acc end parallel + +end subroutine SquareTransposeMatrixProduct_09 + +!------------------------------------------------------------------------------- +!> \brief C <- A B for small square matrix A of size 10 x 10 +!> \author Immo Huismann +subroutine SquareTransposeMatrixProduct_10(m,A_T,B,C) + integer, parameter :: N = 10 + integer, intent(in) :: m + real(RDP), intent(in) :: A_T(N,N) + real(RDP), intent(in) :: B (N,m) + real(RDP), intent(out) :: C (N,m) + + integer :: i, j ! loop indices + real(RDP) :: tmp ! reduction variable + + !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE) + !$acc loop collapse(2) private(i,j,tmp) + !$omp do private(i,j,tmp) + do j = 1, m + do i = 1, N + tmp = & + + A_T( 1,i) * B( 1,j) & + + A_T( 2,i) * B( 2,j) & + + A_T( 3,i) * B( 3,j) & + + A_T( 4,i) * B( 4,j) & + + A_T( 5,i) * B( 5,j) & + + A_T( 6,i) * B( 6,j) & + + A_T( 7,i) * B( 7,j) & + + A_T( 8,i) * B( 8,j) & + + A_T( 9,i) * B( 9,j) & + + A_T(10,i) * B(10,j) + + C(i,j) = tmp + end do + end do + !$omp end do + !$acc end loop + !$acc end parallel + +end subroutine SquareTransposeMatrixProduct_10 + +!------------------------------------------------------------------------------- +!> \brief C <- A B for small square matrix A of size 11 x 11 +!> \author Immo Huismann +subroutine SquareTransposeMatrixProduct_11(m,A_T,B,C) + integer, parameter :: N = 11 + integer, intent(in) :: m + real(RDP), intent(in) :: A_T(N,N) + real(RDP), intent(in) :: B (N,m) + real(RDP), intent(out) :: C (N,m) + + integer :: i, j ! loop indices + real(RDP) :: tmp ! reduction variable + + !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE) + !$acc loop collapse(2) private(i,j,tmp) + !$omp do private(i,j,tmp) + do j = 1, m + do i = 1, N + tmp = & + + A_T( 1,i) * B( 1,j) & + + A_T( 2,i) * B( 2,j) & + + A_T( 3,i) * B( 3,j) & + + A_T( 4,i) * B( 4,j) & + + A_T( 5,i) * B( 5,j) & + + A_T( 6,i) * B( 6,j) & + + A_T( 7,i) * B( 7,j) & + + A_T( 8,i) * B( 8,j) & + + A_T( 9,i) * B( 9,j) & + + A_T(10,i) * B(10,j) & + + A_T(11,i) * B(11,j) + + C(i,j) = tmp + end do + end do + !$omp end do + !$acc end loop + !$acc end parallel + +end subroutine SquareTransposeMatrixProduct_11 + +!------------------------------------------------------------------------------- +!> \brief C <- A B for small square matrix A of size 12 x 12 +!> \author Immo Huismann +subroutine SquareTransposeMatrixProduct_12(m,A_T,B,C) + integer, parameter :: N = 12 + integer, intent(in) :: m + real(RDP), intent(in) :: A_T(N,N) + real(RDP), intent(in) :: B (N,m) + real(RDP), intent(out) :: C (N,m) + + integer :: i, j ! loop indices + real(RDP) :: tmp ! reduction variable + + !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE) + !$acc loop collapse(2) private(i,j,tmp) + !$omp do private(i,j,tmp) + do j = 1, m + do i = 1, N + tmp = & + + A_T( 1,i) * B( 1,j) & + + A_T( 2,i) * B( 2,j) & + + A_T( 3,i) * B( 3,j) & + + A_T( 4,i) * B( 4,j) & + + A_T( 5,i) * B( 5,j) & + + A_T( 6,i) * B( 6,j) & + + A_T( 7,i) * B( 7,j) & + + A_T( 8,i) * B( 8,j) & + + A_T( 9,i) * B( 9,j) & + + A_T(10,i) * B(10,j) & + + A_T(11,i) * B(11,j) & + + A_T(12,i) * B(12,j) + + C(i,j) = tmp + end do + end do + !$omp end do + !$acc end loop + !$acc end parallel + +end subroutine SquareTransposeMatrixProduct_12 + +!------------------------------------------------------------------------------- +!> \brief C <- A B for small square matrix A of size 13 x 13 +!> \author Immo Huismann +subroutine SquareTransposeMatrixProduct_13(m,A_T,B,C) + integer, parameter :: N = 13 + integer, intent(in) :: m + real(RDP), intent(in) :: A_T(N,N) + real(RDP), intent(in) :: B (N,m) + real(RDP), intent(out) :: C (N,m) + + integer :: i, j ! loop indices + real(RDP) :: tmp ! reduction variable + + !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE) + !$acc loop collapse(2) private(i,j,tmp) + !$omp do private(i,j,tmp) + do j = 1, m + do i = 1, N + tmp = & + + A_T( 1,i) * B( 1,j) & + + A_T( 2,i) * B( 2,j) & + + A_T( 3,i) * B( 3,j) & + + A_T( 4,i) * B( 4,j) & + + A_T( 5,i) * B( 5,j) & + + A_T( 6,i) * B( 6,j) & + + A_T( 7,i) * B( 7,j) & + + A_T( 8,i) * B( 8,j) & + + A_T( 9,i) * B( 9,j) & + + A_T(10,i) * B(10,j) & + + A_T(11,i) * B(11,j) & + + A_T(12,i) * B(12,j) & + + A_T(13,i) * B(13,j) + + C(i,j) = tmp + end do + end do + !$omp end do + !$acc end loop + !$acc end parallel + +end subroutine SquareTransposeMatrixProduct_13 + +!------------------------------------------------------------------------------- +!> \brief C <- A B for small square matrix A of size 14 x 14 +!> \author Immo Huismann +subroutine SquareTransposeMatrixProduct_14(m,A_T,B,C) + integer, parameter :: N = 14 + integer, intent(in) :: m + real(RDP), intent(in) :: A_T(N,N) + real(RDP), intent(in) :: B (N,m) + real(RDP), intent(out) :: C (N,m) + + integer :: i, j ! loop indices + real(RDP) :: tmp ! reduction variable + + !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE) + !$acc loop collapse(2) private(i,j,tmp) + !$omp do private(i,j,tmp) + do j = 1, m + do i = 1, N + tmp = & + + A_T( 1,i) * B( 1,j) & + + A_T( 2,i) * B( 2,j) & + + A_T( 3,i) * B( 3,j) & + + A_T( 4,i) * B( 4,j) & + + A_T( 5,i) * B( 5,j) & + + A_T( 6,i) * B( 6,j) & + + A_T( 7,i) * B( 7,j) & + + A_T( 8,i) * B( 8,j) & + + A_T( 9,i) * B( 9,j) & + + A_T(10,i) * B(10,j) & + + A_T(11,i) * B(11,j) & + + A_T(12,i) * B(12,j) & + + A_T(13,i) * B(13,j) & + + A_T(14,i) * B(14,j) + + C(i,j) = tmp + end do + end do + !$omp end do + !$acc end loop + !$acc end parallel + +end subroutine SquareTransposeMatrixProduct_14 + +!------------------------------------------------------------------------------- +!> \brief C <- A B for small square matrix A of size 15 x 15 +!> \author Immo Huismann +subroutine SquareTransposeMatrixProduct_15(m,A_T,B,C) + integer, parameter :: N = 15 + integer, intent(in) :: m + real(RDP), intent(in) :: A_T(N,N) + real(RDP), intent(in) :: B (N,m) + real(RDP), intent(out) :: C (N,m) + + integer :: i, j ! loop indices + real(RDP) :: tmp ! reduction variable + + !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE) + !$acc loop collapse(2) private(i,j,tmp) + !$omp do private(i,j,tmp) + do j = 1, m + do i = 1, N + tmp = & + + A_T( 1,i) * B( 1,j) & + + A_T( 2,i) * B( 2,j) & + + A_T( 3,i) * B( 3,j) & + + A_T( 4,i) * B( 4,j) & + + A_T( 5,i) * B( 5,j) & + + A_T( 6,i) * B( 6,j) & + + A_T( 7,i) * B( 7,j) & + + A_T( 8,i) * B( 8,j) & + + A_T( 9,i) * B( 9,j) & + + A_T(10,i) * B(10,j) & + + A_T(11,i) * B(11,j) & + + A_T(12,i) * B(12,j) & + + A_T(13,i) * B(13,j) & + + A_T(14,i) * B(14,j) & + + A_T(15,i) * B(15,j) + + C(i,j) = tmp + end do + end do + !$omp end do + !$acc end loop + !$acc end parallel + +end subroutine SquareTransposeMatrixProduct_15 + +!------------------------------------------------------------------------------- +!> \brief C <- A B for small square matrix A of size 16 x 16 +!> \author Immo Huismann +subroutine SquareTransposeMatrixProduct_16(m,A_T,B,C) + integer, parameter :: N = 16 + integer, intent(in) :: m + real(RDP), intent(in) :: A_T(N,N) + real(RDP), intent(in) :: B (N,m) + real(RDP), intent(out) :: C (N,m) + + integer :: i, j ! loop indices + real(RDP) :: tmp ! reduction variable + + !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE) + !$acc loop collapse(2) private(i,j,tmp) + !$omp do private(i,j,tmp) + do j = 1, m + do i = 1, N + tmp = & + + A_T( 1,i) * B( 1,j) & + + A_T( 2,i) * B( 2,j) & + + A_T( 3,i) * B( 3,j) & + + A_T( 4,i) * B( 4,j) & + + A_T( 5,i) * B( 5,j) & + + A_T( 6,i) * B( 6,j) & + + A_T( 7,i) * B( 7,j) & + + A_T( 8,i) * B( 8,j) & + + A_T( 9,i) * B( 9,j) & + + A_T(10,i) * B(10,j) & + + A_T(11,i) * B(11,j) & + + A_T(12,i) * B(12,j) & + + A_T(13,i) * B(13,j) & + + A_T(14,i) * B(14,j) & + + A_T(15,i) * B(15,j) & + + A_T(16,i) * B(16,j) + + C(i,j) = tmp + end do + end do + !$omp end do + !$acc end loop + !$acc end parallel + +end subroutine SquareTransposeMatrixProduct_16 + +!=============================================================================== + +end module Tiny_Matrix_Products_Explicit diff --git a/Parser/test-data/specht/transformed_condensed_part.f b/Parser/test-data/specht/transformed_condensed_part.f new file mode 100644 index 0000000..e6d4828 --- /dev/null +++ b/Parser/test-data/specht/transformed_condensed_part.f @@ -0,0 +1,151 @@ +!> \file eigenspace_condensed_part.f +!> \brief Condensed part for condensed face eigenspace solver +!> \author Immo Huismann +!> \date 2015/05/23 +!> \copyright Institute of Fluid Mechanics, TU Dresden, 01062 Dresden, Germany +!> +!> \details +!> This module provides the primary part for the condensed system eigenspace +!> solver. +!=============================================================================== + +module Transformed_Condensed_Part + use Kind_Parameters, only: RDP + use Constants, only: HALF + + use ACC_Parameters, only: ACC_EXEC_QUEUE + use Condensed_SEM_Operators, only: CondensedOperators1D + use Geometry_Coefficients, only: GetLaplaceCoefficients + use Element_Boundary_Parameters, only: N_FACES + implicit none + private + + public :: SubtractTransformedCondensedPart + +contains + +!------------------------------------------------------------------------------- +!> \brief Subtracts the condensed part of the eigenspace system from r. +!> \author Immo Huismann +!> +!> \details +!> Serves as a wrapper to the real routine, but with far fewer arguments. + +subroutine SubtractTransformedCondensedPart(cop,d,D_inv,u_f,r_f) + class(CondensedOperators1D), intent(in) :: cop !< condensed operators + real(RDP), intent(in) :: d(0:,:) !< Helmholtz parameters + real(RDP), intent(in) :: D_inv(:,:,:,:) !< eigenvalues of iHii + real(RDP), intent(in) :: u_f(:,:,:,:) !< variable u on faces + real(RDP), intent(inout) :: r_f(:,:,:,:) !< result on faces + + integer :: n, n_element ! points per edge, number of elements + + ! Prologue ................................................................... + n = size(D_inv,1) + n_element = size(D_inv,4) + + ! computation ................................................................ + + ! map from faces to inner element eigenspace. + ! This is done with the transpose of the transformation matrix and the + ! Helmholtz suboperator corresponding to the specific face + + call Operator(n,n_element,cop%S_T_L_I0,cop%S_T_L_Ip,d,u_f,D_inv,r_f) + + +end subroutine SubtractTransformedCondensedPart + +!------------------------------------------------------------------------------- +!> \brief Explicit size implementation of the transformed condensed part +!> \author Immo Huismann +!> +!> \details +!> This implementation uses a small working set (one temporary), leading to few +!> load and store operations from memory. + +subroutine Operator(n,n_element,S_T_L_I0,S_T_L_Ip,d,u_f,D_inv,r_f) + integer, intent(in) :: n !< points per edge + integer, intent(in) :: n_element !< number of elements + real(RDP), intent(in) :: S_T_L_I0(n) !< \f$S^{T} L_{I0}\f$ + real(RDP), intent(in) :: S_T_L_Ip(n) !< \f$S^{T} L_{Ip}\f$ + real(RDP), intent(in) :: d(0:3,n_element) !< Helmholtz coefficients + real(RDP), intent(in) :: u_f ( n,n,N_FACES,n_element) !< u face values + real(RDP), intent(in) :: D_inv(n,n,n,n_element) !< D^{-1} + real(RDP), intent(inout) :: r_f ( n,n,N_FACES,n_element) !< r face values + + real(RDP) :: v(n,n,n) + real(RDP) :: L_0I_S(n), L_pI_S(n) + integer :: i,j,k,e + + ! Small structure exploitation + L_0I_S = S_T_L_I0 + L_pI_S = S_T_L_Ip + + ! Loop over all elements + !$acc parallel async(ACC_EXEC_QUEUE) present(d,D_inv,u_f,r_f) + !$acc loop private(v) + !$omp do private(i,j,k,e,v) + do e = 1, n_element + ! Gather contributions ..................................................... + !$acc loop collapse(3) + do k = 1, n + do j = 1, n + do i = 1, n + v(i,j,k) = d(1,e) * S_T_L_I0(i) * u_f( j,k,1,e) & + + d(1,e) * S_T_L_Ip(i) * u_f( j,k,2,e) & + + d(2,e) * S_T_L_I0(j) * u_f(i, k,3,e) & + + d(2,e) * S_T_L_Ip(j) * u_f(i, k,4,e) & + + d(3,e) * S_T_L_I0(k) * u_f(i,j, 5,e) & + + d(3,e) * S_T_L_Ip(k) * u_f(i,j, 6,e) + + v(i,j,k) = D_inv(i,j,k,e) * v(i,j,k) + end do + end do + end do + !$acc end loop + + ! Scatter contributions .................................................... + ! First direction + !$acc loop collapse(2) + do k = 1, n + do j = 1, n + do i = 1, n + r_f( j,k,1,e) = r_f( j,k,1,e) - d(1,e) * L_0I_S(i) * v(i,j,k) + r_f( j,k,2,e) = r_f( j,k,2,e) - d(1,e) * L_pI_S(i) * v(i,j,k) + end do + end do + end do + + ! Second direction + !$acc loop collapse(2) + do k = 1, n + do i = 1, n + do j = 1, n + r_f(i, k,3,e) = r_f(i, k,3,e) - d(2,e) * L_0I_S(j) * v(i,j,k) + r_f(i, k,4,e) = r_f(i, k,4,e) - d(2,e) * L_pI_S(j) * v(i,j,k) + end do + end do + end do + !$acc end loop + + ! Third direction + do k = 1, n + !$acc loop collapse(2) + do j = 1, n + do i = 1, n + r_f(i,j, 5,e) = r_f(i,j, 5,e) - d(3,e) * L_0I_S(k) * v(i,j,k) + r_f(i,j, 6,e) = r_f(i,j, 6,e) - d(3,e) * L_pI_S(k) * v(i,j,k) + end do + end do + !$acc end loop + end do + end do + !$omp end do + !$acc end loop + !$acc end parallel + +end subroutine Operator + +!=============================================================================== + +end module Transformed_Condensed_Part diff --git a/Parser/test-data/specht/transformed_primary_part.f b/Parser/test-data/specht/transformed_primary_part.f new file mode 100644 index 0000000..626ce4b --- /dev/null +++ b/Parser/test-data/specht/transformed_primary_part.f @@ -0,0 +1,939 @@ +!> \file eigenspace_primary_part.f +!> \brief Primary part for condensed face eigenspace solver +!> \author Immo Huismann +!> \date 2015/05/20 +!> \copyright Institute of Fluid Mechanics, TU Dresden, 01062 Dresden, Germany +!> +!> \details +!> This module provides the primary part for the condensed system eigenspace +!> solver. +!> +!> \todo +!> * The OpenACC queues could be used for latency hiding via task parallelism +!=============================================================================== + +module Transformed_Primary_Part + ! HiBASE + use Kind_Parameters, only: RDP + + ! Specht + use ACC_Parameters, only: ACC_EXEC_QUEUE + use Condensed_SEM_Operators, only: CondensedOperators1D + use Element_Boundary_Parameters, only: N_FACES, N_EDGES, N_VERTICES + implicit none + private + + public :: AddTransformedPrimaryPart + +contains + +!------------------------------------------------------------------------------- +!> \brief Adds the primary part to the result vector. +!> \author Immo Huismann +!> +!> \details +!> * Assumes that d,u_f,u_e,u_v,r_f,r_e,r_v are on the accelerator + +subroutine AddTransformedPrimaryPart(cop,d,u_f,u_e,u_v,r_f,r_e,r_v) + class(CondensedOperators1D), intent(in) :: cop !< 1D condensed operators + real(RDP), intent(in) :: d(0:,:) !< Helmholtz coefficents + real(RDP), intent(in) :: u_f(:,:,:,:) !< u values on faces + real(RDP), intent(in) :: u_e( :,:,:) !< u values on edges + real(RDP), intent(in) :: u_v( :,:) !< u values on vertices + real(RDP), intent(inout) :: r_f(:,:,:,:) !< r values on faces + real(RDP), intent(inout) :: r_e( :,:,:) !< r values on edges + real(RDP), intent(inout) :: r_v( :,:) !< r values on vertices + + ! Add the single contributions + + call AddFaceContribution (cop,d,u_f, r_f,r_e ) + call AddEdgeContribution (cop,d, u_e, r_f,r_e,r_v) + call AddVertexContribution(cop,d, u_v, r_e,r_v) + +end subroutine AddTransformedPrimaryPart + +!=============================================================================== +! Internals + +!------------------------------------------------------------------------------- +!> \brief Adds the contribution of the faces to the residual +!> \author Immo Huismann + +subroutine AddFaceContribution(cop,d,u_f,r_f,r_e) + class(CondensedOperators1D), intent(in) :: cop + real(RDP), intent(in) :: d(0:,:) !< metric coefficients (0:3, n_e) + real(RDP), intent(in) :: u_f(:,:,:,:) !< values on faces (n,n,6, n_e) + real(RDP), intent(inout) :: r_f(:,:,:,:) !< residual on faces (n,n,6, n_e) + real(RDP), intent(inout) :: r_e( :,:,:) !< residual on edges ( n,12,n_e) + + integer :: n, n_element + real(RDP) :: M_00, L_00, L_0p + + n = size(u_f,1) + n_element = size(u_f,4) + + M_00 = cop%w(0) + L_00 = cop%L(0,0) + L_0p = cop%L(0,1+n) + + call FaceToEdge(n,n_element,M_00, cop%L_0I_S,cop%L_pI_S,d,u_f,r_e) + call FaceToFace(n,n_element,M_00,L_00,L_0p,cop%eig, d,u_f,r_f) + +end subroutine AddFaceContribution + +!------------------------------------------------------------------------------- +!> \brief Performs edge to edge operations of primary part +!> \author Immo Huismann + +subroutine AddEdgeContribution(cop,d,u_e,r_f,r_e,r_v) + class(CondensedOperators1D), intent(in) :: cop + real(RDP), intent(in) :: d(0:,:) !< metric coefficients (0:3, n_e) + real(RDP), intent(in) :: u_e( :,:,:) !< values on the edges ( n,12,n_e) + real(RDP), intent(inout) :: r_f(:,:,:,:) !< residual on faces (n,n, 6,n_e) + real(RDP), intent(inout) :: r_e( :,:,:) !< residual on edges ( n,12,n_e) + real(RDP), intent(inout) :: r_v( :,:) !< residual on vertices ( 8,n_e) + + integer :: n, n_element + real(RDP) :: M_00, L_00, L_0p + real(RDP) :: L_0I (size(cop%S,1)), L_pI (size(cop%S,1)) + real(RDP) :: L_0I_S (size(cop%S,1)), L_pI_S (size(cop%S,1)) + real(RDP) :: L_I0 (size(cop%S,1)), L_Ip (size(cop%S,1)) + real(RDP) :: S_T_LI0(size(cop%S,1)), S_T_LIp(size(cop%S,1)) + real(RDP) :: eig(size(cop%eig)) + + n = size(u_e,1) + n_element = size(u_e,3) + + M_00 = cop%w(0) + L_00 = cop%L(0,0) + L_0p = cop%L(0,1+n) + L_pI = cop%L(1+n,1:n) + L_0I = cop%L(0, 1:n) + L_Ip = cop%L(1:n,1+n) + L_I0 = cop%L(1:n,0 ) + + L_0I_S = cop%L_0I_S + L_pI_S = cop%L_pI_S + S_T_LI0 = cop%S_T_L_I0 + S_T_LIp = cop%S_T_L_Ip + + eig = cop%eig + + call EdgeToVertex(n,n_element,M_00,L_0I_S,L_pI_S, d,u_e,r_v) + call EdgeToEdge (n,n_element,M_00,L_00,L_0p,eig, d,u_e, r_e) + call EdgeToFace (n,n_element,M_00,S_T_LI0,S_T_LIp,d,u_e, r_f) + +end subroutine AddEdgeContribution + +!------------------------------------------------------------------------------- +!> \brief Add the vertices' contribution to the primary part +!> \author Immo Huismann + +subroutine AddVertexContribution(cop,d,u_v,r_e,r_v) + class(CondensedOperators1D), intent(in) :: cop !< condensed 1D operators + real(RDP), intent(in) :: d(:,0:) !< metric coefficients ( n_e,0:3) + real(RDP), intent(in) :: u_v( :,:) !< values on vertices ( n_e, 6) + real(RDP), intent(inout) :: r_e( :,:,:) !< residual on edges ( n,n_e,12) + real(RDP), intent(inout) :: r_v( :,:) !< residual on vertices ( n_e, 8) + + integer :: n, n_element + real(RDP) :: M_00, L_00, L_0p + real(RDP) :: S_T_L_I0(size(cop%S,1)) + real(RDP) :: S_T_L_Ip(size(cop%S,1)) + + n = size(r_e,1) + n_element = size(r_e,3) + + M_00 = cop%w(0) + L_00 = cop%L(0,0) + L_0p = cop%L(0,1+n) + + S_T_L_I0 = cop%S_T_L_I0 + S_T_L_Ip = cop%S_T_L_Ip + + call VertexToVertex(n_element,M_00,L_00,L_0p, d,u_v,r_v) + call VertexToEdge(n,n_element,M_00,S_T_L_I0,S_T_L_Ip,d,u_v, r_e) + +end subroutine AddVertexContribution + +!=============================================================================== +! Particular terms + +!------------------------------------------------------------------------------- +!> \brief Face to face term +!> \author Immo Huismann + +subroutine FaceToFace(n,n_element,M_00,L_00,L_0p,eig,d,u_f,r_f) + integer, intent(in) :: n !< points per edge + integer, intent(in) :: n_element !< number of elements + real(RDP), intent(in) :: M_00 !< M_{00} + real(RDP), intent(in) :: L_00 !< L_{00} + real(RDP), intent(in) :: L_0p !< L_{0p} + real(RDP), intent(in) :: eig(n) !< gen. eigenvalues of L_{II} + real(RDP), intent(in) :: d(0:3,n_element) !< metric coefficients + + real(RDP), intent(in) :: u_f(n,n,N_FACES,n_element) !< values on faces + real(RDP), intent(inout) :: r_f(n,n,N_FACES,n_element) !< result on edges + + integer :: i,j,e + + !$acc parallel async(ACC_EXEC_QUEUE) present(d,u_f,r_f) + !$acc loop + !$omp do private(i,j,e) + do e = 1, n_element + !........................................................................... + ! x_1 faces + !$acc loop collapse(2) + do j = 1, n + do i = 1, n + r_f(i,j,1,e) = r_f(i,j,1,e) & + + d(0,e) * M_00 * u_f(i,j,1,e) & + + d(1,e) * L_00 * u_f(i,j,1,e) & + + d(2,e) * M_00 * eig(i) * u_f(i,j,1,e) & + + d(3,e) * M_00 * eig(j) * u_f(i,j,1,e) & + + d(1,e) * L_0p * u_f(i,j,2,e) + + r_f(i,j,2,e) = r_f(i,j,2,e) & + + d(0,e) * M_00 * u_f(i,j,2,e) & + + d(1,e) * L_00 * u_f(i,j,2,e) & + + d(2,e) * M_00 * eig(i) * u_f(i,j,2,e) & + + d(3,e) * M_00 * eig(j) * u_f(i,j,2,e) & + + d(1,e) * L_0p * u_f(i,j,1,e) + end do + end do + !$acc end loop + + !........................................................................... + ! x_2 faces + !$acc loop collapse(2) + do j = 1, n + do i = 1, n + r_f(i,j,3,e) = r_f(i,j,3,e) & + + d(0,e) * M_00 * u_f(i,j,3,e) & + + d(2,e) * L_00 * u_f(i,j,3,e) & + + d(1,e) * M_00 * eig(i) * u_f(i,j,3,e) & + + d(3,e) * M_00 * eig(j) * u_f(i,j,3,e) & + + d(2,e) * L_0p * u_f(i,j,4,e) + + r_f(i,j,4,e) = r_f(i,j,4,e) & + + d(0,e) * M_00 * u_f(i,j,4,e) & + + d(2,e) * L_00 * u_f(i,j,4,e) & + + d(1,e) * M_00 * eig(i) * u_f(i,j,4,e) & + + d(3,e) * M_00 * eig(j) * u_f(i,j,4,e) & + + d(2,e) * L_0p * u_f(i,j,3,e) + end do + end do + !$acc end loop + + !........................................................................... + ! x_3 faces + !$acc loop collapse(2) + do j = 1, n + do i = 1, n + r_f(i,j,5,e) = r_f(i,j,5,e) & + + d(0,e) * M_00 * u_f(i,j,5,e) & + + d(3,e) * L_00 * u_f(i,j,5,e) & + + d(1,e) * M_00 * eig(i) * u_f(i,j,5,e) & + + d(2,e) * M_00 * eig(j) * u_f(i,j,5,e) & + + d(3,e) * L_0p * u_f(i,j,6,e) + + r_f(i,j,6,e) = r_f(i,j,6,e) & + + d(0,e) * M_00 * u_f(i,j,6,e) & + + d(3,e) * L_00 * u_f(i,j,6,e) & + + d(1,e) * M_00 * eig(i) * u_f(i,j,6,e) & + + d(2,e) * M_00 * eig(j) * u_f(i,j,6,e) & + + d(3,e) * L_0p * u_f(i,j,5,e) + end do + end do + !$acc end loop + + end do + !$omp end do + !$acc end loop + !$acc end parallel + +end subroutine FaceToFace + +!------------------------------------------------------------------------------- +!> \brief Face to edge term +!> \author Immo Huismann + +subroutine FaceToEdge(n,n_element,M_00,L_0I_S,L_pI_S,d,u_f,r_e) + integer, intent(in) :: n !< points per edge + integer, intent(in) :: n_element !< number of elements + real(RDP), intent(in) :: M_00 !< M_{00} + real(RDP), intent(in) :: L_0I_S(n) !< S^T L_{I0} + real(RDP), intent(in) :: L_pI_S(n) !< S^T L_{Ip} + real(RDP), intent(in) :: d(0:3,n_element) !< metric coefficients + + real(RDP), intent(in) :: u_f(n,n,N_FACES,n_element) !< values on faces + real(RDP), intent(inout) :: r_e( n,N_EDGES,n_element) !< result on edges + + real(RDP) :: M_00_L_0I_S(n) !< M_00 L_{I0} S^T + real(RDP) :: M_00_L_pI_S(n) !< M_00 L_{Ip} S^T + + integer :: i,j,e + + M_00_L_0I_S = M_00 * L_0I_S + M_00_L_pI_S = M_00 * L_pI_S + + !$acc parallel async(ACC_EXEC_QUEUE) present(d,u_f,r_e) + !$acc loop collapse(2) + !$omp do private(i,j,e) + do e = 1, n_element + do i = 1, n + do j = 1, n + + r_e(i, 1,e) = r_e(i , 1,e) & + + d(2,e) * M_00_L_0I_S(j) * u_f(i,j, 5,e) & + + d(3,e) * M_00_L_0I_S(j) * u_f(i,j, 3,e) + + r_e(i, 2,e) = r_e(i , 2,e) & + + d(2,e) * M_00_L_pI_S(j) * u_f(i,j, 5,e) & + + d(3,e) * M_00_L_0I_S(j) * u_f(i,j, 4,e) + + r_e(i, 3,e) = r_e(i , 3,e) & + + d(2,e) * M_00_L_0I_S(j) * u_f(i,j, 6,e) & + + d(3,e) * M_00_L_pI_S(j) * u_f(i,j, 3,e) + + r_e(i, 4,e) = r_e(i , 4,e) & + + d(2,e) * M_00_L_pI_S(j) * u_f(i,j, 6,e) & + + d(3,e) * M_00_L_pI_S(j) * u_f(i,j, 4,e) + + r_e(i, 5,e) = r_e(i , 5,e) & + + d(3,e) * M_00_L_0I_S(j) * u_f(i,j, 1,e) & + + d(1,e) * M_00_L_0I_S(j) * u_f(j,i, 5,e) + + r_e(i, 6,e) = r_e(i , 6,e) & + + d(3,e) * M_00_L_0I_S(j) * u_f(i,j, 2,e) & + + d(1,e) * M_00_L_pI_S(j) * u_f(j,i, 5,e) + + r_e(i, 7,e) = r_e(i , 7,e) & + + d(3,e) * M_00_L_pI_S(j) * u_f(i,j, 1,e) & + + d(1,e) * M_00_L_0I_S(j) * u_f(j,i, 6,e) + + r_e(i, 8,e) = r_e(i , 8,e) & + + d(3,e) * M_00_L_pI_S(j) * u_f(i,j, 2,e) & + + d(1,e) * M_00_L_pI_S(j) * u_f(j,i, 6,e) + + r_e(i, 9,e) = r_e(i , 9,e) & + + d(2,e) * M_00_L_0I_S(j) * u_f(j,i, 1,e) & + + d(1,e) * M_00_L_0I_S(j) * u_f(j,i, 3,e) + + r_e(i,10,e) = r_e(i ,10,e) & + + d(2,e) * M_00_L_0I_S(j) * u_f(j,i, 2,e) & + + d(1,e) * M_00_L_pI_S(j) * u_f(j,i, 3,e) + + r_e(i,11,e) = r_e(i ,11,e) & + + d(2,e) * M_00_L_pI_S(j) * u_f(j,i, 1,e) & + + d(1,e) * M_00_L_0I_S(j) * u_f(j,i, 4,e) + + r_e(i,12,e) = r_e(i ,12,e) & + + d(2,e) * M_00_L_pI_S(j) * u_f(j,i, 2,e) & + + d(1,e) * M_00_L_pI_S(j) * u_f(j,i, 4,e) + + end do + end do + end do + !$omp end do + !$acc end loop + !$acc end parallel + +end subroutine FaceToEdge + +!------------------------------------------------------------------------------- +!> \brief Primary part for edge -> face +!> \author Immo Huismann + +subroutine EdgeToFace(n,n_element,M_00,S_T_LI0,S_T_LIp,d,u_e,r_f) + integer, intent(in) :: n !< points per edge + integer, intent(in) :: n_element !< number of elements + real(RDP), intent(in) :: M_00 !< M_{00} + real(RDP), intent(in) :: S_T_LI0(n) !< S^T L_{I0} + real(RDP), intent(in) :: S_T_LIp(n) !< S^T L_{Ip} + real(RDP), intent(in) :: d(0:3,n_element) !< metric coefficients + + real(RDP), intent(in) :: u_e( n,N_EDGES,n_element) !< values on edges + real(RDP), intent(inout) :: r_f(n,n,N_FACES,n_element) !< result on faces + + real(RDP) :: M_00_S_T_LI0(n) !< M_00 S^T L_{I0} + real(RDP) :: M_00_S_T_LIp(n) !< M_00 S^T L_{Ip} + + integer :: i,j,e + + M_00_S_T_LI0 = M_00 * S_T_LI0 + M_00_S_T_LIp = M_00 * S_T_LIp + + !$acc parallel async(ACC_EXEC_QUEUE) present(d,u_e,r_f) + !$acc loop collapse(3) + !$omp do private(i,j,e) + do e = 1, n_element + do j = 1, n + do i = 1, n + + r_f(i,j,1,e) = r_f(i,j, 1,e) & + + d(3,e) * M_00_S_T_LI0( j) * u_e(i, 5,e) & + + d(3,e) * M_00_S_T_LIp( j) * u_e(i, 7,e) & + + d(2,e) * M_00_S_T_LI0(i ) * u_e( j, 9,e) & + + d(2,e) * M_00_S_T_LIp(i ) * u_e( j,11,e) + + r_f(i,j,2,e) = r_f(i,j, 2,e) & + + d(3,e) * M_00_S_T_LI0( j) * u_e(i, 6,e) & + + d(3,e) * M_00_S_T_LIp( j) * u_e(i, 8,e) & + + d(2,e) * M_00_S_T_LI0(i ) * u_e( j,10,e) & + + d(2,e) * M_00_S_T_LIp(i ) * u_e( j,12,e) + + r_f(i,j,3,e) = r_f(i,j, 3,e) & + + d(3,e) * M_00_S_T_LI0( j) * u_e(i, 1,e) & + + d(3,e) * M_00_S_T_LIp( j) * u_e(i, 3,e) & + + d(1,e) * M_00_S_T_LI0(i ) * u_e( j, 9,e) & + + d(1,e) * M_00_S_T_LIp(i ) * u_e( j,10,e) + + r_f(i,j,4,e) = r_f(i,j, 4,e) & + + d(3,e) * M_00_S_T_LI0( j) * u_e(i, 2,e) & + + d(3,e) * M_00_S_T_LIp( j) * u_e(i, 4,e) & + + d(1,e) * M_00_S_T_LI0(i ) * u_e( j,11,e) & + + d(1,e) * M_00_S_T_LIp(i ) * u_e( j,12,e) + + r_f(i,j,5,e) = r_f(i,j, 5,e) & + + d(2,e) * M_00_S_T_LI0( j) * u_e(i, 1,e) & + + d(2,e) * M_00_S_T_LIp( j) * u_e(i, 2,e) & + + d(1,e) * M_00_S_T_LI0(i ) * u_e( j, 5,e) & + + d(1,e) * M_00_S_T_LIp(i ) * u_e( j, 6,e) + + r_f(i,j,6,e) = r_f(i,j, 6,e) & + + d(2,e) * M_00_S_T_LI0( j) * u_e(i, 3,e) & + + d(2,e) * M_00_S_T_LIp( j) * u_e(i, 4,e) & + + d(1,e) * M_00_S_T_LI0(i ) * u_e( j, 7,e) & + + d(1,e) * M_00_S_T_LIp(i ) * u_e( j, 8,e) + + end do + end do + end do + !$omp end do + !$acc end loop + !$acc end parallel + +end subroutine EdgeToFace + +!------------------------------------------------------------------------------- +!> \brief Edge to edge interaction +!> \author Immo Huismann + +subroutine EdgeToEdge(n,n_element,M_00,L_00,L_0p,eig,d,u_e,r_e) + integer, intent(in) :: n !< points per edge + integer, intent(in) :: n_element !< number of elements + real(RDP), intent(in) :: d(0:3,n_element) !< metric coefficient * lambda + real(RDP), intent(in) :: M_00 !< M_{00} + real(RDP), intent(in) :: L_00 !< L_{00} + real(RDP), intent(in) :: L_0p !< L_{0p} + real(RDP), intent(in) :: eig(n) !< eigenvalues of L_II + + real(RDP), intent(in) :: u_e(n,N_EDGES,n_element) !< values on edges + real(RDP), intent(inout) :: r_e(n,N_EDGES,n_element) !< result on edges + + integer :: i, e + + real(RDP) :: f_0, f_1, f_2 + + !$acc parallel async(ACC_EXEC_QUEUE) present(d,u_e,r_e) + !$acc loop gang worker vector + !$omp do private(i,e,f_0,f_1,f_2) + do e = 1, n_element + !........................................................................... + ! x_1 edges + !$acc loop vector + do i = 1, n + + f_0 = d(0,e) * M_00 * M_00 & ! lambda * M * M * M + + d(1,e) * M_00 * M_00 * eig(i) & ! M * M * L + + d(2,e) * L_00 * M_00 & ! M * L * M + + d(3,e) * M_00 * L_00 ! L * M * M + + f_1 = d(2,e) * M_00 * L_0p ! M * L * M + f_2 = d(3,e) * M_00 * L_0p ! L * M * M + + r_e(i, 1,e) = r_e(i, 1,e) & + + f_0 * u_e(i, 1,e) & + + f_1 * u_e(i, 2,e) & + + f_2 * u_e(i, 3,e) + + r_e(i, 2,e) = r_e(i, 2,e) & + + f_0 * u_e(i, 2,e) & + + f_1 * u_e(i, 1,e) & + + f_2 * u_e(i, 4,e) + + r_e(i, 3,e) = r_e(i, 3,e) & + + f_0 * u_e(i, 3,e) & + + f_1 * u_e(i, 4,e) & + + f_2 * u_e(i, 1,e) + + r_e(i, 4,e) = r_e(i, 4,e) & + + f_0 * u_e(i, 4,e) & + + f_1 * u_e(i, 3,e) & + + f_2 * u_e(i, 2,e) + end do + !$acc end loop + + !............................................................................. + ! x_2 edges + !$acc loop vector + do i = 1, n + f_0 = d(0,e) * M_00 * M_00 & ! lambda * M * M * M + + d(2,e) * M_00 * M_00 * eig(i) & ! M * M * L + + d(1,e) * L_00 * M_00 & ! M * L * M + + d(3,e) * M_00 * L_00 ! L * M * M + + f_1 = d(1,e) * M_00 * L_0p ! M * L * M + f_2 = d(3,e) * M_00 * L_0p ! L * M * M + + r_e(i, 5,e) = r_e(i, 5,e) & + + f_0 * u_e(i, 5,e) & + + f_1 * u_e(i, 6,e) & + + f_2 * u_e(i, 7,e) + + r_e(i, 6,e) = r_e(i, 6,e) & + + f_0 * u_e(i, 6,e) & + + f_1 * u_e(i, 5,e) & + + f_2 * u_e(i, 8,e) + + r_e(i, 7,e) = r_e(i, 7,e) & + + f_0 * u_e(i, 7,e) & + + f_1 * u_e(i, 8,e) & + + f_2 * u_e(i, 5,e) + + r_e(i, 8,e) = r_e(i, 8,e) & + + f_0 * u_e(i, 8,e) & + + f_1 * u_e(i, 7,e) & + + f_2 * u_e(i, 6,e) + end do + !$acc end loop + + !............................................................................. + ! x_3 edges + !$acc loop vector + do i = 1, n + f_0 = d(0,e) * M_00 * M_00 & ! lambda * M * M * M + + d(3,e) * M_00 * M_00 * eig(i) & ! M * M * L + + d(1,e) * L_00 * M_00 & ! M * L * M + + d(2,e) * M_00 * L_00 ! L * M * M + + f_1 = d(1,e) * M_00 * L_0p ! M * L * M + f_2 = d(2,e) * M_00 * L_0p ! L * M * M + + r_e(i, 9,e) = r_e(i, 9,e) & + + f_0 * u_e(i, 9,e) & + + f_1 * u_e(i,10,e) & + + f_2 * u_e(i,11,e) + + r_e(i,10,e) = r_e(i,10,e) & + + f_0 * u_e(i,10,e) & + + f_1 * u_e(i, 9,e) & + + f_2 * u_e(i,12,e) + + r_e(i,11,e) = r_e(i,11,e) & + + f_0 * u_e(i,11,e) & + + f_1 * u_e(i,12,e) & + + f_2 * u_e(i, 9,e) + + r_e(i,12,e) = r_e(i,12,e) & + + f_0 * u_e(i,12,e) & + + f_1 * u_e(i,11,e) & + + f_2 * u_e(i,10,e) + + end do + !$acc end loop + + end do + !$omp end do + !$acc end loop + !$acc end parallel + +end subroutine EdgeToEdge + +!------------------------------------------------------------------------------- +!> \brief Edge to vertex interaction +!> \author Immo Huismann + +subroutine EdgeToVertex(n,n_element,M_00,L_0I_S,L_pI_S,d,u_e,r_v) + integer, intent(in) :: n + integer, intent(in) :: n_element + real(RDP), intent(in) :: M_00 + real(RDP), intent(in) :: L_0I_S(n) + real(RDP), intent(in) :: L_pI_S(n) + real(RDP), intent(in) :: d(0:3, n_element) + real(RDP), intent(in) :: u_e(n,N_EDGES, n_element) + real(RDP), intent(inout) :: r_v( N_VERTICES,n_element) + + integer :: i, e + real(RDP) :: tmp_1,tmp_2, factor + + !$acc parallel async(ACC_EXEC_QUEUE) present(d,u_e,r_v) + !$acc loop gang worker vector + !$omp do private(i,e,tmp_1,tmp_2,factor) + do e = 1, n_element + + !--------------------------------------------------------------------------- + ! x_1 edges + + factor = M_00 * M_00 * d(1,e) + + !........................................................................... + ! first one + tmp_1 = 0 + tmp_2 = 0 + + !$acc loop vector reduction(+:tmp_1,tmp_2) + do i = 1, n + tmp_1 = tmp_1 + L_0I_S(i) * u_e(i, 1,e) + tmp_2 = tmp_2 + L_pI_S(i) * u_e(i, 1,e) + end do + !$acc end loop + + r_v(1,e) = r_v(1,e) + tmp_1 * factor + r_v(2,e) = r_v(2,e) + tmp_2 * factor + + !........................................................................... + ! second one + tmp_1 = 0 + tmp_2 = 0 + + do i = 1, n + tmp_1 = tmp_1 + L_0I_S(i) * u_e(i, 2,e) + tmp_2 = tmp_2 + L_pI_S(i) * u_e(i, 2,e) + end do + + r_v(3,e) = r_v(3,e) + tmp_1 * factor + r_v(4,e) = r_v(4,e) + tmp_2 * factor + + !........................................................................... + ! third one + tmp_1 = 0 + tmp_2 = 0 + + do i = 1, n + tmp_1 = tmp_1 + L_0I_S(i) * u_e(i, 3,e) + tmp_2 = tmp_2 + L_pI_S(i) * u_e(i, 3,e) + end do + + r_v(5,e) = r_v(5,e) + tmp_1 * factor + r_v(6,e) = r_v(6,e) + tmp_2 * factor + + !........................................................................... + ! Fourth one + tmp_1 = 0 + tmp_2 = 0 + + do i = 1, n + tmp_1 = tmp_1 + L_0I_S(i) * u_e(i, 4,e) + tmp_2 = tmp_2 + L_pI_S(i) * u_e(i, 4,e) + end do + + r_v(7,e) = r_v(7,e) + tmp_1 * factor + r_v(8,e) = r_v(8,e) + tmp_2 * factor + + + !--------------------------------------------------------------------------- + ! x_2 edges + + factor = M_00 * M_00 * d(2,e) + + !........................................................................... + ! first one + tmp_1 = 0 + tmp_2 = 0 + + do i = 1, n + tmp_1 = tmp_1 + L_0I_S(i) * u_e(i, 5,e) + tmp_2 = tmp_2 + L_pI_S(i) * u_e(i, 5,e) + end do + + r_v(1,e) = r_v(1,e) + tmp_1 * factor + r_v(3,e) = r_v(3,e) + tmp_2 * factor + + !........................................................................... + ! second one + tmp_1 = 0 + tmp_2 = 0 + + do i = 1, n + tmp_1 = tmp_1 + L_0I_S(i) * u_e(i, 6,e) + tmp_2 = tmp_2 + L_pI_S(i) * u_e(i, 6,e) + end do + + r_v(2,e) = r_v(2,e) + tmp_1 * factor + r_v(4,e) = r_v(4,e) + tmp_2 * factor + + !........................................................................... + ! third one + tmp_1 = 0 + tmp_2 = 0 + + do i = 1, n + tmp_1 = tmp_1 + L_0I_S(i) * u_e(i, 7,e) + tmp_2 = tmp_2 + L_pI_S(i) * u_e(i, 7,e) + end do + + r_v(5,e) = r_v(5,e) + tmp_1 * factor + r_v(7,e) = r_v(7,e) + tmp_2 * factor + + !........................................................................... + ! Fourth one + tmp_1 = 0 + tmp_2 = 0 + + do i = 1, n + tmp_1 = tmp_1 + L_0I_S(i) * u_e(i, 8,e) + tmp_2 = tmp_2 + L_pI_S(i) * u_e(i, 8,e) + end do + + r_v(6,e) = r_v(6,e) + tmp_1 * factor + r_v(8,e) = r_v(8,e) + tmp_2 * factor + + !--------------------------------------------------------------------------- + ! x_3 edges + + factor = M_00 * M_00 * d(3,e) + + !........................................................................... + ! first one + tmp_1 = 0 + tmp_2 = 0 + + do i = 1, n + tmp_1 = tmp_1 + L_0I_S(i) * u_e(i, 9,e) + tmp_2 = tmp_2 + L_pI_S(i) * u_e(i, 9,e) + end do + + r_v(1,e) = r_v(1,e) + tmp_1 * factor + r_v(5,e) = r_v(5,e) + tmp_2 * factor + + !........................................................................... + ! second one + tmp_1 = 0 + tmp_2 = 0 + + do i = 1, n + tmp_1 = tmp_1 + L_0I_S(i) * u_e(i,10,e) + tmp_2 = tmp_2 + L_pI_S(i) * u_e(i,10,e) + end do + + r_v(2,e) = r_v(2,e) + tmp_1 * factor + r_v(6,e) = r_v(6,e) + tmp_2 * factor + + !........................................................................... + ! third one + tmp_1 = 0 + tmp_2 = 0 + + do i = 1, n + tmp_1 = tmp_1 + L_0I_S(i) * u_e(i,11,e) + tmp_2 = tmp_2 + L_pI_S(i) * u_e(i,11,e) + end do + + r_v(3,e) = r_v(3,e) + tmp_1 * factor + r_v(7,e) = r_v(7,e) + tmp_2 * factor + + !........................................................................... + ! Fourth one + tmp_1 = 0 + tmp_2 = 0 + + do i = 1, n + tmp_1 = tmp_1 + L_0I_S(i) * u_e(i,12,e) + tmp_2 = tmp_2 + L_pI_S(i) * u_e(i,12,e) + end do + + r_v(4,e) = r_v(4,e) + tmp_1 * factor + r_v(8,e) = r_v(8,e) + tmp_2 * factor + + end do + !$omp end do + !$acc end loop + !$acc end parallel + +end subroutine EdgeToVertex + +!------------------------------------------------------------------------------- +!> \brief Vertex to edge interaction +!> \author Immo Huismann + +subroutine VertexToEdge(n,n_element,M_00,S_T_L_I0,S_T_L_Ip,d,u_v,r_e) + integer, intent(in) :: n + integer, intent(in) :: n_element + real(RDP), intent(in) :: M_00 + real(RDP), intent(in) :: S_T_L_I0(n) + real(RDP), intent(in) :: S_T_L_Ip(n) + real(RDP), intent(in) :: d(0:3, n_element) + real(RDP), intent(in) :: u_v( N_VERTICES,n_element) + real(RDP), intent(inout) :: r_e(n,N_EDGES, n_element) + + integer :: i, e + real(RDP) :: tmp, factor + + !$acc parallel async(ACC_EXEC_QUEUE) present(d,u_v,r_e) + !$acc loop + !$omp do private(i,e,tmp,factor) + do e = 1, n_element + !........................................................................... + ! x_1 edges + factor = M_00 * M_00 * d(1,e) + + do i = 1, n + tmp = S_T_L_I0(i) * u_v(1,e) + S_T_L_Ip(i) * u_v(2,e) + r_e(i, 1,e) = r_e(i, 1,e) + factor * tmp + end do + do i = 1, n + tmp = S_T_L_I0(i) * u_v(3,e) + S_T_L_Ip(i) * u_v(4,e) + r_e(i, 2,e) = r_e(i, 2,e) + factor * tmp + end do + do i = 1, n + tmp = S_T_L_I0(i) * u_v(5,e) + S_T_L_Ip(i) * u_v(6,e) + r_e(i, 3,e) = r_e(i, 3,e) + factor * tmp + end do + do i = 1, n + tmp = S_T_L_I0(i) * u_v(7,e) + S_T_L_Ip(i) * u_v(8,e) + r_e(i, 4,e) = r_e(i, 4,e) + factor * tmp + end do + + !........................................................................... + ! x_2 edges + factor = M_00 * M_00 * d(2,e) + + do i = 1, n + tmp = S_T_L_I0(i) * u_v(1,e) + S_T_L_Ip(i) * u_v(3,e) + r_e(i, 5,e) = r_e(i, 5,e) + factor * tmp + end do + do i = 1, n + tmp = S_T_L_I0(i) * u_v(2,e) + S_T_L_Ip(i) * u_v(4,e) + r_e(i, 6,e) = r_e(i, 6,e) + factor * tmp + end do + do i = 1, n + tmp = S_T_L_I0(i) * u_v(5,e) + S_T_L_Ip(i) * u_v(7,e) + r_e(i, 7,e) = r_e(i, 7,e) + factor * tmp + end do + do i = 1, n + tmp = S_T_L_I0(i) * u_v(6,e) + S_T_L_Ip(i) * u_v(8,e) + r_e(i, 8,e) = r_e(i, 8,e) + factor * tmp + end do + + !........................................................................... + ! x_3 edges + factor = M_00 * M_00 * d(3,e) + + do i = 1, n + tmp = S_T_L_I0(i) * u_v(1,e) + S_T_L_Ip(i) * u_v(5,e) + r_e(i, 9,e) = r_e(i, 9,e) + factor * tmp + end do + do i = 1, n + tmp = S_T_L_I0(i) * u_v(2,e) + S_T_L_Ip(i) * u_v(6,e) + r_e(i,10,e) = r_e(i,10,e) + factor * tmp + end do + do i = 1, n + tmp = S_T_L_I0(i) * u_v(3,e) + S_T_L_Ip(i) * u_v(7,e) + r_e(i,11,e) = r_e(i,11,e) + factor * tmp + end do + do i = 1, n + tmp = S_T_L_I0(i) * u_v(4,e) + S_T_L_Ip(i) * u_v(8,e) + r_e(i,12,e) = r_e(i,12,e) + factor * tmp + end do + end do + !$omp end do + !$acc end loop + !$acc end parallel + +end subroutine VertexToEdge + +!------------------------------------------------------------------------------- +!> \brief Primary part vertex to vertex in transformed condensed system +!> \author Immo Huismann + +subroutine VertexToVertex(n_element,M_00,L_00,L_0p,d,u_v,r_v) + integer, intent(in) :: n_element + real(RDP), intent(in) :: M_00 + real(RDP), intent(in) :: L_00 + real(RDP), intent(in) :: L_0p + real(RDP), intent(in) :: d (0:3,n_element) + real(RDP), intent(in) :: u_v(N_VERTICES,n_element) !< values on vertices + real(RDP), intent(inout) :: r_v(N_VERTICES,n_element) !< result on vertices + + integer :: e + real(RDP) :: M_00_M_00_M_00 + real(RDP) :: L_00_M_00_M_00 + real(RDP) :: L_0p_M_00_M_00 + + M_00_M_00_M_00 = M_00 * M_00 * M_00 + L_00_M_00_M_00 = L_00 * M_00 * M_00 + L_0p_M_00_M_00 = L_0p * M_00 * M_00 + + !$acc parallel async(ACC_EXEC_QUEUE) present(d,u_v,r_v) + !$acc loop + !$omp do private(e) + do e = 1, n_element + + r_v(1,e) = r_v(1,e) & + + M_00_M_00_M_00 * d(0,e) * u_v(1,e) & + + L_00_M_00_M_00 * (d(1,e) + d(2,e) + d(3,e)) * u_v(1,e) & + + L_0p_M_00_M_00 * d(1,e) * u_v(2,e) & + + L_0p_M_00_M_00 * d(2,e) * u_v(3,e) & + + L_0p_M_00_M_00 * d(3,e) * u_v(5,e) + + r_v(2,e) = r_v(2,e) & + + M_00_M_00_M_00 * d(0,e) * u_v(2,e) & + + L_00_M_00_M_00 * (d(1,e) + d(2,e) + d(3,e)) * u_v(2,e) & + + L_0p_M_00_M_00 * d(1,e) * u_v(1,e) & + + L_0p_M_00_M_00 * d(2,e) * u_v(4,e) & + + L_0p_M_00_M_00 * d(3,e) * u_v(6,e) + + r_v(3,e) = r_v(3,e) & + + M_00_M_00_M_00 * d(0,e) * u_v(3,e) & + + L_00_M_00_M_00 * (d(1,e) + d(2,e) + d(3,e)) * u_v(3,e) & + + L_0p_M_00_M_00 * d(1,e) * u_v(4,e) & + + L_0p_M_00_M_00 * d(2,e) * u_v(1,e) & + + L_0p_M_00_M_00 * d(3,e) * u_v(7,e) + + r_v(4,e) = r_v(4,e) & + + M_00_M_00_M_00 * d(0,e) * u_v(4,e) & + + L_00_M_00_M_00 * (d(1,e) + d(2,e) + d(3,e)) * u_v(4,e) & + + L_0p_M_00_M_00 * d(1,e) * u_v(3,e) & + + L_0p_M_00_M_00 * d(2,e) * u_v(2,e) & + + L_0p_M_00_M_00 * d(3,e) * u_v(8,e) + + r_v(5,e) = r_v(5,e) & + + M_00_M_00_M_00 * d(0,e) * u_v(5,e) & + + L_00_M_00_M_00 * (d(1,e) + d(2,e) + d(3,e)) * u_v(5,e) & + + L_0p_M_00_M_00 * d(1,e) * u_v(6,e) & + + L_0p_M_00_M_00 * d(2,e) * u_v(7,e) & + + L_0p_M_00_M_00 * d(3,e) * u_v(1,e) + + r_v(6,e) = r_v(6,e) & + + M_00_M_00_M_00 * d(0,e) * u_v(6,e) & + + L_00_M_00_M_00 * (d(1,e) + d(2,e) + d(3,e)) * u_v(6,e) & + + L_0p_M_00_M_00 * d(1,e) * u_v(5,e) & + + L_0p_M_00_M_00 * d(2,e) * u_v(8,e) & + + L_0p_M_00_M_00 * d(3,e) * u_v(2,e) + + r_v(7,e) = r_v(7,e) & + + M_00_M_00_M_00 * d(0,e) * u_v(7,e) & + + L_00_M_00_M_00 * (d(1,e) + d(2,e) + d(3,e)) * u_v(7,e) & + + L_0p_M_00_M_00 * d(1,e) * u_v(8,e) & + + L_0p_M_00_M_00 * d(2,e) * u_v(5,e) & + + L_0p_M_00_M_00 * d(3,e) * u_v(3,e) + + r_v(8,e) = r_v(8,e) & + + M_00_M_00_M_00 * d(0,e) * u_v(8,e) & + + L_00_M_00_M_00 * (d(1,e) + d(2,e) + d(3,e)) * u_v(8,e) & + + L_0p_M_00_M_00 * d(1,e) * u_v(7,e) & + + L_0p_M_00_M_00 * d(2,e) * u_v(6,e) & + + L_0p_M_00_M_00 * d(3,e) * u_v(4,e) + + end do + !$omp end do + !$acc end loop + !$acc end parallel + +end subroutine VertexToVertex + +!=============================================================================== + +end module Transformed_Primary_Part diff --git a/Parser/test/org/tud/forty/test/FragmentTest.java b/Parser/test/org/tud/forty/test/FragmentTest.java index 5512267..30f4569 100644 --- a/Parser/test/org/tud/forty/test/FragmentTest.java +++ b/Parser/test/org/tud/forty/test/FragmentTest.java @@ -2,6 +2,8 @@ package org.tud.forty.test; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import org.tud.forty.ast.ExecutableConstruct; +import org.tud.forty.ast.ExecutionPartConstruct; import org.tud.forty.ast.Expr; import java.io.File; @@ -14,9 +16,24 @@ public class FragmentTest extends TestBase { return ruleProvider("test-data/fragments/Expr"); } + @DataProvider(name = "executableconstruct") + public static Iterator<Object[]> fortranExecutableConstructProvider() { + return ruleProvider("test-data/fragments/ExecutableConstruct"); + } + @Test(dataProvider = "exprs") public void testFragmentExprParser(File f) throws Exception { testParse(f, false, true, false, true, Expr.class); } + @Test(dataProvider = "executableconstruct") + public void testFragmentExecutableConstructParser(File f) throws Exception { + testParse(f, false, true, false, true, ExecutableConstruct.class, true); + } + + @Test(dataProvider = "executableconstruct") + public void testFragmentExecutionPartConstructParser(File f) throws Exception { + testParse(f, false, true, false, true, ExecutionPartConstruct.class, true); + } + } diff --git a/Parser/test/org/tud/forty/test/NASTest.java b/Parser/test/org/tud/forty/test/NASTest.java new file mode 100644 index 0000000..3349276 --- /dev/null +++ b/Parser/test/org/tud/forty/test/NASTest.java @@ -0,0 +1,37 @@ +package org.tud.forty.test; + +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; +import org.tud.forty.ast.Root; + +import java.io.File; +import java.util.Iterator; + +public class NASTest extends TestBase { + + + @DataProvider(name = "nas") + public static Iterator<Object[]> fortranNasProvider() { + return ruleProvider("test-data/nas"); + } + + @Test(dataProvider = "nas", groups = {"parser"}) + public void testNASParser(File f) throws Exception { + testParse(f, false, true, false, true, Root.class); + } + + @Test(dataProvider = "nas", groups = {"prettyprinter"}) + public void testNASPrettyPrinter(File f) throws Exception { + testParse(f, true, false, false, true, Root.class); + } + + @Test(dataProvider = "nas", groups = {"compare"}) + public void testCompareNASPrettyPrinter(File f) throws Exception { + testParse(f, false, false, true, true, Root.class); + } + + @Test(dataProvider = "nas", groups = {"compare"}) + public void testCompareNASPrettyPrinterWithSpaces(File f) throws Exception { + testParse(f, false, false, true, false, Root.class); + } +} diff --git a/Parser/test/org/tud/forty/test/SlotTest.java b/Parser/test/org/tud/forty/test/SlotTest.java index ccf9d0d..228ce73 100644 --- a/Parser/test/org/tud/forty/test/SlotTest.java +++ b/Parser/test/org/tud/forty/test/SlotTest.java @@ -38,4 +38,5 @@ public class SlotTest extends TestBase { public void testSlotsName(File f) throws Exception { testParse(f, false, false, true, false, Root.class); } + } diff --git a/Parser/test/org/tud/forty/test/SpechtTest.java b/Parser/test/org/tud/forty/test/SpechtTest.java new file mode 100644 index 0000000..1bc787b --- /dev/null +++ b/Parser/test/org/tud/forty/test/SpechtTest.java @@ -0,0 +1,42 @@ +package org.tud.forty.test; + +import java.io.File; +import java.util.Iterator; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; +import org.tud.forty.ast.Root; + +public class SpechtTest extends TestBase { + + + @DataProvider(name = "specht") + public static Iterator<Object[]> fortranNasProvider() { + return ruleProvider("test-data/specht"); + } + + @Test(dataProvider = "specht", groups = {"parser"}) + public void testSpechtParser(File f) throws Exception { + testParse(f, false, true, false, true, Root.class); + } + + @Test(dataProvider = "specht", groups = {"prettyprinter"}) + public void testSpechtPrettyPrinter(File f) throws Exception { + testParse(f, true, false, false, true, Root.class); + } + + @Test(dataProvider = "specht", groups = {"compare"}) + public void testCompareSpechtPrettyPrinter(File f) throws Exception { + testParse(f, false, false, true, true, Root.class); + } + + @Test(dataProvider = "specht", groups = {"compare"}) + public void testCompareSpechtPrettyPrinterWithSpaces(File f) throws Exception { + testParse(f, false, false, true, false, Root.class); + } + + @Test(dataProvider = "specht", groups = {"compare"}) + public void testParsePrintParseSpecht(File f) throws Exception { + testParsePrintParse(f); + } + +} diff --git a/Parser/test/org/tud/forty/test/TestBase.java b/Parser/test/org/tud/forty/test/TestBase.java index f9fe437..e5d96cf 100644 --- a/Parser/test/org/tud/forty/test/TestBase.java +++ b/Parser/test/org/tud/forty/test/TestBase.java @@ -1,16 +1,11 @@ package org.tud.forty.test; import org.testng.Assert; -import org.tud.forty.ast.ASTNode; -import org.tud.forty.ast.Expr; -import org.tud.forty.ast.PrettyPrinter; -import org.tud.forty.ast.Root; +import org.tud.forty.ast.*; import org.tud.forty.parser.SlottableFortranParser; import xtc.parser.Result; -import java.io.BufferedReader; -import java.io.File; -import java.io.FileReader; +import java.io.*; import java.util.ArrayList; import java.util.Arrays; import java.util.Iterator; @@ -59,42 +54,120 @@ public class TestBase { System.out.println(sb.toString()); } - protected void testParse(File f, boolean pp, boolean ast, boolean compare, boolean ignore_spaces, Class clazz) throws Exception { + private String trimCode(String code) { + System.out.println(">>" + code + "<<"); + String lastLine = code.split("\n")[code.split("\n").length-1]; + code = code.replaceAll("\n" + lastLine + " ","\n"); + + code = code.replaceAll("^\\p{Blank}*\n", ""); + code = code.replaceAll("\n\\p{Blank}*$", ""); + System.out.println(">>" + code + "<<"); + code = code.trim() + "\n"; + System.out.println(">>" + code + "<<"); + return code; + } + + protected void testSanitize(File f) throws Exception { Assert.assertTrue(f.exists()); - FileReader reader = new FileReader(f); + Reader reader; + + FileReader t = new FileReader(f); + StringBuilder sw = new StringBuilder(); + int i; + while ((i = t.read()) != -1) { + sw.append((char)i); + } + Assert.assertEquals(sw.toString(),SlottableFortranParser.removeLineBreaks(sw.toString())); + + } + + protected void testParse(File f, boolean pp, boolean ast, boolean compare, boolean ignore_spaces, Class clazz) throws Exception { + testParse(f, pp, ast, compare, ignore_spaces, clazz, false); + } + + private String reprint(Reader reader) throws Exception { + SlottableFortranParser parser = SlottableFortranParser.createParser(reader, ""); + Result r = parser.proot(0); + if (!r.hasValue()) { + System.out.println(parser.format(r.parseError())); + if (r.parseError().index > 0) { + Assert.assertTrue(r.hasValue(), "ParseError:" + parser.location(r.parseError().index) + ":(" + r.parseError().index + ") " + r.parseError().msg + "."); + } else { + Assert.assertTrue(r.hasValue(), "ParseError:" + parser.location(r.parseError().index) + ":(" + r.parseError().index + ") " + r.parseError().msg + "."); + } + } + PrettyPrinter pp = new PrettyPrinter(" ", false); + ((ASTNode) r.semanticValue()).prettyPrint(pp); + return pp.toString(); + } + + protected void testParsePrintParse(File file) throws Exception { + FileReader firstReader = new FileReader(file); + String print1 = reprint(firstReader); + //System.out.println(print1); + StringReader secondReader = new StringReader(print1); + String print2 = reprint(secondReader); + Assert.assertEquals(print2,print1); + } + + protected void testParse(File file, boolean testPrettyPrinter, boolean printAst, boolean compareOutput, boolean compareOutputIgnoringSpaces, Class clazz, boolean trim) throws Exception { + Assert.assertTrue(file.exists()); + + Reader reader; + + if (trim) { + FileReader t = new FileReader(file); + StringWriter sw = new StringWriter(); + int i; + while ((i = t.read()) != -1) { + sw.append((char)i); + } + reader = new StringReader(trimCode(sw.toString())); + } else { + reader = new FileReader(file); + } + + + SlottableFortranParser parser = SlottableFortranParser.createParser(reader, ""); Result r; if(clazz.equals(Expr.class)) { r = parser.pexpr(0); + } else if (clazz.equals(ExecutableConstruct.class)) { + r = parser.pexecutable_construct(0); + } else if (clazz.equals(ExecutionPartConstruct.class)) { + r = parser.pexecution_part_construct(0); } else { r = parser.proot(0); } if (!r.hasValue()) { if (r.parseError().index > 0) { - Assert.assertTrue(r.hasValue(), "ParseError in " + f.getName() + ":" + parser.location(r.parseError().index) + ":(" + r.parseError().index + ") " + r.parseError().msg + "."); + Assert.assertTrue(r.hasValue(), "ParseError in " + file.getName() + ":" + parser.location(r.parseError().index) + ":(" + r.parseError().index + ") " + r.parseError().msg + "."); } else { - Assert.assertTrue(r.hasValue(), "ParseError in " + f.getName() + ":" + ":(" + r.parseError().index + ") " + r.parseError().msg + "."); + Assert.assertTrue(r.hasValue(), "ParseError in " + file.getName() + ":" + ":(" + r.parseError().index + ") " + r.parseError().msg + "."); } - } - if (pp || compare) { + ((ASTNode) r.semanticValue()).fixTreeStructure(); + ((ASTNode) r.semanticValue()).checkTreeStructure(); + + if (testPrettyPrinter || compareOutput) { PrettyPrinter s = new PrettyPrinter(" ", false); ((ASTNode) r.semanticValue()).prettyPrint(s); int ln = 1; - if (pp) { + if (testPrettyPrinter) { System.out.println("\n==============================================="); - System.out.println("== " + f.getName()); + System.out.println("== " + file.getName()); System.out.println("==============================================="); for (String sl : s.toString().split("\n")) { System.out.println(String.format("%03d ", ln++) + sl); } } - if (compare) { + if (compareOutput) { reader.close(); - reader = new FileReader(f); + reader = new FileReader(file); BufferedReader br = new BufferedReader(reader); StringBuffer sb = new StringBuffer(); @@ -104,17 +177,21 @@ public class TestBase { sb.append(line).append("\n"); } - if (ignore_spaces) { - Assert.assertEquals(trim(s.toString()), trim(sb.toString())); + if (compareOutputIgnoringSpaces) { + Assert.assertEquals(trim(s.toString()), SlottableFortranParser.removeLineBreaks(trim(sb.toString()))); } else { Assert.assertEquals(s.toString(), sb.toString()); } } } - if (ast) { + if (printAst) { System.out.println("==============================================="); if(clazz.equals(Expr.class)) { System.out.println(((Expr) r.semanticValue()).getAstStringWithSource()); + } else if(clazz.equals(ExecutableConstruct.class)) { + System.out.println(((ExecutableConstruct) r.semanticValue()).getAstStringWithSource()); + } else if(clazz.equals(ExecutionPartConstruct.class)) { + System.out.println(((ExecutionPartConstruct) r.semanticValue()).getAstStringWithSource()); } else { System.out.println(((Root) r.semanticValue()).getAstStringWithSource()); } -- GitLab