From e5486c976df88b7467befb200bbdb7c2a3dee710 Mon Sep 17 00:00:00 2001
From: Johannes Mey <johannes.mey@tu-dresden.de>
Date: Wed, 8 Feb 2017 12:31:03 +0100
Subject: [PATCH] huge update of tests

---
 Parser/test-data/OpenAcc/slots.f              |    7 +
 .../ExecutableConstruct1.f                    |    4 +
 .../fragments/ExecutableConstruct/Slot.f      |    2 +
 .../fragments/ExecutableConstruct/TypedSlot.f |    2 +
 Parser/test-data/nas/ft-min.f                 |  300 ++
 Parser/test-data/nas/ft.f                     | 1065 ++++++
 Parser/test-data/nas/ft2.f                    |  910 +++++
 Parser/test-data/nas/ft3.f                    | 2815 ++++++++++++++++
 Parser/test-data/nas/ft4.f                    | 1106 ++++++
 Parser/test-data/nas/ft5.f                    | 2954 +++++++++++++++++
 Parser/test-data/rules/R503a.f90              |   14 +
 Parser/test-data/rules/tmp.f90                |    4 +
 Parser/test-data/slots/block.f                |    7 +
 Parser/test-data/specht/array_operations.f    | 1166 +++++++
 .../test-data/specht/condensed_primary_part.f |  930 ++++++
 Parser/test-data/specht/matrix_operations.f   |  651 ++++
 .../specht/tiny_matrix_products_explicit.f    | 1386 ++++++++
 .../specht/transformed_condensed_part.f       |  151 +
 .../specht/transformed_primary_part.f         |  939 ++++++
 .../test/org/tud/forty/test/FragmentTest.java |   17 +
 Parser/test/org/tud/forty/test/NASTest.java   |   37 +
 Parser/test/org/tud/forty/test/SlotTest.java  |    1 +
 .../test/org/tud/forty/test/SpechtTest.java   |   42 +
 Parser/test/org/tud/forty/test/TestBase.java  |  117 +-
 24 files changed, 14607 insertions(+), 20 deletions(-)
 create mode 100644 Parser/test-data/OpenAcc/slots.f
 create mode 100644 Parser/test-data/fragments/ExecutableConstruct/ExecutableConstruct1.f
 create mode 100644 Parser/test-data/fragments/ExecutableConstruct/Slot.f
 create mode 100644 Parser/test-data/fragments/ExecutableConstruct/TypedSlot.f
 create mode 100644 Parser/test-data/nas/ft-min.f
 create mode 100644 Parser/test-data/nas/ft.f
 create mode 100644 Parser/test-data/nas/ft2.f
 create mode 100644 Parser/test-data/nas/ft3.f
 create mode 100644 Parser/test-data/nas/ft4.f
 create mode 100644 Parser/test-data/nas/ft5.f
 create mode 100644 Parser/test-data/rules/R503a.f90
 create mode 100644 Parser/test-data/rules/tmp.f90
 create mode 100644 Parser/test-data/slots/block.f
 create mode 100644 Parser/test-data/specht/array_operations.f
 create mode 100644 Parser/test-data/specht/condensed_primary_part.f
 create mode 100644 Parser/test-data/specht/matrix_operations.f
 create mode 100644 Parser/test-data/specht/tiny_matrix_products_explicit.f
 create mode 100644 Parser/test-data/specht/transformed_condensed_part.f
 create mode 100644 Parser/test-data/specht/transformed_primary_part.f
 create mode 100644 Parser/test/org/tud/forty/test/NASTest.java
 create mode 100644 Parser/test/org/tud/forty/test/SpechtTest.java

diff --git a/Parser/test-data/OpenAcc/slots.f b/Parser/test-data/OpenAcc/slots.f
new file mode 100644
index 0000000..f5238a7
--- /dev/null
+++ b/Parser/test-data/OpenAcc/slots.f
@@ -0,0 +1,7 @@
+
+!$acc loop private(#PVT#) collapse(#NEST#)
+do i = 0,10
+  #inner#
+end do
+!$acc end loop
+end
\ No newline at end of file
diff --git a/Parser/test-data/fragments/ExecutableConstruct/ExecutableConstruct1.f b/Parser/test-data/fragments/ExecutableConstruct/ExecutableConstruct1.f
new file mode 100644
index 0000000..34086a5
--- /dev/null
+++ b/Parser/test-data/fragments/ExecutableConstruct/ExecutableConstruct1.f
@@ -0,0 +1,4 @@
+
+    do a = b,c
+        #inner#
+    end do
diff --git a/Parser/test-data/fragments/ExecutableConstruct/Slot.f b/Parser/test-data/fragments/ExecutableConstruct/Slot.f
new file mode 100644
index 0000000..6fc897b
--- /dev/null
+++ b/Parser/test-data/fragments/ExecutableConstruct/Slot.f
@@ -0,0 +1,2 @@
+#inner#
+
diff --git a/Parser/test-data/fragments/ExecutableConstruct/TypedSlot.f b/Parser/test-data/fragments/ExecutableConstruct/TypedSlot.f
new file mode 100644
index 0000000..269ce36
--- /dev/null
+++ b/Parser/test-data/fragments/ExecutableConstruct/TypedSlot.f
@@ -0,0 +1,2 @@
+#inner:ExecutableConstruct#
+
diff --git a/Parser/test-data/nas/ft-min.f b/Parser/test-data/nas/ft-min.f
new file mode 100644
index 0000000..a282b98
--- /dev/null
+++ b/Parser/test-data/nas/ft-min.f
@@ -0,0 +1,300 @@
+!-------------------------------------------------------------------------!
+!                                                                         !
+!        N  A  S     P A R A L L E L     B E N C H M A R K S  3.3         !
+!                                                                         !
+!                       O p e n M P     V E R S I O N                     !
+!                                                                         !
+!                                   F T                                   !
+!                                                                         !
+!-------------------------------------------------------------------------!
+!                                                                         !
+!    This benchmark is an OpenMP version of the NPB FT code.              !
+!    It is described in NAS Technical Report 99-011.                      !
+!                                                                         !
+!    Permission to use, copy, distribute and modify this software         !
+!    for any purpose with or without fee is hereby granted.  We           !
+!    request, however, that all derived work reference the NAS            !
+!    Parallel Benchmarks 3.3. This software is provided "as is"           !
+!    without express or implied warranty.                                 !
+!                                                                         !
+!    Information on NPB 3.3, including the technical report, the          !
+!    original specifications, source code, results and information        !
+!    on how to submit new results, is available at:                       !
+!                                                                         !
+!           http://www.nas.nasa.gov/Software/NPB/                         !
+!                                                                         !
+!    Send comments or suggestions to  npb@nas.nasa.gov                    !
+!                                                                         !
+!          NAS Parallel Benchmarks Group                                  !
+!          NASA Ames Research Center                                      !
+!          Mail Stop: T27A-1                                              !
+!          Moffett Field, CA   94035-1000                                 !
+!                                                                         !
+!          E-mail:  npb@nas.nasa.gov                                      !
+!          Fax:     (650) 604-3957                                        !
+!                                                                         !
+!-------------------------------------------------------------------------!
+
+!---------------------------------------------------------------------
+
+! Authors: D. Bailey
+!          W. Saphir
+!          H. Jin
+
+!---------------------------------------------------------------------
+
+!---------------------------------------------------------------------
+
+!---------------------------------------------------------------------
+! FT benchmark
+!---------------------------------------------------------------------
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    program ft
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+
+    implicit none
+
+! CLASS = A
+!  
+!  
+!  This file is generated automatically by the setparams utility.
+!  It sets the number of processors and the class of the NPB
+!  in this directory. Do not modify it by hand.
+!  
+        integer nx, ny, nz, maxdim, niter_default
+        integer ntotal, nxp, nyp, ntotalp
+        parameter (nx=256, ny=256, nz=128, maxdim=256)
+        parameter (niter_default=6)
+        parameter (nxp=nx+1, nyp=ny)
+        parameter (ntotal=nx*nyp*nz)
+        parameter (ntotalp=nxp*nyp*nz)
+        logical  convertdouble
+        parameter (convertdouble = .false.)
+        character compiletime*11
+        parameter (compiletime='18 Oct 2016')
+        character npbversion*5
+        parameter (npbversion='3.3.1')
+        character cs1*8
+        parameter (cs1='gfortran')
+        character cs2*6
+        parameter (cs2='$(F77)')
+        character cs3*6
+        parameter (cs3='(none)')
+        character cs4*6
+        parameter (cs4='(none)')
+        character cs5*40
+        parameter (cs5='-ffree-form -O3 -fopenmp -mcmodel=medium')
+        character cs6*28
+        parameter (cs6='-O3 -fopenmp -mcmodel=medium')
+        character cs7*6
+        parameter (cs7='randi8')
+
+
+! If processor array is 1x1 -> 0D grid decomposition
+
+
+! Cache blocking params. These values are good for most
+! RISC processors.
+! FFT parameters:
+!  fftblock controls how many ffts are done at a time.
+!  The default is appropriate for most cache-based machines
+!  On vector machines, the FFT can be vectorized with vector
+!  length equal to the block size, so the block size should
+!  be as large as possible. This is the size of the smallest
+!  dimension of the problem: 128 for class A, 256 for class B and
+!  512 for class C.
+
+    integer :: fftblock_default, fftblockpad_default
+!      parameter (fftblock_default=16, fftblockpad_default=18)
+    parameter (fftblock_default=32, fftblockpad_default=33)
+          
+    integer :: fftblock, fftblockpad
+    common /blockinfo/ fftblock, fftblockpad
+
+! we need a bunch of logic to keep track of how
+! arrays are laid out.
+
+
+! Note: this serial version is the derived from the parallel 0D case
+! of the ft NPB.
+! The computation proceeds logically as
+
+! set up initial conditions
+! fftx(1)
+! transpose (1->2)
+! ffty(2)
+! transpose (2->3)
+! fftz(3)
+! time evolution
+! fftz(3)
+! transpose (3->2)
+! ffty(2)
+! transpose (2->1)
+! fftx(1)
+! compute residual(1)
+
+! for the 0D, 1D, 2D strategies, the layouts look like xxx
+
+!            0D        1D        2D
+! 1:        xyz       xyz       xyz
+
+! the array dimensions are stored in dims(coord, phase)
+    integer :: dims(3)
+    common /layout/ dims
+
+    integer :: T_total, T_setup, T_fft, T_evolve, T_checksum, &
+    T_fftx, T_ffty, &
+    T_fftz, T_max
+    parameter (T_total = 1, T_setup = 2, T_fft = 3, &
+    T_evolve = 4, T_checksum = 5, &
+    T_fftx = 6, &
+    T_ffty = 7, &
+    T_fftz = 8, T_max = 8)
+
+
+
+    logical :: timers_enabled
+
+
+    external timer_read
+    double precision :: timer_read
+    external ilog2
+    integer :: ilog2
+
+    external randlc
+    double precision :: randlc
+
+
+! other stuff
+    logical :: debug, debugsynch
+    common /dbg/ debug, debugsynch, timers_enabled
+
+    double precision :: seed, a, pi, alpha
+    parameter (seed = 314159265.d0, a = 1220703125.d0, &
+    pi = 3.141592653589793238d0, alpha=1.0d-6)
+
+
+! roots of unity array
+! relies on x being largest dimension?
+    double complex u(nxp)
+    common /ucomm/ u
+
+
+! for checksum data
+    double complex sums(0:niter_default)
+    common /sumcomm/ sums
+
+! number of iterations
+    integer :: niter
+    common /iter/ niter
+    integer :: i
+          
+!---------------------------------------------------------------------
+! u0, u1, u2 are the main arrays in the problem.
+! Depending on the decomposition, these arrays will have different
+! dimensions. To accomodate all possibilities, we allocate them as
+! one-dimensional arrays and pass them to subroutines for different
+! views
+!  - u0 contains the initial (transformed) initial condition
+!  - u1 and u2 are working arrays
+!  - twiddle contains exponents for the time evolution operator.
+!---------------------------------------------------------------------
+
+    double complex   u0(ntotalp), &
+    u1(ntotalp)
+!     >                 u2(ntotalp)
+    double precision :: twiddle(ntotalp)
+!---------------------------------------------------------------------
+! Large arrays are in common so that they are allocated on the
+! heap rather than the stack. This common block is not
+! referenced directly anywhere else. Padding is to avoid accidental
+! cache problems, since all array sizes are powers of two.
+!---------------------------------------------------------------------
+
+!      double complex pad1(3), pad2(3), pad3(3)
+!      common /bigarrays/ u0, pad1, u1, pad2, u2, pad3, twiddle
+    double complex pad1(3), pad2(3)
+    common /bigarrays/ u0, pad1, u1, pad2, twiddle
+
+    integer :: iter
+    double precision :: total_time, mflops
+    logical :: verified
+    character class
+
+!---------------------------------------------------------------------
+! Run the entire problem once to make sure all data is touched.
+! This reduces variable startup costs, which is important for such a
+! short benchmark. The other NPB 2 implementations are similar.
+!---------------------------------------------------------------------
+    do i = 1, t_max
+        call timer_clear(i)
+    end do
+    call setup()
+    call init_ui(u0, u1, twiddle, dims(1), dims(2), dims(3))
+    call compute_indexmap(twiddle, dims(1), dims(2), dims(3))
+    call compute_initial_conditions(u1, dims(1), dims(2), dims(3))
+    call fft_init (dims(1))
+    call fft(1, u1, u0)
+
+!---------------------------------------------------------------------
+! Start over from the beginning. Note that all operations must
+! be timed, in contrast to other benchmarks.
+!---------------------------------------------------------------------
+    do i = 1, t_max
+        call timer_clear(i)
+    end do
+
+    call timer_start(T_total)
+    if (timers_enabled) call timer_start(T_setup)
+
+    call compute_indexmap(twiddle, dims(1), dims(2), dims(3))
+
+    call compute_initial_conditions(u1, dims(1), dims(2), dims(3))
+
+    call fft_init (dims(1))
+
+    if (timers_enabled) call timer_stop(T_setup)
+    if (timers_enabled) call timer_start(T_fft)
+    call fft(1, u1, u0)
+    if (timers_enabled) call timer_stop(T_fft)
+
+    do iter = 1, niter
+        if (timers_enabled) call timer_start(T_evolve)
+        call evolve(u0, u1, twiddle, dims(1), dims(2), dims(3))
+        if (timers_enabled) call timer_stop(T_evolve)
+        if (timers_enabled) call timer_start(T_fft)
+    !         call fft(-1, u1, u2)
+        call fft(-1, u1, u1)
+        if (timers_enabled) call timer_stop(T_fft)
+        if (timers_enabled) call timer_start(T_checksum)
+    !         call checksum(iter, u2, dims(1), dims(2), dims(3))
+        call checksum(iter, u1, dims(1), dims(2), dims(3))
+        if (timers_enabled) call timer_stop(T_checksum)
+    end do
+
+    call verify(nx, ny, nz, niter, verified, class)
+
+    call timer_stop(t_total)
+    total_time = timer_read(t_total)
+
+    if( total_time /= 0. ) then
+        mflops = 1.0d-6*float(ntotal) * &
+        (14.8157+7.19641*log(float(ntotal)) &
+        +  (5.23518+7.21113*log(float(ntotal)))*niter) &
+        /total_time
+    else
+        mflops = 0.0
+    endif
+    call print_results('FT', class, nx, ny, nz, niter, &
+    total_time, mflops, '          floating point', verified, &
+    npbversion, compiletime, cs1, cs2, cs3, cs4, cs5, cs6, cs7)
+    if (timers_enabled) call print_timers()
+
+    END PROGRAM
+
diff --git a/Parser/test-data/nas/ft.f b/Parser/test-data/nas/ft.f
new file mode 100644
index 0000000..c57dcb2
--- /dev/null
+++ b/Parser/test-data/nas/ft.f
@@ -0,0 +1,1065 @@
+!-------------------------------------------------------------------------!
+!                                                                         !
+!        N  A  S     P A R A L L E L     B E N C H M A R K S  3.3         !
+!                                                                         !
+!                       O p e n M P     V E R S I O N                     !
+!                                                                         !
+!                                   F T                                   !
+!                                                                         !
+!-------------------------------------------------------------------------!
+!                                                                         !
+!    This benchmark is an OpenMP version of the NPB FT code.              !
+!    It is described in NAS Technical Report 99-011.                      !
+!                                                                         !
+!    Permission to use, copy, distribute and modify this software         !
+!    for any purpose with or without fee is hereby granted.  We           !
+!    request, however, that all derived work reference the NAS            !
+!    Parallel Benchmarks 3.3. This software is provided "as is"           !
+!    without express or implied warranty.                                 !
+!                                                                         !
+!    Information on NPB 3.3, including the technical report, the          !
+!    original specifications, source code, results and information        !
+!    on how to submit new results, is available at:                       !
+!                                                                         !
+!           http://www.nas.nasa.gov/Software/NPB/                         !
+!                                                                         !
+!    Send comments or suggestions to  npb@nas.nasa.gov                    !
+!                                                                         !
+!          NAS Parallel Benchmarks Group                                  !
+!          NASA Ames Research Center                                      !
+!          Mail Stop: T27A-1                                              !
+!          Moffett Field, CA   94035-1000                                 !
+!                                                                         !
+!          E-mail:  npb@nas.nasa.gov                                      !
+!          Fax:     (650) 604-3957                                        !
+!                                                                         !
+!-------------------------------------------------------------------------!
+
+!---------------------------------------------------------------------
+
+! Authors: D. Bailey
+!          W. Saphir
+!          H. Jin
+
+!---------------------------------------------------------------------
+
+!---------------------------------------------------------------------
+
+!---------------------------------------------------------------------
+! FT benchmark
+!---------------------------------------------------------------------
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    program ft
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+
+    implicit none
+
+!    include 'global.h'
+    integer :: i
+          
+!---------------------------------------------------------------------
+! u0, u1, u2 are the main arrays in the problem.
+! Depending on the decomposition, these arrays will have different
+! dimensions. To accomodate all possibilities, we allocate them as
+! one-dimensional arrays and pass them to subroutines for different
+! views
+!  - u0 contains the initial (transformed) initial condition
+!  - u1 and u2 are working arrays
+!  - twiddle contains exponents for the time evolution operator.
+!---------------------------------------------------------------------
+
+    double complex   u0(ntotalp), u1(ntotalp)
+!     >                 u2(ntotalp)
+    double precision :: twiddle(ntotalp)
+!---------------------------------------------------------------------
+! Large arrays are in common so that they are allocated on the
+! heap rather than the stack. This common block is not
+! referenced directly anywhere else. Padding is to avoid accidental
+! cache problems, since all array sizes are powers of two.
+!---------------------------------------------------------------------
+
+!      double complex pad1(3), pad2(3), pad3(3)
+!      common /bigarrays/ u0, pad1, u1, pad2, u2, pad3, twiddle
+    double complex pad1(3), pad2(3)
+    common /bigarrays/ u0, pad1, u1, pad2, twiddle
+
+    integer :: iter
+    double precision :: total_time, mflops
+    logical :: verified
+    character class
+
+!---------------------------------------------------------------------
+! Run the entire problem once to make sure all data is touched.
+! This reduces variable startup costs, which is important for such a
+! short benchmark. The other NPB 2 implementations are similar.
+!---------------------------------------------------------------------
+    do i = 1, t_max
+        call timer_clear(i)
+    end do
+    call setup()
+    call init_ui(u0, u1, twiddle, dims(1), dims(2), dims(3))
+    call compute_indexmap(twiddle, dims(1), dims(2), dims(3))
+    call compute_initial_conditions(u1, dims(1), dims(2), dims(3))
+    call fft_init (dims(1))
+    call fft(1, u1, u0)
+
+!---------------------------------------------------------------------
+! Start over from the beginning. Note that all operations must
+! be timed, in contrast to other benchmarks.
+!---------------------------------------------------------------------
+    do i = 1, t_max
+        call timer_clear(i)
+    end do
+
+    call timer_start(T_total)
+    if (timers_enabled) call timer_start(T_setup)
+
+    call compute_indexmap(twiddle, dims(1), dims(2), dims(3))
+
+    call compute_initial_conditions(u1, dims(1), dims(2), dims(3))
+
+    call fft_init (dims(1))
+
+    if (timers_enabled) call timer_stop(T_setup)
+    if (timers_enabled) call timer_start(T_fft)
+    call fft(1, u1, u0)
+    if (timers_enabled) call timer_stop(T_fft)
+
+    do iter = 1, niter
+        if (timers_enabled) call timer_start(T_evolve)
+        call evolve(u0, u1, twiddle, dims(1), dims(2), dims(3))
+        if (timers_enabled) call timer_stop(T_evolve)
+        if (timers_enabled) call timer_start(T_fft)
+    !         call fft(-1, u1, u2)
+        call fft(-1, u1, u1)
+        if (timers_enabled) call timer_stop(T_fft)
+        if (timers_enabled) call timer_start(T_checksum)
+    !         call checksum(iter, u2, dims(1), dims(2), dims(3))
+        call checksum(iter, u1, dims(1), dims(2), dims(3))
+        if (timers_enabled) call timer_stop(T_checksum)
+    end do
+
+    call verify(nx, ny, nz, niter, verified, class)
+
+    call timer_stop(t_total)
+    total_time = timer_read(t_total)
+
+    if( total_time /= 0. ) then
+        mflops = 1.0d-6*float(ntotal) * (14.8157+7.19641*log(float(ntotal)) +  (5.23518+7.21113*log(float(ntotal)))*niter) /total_time
+    else
+        mflops = 0.0
+    endif
+    call print_results('FT', class, nx, ny, nz, niter, total_time, mflops, '          floating point', verified, npbversion, compiletime, cs1, cs2, cs3, cs4, cs5, cs6, cs7)
+    if (timers_enabled) call print_timers()
+
+    END PROGRAM
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine init_ui(u0, u1, twiddle, d1, d2, d3)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+!---------------------------------------------------------------------
+! touch all the big data
+!---------------------------------------------------------------------
+
+    implicit none
+    integer :: d1, d2, d3
+    double complex   u0(d1+1,d2,d3)
+    double complex   u1(d1+1,d2,d3)
+    double precision :: twiddle(d1+1,d2,d3)
+    integer :: i, j, k
+
+! omp parallel do default(shared) private(i,j,k)
+    do k = 1, d3
+        do j = 1, d2
+            do i = 1, d1
+                u0(i,j,k) = 0.d0
+                u1(i,j,k) = 0.d0
+                twiddle(i,j,k) = 0.d0
+            end do
+        end do
+    end do
+
+    return
+    end subroutine init_ui
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine evolve(u0, u1, twiddle, d1, d2, d3)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+!---------------------------------------------------------------------
+! evolve u0 -> u1 (t time steps) in fourier space
+!---------------------------------------------------------------------
+
+    implicit none
+    include 'global.h'
+    integer :: d1, d2, d3
+    double complex   u0(d1+1,d2,d3)
+    double complex   u1(d1+1,d2,d3)
+    double precision :: twiddle(d1+1,d2,d3)
+    integer :: i, j, k
+
+! omp parallel do default(shared) private(i,j,k)
+    do k = 1, d3
+        do j = 1, d2
+            do i = 1, d1
+                u0(i,j,k) = u0(i,j,k) * twiddle(i,j,k)
+                u1(i,j,k) = u0(i,j,k)
+            end do
+        end do
+    end do
+
+    return
+    end subroutine evolve
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine compute_initial_conditions(u0, d1, d2, d3)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+!---------------------------------------------------------------------
+! Fill in array u0 with initial conditions from
+! random number generator
+!---------------------------------------------------------------------
+    implicit none
+    include 'global.h'
+    integer :: d1, d2, d3
+    double complex u0(d1+1, d2, d3)
+    integer :: k, j
+    double precision :: x0, start, an, dummy, starts(nz)
+          
+
+    start = seed
+!---------------------------------------------------------------------
+! Jump to the starting element for our first plane.
+!---------------------------------------------------------------------
+    call ipow46(a, 0, an)
+    dummy = randlc(start, an)
+    call ipow46(a, 2*nx*ny, an)
+
+    starts(1) = start
+    do k = 2, dims(3)
+        dummy = randlc(start, an)
+        starts(k) = start
+    end do
+          
+!---------------------------------------------------------------------
+! Go through by z planes filling in one square at a time.
+!---------------------------------------------------------------------
+! omp parallel do default(shared) private(k,j,x0)
+    do k = 1, dims(3)
+        x0 = starts(k)
+        do j = 1, dims(2)
+            call vranlc(2*nx, x0, a, u0(1, j, k))
+        end do
+    end do
+
+    return
+    end subroutine compute_initial_conditions
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine ipow46(a, exponent, result)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+!---------------------------------------------------------------------
+! compute a^exponent mod 2^46
+!---------------------------------------------------------------------
+
+    implicit none
+    double precision :: a, result, dummy, q, r
+    integer :: exponent, n, n2
+    external randlc
+    double precision :: randlc
+!---------------------------------------------------------------------
+! Use
+!   a^n = a^(n/2)*a^(n/2) if n even else
+!   a^n = a*a^(n-1)       if n odd
+!---------------------------------------------------------------------
+    result = 1
+    if (exponent == 0) return
+    q = a
+    r = 1
+    n = exponent
+
+
+    do while (n > 1)
+        n2 = n/2
+        if (n2 * 2 == n) then
+            dummy = randlc(q, q)
+            n = n2
+        else
+            dummy = randlc(r, q)
+            n = n-1
+        endif
+    end do
+    dummy = randlc(r, q)
+    result = r
+    return
+    end subroutine ipow46
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine setup
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    implicit none
+    include 'global.h'
+
+    integer :: fstatus
+!$    integer  omp_get_max_threads
+!$    external omp_get_max_threads
+    debug = .FALSE. 
+
+    open (unit=2,file='timer.flag',status='old',iostat=fstatus)
+    if (fstatus == 0) then
+        timers_enabled = .TRUE. 
+        close(2)
+    else
+        timers_enabled = .FALSE. 
+    endif
+
+    write(*, 1000)
+
+    niter = niter_default
+
+    write(*, 1001) nx, ny, nz
+    write(*, 1002) niter
+!$    write(*, 1003) omp_get_max_threads()
+    write(*, *)
+
+
+    1000 format(//,' NAS Parallel Benchmarks (NPB3.3-OMP)', ' - FT Benchmark', /)
+    1001 format(' Size                : ', i4, 'x', i4, 'x', i4)
+    1002 format(' Iterations                  :', i7)
+    1003 format(' Number of available threads :', i7)
+
+    dims(1) = nx
+    dims(2) = ny
+    dims(3) = nz
+
+
+!---------------------------------------------------------------------
+! Set up info for blocking of ffts and transposes.  This improves
+! performance on cache-based systems. Blocking involves
+! working on a chunk of the problem at a time, taking chunks
+! along the first, second, or third dimension.
+
+! - In cffts1 blocking is on 2nd dimension (with fft on 1st dim)
+! - In cffts2/3 blocking is on 1st dimension (with fft on 2nd and 3rd dims)
+
+! Since 1st dim is always in processor, we'll assume it's long enough
+! (default blocking factor is 16 so min size for 1st dim is 16)
+! The only case we have to worry about is cffts1 in a 2d decomposition.
+! so the blocking factor should not be larger than the 2nd dimension.
+!---------------------------------------------------------------------
+
+    fftblock = fftblock_default
+    fftblockpad = fftblockpad_default
+
+    if (fftblock /= fftblock_default) fftblockpad = fftblock+3
+
+    return
+    end subroutine setup
+
+          
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine compute_indexmap(twiddle, d1, d2, d3)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+!---------------------------------------------------------------------
+! compute function from local (i,j,k) to ibar^2+jbar^2+kbar^2
+! for time evolution exponent.
+!---------------------------------------------------------------------
+
+    implicit none
+    include 'global.h'
+    integer :: d1, d2, d3
+    double precision :: twiddle(d1+1, d2, d3)
+    integer :: i, j, k, kk, kk2, jj, kj2, ii
+    double precision :: ap
+
+!---------------------------------------------------------------------
+! basically we want to convert the fortran indices
+!   1 2 3 4 5 6 7 8
+! to
+!   0 1 2 3 -4 -3 -2 -1
+! The following magic formula does the trick:
+! mod(i-1+n/2, n) - n/2
+!---------------------------------------------------------------------
+
+    ap = - 4.d0 * alpha * pi *pi
+
+! omp parallel do default(shared) private(i,j,k,kk,kk2,jj,kj2,ii)
+    do k = 1, dims(3)
+        kk =  mod(k-1+nz/2, nz) - nz/2
+        kk2 = kk*kk
+        do j = 1, dims(2)
+            jj = mod(j-1+ny/2, ny) - ny/2
+            kj2 = jj*jj+kk2
+            do i = 1, dims(1)
+                ii = mod(i-1+nx/2, nx) - nx/2
+                twiddle(i,j,k) = dexp(ap*dble(ii*ii+kj2))
+            end do
+        end do
+    end do
+
+    return
+    end subroutine compute_indexmap
+
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine print_timers()
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    implicit none
+    integer :: i
+    include 'global.h'
+    double precision :: t, t_m
+    character(25) :: tstrings(T_max)
+    data tstrings / '          total ', '          setup ', '            fft ', '         evolve ', '       checksum ', '           fftx ', '           ffty ', '           fftz ' /
+
+    t_m = timer_read(T_total)
+    if (t_m <= 0.0d0) t_m = 1.0d0
+    do i = 1, t_max
+        t = timer_read(i)
+        write(*, 100) i, tstrings(i), t, t*100.0/t_m
+    end do
+    100 format(' timer ', i2, '(', A16,  ') :', F9.4, ' (',F6.2,'%)')
+    return
+    end subroutine print_timers
+
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine fft(dir, x1, x2)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    implicit none
+    include 'global.h'
+    integer :: dir
+    double complex x1(ntotalp), x2(ntotalp)
+
+    double complex y1(fftblockpad_default*maxdim), y2(fftblockpad_default*maxdim)
+
+!---------------------------------------------------------------------
+! note: args x1, x2 must be different arrays
+! note: args for cfftsx are (direction, layout, xin, xout, scratch)
+!       xin/xout may be the same and it can be somewhat faster
+!       if they are
+!---------------------------------------------------------------------
+
+    if (dir == 1) then
+        call cffts1(1, dims(1), dims(2), dims(3), x1, x1, y1, y2)
+        call cffts2(1, dims(1), dims(2), dims(3), x1, x1, y1, y2)
+        call cffts3(1, dims(1), dims(2), dims(3), x1, x2, y1, y2)
+    else
+        call cffts3(-1, dims(1), dims(2), dims(3), x1, x1, y1, y2)
+        call cffts2(-1, dims(1), dims(2), dims(3), x1, x1, y1, y2)
+        call cffts1(-1, dims(1), dims(2), dims(3), x1, x2, y1, y2)
+    endif
+    return
+    end subroutine fft
+
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine cffts1(is, d1, d2, d3, x, xout, y1, y2)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    implicit none
+
+    include 'global.h'
+    integer :: is, d1, d2, d3, logd1
+    double complex x(d1+1,d2,d3)
+    double complex xout(d1+1,d2,d3)
+    double complex y1(fftblockpad, d1), y2(fftblockpad, d1)
+    integer :: i, j, k, jj
+
+    logd1 = ilog2(d1)
+
+    if (timers_enabled) call timer_start(T_fftx)
+! omp parallel do default(shared) private(i,j,k,jj,y1,y2) shared(is,logd1,d1)
+    do k = 1, d3
+        do jj = 0, d2 - fftblock, fftblock
+            do j = 1, fftblock
+                do i = 1, d1
+                    y1(j,i) = x(i,j+jj,k)
+                enddo
+            enddo
+                        
+            call cfftz (is, logd1, d1, y1, y2)
+
+
+            do j = 1, fftblock
+                do i = 1, d1
+                    xout(i,j+jj,k) = y1(j,i)
+                enddo
+            enddo
+        enddo
+    enddo
+    if (timers_enabled) call timer_stop(T_fftx)
+
+    return
+    end subroutine cffts1
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine cffts2(is, d1, d2, d3, x, xout, y1, y2)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    implicit none
+
+    include 'global.h'
+    integer :: is, d1, d2, d3, logd2
+    double complex x(d1+1,d2,d3)
+    double complex xout(d1+1,d2,d3)
+    double complex y1(fftblockpad, d2), y2(fftblockpad, d2)
+    integer :: i, j, k, ii
+
+    logd2 = ilog2(d2)
+
+    if (timers_enabled) call timer_start(T_ffty)
+! omp parallel do default(shared) private(i,j,k,ii,y1,y2) shared(is,logd2,d2)
+    do k = 1, d3
+        do ii = 0, d1 - fftblock, fftblock
+            do j = 1, d2
+                do i = 1, fftblock
+                    y1(i,j) = x(i+ii,j,k)
+                enddo
+            enddo
+
+            call cfftz (is, logd2, d2, y1, y2)
+                       
+            do j = 1, d2
+                do i = 1, fftblock
+                    xout(i+ii,j,k) = y1(i,j)
+                enddo
+            enddo
+        enddo
+    enddo
+    if (timers_enabled) call timer_stop(T_ffty)
+
+    return
+    end subroutine cffts2
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine cffts3(is, d1, d2, d3, x, xout, y1, y2)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    implicit none
+
+    include 'global.h'
+    integer :: is, d1, d2, d3, logd3
+    double complex x(d1+1,d2,d3)
+    double complex xout(d1+1,d2,d3)
+    double complex y1(fftblockpad, d3), y2(fftblockpad, d3)
+    integer :: i, j, k, ii
+
+    logd3 = ilog2(d3)
+
+    if (timers_enabled) call timer_start(T_fftz)
+! omp parallel do default(shared) private(i,j,k,ii,y1,y2) shared(is)
+    do j = 1, d2
+        do ii = 0, d1 - fftblock, fftblock
+            do k = 1, d3
+                do i = 1, fftblock
+                    y1(i,k) = x(i+ii,j,k)
+                enddo
+            enddo
+
+            call cfftz (is, logd3, d3, y1, y2)
+
+            do k = 1, d3
+                do i = 1, fftblock
+                    xout(i+ii,j,k) = y1(i,k)
+                enddo
+            enddo
+        enddo
+    enddo
+    if (timers_enabled) call timer_stop(T_fftz)
+
+    return
+    end subroutine cffts3
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine fft_init (n)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+!---------------------------------------------------------------------
+! compute the roots-of-unity array that will be used for subsequent FFTs.
+!---------------------------------------------------------------------
+
+    implicit none
+    include 'global.h'
+
+    integer :: m,n,nu,ku,i,j,ln
+    double precision :: t, ti
+
+
+!---------------------------------------------------------------------
+!   Initialize the U array with sines and cosines in a manner that permits
+!   stride one access at each FFT iteration.
+!---------------------------------------------------------------------
+    nu = n
+    m = ilog2(n)
+    u(1) = m
+    ku = 2
+    ln = 1
+
+    do j = 1, m
+        t = pi / ln
+                 
+        do i = 0, ln - 1
+            ti = i * t
+            u(i+ku) = dcmplx (cos (ti), sin(ti))
+        enddo
+                 
+        ku = ku + ln
+        ln = 2 * ln
+    enddo
+          
+    return
+    end subroutine fft_init
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine cfftz (is, m, n, x, y)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+!---------------------------------------------------------------------
+!   Computes NY N-point complex-to-complex FFTs of X using an algorithm due
+!   to Swarztrauber.  X is both the input and the output array, while Y is a
+!   scratch array.  It is assumed that N = 2^M.  Before calling CFFTZ to
+!   perform FFTs, the array U must be initialized by calling CFFTZ with IS
+!   set to 0 and M set to MX, where MX is the maximum value of M for any
+!   subsequent call.
+!---------------------------------------------------------------------
+
+    implicit none
+    include 'global.h'
+
+    integer :: is,m,n,i,j,l,mx
+    double complex x, y
+
+    dimension x(fftblockpad,n), y(fftblockpad,n)
+
+!---------------------------------------------------------------------
+!   Check if input parameters are invalid.
+!---------------------------------------------------------------------
+    mx = u(1)
+    if ((is /= 1 .AND. is /= -1) .OR. m < 1 .OR. m > mx) then
+        write (*, 1)  is, m, mx
+        1 format ('CFFTZ: Either U has not been initialized, or else'/  'one of the input parameters is invalid', 3I5)
+        stop
+    endif
+
+!---------------------------------------------------------------------
+!   Perform one variant of the Stockham FFT.
+!---------------------------------------------------------------------
+    do l = 1, m, 2
+        call fftz2 (is, l, m, n, fftblock, fftblockpad, u, x, y)
+        if (l == m) goto 160
+        call fftz2 (is, l + 1, m, n, fftblock, fftblockpad, u, y, x)
+    enddo
+
+    goto 180
+
+!---------------------------------------------------------------------
+!   Copy Y to X.
+!---------------------------------------------------------------------
+    160 do j = 1, n
+        do i = 1, fftblock
+            x(i,j) = y(i,j)
+        enddo
+    enddo
+
+    180 continue
+
+    return
+    end subroutine cfftz
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine fftz2 (is, l, m, n, ny, ny1, u, x, y)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+!---------------------------------------------------------------------
+!   Performs the L-th iteration of the second variant of the Stockham FFT.
+!---------------------------------------------------------------------
+
+    implicit none
+
+    integer :: is,k,l,m,n,ny,ny1,n1,li,lj,lk,ku,i,j,i11,i12,i21,i22
+    double complex u,x,y,u1,x11,x21
+    dimension u(n), x(ny1,n), y(ny1,n)
+
+
+!---------------------------------------------------------------------
+!   Set initial parameters.
+!---------------------------------------------------------------------
+
+    n1 = n / 2
+    lk = 2 ** (l - 1)
+    li = 2 ** (m - l)
+    lj = 2 * lk
+    ku = li + 1
+
+    do i = 0, li - 1
+        i11 = i * lk + 1
+        i12 = i11 + n1
+        i21 = i * lj + 1
+        i22 = i21 + lk
+        if (is >= 1) then
+            u1 = u(ku+i)
+        else
+            u1 = dconjg (u(ku+i))
+        endif
+
+    !---------------------------------------------------------------------
+    !   This loop is vectorizable.
+    !---------------------------------------------------------------------
+        do k = 0, lk - 1
+            do j = 1, ny
+                x11 = x(j,i11+k)
+                x21 = x(j,i12+k)
+                y(j,i21+k) = x11 + x21
+                y(j,i22+k) = u1 * (x11 - x21)
+            enddo
+        enddo
+    enddo
+
+    return
+    end subroutine fftz2
+
+!---------------------------------------------------------------------
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    integer function ilog2(n)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    implicit none
+    integer :: n, nn, lg
+    if (n == 1) then
+        ilog2=0
+        return
+    endif
+    lg = 1
+    nn = 2
+    do while (nn < n)
+        nn = nn*2
+        lg = lg+1
+    end do
+    ilog2 = lg
+    return
+    end function ilog2
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine checksum(i, u1, d1, d2, d3)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    implicit none
+    include 'global.h'
+    integer :: i, d1, d2, d3
+    double complex u1(d1+1,d2,d3)
+    integer :: j, q,r,s
+    double complex chk
+    chk = (0.0,0.0)
+
+! omp parallel do default(shared) private(i,q,r,s) reduction(+:chk)
+    do j=1,1024
+        q = mod(j, nx)+1
+        r = mod(3*j,ny)+1
+        s = mod(5*j,nz)+1
+        chk=chk+u1(q,r,s)
+    end do
+
+    chk = chk/dble(ntotal)
+          
+    write (*, 30) i, chk
+    30 format (' T =',I5,5X,'Checksum =',1P2D22.12)
+    sums(i) = chk
+    return
+    end subroutine checksum
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine verify (d1, d2, d3, nt, verified, class)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    implicit none
+    include 'global.h'
+    integer :: d1, d2, d3, nt
+    character class
+    logical :: verified
+    integer :: i
+    double precision :: err, epsilon
+
+!---------------------------------------------------------------------
+!   Reference checksums
+!---------------------------------------------------------------------
+    double complex csum_ref(25)
+
+
+    class = 'U'
+
+    epsilon = 1.0d-12
+    verified = .FALSE. 
+
+    if (d1 == 64 .AND.     d2 == 64 .AND.     d3 == 64 .AND.     nt == 6) then
+    !---------------------------------------------------------------------
+    !   Sample size reference checksums
+    !---------------------------------------------------------------------
+        class = 'S'
+        csum_ref(1) = dcmplx(5.546087004964D+02, 4.845363331978D+02)
+        csum_ref(2) = dcmplx(5.546385409189D+02, 4.865304269511D+02)
+        csum_ref(3) = dcmplx(5.546148406171D+02, 4.883910722336D+02)
+        csum_ref(4) = dcmplx(5.545423607415D+02, 4.901273169046D+02)
+        csum_ref(5) = dcmplx(5.544255039624D+02, 4.917475857993D+02)
+        csum_ref(6) = dcmplx(5.542683411902D+02, 4.932597244941D+02)
+
+    else if (d1 == 128 .AND.         d2 == 128 .AND.         d3 == 32 .AND.         nt == 6) then
+    !---------------------------------------------------------------------
+    !   Class W size reference checksums
+    !---------------------------------------------------------------------
+        class = 'W'
+        csum_ref(1) = dcmplx(5.673612178944D+02, 5.293246849175D+02)
+        csum_ref(2) = dcmplx(5.631436885271D+02, 5.282149986629D+02)
+        csum_ref(3) = dcmplx(5.594024089970D+02, 5.270996558037D+02)
+        csum_ref(4) = dcmplx(5.560698047020D+02, 5.260027904925D+02)
+        csum_ref(5) = dcmplx(5.530898991250D+02, 5.249400845633D+02)
+        csum_ref(6) = dcmplx(5.504159734538D+02, 5.239212247086D+02)
+
+    else if (d1 == 256 .AND.         d2 == 256 .AND.         d3 == 128 .AND.         nt == 6) then
+    !---------------------------------------------------------------------
+    !   Class A size reference checksums
+    !---------------------------------------------------------------------
+        class = 'A'
+        csum_ref(1) = dcmplx(5.046735008193D+02, 5.114047905510D+02)
+        csum_ref(2) = dcmplx(5.059412319734D+02, 5.098809666433D+02)
+        csum_ref(3) = dcmplx(5.069376896287D+02, 5.098144042213D+02)
+        csum_ref(4) = dcmplx(5.077892868474D+02, 5.101336130759D+02)
+        csum_ref(5) = dcmplx(5.085233095391D+02, 5.104914655194D+02)
+        csum_ref(6) = dcmplx(5.091487099959D+02, 5.107917842803D+02)
+              
+    else if (d1 == 512 .AND.         d2 == 256 .AND.         d3 == 256 .AND.         nt == 20) then
+    !---------------------------------------------------------------------
+    !   Class B size reference checksums
+    !---------------------------------------------------------------------
+        class = 'B'
+        csum_ref(1)  = dcmplx(5.177643571579D+02, 5.077803458597D+02)
+        csum_ref(2)  = dcmplx(5.154521291263D+02, 5.088249431599D+02)
+        csum_ref(3)  = dcmplx(5.146409228649D+02, 5.096208912659D+02)
+        csum_ref(4)  = dcmplx(5.142378756213D+02, 5.101023387619D+02)
+        csum_ref(5)  = dcmplx(5.139626667737D+02, 5.103976610617D+02)
+        csum_ref(6)  = dcmplx(5.137423460082D+02, 5.105948019802D+02)
+        csum_ref(7)  = dcmplx(5.135547056878D+02, 5.107404165783D+02)
+        csum_ref(8)  = dcmplx(5.133910925466D+02, 5.108576573661D+02)
+        csum_ref(9)  = dcmplx(5.132470705390D+02, 5.109577278523D+02)
+        csum_ref(10) = dcmplx(5.131197729984D+02, 5.110460304483D+02)
+        csum_ref(11) = dcmplx(5.130070319283D+02, 5.111252433800D+02)
+        csum_ref(12) = dcmplx(5.129070537032D+02, 5.111968077718D+02)
+        csum_ref(13) = dcmplx(5.128182883502D+02, 5.112616233064D+02)
+        csum_ref(14) = dcmplx(5.127393733383D+02, 5.113203605551D+02)
+        csum_ref(15) = dcmplx(5.126691062020D+02, 5.113735928093D+02)
+        csum_ref(16) = dcmplx(5.126064276004D+02, 5.114218460548D+02)
+        csum_ref(17) = dcmplx(5.125504076570D+02, 5.114656139760D+02)
+        csum_ref(18) = dcmplx(5.125002331720D+02, 5.115053595966D+02)
+        csum_ref(19) = dcmplx(5.124551951846D+02, 5.115415130407D+02)
+        csum_ref(20) = dcmplx(5.124146770029D+02, 5.115744692211D+02)
+
+    else if (d1 == 512 .AND.         d2 == 512 .AND.         d3 == 512 .AND.         nt == 20) then
+    !---------------------------------------------------------------------
+    !   Class C size reference checksums
+    !---------------------------------------------------------------------
+        class = 'C'
+        csum_ref(1)  = dcmplx(5.195078707457D+02, 5.149019699238D+02)
+        csum_ref(2)  = dcmplx(5.155422171134D+02, 5.127578201997D+02)
+        csum_ref(3)  = dcmplx(5.144678022222D+02, 5.122251847514D+02)
+        csum_ref(4)  = dcmplx(5.140150594328D+02, 5.121090289018D+02)
+        csum_ref(5)  = dcmplx(5.137550426810D+02, 5.121143685824D+02)
+        csum_ref(6)  = dcmplx(5.135811056728D+02, 5.121496764568D+02)
+        csum_ref(7)  = dcmplx(5.134569343165D+02, 5.121870921893D+02)
+        csum_ref(8)  = dcmplx(5.133651975661D+02, 5.122193250322D+02)
+        csum_ref(9)  = dcmplx(5.132955192805D+02, 5.122454735794D+02)
+        csum_ref(10) = dcmplx(5.132410471738D+02, 5.122663649603D+02)
+        csum_ref(11) = dcmplx(5.131971141679D+02, 5.122830879827D+02)
+        csum_ref(12) = dcmplx(5.131605205716D+02, 5.122965869718D+02)
+        csum_ref(13) = dcmplx(5.131290734194D+02, 5.123075927445D+02)
+        csum_ref(14) = dcmplx(5.131012720314D+02, 5.123166486553D+02)
+        csum_ref(15) = dcmplx(5.130760908195D+02, 5.123241541685D+02)
+        csum_ref(16) = dcmplx(5.130528295923D+02, 5.123304037599D+02)
+        csum_ref(17) = dcmplx(5.130310107773D+02, 5.123356167976D+02)
+        csum_ref(18) = dcmplx(5.130103090133D+02, 5.123399592211D+02)
+        csum_ref(19) = dcmplx(5.129905029333D+02, 5.123435588985D+02)
+        csum_ref(20) = dcmplx(5.129714421109D+02, 5.123465164008D+02)
+
+    else if (d1 == 2048 .AND.         d2 == 1024 .AND.         d3 == 1024 .AND.         nt == 25) then
+    !---------------------------------------------------------------------
+    !   Class D size reference checksums
+    !---------------------------------------------------------------------
+        class = 'D'
+        csum_ref(1)  = dcmplx(5.122230065252D+02, 5.118534037109D+02)
+        csum_ref(2)  = dcmplx(5.120463975765D+02, 5.117061181082D+02)
+        csum_ref(3)  = dcmplx(5.119865766760D+02, 5.117096364601D+02)
+        csum_ref(4)  = dcmplx(5.119518799488D+02, 5.117373863950D+02)
+        csum_ref(5)  = dcmplx(5.119269088223D+02, 5.117680347632D+02)
+        csum_ref(6)  = dcmplx(5.119082416858D+02, 5.117967875532D+02)
+        csum_ref(7)  = dcmplx(5.118943814638D+02, 5.118225281841D+02)
+        csum_ref(8)  = dcmplx(5.118842385057D+02, 5.118451629348D+02)
+        csum_ref(9)  = dcmplx(5.118769435632D+02, 5.118649119387D+02)
+        csum_ref(10) = dcmplx(5.118718203448D+02, 5.118820803844D+02)
+        csum_ref(11) = dcmplx(5.118683569061D+02, 5.118969781011D+02)
+        csum_ref(12) = dcmplx(5.118661708593D+02, 5.119098918835D+02)
+        csum_ref(13) = dcmplx(5.118649768950D+02, 5.119210777066D+02)
+        csum_ref(14) = dcmplx(5.118645605626D+02, 5.119307604484D+02)
+        csum_ref(15) = dcmplx(5.118647586618D+02, 5.119391362671D+02)
+        csum_ref(16) = dcmplx(5.118654451572D+02, 5.119463757241D+02)
+        csum_ref(17) = dcmplx(5.118665212451D+02, 5.119526269238D+02)
+        csum_ref(18) = dcmplx(5.118679083821D+02, 5.119580184108D+02)
+        csum_ref(19) = dcmplx(5.118695433664D+02, 5.119626617538D+02)
+        csum_ref(20) = dcmplx(5.118713748264D+02, 5.119666538138D+02)
+        csum_ref(21) = dcmplx(5.118733606701D+02, 5.119700787219D+02)
+        csum_ref(22) = dcmplx(5.118754661974D+02, 5.119730095953D+02)
+        csum_ref(23) = dcmplx(5.118776626738D+02, 5.119755100241D+02)
+        csum_ref(24) = dcmplx(5.118799262314D+02, 5.119776353561D+02)
+        csum_ref(25) = dcmplx(5.118822370068D+02, 5.119794338060D+02)
+
+    else if (d1 == 4096 .AND.         d2 == 2048 .AND.         d3 == 2048 .AND.         nt == 25) then
+    !---------------------------------------------------------------------
+    !   Class E size reference checksums
+    !---------------------------------------------------------------------
+        class = 'E'
+        csum_ref(1)  = dcmplx(5.121601045346D+02, 5.117395998266D+02)
+        csum_ref(2)  = dcmplx(5.120905403678D+02, 5.118614716182D+02)
+        csum_ref(3)  = dcmplx(5.120623229306D+02, 5.119074203747D+02)
+        csum_ref(4)  = dcmplx(5.120438418997D+02, 5.119345900733D+02)
+        csum_ref(5)  = dcmplx(5.120311521872D+02, 5.119551325550D+02)
+        csum_ref(6)  = dcmplx(5.120226088809D+02, 5.119720179919D+02)
+        csum_ref(7)  = dcmplx(5.120169296534D+02, 5.119861371665D+02)
+        csum_ref(8)  = dcmplx(5.120131225172D+02, 5.119979364402D+02)
+        csum_ref(9)  = dcmplx(5.120104767108D+02, 5.120077674092D+02)
+        csum_ref(10) = dcmplx(5.120085127969D+02, 5.120159443121D+02)
+        csum_ref(11) = dcmplx(5.120069224127D+02, 5.120227453670D+02)
+        csum_ref(12) = dcmplx(5.120055158164D+02, 5.120284096041D+02)
+        csum_ref(13) = dcmplx(5.120041820159D+02, 5.120331373793D+02)
+        csum_ref(14) = dcmplx(5.120028605402D+02, 5.120370938679D+02)
+        csum_ref(15) = dcmplx(5.120015223011D+02, 5.120404138831D+02)
+        csum_ref(16) = dcmplx(5.120001570022D+02, 5.120432068837D+02)
+        csum_ref(17) = dcmplx(5.119987650555D+02, 5.120455615860D+02)
+        csum_ref(18) = dcmplx(5.119973525091D+02, 5.120475499442D+02)
+        csum_ref(19) = dcmplx(5.119959279472D+02, 5.120492304629D+02)
+        csum_ref(20) = dcmplx(5.119945006558D+02, 5.120506508902D+02)
+        csum_ref(21) = dcmplx(5.119930795911D+02, 5.120518503782D+02)
+        csum_ref(22) = dcmplx(5.119916728462D+02, 5.120528612016D+02)
+        csum_ref(23) = dcmplx(5.119902874185D+02, 5.120537101195D+02)
+        csum_ref(24) = dcmplx(5.119889291565D+02, 5.120544194514D+02)
+        csum_ref(25) = dcmplx(5.119876028049D+02, 5.120550079284D+02)
+
+    endif
+
+
+    if (class /= 'U') then
+
+        do i = 1, nt
+            err = abs( (sums(i) - csum_ref(i)) / csum_ref(i) )
+            if ( .NOT. (err <= epsilon)) goto 100
+        end do
+        verified = .TRUE. 
+        100 continue
+
+    endif
+
+             
+    if (class /= 'U') then
+        if (verified) then
+            write(*,2000)
+            2000 format(' Result verification successful')
+        else
+            write(*,2001)
+            2001 format(' Result verification failed')
+        endif
+    endif
+    print *, 'class = ', class
+
+    return
+    end subroutine verify
+
+
diff --git a/Parser/test-data/nas/ft2.f b/Parser/test-data/nas/ft2.f
new file mode 100644
index 0000000..63ea2cd
--- /dev/null
+++ b/Parser/test-data/nas/ft2.f
@@ -0,0 +1,910 @@
+
+    program ft
+
+    if (timers_enabled) call print_timers()
+
+    END PROGRAM
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine init_ui(u0, u1, twiddle, d1, d2, d3)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+!---------------------------------------------------------------------
+! touch all the big data
+!---------------------------------------------------------------------
+
+    implicit none
+    integer :: d1, d2, d3
+    double complex   u0(d1+1,d2,d3)
+    double complex   u1(d1+1,d2,d3)
+    double precision :: twiddle(d1+1,d2,d3)
+    integer :: i, j, k
+
+! omp parallel do default(shared) private(i,j,k)
+    do k = 1, d3
+        do j = 1, d2
+            do i = 1, d1
+                u0(i,j,k) = 0.d0
+                u1(i,j,k) = 0.d0
+                twiddle(i,j,k) = 0.d0
+            end do
+        end do
+    end do
+
+    return
+    end subroutine init_ui
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine evolve(u0, u1, twiddle, d1, d2, d3)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+!---------------------------------------------------------------------
+! evolve u0 -> u1 (t time steps) in fourier space
+!---------------------------------------------------------------------
+
+    implicit none
+    ! include 'global.h'
+    integer :: d1, d2, d3
+    double complex   u0(d1+1,d2,d3)
+    double complex   u1(d1+1,d2,d3)
+    double precision :: twiddle(d1+1,d2,d3)
+    integer :: i, j, k
+
+! omp parallel do default(shared) private(i,j,k)
+    do k = 1, d3
+        do j = 1, d2
+            do i = 1, d1
+                u0(i,j,k) = u0(i,j,k) * twiddle(i,j,k)
+                u1(i,j,k) = u0(i,j,k)
+            end do
+        end do
+    end do
+
+    return
+    end subroutine evolve
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine compute_initial_conditions(u0, d1, d2, d3)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+!---------------------------------------------------------------------
+! Fill in array u0 with initial conditions from
+! random number generator
+!---------------------------------------------------------------------
+    implicit none
+!    include 'global.h'
+    integer :: d1, d2, d3
+    double complex u0(d1+1, d2, d3)
+    integer :: k, j
+    double precision :: x0, start, an, dummy, starts(nz)
+
+
+    start = seed
+!---------------------------------------------------------------------
+! Jump to the starting element for our first plane.
+!---------------------------------------------------------------------
+    call ipow46(a, 0, an)
+    dummy = randlc(start, an)
+    call ipow46(a, 2*nx*ny, an)
+
+    starts(1) = start
+    do k = 2, dims(3)
+        dummy = randlc(start, an)
+        starts(k) = start
+    end do
+
+!---------------------------------------------------------------------
+! Go through by z planes filling in one square at a time.
+!---------------------------------------------------------------------
+! omp parallel do default(shared) private(k,j,x0)
+    do k = 1, dims(3)
+        x0 = starts(k)
+        do j = 1, dims(2)
+            call vranlc(2*nx, x0, a, u0(1, j, k))
+        end do
+    end do
+
+    return
+    end subroutine compute_initial_conditions
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine ipow46(a, exponent, result)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+!---------------------------------------------------------------------
+! compute a^exponent mod 2^46
+!---------------------------------------------------------------------
+
+    implicit none
+    double precision :: a, result, dummy, q, r
+    integer :: exponent, n, n2
+    external randlc
+    double precision :: randlc
+!---------------------------------------------------------------------
+! Use
+!   a^n = a^(n/2)*a^(n/2) if n even else
+!   a^n = a*a^(n-1)       if n odd
+!---------------------------------------------------------------------
+    result = 1
+    if (exponent == 0) return
+    q = a
+    r = 1
+    n = exponent
+
+
+    do while (n > 1)
+        n2 = n/2
+        if (n2 * 2 == n) then
+            dummy = randlc(q, q)
+            n = n2
+        else
+            dummy = randlc(r, q)
+            n = n-1
+        endif
+    end do
+    dummy = randlc(r, q)
+    result = r
+    return
+    end subroutine ipow46
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine setup
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    implicit none
+!    include 'global.h'
+
+    integer :: fstatus
+!$    integer  omp_get_max_threads
+!$    external omp_get_max_threads
+    debug = .FALSE.
+
+    open (unit=2,file='timer.flag',status='old',iostat=fstatus)
+    if (fstatus == 0) then
+        timers_enabled = .TRUE.
+        close(2)
+    else
+        timers_enabled = .FALSE.
+    endif
+
+    write(*, 1000)
+
+    niter = niter_default
+
+    write(*, 1001) nx, ny, nz
+    write(*, 1002) niter
+!$    write(*, 1003) omp_get_max_threads()
+    write(*, *)
+
+
+    1000 format(//,' NAS Parallel Benchmarks (NPB3.3-OMP)', ' - FT Benchmark', /)
+    1001 format(' Size                : ', i4, 'x', i4, 'x', i4)
+    1002 format(' Iterations                  :', i7)
+    1003 format(' Number of available threads :', i7)
+
+    dims(1) = nx
+    dims(2) = ny
+    dims(3) = nz
+
+
+!---------------------------------------------------------------------
+! Set up info for blocking of ffts and transposes.  This improves
+! performance on cache-based systems. Blocking involves
+! working on a chunk of the problem at a time, taking chunks
+! along the first, second, or third dimension.
+
+! - In cffts1 blocking is on 2nd dimension (with fft on 1st dim)
+! - In cffts2/3 blocking is on 1st dimension (with fft on 2nd and 3rd dims)
+
+! Since 1st dim is always in processor, we'll assume it's long enough
+! (default blocking factor is 16 so min size for 1st dim is 16)
+! The only case we have to worry about is cffts1 in a 2d decomposition.
+! so the blocking factor should not be larger than the 2nd dimension.
+!---------------------------------------------------------------------
+
+    fftblock = fftblock_default
+    fftblockpad = fftblockpad_default
+
+    if (fftblock /= fftblock_default) fftblockpad = fftblock+3
+
+    return
+    end subroutine setup
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine compute_indexmap(twiddle, d1, d2, d3)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+!---------------------------------------------------------------------
+! compute function from local (i,j,k) to ibar^2+jbar^2+kbar^2
+! for time evolution exponent.
+!---------------------------------------------------------------------
+
+    implicit none
+!    include 'global.h'
+    integer :: d1, d2, d3
+    double precision :: twiddle(d1+1, d2, d3)
+    integer :: i, j, k, kk, kk2, jj, kj2, ii
+    double precision :: ap
+
+!---------------------------------------------------------------------
+! basically we want to convert the fortran indices
+!   1 2 3 4 5 6 7 8
+! to
+!   0 1 2 3 -4 -3 -2 -1
+! The following magic formula does the trick:
+! mod(i-1+n/2, n) - n/2
+!---------------------------------------------------------------------
+
+    ap = - 4.d0 * alpha * pi *pi
+
+! omp parallel do default(shared) private(i,j,k,kk,kk2,jj,kj2,ii)
+    do k = 1, dims(3)
+        kk =  mod(k-1+nz/2, nz) - nz/2
+        kk2 = kk*kk
+        do j = 1, dims(2)
+            jj = mod(j-1+ny/2, ny) - ny/2
+            kj2 = jj*jj+kk2
+            do i = 1, dims(1)
+                ii = mod(i-1+nx/2, nx) - nx/2
+                twiddle(i,j,k) = dexp(ap*dble(ii*ii+kj2))
+            end do
+        end do
+    end do
+
+    return
+    end subroutine compute_indexmap
+
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine print_timers()
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    implicit none
+    integer :: i
+!    include 'global.h'
+    double precision :: t, t_m
+    character(25) :: tstrings(T_max)
+    data tstrings / '          total ', '          setup ', '            fft ', '         evolve ', '       checksum ', '           fftx ', '           ffty ', '           fftz ' /
+
+    t_m = timer_read(T_total)
+    if (t_m <= 0.0d0) t_m = 1.0d0
+    do i = 1, t_max
+        t = timer_read(i)
+        write(*, 100) i, tstrings(i), t, t*100.0/t_m
+    end do
+    100 format(' timer ', i2, '(', A16,  ') :', F9.4, ' (',F6.2,'%)')
+    return
+    end subroutine print_timers
+
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine fft(dir, x1, x2)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    implicit none
+!    include 'global.h'
+    integer :: dir
+    double complex x1(ntotalp), x2(ntotalp)
+
+    double complex y1(fftblockpad_default*maxdim), y2(fftblockpad_default*maxdim)
+
+!---------------------------------------------------------------------
+! note: args x1, x2 must be different arrays
+! note: args for cfftsx are (direction, layout, xin, xout, scratch)
+!       xin/xout may be the same and it can be somewhat faster
+!       if they are
+!---------------------------------------------------------------------
+
+    if (dir == 1) then
+        call cffts1(1, dims(1), dims(2), dims(3), x1, x1, y1, y2)
+        call cffts2(1, dims(1), dims(2), dims(3), x1, x1, y1, y2)
+        call cffts3(1, dims(1), dims(2), dims(3), x1, x2, y1, y2)
+    else
+        call cffts3(-1, dims(1), dims(2), dims(3), x1, x1, y1, y2)
+        call cffts2(-1, dims(1), dims(2), dims(3), x1, x1, y1, y2)
+        call cffts1(-1, dims(1), dims(2), dims(3), x1, x2, y1, y2)
+    endif
+    return
+    end subroutine fft
+
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine cffts1(is, d1, d2, d3, x, xout, y1, y2)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    implicit none
+
+!    include 'global.h'
+    integer :: is, d1, d2, d3, logd1
+    double complex x(d1+1,d2,d3)
+    double complex xout(d1+1,d2,d3)
+    double complex y1(fftblockpad, d1), y2(fftblockpad, d1)
+    integer :: i, j, k, jj
+
+    logd1 = ilog2(d1)
+
+    if (timers_enabled) call timer_start(T_fftx)
+! omp parallel do default(shared) private(i,j,k,jj,y1,y2) shared(is,logd1,d1)
+    do k = 1, d3
+        do jj = 0, d2 - fftblock, fftblock
+            do j = 1, fftblock
+                do i = 1, d1
+                    y1(j,i) = x(i,j+jj,k)
+                enddo
+            enddo
+
+            call cfftz (is, logd1, d1, y1, y2)
+
+
+            do j = 1, fftblock
+                do i = 1, d1
+                    xout(i,j+jj,k) = y1(j,i)
+                enddo
+            enddo
+        enddo
+    enddo
+    if (timers_enabled) call timer_stop(T_fftx)
+
+    return
+    end subroutine cffts1
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine cffts2(is, d1, d2, d3, x, xout, y1, y2)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    implicit none
+
+    include 'global.h'
+    integer :: is, d1, d2, d3, logd2
+    double complex x(d1+1,d2,d3)
+    double complex xout(d1+1,d2,d3)
+    double complex y1(fftblockpad, d2), y2(fftblockpad, d2)
+    integer :: i, j, k, ii
+
+    logd2 = ilog2(d2)
+
+    if (timers_enabled) call timer_start(T_ffty)
+! omp parallel do default(shared) private(i,j,k,ii,y1,y2) shared(is,logd2,d2)
+    do k = 1, d3
+        do ii = 0, d1 - fftblock, fftblock
+            do j = 1, d2
+                do i = 1, fftblock
+                    y1(i,j) = x(i+ii,j,k)
+                enddo
+            enddo
+
+            call cfftz (is, logd2, d2, y1, y2)
+
+            do j = 1, d2
+                do i = 1, fftblock
+                    xout(i+ii,j,k) = y1(i,j)
+                enddo
+            enddo
+        enddo
+    enddo
+    if (timers_enabled) call timer_stop(T_ffty)
+
+    return
+    end subroutine cffts2
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine cffts3(is, d1, d2, d3, x, xout, y1, y2)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    implicit none
+
+    include 'global.h'
+    integer :: is, d1, d2, d3, logd3
+    double complex x(d1+1,d2,d3)
+    double complex xout(d1+1,d2,d3)
+    double complex y1(fftblockpad, d3), y2(fftblockpad, d3)
+    integer :: i, j, k, ii
+
+    logd3 = ilog2(d3)
+
+    if (timers_enabled) call timer_start(T_fftz)
+! omp parallel do default(shared) private(i,j,k,ii,y1,y2) shared(is)
+    do j = 1, d2
+        do ii = 0, d1 - fftblock, fftblock
+            do k = 1, d3
+                do i = 1, fftblock
+                    y1(i,k) = x(i+ii,j,k)
+                enddo
+            enddo
+
+            call cfftz (is, logd3, d3, y1, y2)
+
+            do k = 1, d3
+                do i = 1, fftblock
+                    xout(i+ii,j,k) = y1(i,k)
+                enddo
+            enddo
+        enddo
+    enddo
+    if (timers_enabled) call timer_stop(T_fftz)
+
+    return
+    end subroutine cffts3
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine fft_init (n)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+!---------------------------------------------------------------------
+! compute the roots-of-unity array that will be used for subsequent FFTs.
+!---------------------------------------------------------------------
+
+    implicit none
+    include 'global.h'
+
+    integer :: m,n,nu,ku,i,j,ln
+    double precision :: t, ti
+
+
+!---------------------------------------------------------------------
+!   Initialize the U array with sines and cosines in a manner that permits
+!   stride one access at each FFT iteration.
+!---------------------------------------------------------------------
+    nu = n
+    m = ilog2(n)
+    u(1) = m
+    ku = 2
+    ln = 1
+
+    do j = 1, m
+        t = pi / ln
+
+        do i = 0, ln - 1
+            ti = i * t
+            u(i+ku) = dcmplx (cos (ti), sin(ti))
+        enddo
+
+        ku = ku + ln
+        ln = 2 * ln
+    enddo
+
+    return
+    end subroutine fft_init
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine cfftz (is, m, n, x, y)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+!---------------------------------------------------------------------
+!   Computes NY N-point complex-to-complex FFTs of X using an algorithm due
+!   to Swarztrauber.  X is both the input and the output array, while Y is a
+!   scratch array.  It is assumed that N = 2^M.  Before calling CFFTZ to
+!   perform FFTs, the array U must be initialized by calling CFFTZ with IS
+!   set to 0 and M set to MX, where MX is the maximum value of M for any
+!   subsequent call.
+!---------------------------------------------------------------------
+
+    implicit none
+    include 'global.h'
+
+    integer :: is,m,n,i,j,l,mx
+    double complex x, y
+
+    dimension x(fftblockpad,n), y(fftblockpad,n)
+
+!---------------------------------------------------------------------
+!   Check if input parameters are invalid.
+!---------------------------------------------------------------------
+    mx = u(1)
+    if ((is /= 1 .AND. is /= -1) .OR. m < 1 .OR. m > mx) then
+        write (*, 1)  is, m, mx
+        1 format ('CFFTZ: Either U has not been initialized, or else'/  'one of the input parameters is invalid', 3I5)
+        stop
+    endif
+
+!---------------------------------------------------------------------
+!   Perform one variant of the Stockham FFT.
+!---------------------------------------------------------------------
+    do l = 1, m, 2
+        call fftz2 (is, l, m, n, fftblock, fftblockpad, u, x, y)
+        if (l == m) goto 160
+        call fftz2 (is, l + 1, m, n, fftblock, fftblockpad, u, y, x)
+    enddo
+
+    goto 180
+
+!---------------------------------------------------------------------
+!   Copy Y to X.
+!---------------------------------------------------------------------
+    160 do j = 1, n
+        do i = 1, fftblock
+            x(i,j) = y(i,j)
+        enddo
+    enddo
+
+    180 continue
+
+    return
+    end subroutine cfftz
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine fftz2 (is, l, m, n, ny, ny1, u, x, y)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+!---------------------------------------------------------------------
+!   Performs the L-th iteration of the second variant of the Stockham FFT.
+!---------------------------------------------------------------------
+
+    implicit none
+
+    integer :: is,k,l,m,n,ny,ny1,n1,li,lj,lk,ku,i,j,i11,i12,i21,i22
+    double complex u,x,y,u1,x11,x21
+    dimension u(n), x(ny1,n), y(ny1,n)
+
+
+!---------------------------------------------------------------------
+!   Set initial parameters.
+!---------------------------------------------------------------------
+
+    n1 = n / 2
+    lk = 2 ** (l - 1)
+    li = 2 ** (m - l)
+    lj = 2 * lk
+    ku = li + 1
+
+    do i = 0, li - 1
+        i11 = i * lk + 1
+        i12 = i11 + n1
+        i21 = i * lj + 1
+        i22 = i21 + lk
+        if (is >= 1) then
+            u1 = u(ku+i)
+        else
+            u1 = dconjg (u(ku+i))
+        endif
+
+    !---------------------------------------------------------------------
+    !   This loop is vectorizable.
+    !---------------------------------------------------------------------
+        do k = 0, lk - 1
+            do j = 1, ny
+                x11 = x(j,i11+k)
+                x21 = x(j,i12+k)
+                y(j,i21+k) = x11 + x21
+                y(j,i22+k) = u1 * (x11 - x21)
+            enddo
+        enddo
+    enddo
+
+    return
+    end subroutine fftz2
+
+!---------------------------------------------------------------------
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    integer function ilog2(n)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    implicit none
+    integer :: n, nn, lg
+    if (n == 1) then
+        ilog2=0
+        return
+    endif
+    lg = 1
+    nn = 2
+    do while (nn < n)
+        nn = nn*2
+        lg = lg+1
+    end do
+    ilog2 = lg
+    return
+    end function ilog2
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine checksum(i, u1, d1, d2, d3)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    implicit none
+    include 'global.h'
+    integer :: i, d1, d2, d3
+    double complex u1(d1+1,d2,d3)
+    integer :: j, q,r,s
+    double complex chk
+    chk = (0.0,0.0)
+
+! omp parallel do default(shared) private(i,q,r,s) reduction(+:chk)
+    do j=1,1024
+        q = mod(j, nx)+1
+        r = mod(3*j,ny)+1
+        s = mod(5*j,nz)+1
+        chk=chk+u1(q,r,s)
+    end do
+
+    chk = chk/dble(ntotal)
+
+    write (*, 30) i, chk
+    30 format (' T =',I5,5X,'Checksum =',1P2D22.12)
+    sums(i) = chk
+    return
+    end subroutine checksum
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine verify (d1, d2, d3, nt, verified, class)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    implicit none
+    include 'global.h'
+    integer :: d1, d2, d3, nt
+    character class
+    logical :: verified
+    integer :: i
+    double precision :: err, epsilon
+
+!---------------------------------------------------------------------
+!   Reference checksums
+!---------------------------------------------------------------------
+    double complex csum_ref(25)
+
+
+    class = 'U'
+
+    epsilon = 1.0d-12
+    verified = .FALSE.
+
+    if (d1 == 64 .AND.     d2 == 64 .AND.     d3 == 64 .AND.     nt == 6) then
+    !---------------------------------------------------------------------
+    !   Sample size reference checksums
+    !---------------------------------------------------------------------
+        class = 'S'
+        csum_ref(1) = dcmplx(5.546087004964D+02, 4.845363331978D+02)
+        csum_ref(2) = dcmplx(5.546385409189D+02, 4.865304269511D+02)
+        csum_ref(3) = dcmplx(5.546148406171D+02, 4.883910722336D+02)
+        csum_ref(4) = dcmplx(5.545423607415D+02, 4.901273169046D+02)
+        csum_ref(5) = dcmplx(5.544255039624D+02, 4.917475857993D+02)
+        csum_ref(6) = dcmplx(5.542683411902D+02, 4.932597244941D+02)
+
+    else if (d1 == 128 .AND.         d2 == 128 .AND.         d3 == 32 .AND.         nt == 6) then
+    !---------------------------------------------------------------------
+    !   Class W size reference checksums
+    !---------------------------------------------------------------------
+        class = 'W'
+        csum_ref(1) = dcmplx(5.673612178944D+02, 5.293246849175D+02)
+        csum_ref(2) = dcmplx(5.631436885271D+02, 5.282149986629D+02)
+        csum_ref(3) = dcmplx(5.594024089970D+02, 5.270996558037D+02)
+        csum_ref(4) = dcmplx(5.560698047020D+02, 5.260027904925D+02)
+        csum_ref(5) = dcmplx(5.530898991250D+02, 5.249400845633D+02)
+        csum_ref(6) = dcmplx(5.504159734538D+02, 5.239212247086D+02)
+
+    else if (d1 == 256 .AND.         d2 == 256 .AND.         d3 == 128 .AND.         nt == 6) then
+    !---------------------------------------------------------------------
+    !   Class A size reference checksums
+    !---------------------------------------------------------------------
+        class = 'A'
+        csum_ref(1) = dcmplx(5.046735008193D+02, 5.114047905510D+02)
+        csum_ref(2) = dcmplx(5.059412319734D+02, 5.098809666433D+02)
+        csum_ref(3) = dcmplx(5.069376896287D+02, 5.098144042213D+02)
+        csum_ref(4) = dcmplx(5.077892868474D+02, 5.101336130759D+02)
+        csum_ref(5) = dcmplx(5.085233095391D+02, 5.104914655194D+02)
+        csum_ref(6) = dcmplx(5.091487099959D+02, 5.107917842803D+02)
+
+    else if (d1 == 512 .AND.         d2 == 256 .AND.         d3 == 256 .AND.         nt == 20) then
+    !---------------------------------------------------------------------
+    !   Class B size reference checksums
+    !---------------------------------------------------------------------
+        class = 'B'
+        csum_ref(1)  = dcmplx(5.177643571579D+02, 5.077803458597D+02)
+        csum_ref(2)  = dcmplx(5.154521291263D+02, 5.088249431599D+02)
+        csum_ref(3)  = dcmplx(5.146409228649D+02, 5.096208912659D+02)
+        csum_ref(4)  = dcmplx(5.142378756213D+02, 5.101023387619D+02)
+        csum_ref(5)  = dcmplx(5.139626667737D+02, 5.103976610617D+02)
+        csum_ref(6)  = dcmplx(5.137423460082D+02, 5.105948019802D+02)
+        csum_ref(7)  = dcmplx(5.135547056878D+02, 5.107404165783D+02)
+        csum_ref(8)  = dcmplx(5.133910925466D+02, 5.108576573661D+02)
+        csum_ref(9)  = dcmplx(5.132470705390D+02, 5.109577278523D+02)
+        csum_ref(10) = dcmplx(5.131197729984D+02, 5.110460304483D+02)
+        csum_ref(11) = dcmplx(5.130070319283D+02, 5.111252433800D+02)
+        csum_ref(12) = dcmplx(5.129070537032D+02, 5.111968077718D+02)
+        csum_ref(13) = dcmplx(5.128182883502D+02, 5.112616233064D+02)
+        csum_ref(14) = dcmplx(5.127393733383D+02, 5.113203605551D+02)
+        csum_ref(15) = dcmplx(5.126691062020D+02, 5.113735928093D+02)
+        csum_ref(16) = dcmplx(5.126064276004D+02, 5.114218460548D+02)
+        csum_ref(17) = dcmplx(5.125504076570D+02, 5.114656139760D+02)
+        csum_ref(18) = dcmplx(5.125002331720D+02, 5.115053595966D+02)
+        csum_ref(19) = dcmplx(5.124551951846D+02, 5.115415130407D+02)
+        csum_ref(20) = dcmplx(5.124146770029D+02, 5.115744692211D+02)
+
+    else if (d1 == 512 .AND.         d2 == 512 .AND.         d3 == 512 .AND.         nt == 20) then
+    !---------------------------------------------------------------------
+    !   Class C size reference checksums
+    !---------------------------------------------------------------------
+        class = 'C'
+        csum_ref(1)  = dcmplx(5.195078707457D+02, 5.149019699238D+02)
+        csum_ref(2)  = dcmplx(5.155422171134D+02, 5.127578201997D+02)
+        csum_ref(3)  = dcmplx(5.144678022222D+02, 5.122251847514D+02)
+        csum_ref(4)  = dcmplx(5.140150594328D+02, 5.121090289018D+02)
+        csum_ref(5)  = dcmplx(5.137550426810D+02, 5.121143685824D+02)
+        csum_ref(6)  = dcmplx(5.135811056728D+02, 5.121496764568D+02)
+        csum_ref(7)  = dcmplx(5.134569343165D+02, 5.121870921893D+02)
+        csum_ref(8)  = dcmplx(5.133651975661D+02, 5.122193250322D+02)
+        csum_ref(9)  = dcmplx(5.132955192805D+02, 5.122454735794D+02)
+        csum_ref(10) = dcmplx(5.132410471738D+02, 5.122663649603D+02)
+        csum_ref(11) = dcmplx(5.131971141679D+02, 5.122830879827D+02)
+        csum_ref(12) = dcmplx(5.131605205716D+02, 5.122965869718D+02)
+        csum_ref(13) = dcmplx(5.131290734194D+02, 5.123075927445D+02)
+        csum_ref(14) = dcmplx(5.131012720314D+02, 5.123166486553D+02)
+        csum_ref(15) = dcmplx(5.130760908195D+02, 5.123241541685D+02)
+        csum_ref(16) = dcmplx(5.130528295923D+02, 5.123304037599D+02)
+        csum_ref(17) = dcmplx(5.130310107773D+02, 5.123356167976D+02)
+        csum_ref(18) = dcmplx(5.130103090133D+02, 5.123399592211D+02)
+        csum_ref(19) = dcmplx(5.129905029333D+02, 5.123435588985D+02)
+        csum_ref(20) = dcmplx(5.129714421109D+02, 5.123465164008D+02)
+
+    else if (d1 == 2048 .AND.         d2 == 1024 .AND.         d3 == 1024 .AND.         nt == 25) then
+    !---------------------------------------------------------------------
+    !   Class D size reference checksums
+    !---------------------------------------------------------------------
+        class = 'D'
+        csum_ref(1)  = dcmplx(5.122230065252D+02, 5.118534037109D+02)
+        csum_ref(2)  = dcmplx(5.120463975765D+02, 5.117061181082D+02)
+        csum_ref(3)  = dcmplx(5.119865766760D+02, 5.117096364601D+02)
+        csum_ref(4)  = dcmplx(5.119518799488D+02, 5.117373863950D+02)
+        csum_ref(5)  = dcmplx(5.119269088223D+02, 5.117680347632D+02)
+        csum_ref(6)  = dcmplx(5.119082416858D+02, 5.117967875532D+02)
+        csum_ref(7)  = dcmplx(5.118943814638D+02, 5.118225281841D+02)
+        csum_ref(8)  = dcmplx(5.118842385057D+02, 5.118451629348D+02)
+        csum_ref(9)  = dcmplx(5.118769435632D+02, 5.118649119387D+02)
+        csum_ref(10) = dcmplx(5.118718203448D+02, 5.118820803844D+02)
+        csum_ref(11) = dcmplx(5.118683569061D+02, 5.118969781011D+02)
+        csum_ref(12) = dcmplx(5.118661708593D+02, 5.119098918835D+02)
+        csum_ref(13) = dcmplx(5.118649768950D+02, 5.119210777066D+02)
+        csum_ref(14) = dcmplx(5.118645605626D+02, 5.119307604484D+02)
+        csum_ref(15) = dcmplx(5.118647586618D+02, 5.119391362671D+02)
+        csum_ref(16) = dcmplx(5.118654451572D+02, 5.119463757241D+02)
+        csum_ref(17) = dcmplx(5.118665212451D+02, 5.119526269238D+02)
+        csum_ref(18) = dcmplx(5.118679083821D+02, 5.119580184108D+02)
+        csum_ref(19) = dcmplx(5.118695433664D+02, 5.119626617538D+02)
+        csum_ref(20) = dcmplx(5.118713748264D+02, 5.119666538138D+02)
+        csum_ref(21) = dcmplx(5.118733606701D+02, 5.119700787219D+02)
+        csum_ref(22) = dcmplx(5.118754661974D+02, 5.119730095953D+02)
+        csum_ref(23) = dcmplx(5.118776626738D+02, 5.119755100241D+02)
+        csum_ref(24) = dcmplx(5.118799262314D+02, 5.119776353561D+02)
+        csum_ref(25) = dcmplx(5.118822370068D+02, 5.119794338060D+02)
+
+    else if (d1 == 4096 .AND.         d2 == 2048 .AND.         d3 == 2048 .AND.         nt == 25) then
+    !---------------------------------------------------------------------
+    !   Class E size reference checksums
+    !---------------------------------------------------------------------
+        class = 'E'
+        csum_ref(1)  = dcmplx(5.121601045346D+02, 5.117395998266D+02)
+        csum_ref(2)  = dcmplx(5.120905403678D+02, 5.118614716182D+02)
+        csum_ref(3)  = dcmplx(5.120623229306D+02, 5.119074203747D+02)
+        csum_ref(4)  = dcmplx(5.120438418997D+02, 5.119345900733D+02)
+        csum_ref(5)  = dcmplx(5.120311521872D+02, 5.119551325550D+02)
+        csum_ref(6)  = dcmplx(5.120226088809D+02, 5.119720179919D+02)
+        csum_ref(7)  = dcmplx(5.120169296534D+02, 5.119861371665D+02)
+        csum_ref(8)  = dcmplx(5.120131225172D+02, 5.119979364402D+02)
+        csum_ref(9)  = dcmplx(5.120104767108D+02, 5.120077674092D+02)
+        csum_ref(10) = dcmplx(5.120085127969D+02, 5.120159443121D+02)
+        csum_ref(11) = dcmplx(5.120069224127D+02, 5.120227453670D+02)
+        csum_ref(12) = dcmplx(5.120055158164D+02, 5.120284096041D+02)
+        csum_ref(13) = dcmplx(5.120041820159D+02, 5.120331373793D+02)
+        csum_ref(14) = dcmplx(5.120028605402D+02, 5.120370938679D+02)
+        csum_ref(15) = dcmplx(5.120015223011D+02, 5.120404138831D+02)
+        csum_ref(16) = dcmplx(5.120001570022D+02, 5.120432068837D+02)
+        csum_ref(17) = dcmplx(5.119987650555D+02, 5.120455615860D+02)
+        csum_ref(18) = dcmplx(5.119973525091D+02, 5.120475499442D+02)
+        csum_ref(19) = dcmplx(5.119959279472D+02, 5.120492304629D+02)
+        csum_ref(20) = dcmplx(5.119945006558D+02, 5.120506508902D+02)
+        csum_ref(21) = dcmplx(5.119930795911D+02, 5.120518503782D+02)
+        csum_ref(22) = dcmplx(5.119916728462D+02, 5.120528612016D+02)
+        csum_ref(23) = dcmplx(5.119902874185D+02, 5.120537101195D+02)
+        csum_ref(24) = dcmplx(5.119889291565D+02, 5.120544194514D+02)
+        csum_ref(25) = dcmplx(5.119876028049D+02, 5.120550079284D+02)
+
+    endif
+
+
+    if (class /= 'U') then
+
+        do i = 1, nt
+            err = abs( (sums(i) - csum_ref(i)) / csum_ref(i) )
+            if ( .NOT. (err <= epsilon)) goto 100
+        end do
+        verified = .TRUE.
+        100 continue
+
+    endif
+
+
+    if (class /= 'U') then
+        if (verified) then
+            write(*,2000)
+            2000 format(' Result verification successful')
+        else
+            write(*,2001)
+            2001 format(' Result verification failed')
+        endif
+    endif
+    print *, 'class = ', class
+
+    return
+    end subroutine verify
+
+
diff --git a/Parser/test-data/nas/ft3.f b/Parser/test-data/nas/ft3.f
new file mode 100644
index 0000000..4dd7019
--- /dev/null
+++ b/Parser/test-data/nas/ft3.f
@@ -0,0 +1,2815 @@
+!-------------------------------------------------------------------------!
+!                                                                         !
+!        N  A  S     P A R A L L E L     B E N C H M A R K S  3.3         !
+!                                                                         !
+!                       O p e n M P     V E R S I O N                     !
+!                                                                         !
+!                                   F T                                   !
+!                                                                         !
+!-------------------------------------------------------------------------!
+!                                                                         !
+!    This benchmark is an OpenMP version of the NPB FT code.              !
+!    It is described in NAS Technical Report 99-011.                      !
+!                                                                         !
+!    Permission to use, copy, distribute and modify this software         !
+!    for any purpose with or without fee is hereby granted.  We           !
+!    request, however, that all derived work reference the NAS            !
+!    Parallel Benchmarks 3.3. This software is provided "as is"           !
+!    without express or implied warranty.                                 !
+!                                                                         !
+!    Information on NPB 3.3, including the technical report, the          !
+!    original specifications, source code, results and information        !
+!    on how to submit new results, is available at:                       !
+!                                                                         !
+!           http://www.nas.nasa.gov/Software/NPB/                         !
+!                                                                         !
+!    Send comments or suggestions to  npb@nas.nasa.gov                    !
+!                                                                         !
+!          NAS Parallel Benchmarks Group                                  !
+!          NASA Ames Research Center                                      !
+!          Mail Stop: T27A-1                                              !
+!          Moffett Field, CA   94035-1000                                 !
+!                                                                         !
+!          E-mail:  npb@nas.nasa.gov                                      !
+!          Fax:     (650) 604-3957                                        !
+!                                                                         !
+!-------------------------------------------------------------------------!
+
+!---------------------------------------------------------------------
+
+! Authors: D. Bailey
+!          W. Saphir
+!          H. Jin
+
+!---------------------------------------------------------------------
+
+!---------------------------------------------------------------------
+
+!---------------------------------------------------------------------
+! FT benchmark
+!---------------------------------------------------------------------
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    program ft
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+
+    implicit none
+
+! CLASS = A
+!  
+!  
+!  This file is generated automatically by the setparams utility.
+!  It sets the number of processors and the class of the NPB
+!  in this directory. Do not modify it by hand.
+!  
+        integer nx, ny, nz, maxdim, niter_default
+        integer ntotal, nxp, nyp, ntotalp
+        parameter (nx=256, ny=256, nz=128, maxdim=256)
+        parameter (niter_default=6)
+        parameter (nxp=nx+1, nyp=ny)
+        parameter (ntotal=nx*nyp*nz)
+        parameter (ntotalp=nxp*nyp*nz)
+        logical  convertdouble
+        parameter (convertdouble = .false.)
+        character compiletime*11
+        parameter (compiletime='18 Oct 2016')
+        character npbversion*5
+        parameter (npbversion='3.3.1')
+        character cs1*8
+        parameter (cs1='gfortran')
+        character cs2*6
+        parameter (cs2='$(F77)')
+        character cs3*6
+        parameter (cs3='(none)')
+        character cs4*6
+        parameter (cs4='(none)')
+        character cs5*40
+        parameter (cs5='-ffree-form -O3 -fopenmp -mcmodel=medium')
+        character cs6*28
+        parameter (cs6='-O3 -fopenmp -mcmodel=medium')
+        character cs7*6
+        parameter (cs7='randi8')
+
+
+! If processor array is 1x1 -> 0D grid decomposition
+
+
+! Cache blocking params. These values are good for most
+! RISC processors.
+! FFT parameters:
+!  fftblock controls how many ffts are done at a time.
+!  The default is appropriate for most cache-based machines
+!  On vector machines, the FFT can be vectorized with vector
+!  length equal to the block size, so the block size should
+!  be as large as possible. This is the size of the smallest
+!  dimension of the problem: 128 for class A, 256 for class B and
+!  512 for class C.
+
+    integer :: fftblock_default, fftblockpad_default
+!      parameter (fftblock_default=16, fftblockpad_default=18)
+    parameter (fftblock_default=32, fftblockpad_default=33)
+          
+    integer :: fftblock, fftblockpad
+    common /blockinfo/ fftblock, fftblockpad
+
+! we need a bunch of logic to keep track of how
+! arrays are laid out.
+
+
+! Note: this serial version is the derived from the parallel 0D case
+! of the ft NPB.
+! The computation proceeds logically as
+
+! set up initial conditions
+! fftx(1)
+! transpose (1->2)
+! ffty(2)
+! transpose (2->3)
+! fftz(3)
+! time evolution
+! fftz(3)
+! transpose (3->2)
+! ffty(2)
+! transpose (2->1)
+! fftx(1)
+! compute residual(1)
+
+! for the 0D, 1D, 2D strategies, the layouts look like xxx
+
+!            0D        1D        2D
+! 1:        xyz       xyz       xyz
+
+! the array dimensions are stored in dims(coord, phase)
+    integer :: dims(3)
+    common /layout/ dims
+
+    integer :: T_total, T_setup, T_fft, T_evolve, T_checksum,     T_fftx, T_ffty,     T_fftz, T_max
+    parameter (T_total = 1, T_setup = 2, T_fft = 3,     T_evolve = 4, T_checksum = 5,     T_fftx = 6,     T_ffty = 7,     T_fftz = 8, T_max = 8)
+
+
+
+    logical :: timers_enabled
+
+
+    external timer_read
+    double precision :: timer_read
+    external ilog2
+    integer :: ilog2
+
+    external randlc
+    double precision :: randlc
+
+
+! other stuff
+    logical :: debug, debugsynch
+    common /dbg/ debug, debugsynch, timers_enabled
+
+    double precision :: seed, a, pi, alpha
+    parameter (seed = 314159265.d0, a = 1220703125.d0,     pi = 3.141592653589793238d0, alpha=1.0d-6)
+
+
+! roots of unity array
+! relies on x being largest dimension?
+    double complex u(nxp)
+    common /ucomm/ u
+
+
+! for checksum data
+    double complex sums(0:niter_default)
+    common /sumcomm/ sums
+
+! number of iterations
+    integer :: niter
+    common /iter/ niter
+    integer :: i
+          
+!---------------------------------------------------------------------
+! u0, u1, u2 are the main arrays in the problem.
+! Depending on the decomposition, these arrays will have different
+! dimensions. To accomodate all possibilities, we allocate them as
+! one-dimensional arrays and pass them to subroutines for different
+! views
+!  - u0 contains the initial (transformed) initial condition
+!  - u1 and u2 are working arrays
+!  - twiddle contains exponents for the time evolution operator.
+!---------------------------------------------------------------------
+
+    double complex   u0(ntotalp),     u1(ntotalp)
+!     >                 u2(ntotalp)
+    double precision :: twiddle(ntotalp)
+!---------------------------------------------------------------------
+! Large arrays are in common so that they are allocated on the
+! heap rather than the stack. This common block is not
+! referenced directly anywhere else. Padding is to avoid accidental
+! cache problems, since all array sizes are powers of two.
+!---------------------------------------------------------------------
+
+!      double complex pad1(3), pad2(3), pad3(3)
+!      common /bigarrays/ u0, pad1, u1, pad2, u2, pad3, twiddle
+    double complex pad1(3), pad2(3)
+    common /bigarrays/ u0, pad1, u1, pad2, twiddle
+
+    integer :: iter
+    double precision :: total_time, mflops
+    logical :: verified
+    character class
+
+!---------------------------------------------------------------------
+! Run the entire problem once to make sure all data is touched.
+! This reduces variable startup costs, which is important for such a
+! short benchmark. The other NPB 2 implementations are similar.
+!---------------------------------------------------------------------
+    do i = 1, t_max
+        call timer_clear(i)
+    end do
+    call setup()
+    call init_ui(u0, u1, twiddle, dims(1), dims(2), dims(3))
+    call compute_indexmap(twiddle, dims(1), dims(2), dims(3))
+    call compute_initial_conditions(u1, dims(1), dims(2), dims(3))
+    call fft_init (dims(1))
+    call fft(1, u1, u0)
+
+!---------------------------------------------------------------------
+! Start over from the beginning. Note that all operations must
+! be timed, in contrast to other benchmarks.
+!---------------------------------------------------------------------
+    do i = 1, t_max
+        call timer_clear(i)
+    end do
+
+    call timer_start(T_total)
+    if (timers_enabled) call timer_start(T_setup)
+
+    call compute_indexmap(twiddle, dims(1), dims(2), dims(3))
+
+    call compute_initial_conditions(u1, dims(1), dims(2), dims(3))
+
+    call fft_init (dims(1))
+
+    if (timers_enabled) call timer_stop(T_setup)
+    if (timers_enabled) call timer_start(T_fft)
+    call fft(1, u1, u0)
+    if (timers_enabled) call timer_stop(T_fft)
+
+    do iter = 1, niter
+        if (timers_enabled) call timer_start(T_evolve)
+        call evolve(u0, u1, twiddle, dims(1), dims(2), dims(3))
+        if (timers_enabled) call timer_stop(T_evolve)
+        if (timers_enabled) call timer_start(T_fft)
+    !         call fft(-1, u1, u2)
+        call fft(-1, u1, u1)
+        if (timers_enabled) call timer_stop(T_fft)
+        if (timers_enabled) call timer_start(T_checksum)
+    !         call checksum(iter, u2, dims(1), dims(2), dims(3))
+        call checksum(iter, u1, dims(1), dims(2), dims(3))
+        if (timers_enabled) call timer_stop(T_checksum)
+    end do
+
+    call verify(nx, ny, nz, niter, verified, class)
+
+    call timer_stop(t_total)
+    total_time = timer_read(t_total)
+
+    if( total_time /= 0. ) then
+        mflops = 1.0d-6*float(ntotal) *         (14.8157+7.19641*log(float(ntotal))         +  (5.23518+7.21113*log(float(ntotal)))*niter)         /total_time
+    else
+        mflops = 0.0
+    endif
+    call print_results('FT', class, nx, ny, nz, niter,     total_time, mflops, '          floating point', verified,     npbversion, compiletime, cs1, cs2, cs3, cs4, cs5, cs6, cs7)
+    if (timers_enabled) call print_timers()
+
+    END PROGRAM
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine init_ui(u0, u1, twiddle, d1, d2, d3)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+!---------------------------------------------------------------------
+! touch all the big data
+!---------------------------------------------------------------------
+
+    implicit none
+    integer :: d1, d2, d3
+    double complex   u0(d1+1,d2,d3)
+    double complex   u1(d1+1,d2,d3)
+    double precision :: twiddle(d1+1,d2,d3)
+    integer :: i, j, k
+
+! omp parallel do default(shared) private(i,j,k)
+    do k = 1, d3
+        do j = 1, d2
+            do i = 1, d1
+                u0(i,j,k) = 0.d0
+                u1(i,j,k) = 0.d0
+                twiddle(i,j,k) = 0.d0
+            end do
+        end do
+    end do
+
+    return
+    end subroutine init_ui
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine evolve(u0, u1, twiddle, d1, d2, d3)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+!---------------------------------------------------------------------
+! evolve u0 -> u1 (t time steps) in fourier space
+!---------------------------------------------------------------------
+
+    implicit none
+! CLASS = A
+!  
+!  
+!  This file is generated automatically by the setparams utility.
+!  It sets the number of processors and the class of the NPB
+!  in this directory. Do not modify it by hand.
+!  
+        integer nx, ny, nz, maxdim, niter_default
+        integer ntotal, nxp, nyp, ntotalp
+        parameter (nx=256, ny=256, nz=128, maxdim=256)
+        parameter (niter_default=6)
+        parameter (nxp=nx+1, nyp=ny)
+        parameter (ntotal=nx*nyp*nz)
+        parameter (ntotalp=nxp*nyp*nz)
+        logical  convertdouble
+        parameter (convertdouble = .false.)
+        character compiletime*11
+        parameter (compiletime='18 Oct 2016')
+        character npbversion*5
+        parameter (npbversion='3.3.1')
+        character cs1*8
+        parameter (cs1='gfortran')
+        character cs2*6
+        parameter (cs2='$(F77)')
+        character cs3*6
+        parameter (cs3='(none)')
+        character cs4*6
+        parameter (cs4='(none)')
+        character cs5*40
+        parameter (cs5='-ffree-form -O3 -fopenmp -mcmodel=medium')
+        character cs6*28
+        parameter (cs6='-O3 -fopenmp -mcmodel=medium')
+        character cs7*6
+        parameter (cs7='randi8')
+
+
+! If processor array is 1x1 -> 0D grid decomposition
+
+
+! Cache blocking params. These values are good for most
+! RISC processors.
+! FFT parameters:
+!  fftblock controls how many ffts are done at a time.
+!  The default is appropriate for most cache-based machines
+!  On vector machines, the FFT can be vectorized with vector
+!  length equal to the block size, so the block size should
+!  be as large as possible. This is the size of the smallest
+!  dimension of the problem: 128 for class A, 256 for class B and
+!  512 for class C.
+
+    integer :: fftblock_default, fftblockpad_default
+!      parameter (fftblock_default=16, fftblockpad_default=18)
+    parameter (fftblock_default=32, fftblockpad_default=33)
+          
+    integer :: fftblock, fftblockpad
+    common /blockinfo/ fftblock, fftblockpad
+
+! we need a bunch of logic to keep track of how
+! arrays are laid out.
+
+
+! Note: this serial version is the derived from the parallel 0D case
+! of the ft NPB.
+! The computation proceeds logically as
+
+! set up initial conditions
+! fftx(1)
+! transpose (1->2)
+! ffty(2)
+! transpose (2->3)
+! fftz(3)
+! time evolution
+! fftz(3)
+! transpose (3->2)
+! ffty(2)
+! transpose (2->1)
+! fftx(1)
+! compute residual(1)
+
+! for the 0D, 1D, 2D strategies, the layouts look like xxx
+
+!            0D        1D        2D
+! 1:        xyz       xyz       xyz
+
+! the array dimensions are stored in dims(coord, phase)
+    integer :: dims(3)
+    common /layout/ dims
+
+    integer :: T_total, T_setup, T_fft, T_evolve, T_checksum,     T_fftx, T_ffty,     T_fftz, T_max
+    parameter (T_total = 1, T_setup = 2, T_fft = 3,     T_evolve = 4, T_checksum = 5,     T_fftx = 6,     T_ffty = 7,     T_fftz = 8, T_max = 8)
+
+
+
+    logical :: timers_enabled
+
+
+    external timer_read
+    double precision :: timer_read
+    external ilog2
+    integer :: ilog2
+
+    external randlc
+    double precision :: randlc
+
+
+! other stuff
+    logical :: debug, debugsynch
+    common /dbg/ debug, debugsynch, timers_enabled
+
+    double precision :: seed, a, pi, alpha
+    parameter (seed = 314159265.d0, a = 1220703125.d0,     pi = 3.141592653589793238d0, alpha=1.0d-6)
+
+
+! roots of unity array
+! relies on x being largest dimension?
+    double complex u(nxp)
+    common /ucomm/ u
+
+
+! for checksum data
+    double complex sums(0:niter_default)
+    common /sumcomm/ sums
+
+! number of iterations
+    integer :: niter
+    common /iter/ niter
+    integer :: d1, d2, d3
+    double complex   u0(d1+1,d2,d3)
+    double complex   u1(d1+1,d2,d3)
+    double precision :: twiddle(d1+1,d2,d3)
+    integer :: i, j, k
+
+! omp parallel do default(shared) private(i,j,k)
+    do k = 1, d3
+        do j = 1, d2
+            do i = 1, d1
+                u0(i,j,k) = u0(i,j,k) * twiddle(i,j,k)
+                u1(i,j,k) = u0(i,j,k)
+            end do
+        end do
+    end do
+
+    return
+    end subroutine evolve
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine compute_initial_conditions(u0, d1, d2, d3)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+!---------------------------------------------------------------------
+! Fill in array u0 with initial conditions from
+! random number generator
+!---------------------------------------------------------------------
+    implicit none
+! CLASS = A
+!  
+!  
+!  This file is generated automatically by the setparams utility.
+!  It sets the number of processors and the class of the NPB
+!  in this directory. Do not modify it by hand.
+!  
+        integer nx, ny, nz, maxdim, niter_default
+        integer ntotal, nxp, nyp, ntotalp
+        parameter (nx=256, ny=256, nz=128, maxdim=256)
+        parameter (niter_default=6)
+        parameter (nxp=nx+1, nyp=ny)
+        parameter (ntotal=nx*nyp*nz)
+        parameter (ntotalp=nxp*nyp*nz)
+        logical  convertdouble
+        parameter (convertdouble = .false.)
+        character compiletime*11
+        parameter (compiletime='18 Oct 2016')
+        character npbversion*5
+        parameter (npbversion='3.3.1')
+        character cs1*8
+        parameter (cs1='gfortran')
+        character cs2*6
+        parameter (cs2='$(F77)')
+        character cs3*6
+        parameter (cs3='(none)')
+        character cs4*6
+        parameter (cs4='(none)')
+        character cs5*40
+        parameter (cs5='-ffree-form -O3 -fopenmp -mcmodel=medium')
+        character cs6*28
+        parameter (cs6='-O3 -fopenmp -mcmodel=medium')
+        character cs7*6
+        parameter (cs7='randi8')
+
+
+! If processor array is 1x1 -> 0D grid decomposition
+
+
+! Cache blocking params. These values are good for most
+! RISC processors.
+! FFT parameters:
+!  fftblock controls how many ffts are done at a time.
+!  The default is appropriate for most cache-based machines
+!  On vector machines, the FFT can be vectorized with vector
+!  length equal to the block size, so the block size should
+!  be as large as possible. This is the size of the smallest
+!  dimension of the problem: 128 for class A, 256 for class B and
+!  512 for class C.
+
+    integer :: fftblock_default, fftblockpad_default
+!      parameter (fftblock_default=16, fftblockpad_default=18)
+    parameter (fftblock_default=32, fftblockpad_default=33)
+          
+    integer :: fftblock, fftblockpad
+    common /blockinfo/ fftblock, fftblockpad
+
+! we need a bunch of logic to keep track of how
+! arrays are laid out.
+
+
+! Note: this serial version is the derived from the parallel 0D case
+! of the ft NPB.
+! The computation proceeds logically as
+
+! set up initial conditions
+! fftx(1)
+! transpose (1->2)
+! ffty(2)
+! transpose (2->3)
+! fftz(3)
+! time evolution
+! fftz(3)
+! transpose (3->2)
+! ffty(2)
+! transpose (2->1)
+! fftx(1)
+! compute residual(1)
+
+! for the 0D, 1D, 2D strategies, the layouts look like xxx
+
+!            0D        1D        2D
+! 1:        xyz       xyz       xyz
+
+! the array dimensions are stored in dims(coord, phase)
+    integer :: dims(3)
+    common /layout/ dims
+
+    integer :: T_total, T_setup, T_fft, T_evolve, T_checksum,     T_fftx, T_ffty,     T_fftz, T_max
+    parameter (T_total = 1, T_setup = 2, T_fft = 3,     T_evolve = 4, T_checksum = 5,     T_fftx = 6,     T_ffty = 7,     T_fftz = 8, T_max = 8)
+
+
+
+    logical :: timers_enabled
+
+
+    external timer_read
+    double precision :: timer_read
+    external ilog2
+    integer :: ilog2
+
+    external randlc
+    double precision :: randlc
+
+
+! other stuff
+    logical :: debug, debugsynch
+    common /dbg/ debug, debugsynch, timers_enabled
+
+    double precision :: seed, a, pi, alpha
+    parameter (seed = 314159265.d0, a = 1220703125.d0,     pi = 3.141592653589793238d0, alpha=1.0d-6)
+
+
+! roots of unity array
+! relies on x being largest dimension?
+    double complex u(nxp)
+    common /ucomm/ u
+
+
+! for checksum data
+    double complex sums(0:niter_default)
+    common /sumcomm/ sums
+
+! number of iterations
+    integer :: niter
+    common /iter/ niter
+    integer :: d1, d2, d3
+    double complex u0(d1+1, d2, d3)
+    integer :: k, j
+    double precision :: x0, start, an, dummy, starts(nz)
+          
+
+    start = seed
+!---------------------------------------------------------------------
+! Jump to the starting element for our first plane.
+!---------------------------------------------------------------------
+    call ipow46(a, 0, an)
+    dummy = randlc(start, an)
+    call ipow46(a, 2*nx*ny, an)
+
+    starts(1) = start
+    do k = 2, dims(3)
+        dummy = randlc(start, an)
+        starts(k) = start
+    end do
+          
+!---------------------------------------------------------------------
+! Go through by z planes filling in one square at a time.
+!---------------------------------------------------------------------
+! omp parallel do default(shared) private(k,j,x0)
+    do k = 1, dims(3)
+        x0 = starts(k)
+        do j = 1, dims(2)
+            call vranlc(2*nx, x0, a, u0(1, j, k))
+        end do
+    end do
+
+    return
+    end subroutine compute_initial_conditions
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine ipow46(a, exponent, result)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+!---------------------------------------------------------------------
+! compute a^exponent mod 2^46
+!---------------------------------------------------------------------
+
+    implicit none
+    double precision :: a, result, dummy, q, r
+    integer :: exponent, n, n2
+    external randlc
+    double precision :: randlc
+!---------------------------------------------------------------------
+! Use
+!   a^n = a^(n/2)*a^(n/2) if n even else
+!   a^n = a*a^(n-1)       if n odd
+!---------------------------------------------------------------------
+    result = 1
+    if (exponent == 0) return
+    q = a
+    r = 1
+    n = exponent
+
+
+    do while (n > 1)
+        n2 = n/2
+        if (n2 * 2 == n) then
+            dummy = randlc(q, q)
+            n = n2
+        else
+            dummy = randlc(r, q)
+            n = n-1
+        endif
+    end do
+    dummy = randlc(r, q)
+    result = r
+    return
+    end subroutine ipow46
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine setup
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    implicit none
+! CLASS = A
+!  
+!  
+!  This file is generated automatically by the setparams utility.
+!  It sets the number of processors and the class of the NPB
+!  in this directory. Do not modify it by hand.
+!  
+        integer nx, ny, nz, maxdim, niter_default
+        integer ntotal, nxp, nyp, ntotalp
+        parameter (nx=256, ny=256, nz=128, maxdim=256)
+        parameter (niter_default=6)
+        parameter (nxp=nx+1, nyp=ny)
+        parameter (ntotal=nx*nyp*nz)
+        parameter (ntotalp=nxp*nyp*nz)
+        logical  convertdouble
+        parameter (convertdouble = .false.)
+        character compiletime*11
+        parameter (compiletime='18 Oct 2016')
+        character npbversion*5
+        parameter (npbversion='3.3.1')
+        character cs1*8
+        parameter (cs1='gfortran')
+        character cs2*6
+        parameter (cs2='$(F77)')
+        character cs3*6
+        parameter (cs3='(none)')
+        character cs4*6
+        parameter (cs4='(none)')
+        character cs5*40
+        parameter (cs5='-ffree-form -O3 -fopenmp -mcmodel=medium')
+        character cs6*28
+        parameter (cs6='-O3 -fopenmp -mcmodel=medium')
+        character cs7*6
+        parameter (cs7='randi8')
+
+
+! If processor array is 1x1 -> 0D grid decomposition
+
+
+! Cache blocking params. These values are good for most
+! RISC processors.
+! FFT parameters:
+!  fftblock controls how many ffts are done at a time.
+!  The default is appropriate for most cache-based machines
+!  On vector machines, the FFT can be vectorized with vector
+!  length equal to the block size, so the block size should
+!  be as large as possible. This is the size of the smallest
+!  dimension of the problem: 128 for class A, 256 for class B and
+!  512 for class C.
+
+    integer :: fftblock_default, fftblockpad_default
+!      parameter (fftblock_default=16, fftblockpad_default=18)
+    parameter (fftblock_default=32, fftblockpad_default=33)
+          
+    integer :: fftblock, fftblockpad
+    common /blockinfo/ fftblock, fftblockpad
+
+! we need a bunch of logic to keep track of how
+! arrays are laid out.
+
+
+! Note: this serial version is the derived from the parallel 0D case
+! of the ft NPB.
+! The computation proceeds logically as
+
+! set up initial conditions
+! fftx(1)
+! transpose (1->2)
+! ffty(2)
+! transpose (2->3)
+! fftz(3)
+! time evolution
+! fftz(3)
+! transpose (3->2)
+! ffty(2)
+! transpose (2->1)
+! fftx(1)
+! compute residual(1)
+
+! for the 0D, 1D, 2D strategies, the layouts look like xxx
+
+!            0D        1D        2D
+! 1:        xyz       xyz       xyz
+
+! the array dimensions are stored in dims(coord, phase)
+    integer :: dims(3)
+    common /layout/ dims
+
+    integer :: T_total, T_setup, T_fft, T_evolve, T_checksum,     T_fftx, T_ffty,     T_fftz, T_max
+    parameter (T_total = 1, T_setup = 2, T_fft = 3,     T_evolve = 4, T_checksum = 5,     T_fftx = 6,     T_ffty = 7,     T_fftz = 8, T_max = 8)
+
+
+
+    logical :: timers_enabled
+
+
+    external timer_read
+    double precision :: timer_read
+    external ilog2
+    integer :: ilog2
+
+    external randlc
+    double precision :: randlc
+
+
+! other stuff
+    logical :: debug, debugsynch
+    common /dbg/ debug, debugsynch, timers_enabled
+
+    double precision :: seed, a, pi, alpha
+    parameter (seed = 314159265.d0, a = 1220703125.d0,     pi = 3.141592653589793238d0, alpha=1.0d-6)
+
+
+! roots of unity array
+! relies on x being largest dimension?
+    double complex u(nxp)
+    common /ucomm/ u
+
+
+! for checksum data
+    double complex sums(0:niter_default)
+    common /sumcomm/ sums
+
+! number of iterations
+    integer :: niter
+    common /iter/ niter
+
+    integer :: fstatus
+! !$    integer  omp_get_max_threads
+! !$    external omp_get_max_threads
+    debug = .FALSE.
+
+    open (unit=2,file='timer.flag',status='old',iostat=fstatus)
+    if (fstatus == 0) then
+        timers_enabled = .TRUE.
+        close(2)
+    else
+        timers_enabled = .FALSE.
+    endif
+
+    write(*, 1000)
+
+    niter = niter_default
+
+    write(*, 1001) nx, ny, nz
+    write(*, 1002) niter
+! !$    write(*, 1003) omp_get_max_threads()
+    write(*, *)
+
+
+    1000 format(//,' NAS Parallel Benchmarks (NPB3.3-OMP)',     ' - FT Benchmark', /)
+    1001 format(' Size                : ', i4, 'x', i4, 'x', i4)
+    1002 format(' Iterations                  :', i7)
+    1003 format(' Number of available threads :', i7)
+
+    dims(1) = nx
+    dims(2) = ny
+    dims(3) = nz
+
+
+!---------------------------------------------------------------------
+! Set up info for blocking of ffts and transposes.  This improves
+! performance on cache-based systems. Blocking involves
+! working on a chunk of the problem at a time, taking chunks
+! along the first, second, or third dimension.
+
+! - In cffts1 blocking is on 2nd dimension (with fft on 1st dim)
+! - In cffts2/3 blocking is on 1st dimension (with fft on 2nd and 3rd dims)
+
+! Since 1st dim is always in processor, we'll assume it's long enough
+! (default blocking factor is 16 so min size for 1st dim is 16)
+! The only case we have to worry about is cffts1 in a 2d decomposition.
+! so the blocking factor should not be larger than the 2nd dimension.
+!---------------------------------------------------------------------
+
+    fftblock = fftblock_default
+    fftblockpad = fftblockpad_default
+
+    if (fftblock /= fftblock_default) fftblockpad = fftblock+3
+
+    return
+    end subroutine setup
+
+          
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine compute_indexmap(twiddle, d1, d2, d3)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+!---------------------------------------------------------------------
+! compute function from local (i,j,k) to ibar^2+jbar^2+kbar^2
+! for time evolution exponent.
+!---------------------------------------------------------------------
+
+    implicit none
+! CLASS = A
+!  
+!  
+!  This file is generated automatically by the setparams utility.
+!  It sets the number of processors and the class of the NPB
+!  in this directory. Do not modify it by hand.
+!  
+        integer nx, ny, nz, maxdim, niter_default
+        integer ntotal, nxp, nyp, ntotalp
+        parameter (nx=256, ny=256, nz=128, maxdim=256)
+        parameter (niter_default=6)
+        parameter (nxp=nx+1, nyp=ny)
+        parameter (ntotal=nx*nyp*nz)
+        parameter (ntotalp=nxp*nyp*nz)
+        logical  convertdouble
+        parameter (convertdouble = .false.)
+        character compiletime*11
+        parameter (compiletime='18 Oct 2016')
+        character npbversion*5
+        parameter (npbversion='3.3.1')
+        character cs1*8
+        parameter (cs1='gfortran')
+        character cs2*6
+        parameter (cs2='$(F77)')
+        character cs3*6
+        parameter (cs3='(none)')
+        character cs4*6
+        parameter (cs4='(none)')
+        character cs5*40
+        parameter (cs5='-ffree-form -O3 -fopenmp -mcmodel=medium')
+        character cs6*28
+        parameter (cs6='-O3 -fopenmp -mcmodel=medium')
+        character cs7*6
+        parameter (cs7='randi8')
+
+
+! If processor array is 1x1 -> 0D grid decomposition
+
+
+! Cache blocking params. These values are good for most
+! RISC processors.
+! FFT parameters:
+!  fftblock controls how many ffts are done at a time.
+!  The default is appropriate for most cache-based machines
+!  On vector machines, the FFT can be vectorized with vector
+!  length equal to the block size, so the block size should
+!  be as large as possible. This is the size of the smallest
+!  dimension of the problem: 128 for class A, 256 for class B and
+!  512 for class C.
+
+    integer :: fftblock_default, fftblockpad_default
+!      parameter (fftblock_default=16, fftblockpad_default=18)
+    parameter (fftblock_default=32, fftblockpad_default=33)
+          
+    integer :: fftblock, fftblockpad
+    common /blockinfo/ fftblock, fftblockpad
+
+! we need a bunch of logic to keep track of how
+! arrays are laid out.
+
+
+! Note: this serial version is the derived from the parallel 0D case
+! of the ft NPB.
+! The computation proceeds logically as
+
+! set up initial conditions
+! fftx(1)
+! transpose (1->2)
+! ffty(2)
+! transpose (2->3)
+! fftz(3)
+! time evolution
+! fftz(3)
+! transpose (3->2)
+! ffty(2)
+! transpose (2->1)
+! fftx(1)
+! compute residual(1)
+
+! for the 0D, 1D, 2D strategies, the layouts look like xxx
+
+!            0D        1D        2D
+! 1:        xyz       xyz       xyz
+
+! the array dimensions are stored in dims(coord, phase)
+    integer :: dims(3)
+    common /layout/ dims
+
+    integer :: T_total, T_setup, T_fft, T_evolve, T_checksum,     T_fftx, T_ffty,     T_fftz, T_max
+    parameter (T_total = 1, T_setup = 2, T_fft = 3,     T_evolve = 4, T_checksum = 5,     T_fftx = 6,     T_ffty = 7,     T_fftz = 8, T_max = 8)
+
+
+
+    logical :: timers_enabled
+
+
+    external timer_read
+    double precision :: timer_read
+    external ilog2
+    integer :: ilog2
+
+    external randlc
+    double precision :: randlc
+
+
+! other stuff
+    logical :: debug, debugsynch
+    common /dbg/ debug, debugsynch, timers_enabled
+
+    double precision :: seed, a, pi, alpha
+    parameter (seed = 314159265.d0, a = 1220703125.d0,     pi = 3.141592653589793238d0, alpha=1.0d-6)
+
+
+! roots of unity array
+! relies on x being largest dimension?
+    double complex u(nxp)
+    common /ucomm/ u
+
+
+! for checksum data
+    double complex sums(0:niter_default)
+    common /sumcomm/ sums
+
+! number of iterations
+    integer :: niter
+    common /iter/ niter
+    integer :: d1, d2, d3
+    double precision :: twiddle(d1+1, d2, d3)
+    integer :: i, j, k, kk, kk2, jj, kj2, ii
+    double precision :: ap
+
+!---------------------------------------------------------------------
+! basically we want to convert the fortran indices
+!   1 2 3 4 5 6 7 8
+! to
+!   0 1 2 3 -4 -3 -2 -1
+! The following magic formula does the trick:
+! mod(i-1+n/2, n) - n/2
+!---------------------------------------------------------------------
+
+    ap = - 4.d0 * alpha * pi *pi
+
+! omp parallel do default(shared) private(i,j,k,kk,kk2,jj,kj2,ii)
+    do k = 1, dims(3)
+        kk =  mod(k-1+nz/2, nz) - nz/2
+        kk2 = kk*kk
+        do j = 1, dims(2)
+            jj = mod(j-1+ny/2, ny) - ny/2
+            kj2 = jj*jj+kk2
+            do i = 1, dims(1)
+                ii = mod(i-1+nx/2, nx) - nx/2
+                twiddle(i,j,k) = dexp(ap*dble(ii*ii+kj2))
+            end do
+        end do
+    end do
+
+    return
+    end subroutine compute_indexmap
+
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine print_timers()
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    implicit none
+    integer :: i
+! CLASS = A
+!  
+!  
+!  This file is generated automatically by the setparams utility.
+!  It sets the number of processors and the class of the NPB
+!  in this directory. Do not modify it by hand.
+!  
+        integer nx, ny, nz, maxdim, niter_default
+        integer ntotal, nxp, nyp, ntotalp
+        parameter (nx=256, ny=256, nz=128, maxdim=256)
+        parameter (niter_default=6)
+        parameter (nxp=nx+1, nyp=ny)
+        parameter (ntotal=nx*nyp*nz)
+        parameter (ntotalp=nxp*nyp*nz)
+        logical  convertdouble
+        parameter (convertdouble = .false.)
+        character compiletime*11
+        parameter (compiletime='18 Oct 2016')
+        character npbversion*5
+        parameter (npbversion='3.3.1')
+        character cs1*8
+        parameter (cs1='gfortran')
+        character cs2*6
+        parameter (cs2='$(F77)')
+        character cs3*6
+        parameter (cs3='(none)')
+        character cs4*6
+        parameter (cs4='(none)')
+        character cs5*40
+        parameter (cs5='-ffree-form -O3 -fopenmp -mcmodel=medium')
+        character cs6*28
+        parameter (cs6='-O3 -fopenmp -mcmodel=medium')
+        character cs7*6
+        parameter (cs7='randi8')
+
+
+! If processor array is 1x1 -> 0D grid decomposition
+
+
+! Cache blocking params. These values are good for most
+! RISC processors.
+! FFT parameters:
+!  fftblock controls how many ffts are done at a time.
+!  The default is appropriate for most cache-based machines
+!  On vector machines, the FFT can be vectorized with vector
+!  length equal to the block size, so the block size should
+!  be as large as possible. This is the size of the smallest
+!  dimension of the problem: 128 for class A, 256 for class B and
+!  512 for class C.
+
+    integer :: fftblock_default, fftblockpad_default
+!      parameter (fftblock_default=16, fftblockpad_default=18)
+    parameter (fftblock_default=32, fftblockpad_default=33)
+          
+    integer :: fftblock, fftblockpad
+    common /blockinfo/ fftblock, fftblockpad
+
+! we need a bunch of logic to keep track of how
+! arrays are laid out.
+
+
+! Note: this serial version is the derived from the parallel 0D case
+! of the ft NPB.
+! The computation proceeds logically as
+
+! set up initial conditions
+! fftx(1)
+! transpose (1->2)
+! ffty(2)
+! transpose (2->3)
+! fftz(3)
+! time evolution
+! fftz(3)
+! transpose (3->2)
+! ffty(2)
+! transpose (2->1)
+! fftx(1)
+! compute residual(1)
+
+! for the 0D, 1D, 2D strategies, the layouts look like xxx
+
+!            0D        1D        2D
+! 1:        xyz       xyz       xyz
+
+! the array dimensions are stored in dims(coord, phase)
+    integer :: dims(3)
+    common /layout/ dims
+
+    integer :: T_total, T_setup, T_fft, T_evolve, T_checksum,     T_fftx, T_ffty,     T_fftz, T_max
+    parameter (T_total = 1, T_setup = 2, T_fft = 3,     T_evolve = 4, T_checksum = 5,     T_fftx = 6,     T_ffty = 7,     T_fftz = 8, T_max = 8)
+
+
+
+    logical :: timers_enabled
+
+
+    external timer_read
+    double precision :: timer_read
+    external ilog2
+    integer :: ilog2
+
+    external randlc
+    double precision :: randlc
+
+
+! other stuff
+    logical :: debug, debugsynch
+    common /dbg/ debug, debugsynch, timers_enabled
+
+    double precision :: seed, a, pi, alpha
+    parameter (seed = 314159265.d0, a = 1220703125.d0,     pi = 3.141592653589793238d0, alpha=1.0d-6)
+
+
+! roots of unity array
+! relies on x being largest dimension?
+    double complex u(nxp)
+    common /ucomm/ u
+
+
+! for checksum data
+    double complex sums(0:niter_default)
+    common /sumcomm/ sums
+
+! number of iterations
+    integer :: niter
+    common /iter/ niter
+    double precision :: t, t_m
+    character(25) :: tstrings(T_max)
+    data tstrings / '          total ',     '          setup ',     '            fft ',     '         evolve ',     '       checksum ',     '           fftx ',     '           ffty ',     '           fftz ' /
+
+    t_m = timer_read(T_total)
+    if (t_m <= 0.0d0) t_m = 1.0d0
+    do i = 1, t_max
+        t = timer_read(i)
+        write(*, 100) i, tstrings(i), t, t*100.0/t_m
+    end do
+    100 format(' timer ', i2, '(', A16,  ') :', F9.4, ' (',F6.2,'%)')
+    return
+    end subroutine print_timers
+
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine fft(dir, x1, x2)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    implicit none
+! CLASS = A
+!  
+!  
+!  This file is generated automatically by the setparams utility.
+!  It sets the number of processors and the class of the NPB
+!  in this directory. Do not modify it by hand.
+!  
+        integer nx, ny, nz, maxdim, niter_default
+        integer ntotal, nxp, nyp, ntotalp
+        parameter (nx=256, ny=256, nz=128, maxdim=256)
+        parameter (niter_default=6)
+        parameter (nxp=nx+1, nyp=ny)
+        parameter (ntotal=nx*nyp*nz)
+        parameter (ntotalp=nxp*nyp*nz)
+        logical  convertdouble
+        parameter (convertdouble = .false.)
+        character compiletime*11
+        parameter (compiletime='18 Oct 2016')
+        character npbversion*5
+        parameter (npbversion='3.3.1')
+        character cs1*8
+        parameter (cs1='gfortran')
+        character cs2*6
+        parameter (cs2='$(F77)')
+        character cs3*6
+        parameter (cs3='(none)')
+        character cs4*6
+        parameter (cs4='(none)')
+        character cs5*40
+        parameter (cs5='-ffree-form -O3 -fopenmp -mcmodel=medium')
+        character cs6*28
+        parameter (cs6='-O3 -fopenmp -mcmodel=medium')
+        character cs7*6
+        parameter (cs7='randi8')
+
+
+! If processor array is 1x1 -> 0D grid decomposition
+
+
+! Cache blocking params. These values are good for most
+! RISC processors.
+! FFT parameters:
+!  fftblock controls how many ffts are done at a time.
+!  The default is appropriate for most cache-based machines
+!  On vector machines, the FFT can be vectorized with vector
+!  length equal to the block size, so the block size should
+!  be as large as possible. This is the size of the smallest
+!  dimension of the problem: 128 for class A, 256 for class B and
+!  512 for class C.
+
+    integer :: fftblock_default, fftblockpad_default
+!      parameter (fftblock_default=16, fftblockpad_default=18)
+    parameter (fftblock_default=32, fftblockpad_default=33)
+          
+    integer :: fftblock, fftblockpad
+    common /blockinfo/ fftblock, fftblockpad
+
+! we need a bunch of logic to keep track of how
+! arrays are laid out.
+
+
+! Note: this serial version is the derived from the parallel 0D case
+! of the ft NPB.
+! The computation proceeds logically as
+
+! set up initial conditions
+! fftx(1)
+! transpose (1->2)
+! ffty(2)
+! transpose (2->3)
+! fftz(3)
+! time evolution
+! fftz(3)
+! transpose (3->2)
+! ffty(2)
+! transpose (2->1)
+! fftx(1)
+! compute residual(1)
+
+! for the 0D, 1D, 2D strategies, the layouts look like xxx
+
+!            0D        1D        2D
+! 1:        xyz       xyz       xyz
+
+! the array dimensions are stored in dims(coord, phase)
+    integer :: dims(3)
+    common /layout/ dims
+
+    integer :: T_total, T_setup, T_fft, T_evolve, T_checksum,     T_fftx, T_ffty,     T_fftz, T_max
+    parameter (T_total = 1, T_setup = 2, T_fft = 3,     T_evolve = 4, T_checksum = 5,     T_fftx = 6,     T_ffty = 7,     T_fftz = 8, T_max = 8)
+
+
+
+    logical :: timers_enabled
+
+
+    external timer_read
+    double precision :: timer_read
+    external ilog2
+    integer :: ilog2
+
+    external randlc
+    double precision :: randlc
+
+
+! other stuff
+    logical :: debug, debugsynch
+    common /dbg/ debug, debugsynch, timers_enabled
+
+    double precision :: seed, a, pi, alpha
+    parameter (seed = 314159265.d0, a = 1220703125.d0,     pi = 3.141592653589793238d0, alpha=1.0d-6)
+
+
+! roots of unity array
+! relies on x being largest dimension?
+    double complex u(nxp)
+    common /ucomm/ u
+
+
+! for checksum data
+    double complex sums(0:niter_default)
+    common /sumcomm/ sums
+
+! number of iterations
+    integer :: niter
+    common /iter/ niter
+    integer :: dir
+    double complex x1(ntotalp), x2(ntotalp)
+
+    double complex y1(fftblockpad_default*maxdim),     y2(fftblockpad_default*maxdim)
+
+!---------------------------------------------------------------------
+! note: args x1, x2 must be different arrays
+! note: args for cfftsx are (direction, layout, xin, xout, scratch)
+!       xin/xout may be the same and it can be somewhat faster
+!       if they are
+!---------------------------------------------------------------------
+
+    if (dir == 1) then
+        call cffts1(1, dims(1), dims(2), dims(3), x1, x1, y1, y2)
+        call cffts2(1, dims(1), dims(2), dims(3), x1, x1, y1, y2)
+        call cffts3(1, dims(1), dims(2), dims(3), x1, x2, y1, y2)
+    else
+        call cffts3(-1, dims(1), dims(2), dims(3), x1, x1, y1, y2)
+        call cffts2(-1, dims(1), dims(2), dims(3), x1, x1, y1, y2)
+        call cffts1(-1, dims(1), dims(2), dims(3), x1, x2, y1, y2)
+    endif
+    return
+    end subroutine fft
+
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine cffts1(is, d1, d2, d3, x, xout, y1, y2)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    implicit none
+
+! CLASS = A
+!  
+!  
+!  This file is generated automatically by the setparams utility.
+!  It sets the number of processors and the class of the NPB
+!  in this directory. Do not modify it by hand.
+!  
+        integer nx, ny, nz, maxdim, niter_default
+        integer ntotal, nxp, nyp, ntotalp
+        parameter (nx=256, ny=256, nz=128, maxdim=256)
+        parameter (niter_default=6)
+        parameter (nxp=nx+1, nyp=ny)
+        parameter (ntotal=nx*nyp*nz)
+        parameter (ntotalp=nxp*nyp*nz)
+        logical  convertdouble
+        parameter (convertdouble = .false.)
+        character compiletime*11
+        parameter (compiletime='18 Oct 2016')
+        character npbversion*5
+        parameter (npbversion='3.3.1')
+        character cs1*8
+        parameter (cs1='gfortran')
+        character cs2*6
+        parameter (cs2='$(F77)')
+        character cs3*6
+        parameter (cs3='(none)')
+        character cs4*6
+        parameter (cs4='(none)')
+        character cs5*40
+        parameter (cs5='-ffree-form -O3 -fopenmp -mcmodel=medium')
+        character cs6*28
+        parameter (cs6='-O3 -fopenmp -mcmodel=medium')
+        character cs7*6
+        parameter (cs7='randi8')
+
+
+! If processor array is 1x1 -> 0D grid decomposition
+
+
+! Cache blocking params. These values are good for most
+! RISC processors.
+! FFT parameters:
+!  fftblock controls how many ffts are done at a time.
+!  The default is appropriate for most cache-based machines
+!  On vector machines, the FFT can be vectorized with vector
+!  length equal to the block size, so the block size should
+!  be as large as possible. This is the size of the smallest
+!  dimension of the problem: 128 for class A, 256 for class B and
+!  512 for class C.
+
+    integer :: fftblock_default, fftblockpad_default
+!      parameter (fftblock_default=16, fftblockpad_default=18)
+    parameter (fftblock_default=32, fftblockpad_default=33)
+          
+    integer :: fftblock, fftblockpad
+    common /blockinfo/ fftblock, fftblockpad
+
+! we need a bunch of logic to keep track of how
+! arrays are laid out.
+
+
+! Note: this serial version is the derived from the parallel 0D case
+! of the ft NPB.
+! The computation proceeds logically as
+
+! set up initial conditions
+! fftx(1)
+! transpose (1->2)
+! ffty(2)
+! transpose (2->3)
+! fftz(3)
+! time evolution
+! fftz(3)
+! transpose (3->2)
+! ffty(2)
+! transpose (2->1)
+! fftx(1)
+! compute residual(1)
+
+! for the 0D, 1D, 2D strategies, the layouts look like xxx
+
+!            0D        1D        2D
+! 1:        xyz       xyz       xyz
+
+! the array dimensions are stored in dims(coord, phase)
+    integer :: dims(3)
+    common /layout/ dims
+
+    integer :: T_total, T_setup, T_fft, T_evolve, T_checksum,     T_fftx, T_ffty,     T_fftz, T_max
+    parameter (T_total = 1, T_setup = 2, T_fft = 3,     T_evolve = 4, T_checksum = 5,     T_fftx = 6,     T_ffty = 7,     T_fftz = 8, T_max = 8)
+
+
+
+    logical :: timers_enabled
+
+
+    external timer_read
+    double precision :: timer_read
+    external ilog2
+    integer :: ilog2
+
+    external randlc
+    double precision :: randlc
+
+
+! other stuff
+    logical :: debug, debugsynch
+    common /dbg/ debug, debugsynch, timers_enabled
+
+    double precision :: seed, a, pi, alpha
+    parameter (seed = 314159265.d0, a = 1220703125.d0,     pi = 3.141592653589793238d0, alpha=1.0d-6)
+
+
+! roots of unity array
+! relies on x being largest dimension?
+    double complex u(nxp)
+    common /ucomm/ u
+
+
+! for checksum data
+    double complex sums(0:niter_default)
+    common /sumcomm/ sums
+
+! number of iterations
+    integer :: niter
+    common /iter/ niter
+    integer :: is, d1, d2, d3, logd1
+    double complex x(d1+1,d2,d3)
+    double complex xout(d1+1,d2,d3)
+    double complex y1(fftblockpad, d1), y2(fftblockpad, d1)
+    integer :: i, j, k, jj
+
+    logd1 = ilog2(d1)
+
+    if (timers_enabled) call timer_start(T_fftx)
+! omp parallel do default(shared) private(i,j,k,jj,y1,y2) shared(is,logd1,d1)
+    do k = 1, d3
+        do jj = 0, d2 - fftblock, fftblock
+            do j = 1, fftblock
+                do i = 1, d1
+                    y1(j,i) = x(i,j+jj,k)
+                enddo
+            enddo
+                        
+            call cfftz (is, logd1, d1, y1, y2)
+
+
+            do j = 1, fftblock
+                do i = 1, d1
+                    xout(i,j+jj,k) = y1(j,i)
+                enddo
+            enddo
+        enddo
+    enddo
+    if (timers_enabled) call timer_stop(T_fftx)
+
+    return
+    end subroutine cffts1
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine cffts2(is, d1, d2, d3, x, xout, y1, y2)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    implicit none
+
+! CLASS = A
+!  
+!  
+!  This file is generated automatically by the setparams utility.
+!  It sets the number of processors and the class of the NPB
+!  in this directory. Do not modify it by hand.
+!  
+        integer nx, ny, nz, maxdim, niter_default
+        integer ntotal, nxp, nyp, ntotalp
+        parameter (nx=256, ny=256, nz=128, maxdim=256)
+        parameter (niter_default=6)
+        parameter (nxp=nx+1, nyp=ny)
+        parameter (ntotal=nx*nyp*nz)
+        parameter (ntotalp=nxp*nyp*nz)
+        logical  convertdouble
+        parameter (convertdouble = .false.)
+        character compiletime*11
+        parameter (compiletime='18 Oct 2016')
+        character npbversion*5
+        parameter (npbversion='3.3.1')
+        character cs1*8
+        parameter (cs1='gfortran')
+        character cs2*6
+        parameter (cs2='$(F77)')
+        character cs3*6
+        parameter (cs3='(none)')
+        character cs4*6
+        parameter (cs4='(none)')
+        character cs5*40
+        parameter (cs5='-ffree-form -O3 -fopenmp -mcmodel=medium')
+        character cs6*28
+        parameter (cs6='-O3 -fopenmp -mcmodel=medium')
+        character cs7*6
+        parameter (cs7='randi8')
+
+
+! If processor array is 1x1 -> 0D grid decomposition
+
+
+! Cache blocking params. These values are good for most
+! RISC processors.
+! FFT parameters:
+!  fftblock controls how many ffts are done at a time.
+!  The default is appropriate for most cache-based machines
+!  On vector machines, the FFT can be vectorized with vector
+!  length equal to the block size, so the block size should
+!  be as large as possible. This is the size of the smallest
+!  dimension of the problem: 128 for class A, 256 for class B and
+!  512 for class C.
+
+    integer :: fftblock_default, fftblockpad_default
+!      parameter (fftblock_default=16, fftblockpad_default=18)
+    parameter (fftblock_default=32, fftblockpad_default=33)
+          
+    integer :: fftblock, fftblockpad
+    common /blockinfo/ fftblock, fftblockpad
+
+! we need a bunch of logic to keep track of how
+! arrays are laid out.
+
+
+! Note: this serial version is the derived from the parallel 0D case
+! of the ft NPB.
+! The computation proceeds logically as
+
+! set up initial conditions
+! fftx(1)
+! transpose (1->2)
+! ffty(2)
+! transpose (2->3)
+! fftz(3)
+! time evolution
+! fftz(3)
+! transpose (3->2)
+! ffty(2)
+! transpose (2->1)
+! fftx(1)
+! compute residual(1)
+
+! for the 0D, 1D, 2D strategies, the layouts look like xxx
+
+!            0D        1D        2D
+! 1:        xyz       xyz       xyz
+
+! the array dimensions are stored in dims(coord, phase)
+    integer :: dims(3)
+    common /layout/ dims
+
+    integer :: T_total, T_setup, T_fft, T_evolve, T_checksum,     T_fftx, T_ffty,     T_fftz, T_max
+    parameter (T_total = 1, T_setup = 2, T_fft = 3,     T_evolve = 4, T_checksum = 5,     T_fftx = 6,     T_ffty = 7,     T_fftz = 8, T_max = 8)
+
+
+
+    logical :: timers_enabled
+
+
+    external timer_read
+    double precision :: timer_read
+    external ilog2
+    integer :: ilog2
+
+    external randlc
+    double precision :: randlc
+
+
+! other stuff
+    logical :: debug, debugsynch
+    common /dbg/ debug, debugsynch, timers_enabled
+
+    double precision :: seed, a, pi, alpha
+    parameter (seed = 314159265.d0, a = 1220703125.d0,     pi = 3.141592653589793238d0, alpha=1.0d-6)
+
+
+! roots of unity array
+! relies on x being largest dimension?
+    double complex u(nxp)
+    common /ucomm/ u
+
+
+! for checksum data
+    double complex sums(0:niter_default)
+    common /sumcomm/ sums
+
+! number of iterations
+    integer :: niter
+    common /iter/ niter
+    integer :: is, d1, d2, d3, logd2
+    double complex x(d1+1,d2,d3)
+    double complex xout(d1+1,d2,d3)
+    double complex y1(fftblockpad, d2), y2(fftblockpad, d2)
+    integer :: i, j, k, ii
+
+    logd2 = ilog2(d2)
+
+    if (timers_enabled) call timer_start(T_ffty)
+! omp parallel do default(shared) private(i,j,k,ii,y1,y2) shared(is,logd2,d2)
+    do k = 1, d3
+        do ii = 0, d1 - fftblock, fftblock
+            do j = 1, d2
+                do i = 1, fftblock
+                    y1(i,j) = x(i+ii,j,k)
+                enddo
+            enddo
+
+            call cfftz (is, logd2, d2, y1, y2)
+                       
+            do j = 1, d2
+                do i = 1, fftblock
+                    xout(i+ii,j,k) = y1(i,j)
+                enddo
+            enddo
+        enddo
+    enddo
+    if (timers_enabled) call timer_stop(T_ffty)
+
+    return
+    end subroutine cffts2
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine cffts3(is, d1, d2, d3, x, xout, y1, y2)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    implicit none
+
+! CLASS = A
+!  
+!  
+!  This file is generated automatically by the setparams utility.
+!  It sets the number of processors and the class of the NPB
+!  in this directory. Do not modify it by hand.
+!  
+        integer nx, ny, nz, maxdim, niter_default
+        integer ntotal, nxp, nyp, ntotalp
+        parameter (nx=256, ny=256, nz=128, maxdim=256)
+        parameter (niter_default=6)
+        parameter (nxp=nx+1, nyp=ny)
+        parameter (ntotal=nx*nyp*nz)
+        parameter (ntotalp=nxp*nyp*nz)
+        logical  convertdouble
+        parameter (convertdouble = .false.)
+        character compiletime*11
+        parameter (compiletime='18 Oct 2016')
+        character npbversion*5
+        parameter (npbversion='3.3.1')
+        character cs1*8
+        parameter (cs1='gfortran')
+        character cs2*6
+        parameter (cs2='$(F77)')
+        character cs3*6
+        parameter (cs3='(none)')
+        character cs4*6
+        parameter (cs4='(none)')
+        character cs5*40
+        parameter (cs5='-ffree-form -O3 -fopenmp -mcmodel=medium')
+        character cs6*28
+        parameter (cs6='-O3 -fopenmp -mcmodel=medium')
+        character cs7*6
+        parameter (cs7='randi8')
+
+
+! If processor array is 1x1 -> 0D grid decomposition
+
+
+! Cache blocking params. These values are good for most
+! RISC processors.
+! FFT parameters:
+!  fftblock controls how many ffts are done at a time.
+!  The default is appropriate for most cache-based machines
+!  On vector machines, the FFT can be vectorized with vector
+!  length equal to the block size, so the block size should
+!  be as large as possible. This is the size of the smallest
+!  dimension of the problem: 128 for class A, 256 for class B and
+!  512 for class C.
+
+    integer :: fftblock_default, fftblockpad_default
+!      parameter (fftblock_default=16, fftblockpad_default=18)
+    parameter (fftblock_default=32, fftblockpad_default=33)
+          
+    integer :: fftblock, fftblockpad
+    common /blockinfo/ fftblock, fftblockpad
+
+! we need a bunch of logic to keep track of how
+! arrays are laid out.
+
+
+! Note: this serial version is the derived from the parallel 0D case
+! of the ft NPB.
+! The computation proceeds logically as
+
+! set up initial conditions
+! fftx(1)
+! transpose (1->2)
+! ffty(2)
+! transpose (2->3)
+! fftz(3)
+! time evolution
+! fftz(3)
+! transpose (3->2)
+! ffty(2)
+! transpose (2->1)
+! fftx(1)
+! compute residual(1)
+
+! for the 0D, 1D, 2D strategies, the layouts look like xxx
+
+!            0D        1D        2D
+! 1:        xyz       xyz       xyz
+
+! the array dimensions are stored in dims(coord, phase)
+    integer :: dims(3)
+    common /layout/ dims
+
+    integer :: T_total, T_setup, T_fft, T_evolve, T_checksum,     T_fftx, T_ffty,     T_fftz, T_max
+    parameter (T_total = 1, T_setup = 2, T_fft = 3,     T_evolve = 4, T_checksum = 5,     T_fftx = 6,     T_ffty = 7,     T_fftz = 8, T_max = 8)
+
+
+
+    logical :: timers_enabled
+
+
+    external timer_read
+    double precision :: timer_read
+    external ilog2
+    integer :: ilog2
+
+    external randlc
+    double precision :: randlc
+
+
+! other stuff
+    logical :: debug, debugsynch
+    common /dbg/ debug, debugsynch, timers_enabled
+
+    double precision :: seed, a, pi, alpha
+    parameter (seed = 314159265.d0, a = 1220703125.d0,     pi = 3.141592653589793238d0, alpha=1.0d-6)
+
+
+! roots of unity array
+! relies on x being largest dimension?
+    double complex u(nxp)
+    common /ucomm/ u
+
+
+! for checksum data
+    double complex sums(0:niter_default)
+    common /sumcomm/ sums
+
+! number of iterations
+    integer :: niter
+    common /iter/ niter
+    integer :: is, d1, d2, d3, logd3
+    double complex x(d1+1,d2,d3)
+    double complex xout(d1+1,d2,d3)
+    double complex y1(fftblockpad, d3), y2(fftblockpad, d3)
+    integer :: i, j, k, ii
+
+    logd3 = ilog2(d3)
+
+    if (timers_enabled) call timer_start(T_fftz)
+! omp parallel do default(shared) private(i,j,k,ii,y1,y2) shared(is)
+    do j = 1, d2
+        do ii = 0, d1 - fftblock, fftblock
+            do k = 1, d3
+                do i = 1, fftblock
+                    y1(i,k) = x(i+ii,j,k)
+                enddo
+            enddo
+
+            call cfftz (is, logd3, d3, y1, y2)
+
+            do k = 1, d3
+                do i = 1, fftblock
+                    xout(i+ii,j,k) = y1(i,k)
+                enddo
+            enddo
+        enddo
+    enddo
+    if (timers_enabled) call timer_stop(T_fftz)
+
+    return
+    end subroutine cffts3
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine fft_init (n)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+!---------------------------------------------------------------------
+! compute the roots-of-unity array that will be used for subsequent FFTs.
+!---------------------------------------------------------------------
+
+    implicit none
+! CLASS = A
+!  
+!  
+!  This file is generated automatically by the setparams utility.
+!  It sets the number of processors and the class of the NPB
+!  in this directory. Do not modify it by hand.
+!  
+        integer nx, ny, nz, maxdim, niter_default
+        integer ntotal, nxp, nyp, ntotalp
+        parameter (nx=256, ny=256, nz=128, maxdim=256)
+        parameter (niter_default=6)
+        parameter (nxp=nx+1, nyp=ny)
+        parameter (ntotal=nx*nyp*nz)
+        parameter (ntotalp=nxp*nyp*nz)
+        logical  convertdouble
+        parameter (convertdouble = .false.)
+        character compiletime*11
+        parameter (compiletime='18 Oct 2016')
+        character npbversion*5
+        parameter (npbversion='3.3.1')
+        character cs1*8
+        parameter (cs1='gfortran')
+        character cs2*6
+        parameter (cs2='$(F77)')
+        character cs3*6
+        parameter (cs3='(none)')
+        character cs4*6
+        parameter (cs4='(none)')
+        character cs5*40
+        parameter (cs5='-ffree-form -O3 -fopenmp -mcmodel=medium')
+        character cs6*28
+        parameter (cs6='-O3 -fopenmp -mcmodel=medium')
+        character cs7*6
+        parameter (cs7='randi8')
+
+
+! If processor array is 1x1 -> 0D grid decomposition
+
+
+! Cache blocking params. These values are good for most
+! RISC processors.
+! FFT parameters:
+!  fftblock controls how many ffts are done at a time.
+!  The default is appropriate for most cache-based machines
+!  On vector machines, the FFT can be vectorized with vector
+!  length equal to the block size, so the block size should
+!  be as large as possible. This is the size of the smallest
+!  dimension of the problem: 128 for class A, 256 for class B and
+!  512 for class C.
+
+    integer :: fftblock_default, fftblockpad_default
+!      parameter (fftblock_default=16, fftblockpad_default=18)
+    parameter (fftblock_default=32, fftblockpad_default=33)
+          
+    integer :: fftblock, fftblockpad
+    common /blockinfo/ fftblock, fftblockpad
+
+! we need a bunch of logic to keep track of how
+! arrays are laid out.
+
+
+! Note: this serial version is the derived from the parallel 0D case
+! of the ft NPB.
+! The computation proceeds logically as
+
+! set up initial conditions
+! fftx(1)
+! transpose (1->2)
+! ffty(2)
+! transpose (2->3)
+! fftz(3)
+! time evolution
+! fftz(3)
+! transpose (3->2)
+! ffty(2)
+! transpose (2->1)
+! fftx(1)
+! compute residual(1)
+
+! for the 0D, 1D, 2D strategies, the layouts look like xxx
+
+!            0D        1D        2D
+! 1:        xyz       xyz       xyz
+
+! the array dimensions are stored in dims(coord, phase)
+    integer :: dims(3)
+    common /layout/ dims
+
+    integer :: T_total, T_setup, T_fft, T_evolve, T_checksum,     T_fftx, T_ffty,     T_fftz, T_max
+    parameter (T_total = 1, T_setup = 2, T_fft = 3,     T_evolve = 4, T_checksum = 5,     T_fftx = 6,     T_ffty = 7,     T_fftz = 8, T_max = 8)
+
+
+
+    logical :: timers_enabled
+
+
+    external timer_read
+    double precision :: timer_read
+    external ilog2
+    integer :: ilog2
+
+    external randlc
+    double precision :: randlc
+
+
+! other stuff
+    logical :: debug, debugsynch
+    common /dbg/ debug, debugsynch, timers_enabled
+
+    double precision :: seed, a, pi, alpha
+    parameter (seed = 314159265.d0, a = 1220703125.d0,     pi = 3.141592653589793238d0, alpha=1.0d-6)
+
+
+! roots of unity array
+! relies on x being largest dimension?
+    double complex u(nxp)
+    common /ucomm/ u
+
+
+! for checksum data
+    double complex sums(0:niter_default)
+    common /sumcomm/ sums
+
+! number of iterations
+    integer :: niter
+    common /iter/ niter
+
+    integer :: m,n,nu,ku,i,j,ln
+    double precision :: t, ti
+
+
+!---------------------------------------------------------------------
+!   Initialize the U array with sines and cosines in a manner that permits
+!   stride one access at each FFT iteration.
+!---------------------------------------------------------------------
+    nu = n
+    m = ilog2(n)
+    u(1) = m
+    ku = 2
+    ln = 1
+
+    do j = 1, m
+        t = pi / ln
+                 
+        do i = 0, ln - 1
+            ti = i * t
+            u(i+ku) = dcmplx (cos (ti), sin(ti))
+        enddo
+                 
+        ku = ku + ln
+        ln = 2 * ln
+    enddo
+          
+    return
+    end subroutine fft_init
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine cfftz (is, m, n, x, y)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+!---------------------------------------------------------------------
+!   Computes NY N-point complex-to-complex FFTs of X using an algorithm due
+!   to Swarztrauber.  X is both the input and the output array, while Y is a
+!   scratch array.  It is assumed that N = 2^M.  Before calling CFFTZ to
+!   perform FFTs, the array U must be initialized by calling CFFTZ with IS
+!   set to 0 and M set to MX, where MX is the maximum value of M for any
+!   subsequent call.
+!---------------------------------------------------------------------
+
+    implicit none
+! CLASS = A
+!  
+!  
+!  This file is generated automatically by the setparams utility.
+!  It sets the number of processors and the class of the NPB
+!  in this directory. Do not modify it by hand.
+!  
+        integer nx, ny, nz, maxdim, niter_default
+        integer ntotal, nxp, nyp, ntotalp
+        parameter (nx=256, ny=256, nz=128, maxdim=256)
+        parameter (niter_default=6)
+        parameter (nxp=nx+1, nyp=ny)
+        parameter (ntotal=nx*nyp*nz)
+        parameter (ntotalp=nxp*nyp*nz)
+        logical  convertdouble
+        parameter (convertdouble = .false.)
+        character compiletime*11
+        parameter (compiletime='18 Oct 2016')
+        character npbversion*5
+        parameter (npbversion='3.3.1')
+        character cs1*8
+        parameter (cs1='gfortran')
+        character cs2*6
+        parameter (cs2='$(F77)')
+        character cs3*6
+        parameter (cs3='(none)')
+        character cs4*6
+        parameter (cs4='(none)')
+        character cs5*40
+        parameter (cs5='-ffree-form -O3 -fopenmp -mcmodel=medium')
+        character cs6*28
+        parameter (cs6='-O3 -fopenmp -mcmodel=medium')
+        character cs7*6
+        parameter (cs7='randi8')
+
+
+! If processor array is 1x1 -> 0D grid decomposition
+
+
+! Cache blocking params. These values are good for most
+! RISC processors.
+! FFT parameters:
+!  fftblock controls how many ffts are done at a time.
+!  The default is appropriate for most cache-based machines
+!  On vector machines, the FFT can be vectorized with vector
+!  length equal to the block size, so the block size should
+!  be as large as possible. This is the size of the smallest
+!  dimension of the problem: 128 for class A, 256 for class B and
+!  512 for class C.
+
+    integer :: fftblock_default, fftblockpad_default
+!      parameter (fftblock_default=16, fftblockpad_default=18)
+    parameter (fftblock_default=32, fftblockpad_default=33)
+          
+    integer :: fftblock, fftblockpad
+    common /blockinfo/ fftblock, fftblockpad
+
+! we need a bunch of logic to keep track of how
+! arrays are laid out.
+
+
+! Note: this serial version is the derived from the parallel 0D case
+! of the ft NPB.
+! The computation proceeds logically as
+
+! set up initial conditions
+! fftx(1)
+! transpose (1->2)
+! ffty(2)
+! transpose (2->3)
+! fftz(3)
+! time evolution
+! fftz(3)
+! transpose (3->2)
+! ffty(2)
+! transpose (2->1)
+! fftx(1)
+! compute residual(1)
+
+! for the 0D, 1D, 2D strategies, the layouts look like xxx
+
+!            0D        1D        2D
+! 1:        xyz       xyz       xyz
+
+! the array dimensions are stored in dims(coord, phase)
+    integer :: dims(3)
+    common /layout/ dims
+
+    integer :: T_total, T_setup, T_fft, T_evolve, T_checksum,     T_fftx, T_ffty,     T_fftz, T_max
+    parameter (T_total = 1, T_setup = 2, T_fft = 3,     T_evolve = 4, T_checksum = 5,     T_fftx = 6,     T_ffty = 7,     T_fftz = 8, T_max = 8)
+
+
+
+    logical :: timers_enabled
+
+
+    external timer_read
+    double precision :: timer_read
+    external ilog2
+    integer :: ilog2
+
+    external randlc
+    double precision :: randlc
+
+
+! other stuff
+    logical :: debug, debugsynch
+    common /dbg/ debug, debugsynch, timers_enabled
+
+    double precision :: seed, a, pi, alpha
+    parameter (seed = 314159265.d0, a = 1220703125.d0,     pi = 3.141592653589793238d0, alpha=1.0d-6)
+
+
+! roots of unity array
+! relies on x being largest dimension?
+    double complex u(nxp)
+    common /ucomm/ u
+
+
+! for checksum data
+    double complex sums(0:niter_default)
+    common /sumcomm/ sums
+
+! number of iterations
+    integer :: niter
+    common /iter/ niter
+
+    integer :: is,m,n,i,j,l,mx
+    double complex x, y
+
+    dimension x(fftblockpad,n), y(fftblockpad,n)
+
+!---------------------------------------------------------------------
+!   Check if input parameters are invalid.
+!---------------------------------------------------------------------
+    mx = u(1)
+     if ((is /= 1 .AND. is /= -1) .OR. m < 1 .OR. m > mx)     then
+         write (*, 1)  is, m, mx
+         1 format ('CFFTZ: Either U has not been initialized, or else'/         'one of the input parameters is invalid', 3I5)
+         stop
+     endif
+
+ !---------------------------------------------------------------------
+ !   Perform one variant of the Stockham FFT.
+ !---------------------------------------------------------------------
+     do l = 1, m, 2
+         call fftz2 (is, l, m, n, fftblock, fftblockpad, u, x, y)
+         if (l == m) goto 160
+         call fftz2 (is, l + 1, m, n, fftblock, fftblockpad, u, y, x)
+     enddo
+
+     goto 180
+
+ !---------------------------------------------------------------------
+ !   Copy Y to X.
+ !---------------------------------------------------------------------
+     160 do j = 1, n
+         do i = 1, fftblock
+             x(i,j) = y(i,j)
+         enddo
+     enddo
+
+     180 continue
+
+    return
+    end subroutine cfftz
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine fftz2 (is, l, m, n, ny, ny1, u, x, y)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+!---------------------------------------------------------------------
+!   Performs the L-th iteration of the second variant of the Stockham FFT.
+!---------------------------------------------------------------------
+
+    implicit none
+
+    integer :: is,k,l,m,n,ny,ny1,n1,li,lj,lk,ku,i,j,i11,i12,i21,i22
+    double complex u,x,y,u1,x11,x21
+    dimension u(n), x(ny1,n), y(ny1,n)
+
+
+!---------------------------------------------------------------------
+!   Set initial parameters.
+!---------------------------------------------------------------------
+
+    n1 = n / 2
+    lk = 2 ** (l - 1)
+    li = 2 ** (m - l)
+    lj = 2 * lk
+    ku = li + 1
+
+    do i = 0, li - 1
+        i11 = i * lk + 1
+        i12 = i11 + n1
+        i21 = i * lj + 1
+        i22 = i21 + lk
+        if (is >= 1) then
+            u1 = u(ku+i)
+        else
+            u1 = dconjg (u(ku+i))
+        endif
+
+    !---------------------------------------------------------------------
+    !   This loop is vectorizable.
+    !---------------------------------------------------------------------
+        do k = 0, lk - 1
+            do j = 1, ny
+                x11 = x(j,i11+k)
+                x21 = x(j,i12+k)
+                y(j,i21+k) = x11 + x21
+                y(j,i22+k) = u1 * (x11 - x21)
+            enddo
+        enddo
+    enddo
+
+    return
+    end subroutine fftz2
+
+!---------------------------------------------------------------------
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    integer function ilog2(n)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    implicit none
+    integer :: n, nn, lg
+    if (n == 1) then
+        ilog2=0
+        return
+    endif
+    lg = 1
+    nn = 2
+    do while (nn < n)
+        nn = nn*2
+        lg = lg+1
+    end do
+    ilog2 = lg
+    return
+    end function ilog2
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine checksum(i, u1, d1, d2, d3)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    implicit none
+! CLASS = A
+!  
+!  
+!  This file is generated automatically by the setparams utility.
+!  It sets the number of processors and the class of the NPB
+!  in this directory. Do not modify it by hand.
+!  
+        integer nx, ny, nz, maxdim, niter_default
+        integer ntotal, nxp, nyp, ntotalp
+        parameter (nx=256, ny=256, nz=128, maxdim=256)
+        parameter (niter_default=6)
+        parameter (nxp=nx+1, nyp=ny)
+        parameter (ntotal=nx*nyp*nz)
+        parameter (ntotalp=nxp*nyp*nz)
+        logical  convertdouble
+        parameter (convertdouble = .false.)
+        character compiletime*11
+        parameter (compiletime='18 Oct 2016')
+        character npbversion*5
+        parameter (npbversion='3.3.1')
+        character cs1*8
+        parameter (cs1='gfortran')
+        character cs2*6
+        parameter (cs2='$(F77)')
+        character cs3*6
+        parameter (cs3='(none)')
+        character cs4*6
+        parameter (cs4='(none)')
+        character cs5*40
+        parameter (cs5='-ffree-form -O3 -fopenmp -mcmodel=medium')
+        character cs6*28
+        parameter (cs6='-O3 -fopenmp -mcmodel=medium')
+        character cs7*6
+        parameter (cs7='randi8')
+
+
+! If processor array is 1x1 -> 0D grid decomposition
+
+
+! Cache blocking params. These values are good for most
+! RISC processors.
+! FFT parameters:
+!  fftblock controls how many ffts are done at a time.
+!  The default is appropriate for most cache-based machines
+!  On vector machines, the FFT can be vectorized with vector
+!  length equal to the block size, so the block size should
+!  be as large as possible. This is the size of the smallest
+!  dimension of the problem: 128 for class A, 256 for class B and
+!  512 for class C.
+
+    integer :: fftblock_default, fftblockpad_default
+!      parameter (fftblock_default=16, fftblockpad_default=18)
+    parameter (fftblock_default=32, fftblockpad_default=33)
+          
+    integer :: fftblock, fftblockpad
+    common /blockinfo/ fftblock, fftblockpad
+
+! we need a bunch of logic to keep track of how
+! arrays are laid out.
+
+
+! Note: this serial version is the derived from the parallel 0D case
+! of the ft NPB.
+! The computation proceeds logically as
+
+! set up initial conditions
+! fftx(1)
+! transpose (1->2)
+! ffty(2)
+! transpose (2->3)
+! fftz(3)
+! time evolution
+! fftz(3)
+! transpose (3->2)
+! ffty(2)
+! transpose (2->1)
+! fftx(1)
+! compute residual(1)
+
+! for the 0D, 1D, 2D strategies, the layouts look like xxx
+
+!            0D        1D        2D
+! 1:        xyz       xyz       xyz
+
+! the array dimensions are stored in dims(coord, phase)
+    integer :: dims(3)
+    common /layout/ dims
+
+    integer :: T_total, T_setup, T_fft, T_evolve, T_checksum,     T_fftx, T_ffty,     T_fftz, T_max
+    parameter (T_total = 1, T_setup = 2, T_fft = 3,     T_evolve = 4, T_checksum = 5,     T_fftx = 6,     T_ffty = 7,     T_fftz = 8, T_max = 8)
+
+
+
+    logical :: timers_enabled
+
+
+    external timer_read
+    double precision :: timer_read
+    external ilog2
+    integer :: ilog2
+
+    external randlc
+    double precision :: randlc
+
+
+! other stuff
+    logical :: debug, debugsynch
+    common /dbg/ debug, debugsynch, timers_enabled
+
+    double precision :: seed, a, pi, alpha
+    parameter (seed = 314159265.d0, a = 1220703125.d0,     pi = 3.141592653589793238d0, alpha=1.0d-6)
+
+
+! roots of unity array
+! relies on x being largest dimension?
+    double complex u(nxp)
+    common /ucomm/ u
+
+
+! for checksum data
+    double complex sums(0:niter_default)
+    common /sumcomm/ sums
+
+! number of iterations
+    integer :: niter
+    common /iter/ niter
+    integer :: i, d1, d2, d3
+    double complex u1(d1+1,d2,d3)
+    integer :: j, q,r,s
+    double complex chk
+    chk = (0.0,0.0)
+
+! omp parallel do default(shared) private(i,q,r,s) reduction(+:chk)
+    do j=1,1024
+        q = mod(j, nx)+1
+        r = mod(3*j,ny)+1
+        s = mod(5*j,nz)+1
+        chk=chk+u1(q,r,s)
+    end do
+
+    chk = chk/dble(ntotal)
+          
+    write (*, 30) i, chk
+    30 format (' T =',I5,5X,'Checksum =',1P2D22.12)
+    sums(i) = chk
+    return
+    end subroutine checksum
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine verify (d1, d2, d3, nt, verified, class)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    implicit none
+! CLASS = A
+!  
+!  
+!  This file is generated automatically by the setparams utility.
+!  It sets the number of processors and the class of the NPB
+!  in this directory. Do not modify it by hand.
+!  
+        integer nx, ny, nz, maxdim, niter_default
+        integer ntotal, nxp, nyp, ntotalp
+        parameter (nx=256, ny=256, nz=128, maxdim=256)
+        parameter (niter_default=6)
+        parameter (nxp=nx+1, nyp=ny)
+        parameter (ntotal=nx*nyp*nz)
+        parameter (ntotalp=nxp*nyp*nz)
+        logical  convertdouble
+        parameter (convertdouble = .false.)
+        character compiletime*11
+        parameter (compiletime='18 Oct 2016')
+        character npbversion*5
+        parameter (npbversion='3.3.1')
+        character cs1*8
+        parameter (cs1='gfortran')
+        character cs2*6
+        parameter (cs2='$(F77)')
+        character cs3*6
+        parameter (cs3='(none)')
+        character cs4*6
+        parameter (cs4='(none)')
+        character cs5*40
+        parameter (cs5='-ffree-form -O3 -fopenmp -mcmodel=medium')
+        character cs6*28
+        parameter (cs6='-O3 -fopenmp -mcmodel=medium')
+        character cs7*6
+        parameter (cs7='randi8')
+
+
+! If processor array is 1x1 -> 0D grid decomposition
+
+
+! Cache blocking params. These values are good for most
+! RISC processors.
+! FFT parameters:
+!  fftblock controls how many ffts are done at a time.
+!  The default is appropriate for most cache-based machines
+!  On vector machines, the FFT can be vectorized with vector
+!  length equal to the block size, so the block size should
+!  be as large as possible. This is the size of the smallest
+!  dimension of the problem: 128 for class A, 256 for class B and
+!  512 for class C.
+
+    integer :: fftblock_default, fftblockpad_default
+!      parameter (fftblock_default=16, fftblockpad_default=18)
+    parameter (fftblock_default=32, fftblockpad_default=33)
+          
+    integer :: fftblock, fftblockpad
+    common /blockinfo/ fftblock, fftblockpad
+
+! we need a bunch of logic to keep track of how
+! arrays are laid out.
+
+
+! Note: this serial version is the derived from the parallel 0D case
+! of the ft NPB.
+! The computation proceeds logically as
+
+! set up initial conditions
+! fftx(1)
+! transpose (1->2)
+! ffty(2)
+! transpose (2->3)
+! fftz(3)
+! time evolution
+! fftz(3)
+! transpose (3->2)
+! ffty(2)
+! transpose (2->1)
+! fftx(1)
+! compute residual(1)
+
+! for the 0D, 1D, 2D strategies, the layouts look like xxx
+
+!            0D        1D        2D
+! 1:        xyz       xyz       xyz
+
+! the array dimensions are stored in dims(coord, phase)
+    integer :: dims(3)
+    common /layout/ dims
+
+    integer :: T_total, T_setup, T_fft, T_evolve, T_checksum,     T_fftx, T_ffty,     T_fftz, T_max
+    parameter (T_total = 1, T_setup = 2, T_fft = 3,     T_evolve = 4, T_checksum = 5,     T_fftx = 6,     T_ffty = 7,     T_fftz = 8, T_max = 8)
+
+
+
+    logical :: timers_enabled
+
+
+    external timer_read
+    double precision :: timer_read
+    external ilog2
+    integer :: ilog2
+
+    external randlc
+    double precision :: randlc
+
+
+! other stuff
+    logical :: debug, debugsynch
+    common /dbg/ debug, debugsynch, timers_enabled
+
+    double precision :: seed, a, pi, alpha
+    parameter (seed = 314159265.d0, a = 1220703125.d0,     pi = 3.141592653589793238d0, alpha=1.0d-6)
+
+
+! roots of unity array
+! relies on x being largest dimension?
+    double complex u(nxp)
+    common /ucomm/ u
+
+
+! for checksum data
+    double complex sums(0:niter_default)
+    common /sumcomm/ sums
+
+! number of iterations
+    integer :: niter
+    common /iter/ niter
+    integer :: d1, d2, d3, nt
+    character class
+    logical :: verified
+    integer :: i
+    double precision :: err, epsilon
+
+!---------------------------------------------------------------------
+!   Reference checksums
+!---------------------------------------------------------------------
+    double complex csum_ref(25)
+
+
+    class = 'U'
+
+    epsilon = 1.0d-12
+    verified = .FALSE. 
+
+    if (d1 == 64 .AND.     d2 == 64 .AND.     d3 == 64 .AND.     nt == 6) then
+    !---------------------------------------------------------------------
+    !   Sample size reference checksums
+    !---------------------------------------------------------------------
+        class = 'S'
+        csum_ref(1) = dcmplx(5.546087004964D+02, 4.845363331978D+02)
+        csum_ref(2) = dcmplx(5.546385409189D+02, 4.865304269511D+02)
+        csum_ref(3) = dcmplx(5.546148406171D+02, 4.883910722336D+02)
+        csum_ref(4) = dcmplx(5.545423607415D+02, 4.901273169046D+02)
+        csum_ref(5) = dcmplx(5.544255039624D+02, 4.917475857993D+02)
+        csum_ref(6) = dcmplx(5.542683411902D+02, 4.932597244941D+02)
+
+    else if (d1 == 128 .AND.         d2 == 128 .AND.         d3 == 32 .AND.         nt == 6) then
+    !---------------------------------------------------------------------
+    !   Class W size reference checksums
+    !---------------------------------------------------------------------
+        class = 'W'
+        csum_ref(1) = dcmplx(5.673612178944D+02, 5.293246849175D+02)
+        csum_ref(2) = dcmplx(5.631436885271D+02, 5.282149986629D+02)
+        csum_ref(3) = dcmplx(5.594024089970D+02, 5.270996558037D+02)
+        csum_ref(4) = dcmplx(5.560698047020D+02, 5.260027904925D+02)
+        csum_ref(5) = dcmplx(5.530898991250D+02, 5.249400845633D+02)
+        csum_ref(6) = dcmplx(5.504159734538D+02, 5.239212247086D+02)
+
+    else if (d1 == 256 .AND.         d2 == 256 .AND.         d3 == 128 .AND.         nt == 6) then
+    !---------------------------------------------------------------------
+    !   Class A size reference checksums
+    !---------------------------------------------------------------------
+        class = 'A'
+        csum_ref(1) = dcmplx(5.046735008193D+02, 5.114047905510D+02)
+        csum_ref(2) = dcmplx(5.059412319734D+02, 5.098809666433D+02)
+        csum_ref(3) = dcmplx(5.069376896287D+02, 5.098144042213D+02)
+        csum_ref(4) = dcmplx(5.077892868474D+02, 5.101336130759D+02)
+        csum_ref(5) = dcmplx(5.085233095391D+02, 5.104914655194D+02)
+        csum_ref(6) = dcmplx(5.091487099959D+02, 5.107917842803D+02)
+              
+    else if (d1 == 512 .AND.         d2 == 256 .AND.         d3 == 256 .AND.         nt == 20) then
+    !---------------------------------------------------------------------
+    !   Class B size reference checksums
+    !---------------------------------------------------------------------
+        class = 'B'
+        csum_ref(1)  = dcmplx(5.177643571579D+02, 5.077803458597D+02)
+        csum_ref(2)  = dcmplx(5.154521291263D+02, 5.088249431599D+02)
+        csum_ref(3)  = dcmplx(5.146409228649D+02, 5.096208912659D+02)
+        csum_ref(4)  = dcmplx(5.142378756213D+02, 5.101023387619D+02)
+        csum_ref(5)  = dcmplx(5.139626667737D+02, 5.103976610617D+02)
+        csum_ref(6)  = dcmplx(5.137423460082D+02, 5.105948019802D+02)
+        csum_ref(7)  = dcmplx(5.135547056878D+02, 5.107404165783D+02)
+        csum_ref(8)  = dcmplx(5.133910925466D+02, 5.108576573661D+02)
+        csum_ref(9)  = dcmplx(5.132470705390D+02, 5.109577278523D+02)
+        csum_ref(10) = dcmplx(5.131197729984D+02, 5.110460304483D+02)
+        csum_ref(11) = dcmplx(5.130070319283D+02, 5.111252433800D+02)
+        csum_ref(12) = dcmplx(5.129070537032D+02, 5.111968077718D+02)
+        csum_ref(13) = dcmplx(5.128182883502D+02, 5.112616233064D+02)
+        csum_ref(14) = dcmplx(5.127393733383D+02, 5.113203605551D+02)
+        csum_ref(15) = dcmplx(5.126691062020D+02, 5.113735928093D+02)
+        csum_ref(16) = dcmplx(5.126064276004D+02, 5.114218460548D+02)
+        csum_ref(17) = dcmplx(5.125504076570D+02, 5.114656139760D+02)
+        csum_ref(18) = dcmplx(5.125002331720D+02, 5.115053595966D+02)
+        csum_ref(19) = dcmplx(5.124551951846D+02, 5.115415130407D+02)
+        csum_ref(20) = dcmplx(5.124146770029D+02, 5.115744692211D+02)
+
+    else if (d1 == 512 .AND.         d2 == 512 .AND.         d3 == 512 .AND.         nt == 20) then
+    !---------------------------------------------------------------------
+    !   Class C size reference checksums
+    !---------------------------------------------------------------------
+        class = 'C'
+        csum_ref(1)  = dcmplx(5.195078707457D+02, 5.149019699238D+02)
+        csum_ref(2)  = dcmplx(5.155422171134D+02, 5.127578201997D+02)
+        csum_ref(3)  = dcmplx(5.144678022222D+02, 5.122251847514D+02)
+        csum_ref(4)  = dcmplx(5.140150594328D+02, 5.121090289018D+02)
+        csum_ref(5)  = dcmplx(5.137550426810D+02, 5.121143685824D+02)
+        csum_ref(6)  = dcmplx(5.135811056728D+02, 5.121496764568D+02)
+        csum_ref(7)  = dcmplx(5.134569343165D+02, 5.121870921893D+02)
+        csum_ref(8)  = dcmplx(5.133651975661D+02, 5.122193250322D+02)
+        csum_ref(9)  = dcmplx(5.132955192805D+02, 5.122454735794D+02)
+        csum_ref(10) = dcmplx(5.132410471738D+02, 5.122663649603D+02)
+        csum_ref(11) = dcmplx(5.131971141679D+02, 5.122830879827D+02)
+        csum_ref(12) = dcmplx(5.131605205716D+02, 5.122965869718D+02)
+        csum_ref(13) = dcmplx(5.131290734194D+02, 5.123075927445D+02)
+        csum_ref(14) = dcmplx(5.131012720314D+02, 5.123166486553D+02)
+        csum_ref(15) = dcmplx(5.130760908195D+02, 5.123241541685D+02)
+        csum_ref(16) = dcmplx(5.130528295923D+02, 5.123304037599D+02)
+        csum_ref(17) = dcmplx(5.130310107773D+02, 5.123356167976D+02)
+        csum_ref(18) = dcmplx(5.130103090133D+02, 5.123399592211D+02)
+        csum_ref(19) = dcmplx(5.129905029333D+02, 5.123435588985D+02)
+        csum_ref(20) = dcmplx(5.129714421109D+02, 5.123465164008D+02)
+
+    else if (d1 == 2048 .AND.         d2 == 1024 .AND.         d3 == 1024 .AND.         nt == 25) then
+    !---------------------------------------------------------------------
+    !   Class D size reference checksums
+    !---------------------------------------------------------------------
+        class = 'D'
+        csum_ref(1)  = dcmplx(5.122230065252D+02, 5.118534037109D+02)
+        csum_ref(2)  = dcmplx(5.120463975765D+02, 5.117061181082D+02)
+        csum_ref(3)  = dcmplx(5.119865766760D+02, 5.117096364601D+02)
+        csum_ref(4)  = dcmplx(5.119518799488D+02, 5.117373863950D+02)
+        csum_ref(5)  = dcmplx(5.119269088223D+02, 5.117680347632D+02)
+        csum_ref(6)  = dcmplx(5.119082416858D+02, 5.117967875532D+02)
+        csum_ref(7)  = dcmplx(5.118943814638D+02, 5.118225281841D+02)
+        csum_ref(8)  = dcmplx(5.118842385057D+02, 5.118451629348D+02)
+        csum_ref(9)  = dcmplx(5.118769435632D+02, 5.118649119387D+02)
+        csum_ref(10) = dcmplx(5.118718203448D+02, 5.118820803844D+02)
+        csum_ref(11) = dcmplx(5.118683569061D+02, 5.118969781011D+02)
+        csum_ref(12) = dcmplx(5.118661708593D+02, 5.119098918835D+02)
+        csum_ref(13) = dcmplx(5.118649768950D+02, 5.119210777066D+02)
+        csum_ref(14) = dcmplx(5.118645605626D+02, 5.119307604484D+02)
+        csum_ref(15) = dcmplx(5.118647586618D+02, 5.119391362671D+02)
+        csum_ref(16) = dcmplx(5.118654451572D+02, 5.119463757241D+02)
+        csum_ref(17) = dcmplx(5.118665212451D+02, 5.119526269238D+02)
+        csum_ref(18) = dcmplx(5.118679083821D+02, 5.119580184108D+02)
+        csum_ref(19) = dcmplx(5.118695433664D+02, 5.119626617538D+02)
+        csum_ref(20) = dcmplx(5.118713748264D+02, 5.119666538138D+02)
+        csum_ref(21) = dcmplx(5.118733606701D+02, 5.119700787219D+02)
+        csum_ref(22) = dcmplx(5.118754661974D+02, 5.119730095953D+02)
+        csum_ref(23) = dcmplx(5.118776626738D+02, 5.119755100241D+02)
+        csum_ref(24) = dcmplx(5.118799262314D+02, 5.119776353561D+02)
+        csum_ref(25) = dcmplx(5.118822370068D+02, 5.119794338060D+02)
+
+    else if (d1 == 4096 .AND.         d2 == 2048 .AND.         d3 == 2048 .AND.         nt == 25) then
+    !---------------------------------------------------------------------
+    !   Class E size reference checksums
+    !---------------------------------------------------------------------
+        class = 'E'
+        csum_ref(1)  = dcmplx(5.121601045346D+02, 5.117395998266D+02)
+        csum_ref(2)  = dcmplx(5.120905403678D+02, 5.118614716182D+02)
+        csum_ref(3)  = dcmplx(5.120623229306D+02, 5.119074203747D+02)
+        csum_ref(4)  = dcmplx(5.120438418997D+02, 5.119345900733D+02)
+        csum_ref(5)  = dcmplx(5.120311521872D+02, 5.119551325550D+02)
+        csum_ref(6)  = dcmplx(5.120226088809D+02, 5.119720179919D+02)
+        csum_ref(7)  = dcmplx(5.120169296534D+02, 5.119861371665D+02)
+        csum_ref(8)  = dcmplx(5.120131225172D+02, 5.119979364402D+02)
+        csum_ref(9)  = dcmplx(5.120104767108D+02, 5.120077674092D+02)
+        csum_ref(10) = dcmplx(5.120085127969D+02, 5.120159443121D+02)
+        csum_ref(11) = dcmplx(5.120069224127D+02, 5.120227453670D+02)
+        csum_ref(12) = dcmplx(5.120055158164D+02, 5.120284096041D+02)
+        csum_ref(13) = dcmplx(5.120041820159D+02, 5.120331373793D+02)
+        csum_ref(14) = dcmplx(5.120028605402D+02, 5.120370938679D+02)
+        csum_ref(15) = dcmplx(5.120015223011D+02, 5.120404138831D+02)
+        csum_ref(16) = dcmplx(5.120001570022D+02, 5.120432068837D+02)
+        csum_ref(17) = dcmplx(5.119987650555D+02, 5.120455615860D+02)
+        csum_ref(18) = dcmplx(5.119973525091D+02, 5.120475499442D+02)
+        csum_ref(19) = dcmplx(5.119959279472D+02, 5.120492304629D+02)
+        csum_ref(20) = dcmplx(5.119945006558D+02, 5.120506508902D+02)
+        csum_ref(21) = dcmplx(5.119930795911D+02, 5.120518503782D+02)
+        csum_ref(22) = dcmplx(5.119916728462D+02, 5.120528612016D+02)
+        csum_ref(23) = dcmplx(5.119902874185D+02, 5.120537101195D+02)
+        csum_ref(24) = dcmplx(5.119889291565D+02, 5.120544194514D+02)
+        csum_ref(25) = dcmplx(5.119876028049D+02, 5.120550079284D+02)
+
+    endif
+
+
+    if (class /= 'U') then
+
+        do i = 1, nt
+            err = abs( (sums(i) - csum_ref(i)) / csum_ref(i) )
+            if ( .NOT. (err <= epsilon)) goto 100
+        end do
+        verified = .TRUE. 
+        100 continue
+
+    endif
+
+             
+    if (class /= 'U') then
+        if (verified) then
+            write(*,2000)
+            2000 format(' Result verification successful')
+        else
+            write(*,2001)
+            2001 format(' Result verification failed')
+        endif
+    endif
+    print *, 'class = ', class
+
+    return
+    end subroutine verify
+
+
diff --git a/Parser/test-data/nas/ft4.f b/Parser/test-data/nas/ft4.f
new file mode 100644
index 0000000..6cef1c4
--- /dev/null
+++ b/Parser/test-data/nas/ft4.f
@@ -0,0 +1,1106 @@
+!-------------------------------------------------------------------------!
+!                                                                         !
+!        N  A  S     P A R A L L E L     B E N C H M A R K S  3.3         !
+!                                                                         !
+!                       O p e n M P     V E R S I O N                     !
+!                                                                         !
+!                                   F T                                   !
+!                                                                         !
+!-------------------------------------------------------------------------!
+!                                                                         !
+!    This benchmark is an OpenMP version of the NPB FT code.              !
+!    It is described in NAS Technical Report 99-011.                      !
+!                                                                         !
+!    Permission to use, copy, distribute and modify this software         !
+!    for any purpose with or without fee is hereby granted.  We           !
+!    request, however, that all derived work reference the NAS            !
+!    Parallel Benchmarks 3.3. This software is provided "as is"           !
+!    without express or implied warranty.                                 !
+!                                                                         !
+!    Information on NPB 3.3, including the technical report, the          !
+!    original specifications, source code, results and information        !
+!    on how to submit new results, is available at:                       !
+!                                                                         !
+!           http://www.nas.nasa.gov/Software/NPB/                         !
+!                                                                         !
+!    Send comments or suggestions to  npb@nas.nasa.gov                    !
+!                                                                         !
+!          NAS Parallel Benchmarks Group                                  !
+!          NASA Ames Research Center                                      !
+!          Mail Stop: T27A-1                                              !
+!          Moffett Field, CA   94035-1000                                 !
+!                                                                         !
+!          E-mail:  npb@nas.nasa.gov                                      !
+!          Fax:     (650) 604-3957                                        !
+!                                                                         !
+!-------------------------------------------------------------------------!
+
+!---------------------------------------------------------------------
+
+! Authors: D. Bailey
+!          W. Saphir
+!          H. Jin
+
+!---------------------------------------------------------------------
+
+!---------------------------------------------------------------------
+
+!---------------------------------------------------------------------
+! FT benchmark
+!---------------------------------------------------------------------
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    program ft
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+
+    implicit none
+
+    include 'global.h'
+    integer :: i
+          
+!---------------------------------------------------------------------
+! u0, u1, u2 are the main arrays in the problem.
+! Depending on the decomposition, these arrays will have different
+! dimensions. To accomodate all possibilities, we allocate them as
+! one-dimensional arrays and pass them to subroutines for different
+! views
+!  - u0 contains the initial (transformed) initial condition
+!  - u1 and u2 are working arrays
+!  - twiddle contains exponents for the time evolution operator.
+!---------------------------------------------------------------------
+
+    double complex   u0(ntotalp), &
+    u1(ntotalp)
+!     >                 u2(ntotalp)
+    double precision :: twiddle(ntotalp)
+!---------------------------------------------------------------------
+! Large arrays are in common so that they are allocated on the
+! heap rather than the stack. This common block is not
+! referenced directly anywhere else. Padding is to avoid accidental
+! cache problems, since all array sizes are powers of two.
+!---------------------------------------------------------------------
+
+!      double complex pad1(3), pad2(3), pad3(3)
+!      common /bigarrays/ u0, pad1, u1, pad2, u2, pad3, twiddle
+    double complex pad1(3), pad2(3)
+    common /bigarrays/ u0, pad1, u1, pad2, twiddle
+
+    integer :: iter
+    double precision :: total_time, mflops
+    logical :: verified
+    character class
+
+!---------------------------------------------------------------------
+! Run the entire problem once to make sure all data is touched.
+! This reduces variable startup costs, which is important for such a
+! short benchmark. The other NPB 2 implementations are similar.
+!---------------------------------------------------------------------
+    do i = 1, t_max
+        call timer_clear(i)
+    end do
+    call setup()
+    call init_ui(u0, u1, twiddle, dims(1), dims(2), dims(3))
+    call compute_indexmap(twiddle, dims(1), dims(2), dims(3))
+    call compute_initial_conditions(u1, dims(1), dims(2), dims(3))
+    call fft_init (dims(1))
+    call fft(1, u1, u0)
+
+!---------------------------------------------------------------------
+! Start over from the beginning. Note that all operations must
+! be timed, in contrast to other benchmarks.
+!---------------------------------------------------------------------
+    do i = 1, t_max
+        call timer_clear(i)
+    end do
+
+    call timer_start(T_total)
+    if (timers_enabled) call timer_start(T_setup)
+
+    call compute_indexmap(twiddle, dims(1), dims(2), dims(3))
+
+    call compute_initial_conditions(u1, dims(1), dims(2), dims(3))
+
+    call fft_init (dims(1))
+
+    if (timers_enabled) call timer_stop(T_setup)
+    if (timers_enabled) call timer_start(T_fft)
+    call fft(1, u1, u0)
+    if (timers_enabled) call timer_stop(T_fft)
+
+    do iter = 1, niter
+        if (timers_enabled) call timer_start(T_evolve)
+        call evolve(u0, u1, twiddle, dims(1), dims(2), dims(3))
+        if (timers_enabled) call timer_stop(T_evolve)
+        if (timers_enabled) call timer_start(T_fft)
+    !         call fft(-1, u1, u2)
+        call fft(-1, u1, u1)
+        if (timers_enabled) call timer_stop(T_fft)
+        if (timers_enabled) call timer_start(T_checksum)
+    !         call checksum(iter, u2, dims(1), dims(2), dims(3))
+        call checksum(iter, u1, dims(1), dims(2), dims(3))
+        if (timers_enabled) call timer_stop(T_checksum)
+    end do
+
+    call verify(nx, ny, nz, niter, verified, class)
+
+    call timer_stop(t_total)
+    total_time = timer_read(t_total)
+
+    if( total_time /= 0. ) then
+        mflops = 1.0d-6*float(ntotal) * &
+        (14.8157+7.19641*log(float(ntotal)) &
+        +  (5.23518+7.21113*log(float(ntotal)))*niter) &
+        /total_time
+    else
+        mflops = 0.0
+    endif
+    call print_results('FT', class, nx, ny, nz, niter, &
+    total_time, mflops, '          floating point', verified, &
+    npbversion, compiletime, cs1, cs2, cs3, cs4, cs5, cs6, cs7)
+    if (timers_enabled) call print_timers()
+
+    END PROGRAM
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine init_ui(u0, u1, twiddle, d1, d2, d3)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+!---------------------------------------------------------------------
+! touch all the big data
+!---------------------------------------------------------------------
+
+    implicit none
+    integer :: d1, d2, d3
+    double complex   u0(d1+1,d2,d3)
+    double complex   u1(d1+1,d2,d3)
+    double precision :: twiddle(d1+1,d2,d3)
+    integer :: i, j, k
+
+! omp parallel do default(shared) private(i,j,k)
+    do k = 1, d3
+        do j = 1, d2
+            do i = 1, d1
+                u0(i,j,k) = 0.d0
+                u1(i,j,k) = 0.d0
+                twiddle(i,j,k) = 0.d0
+            end do
+        end do
+    end do
+
+    return
+    end subroutine init_ui
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine evolve(u0, u1, twiddle, d1, d2, d3)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+!---------------------------------------------------------------------
+! evolve u0 -> u1 (t time steps) in fourier space
+!---------------------------------------------------------------------
+
+    implicit none
+    include 'global.h'
+    integer :: d1, d2, d3
+    double complex   u0(d1+1,d2,d3)
+    double complex   u1(d1+1,d2,d3)
+    double precision :: twiddle(d1+1,d2,d3)
+    integer :: i, j, k
+
+! omp parallel do default(shared) private(i,j,k)
+    do k = 1, d3
+        do j = 1, d2
+            do i = 1, d1
+                u0(i,j,k) = u0(i,j,k) * twiddle(i,j,k)
+                u1(i,j,k) = u0(i,j,k)
+            end do
+        end do
+    end do
+
+    return
+    end subroutine evolve
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine compute_initial_conditions(u0, d1, d2, d3)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+!---------------------------------------------------------------------
+! Fill in array u0 with initial conditions from
+! random number generator
+!---------------------------------------------------------------------
+    implicit none
+    include 'global.h'
+    integer :: d1, d2, d3
+    double complex u0(d1+1, d2, d3)
+    integer :: k, j
+    double precision :: x0, start, an, dummy, starts(nz)
+          
+
+    start = seed
+!---------------------------------------------------------------------
+! Jump to the starting element for our first plane.
+!---------------------------------------------------------------------
+    call ipow46(a, 0, an)
+    dummy = randlc(start, an)
+    call ipow46(a, 2*nx*ny, an)
+
+    starts(1) = start
+    do k = 2, dims(3)
+        dummy = randlc(start, an)
+        starts(k) = start
+    end do
+          
+!---------------------------------------------------------------------
+! Go through by z planes filling in one square at a time.
+!---------------------------------------------------------------------
+! omp parallel do default(shared) private(k,j,x0)
+    do k = 1, dims(3)
+        x0 = starts(k)
+        do j = 1, dims(2)
+            call vranlc(2*nx, x0, a, u0(1, j, k))
+        end do
+    end do
+
+    return
+    end subroutine compute_initial_conditions
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine ipow46(a, exponent, result)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+!---------------------------------------------------------------------
+! compute a^exponent mod 2^46
+!---------------------------------------------------------------------
+
+    implicit none
+    double precision :: a, result, dummy, q, r
+    integer :: exponent, n, n2
+    external randlc
+    double precision :: randlc
+!---------------------------------------------------------------------
+! Use
+!   a^n = a^(n/2)*a^(n/2) if n even else
+!   a^n = a*a^(n-1)       if n odd
+!---------------------------------------------------------------------
+    result = 1
+    if (exponent == 0) return
+    q = a
+    r = 1
+    n = exponent
+
+
+    do while (n > 1)
+        n2 = n/2
+        if (n2 * 2 == n) then
+            dummy = randlc(q, q)
+            n = n2
+        else
+            dummy = randlc(r, q)
+            n = n-1
+        endif
+    end do
+    dummy = randlc(r, q)
+    result = r
+    return
+    end subroutine ipow46
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine setup
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    implicit none
+    include 'global.h'
+
+    integer :: fstatus
+!$    integer  omp_get_max_threads
+!$    external omp_get_max_threads
+    debug = .FALSE. 
+
+    open (unit=2,file='timer.flag',status='old',iostat=fstatus)
+    if (fstatus == 0) then
+        timers_enabled = .TRUE. 
+        close(2)
+    else
+        timers_enabled = .FALSE. 
+    endif
+
+    write(*, 1000)
+
+    niter = niter_default
+
+    write(*, 1001) nx, ny, nz
+    write(*, 1002) niter
+!$    write(*, 1003) omp_get_max_threads()
+    write(*, *)
+
+
+    1000 format(//,' NAS Parallel Benchmarks (NPB3.3-OMP)', &
+    ' - FT Benchmark', /)
+    1001 format(' Size                : ', i4, 'x', i4, 'x', i4)
+    1002 format(' Iterations                  :', i7)
+    1003 format(' Number of available threads :', i7)
+
+    dims(1) = nx
+    dims(2) = ny
+    dims(3) = nz
+
+
+!---------------------------------------------------------------------
+! Set up info for blocking of ffts and transposes.  This improves
+! performance on cache-based systems. Blocking involves
+! working on a chunk of the problem at a time, taking chunks
+! along the first, second, or third dimension.
+
+! - In cffts1 blocking is on 2nd dimension (with fft on 1st dim)
+! - In cffts2/3 blocking is on 1st dimension (with fft on 2nd and 3rd dims)
+
+! Since 1st dim is always in processor, we'll assume it's long enough
+! (default blocking factor is 16 so min size for 1st dim is 16)
+! The only case we have to worry about is cffts1 in a 2d decomposition.
+! so the blocking factor should not be larger than the 2nd dimension.
+!---------------------------------------------------------------------
+
+    fftblock = fftblock_default
+    fftblockpad = fftblockpad_default
+
+    if (fftblock /= fftblock_default) fftblockpad = fftblock+3
+
+    return
+    end subroutine setup
+
+          
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine compute_indexmap(twiddle, d1, d2, d3)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+!---------------------------------------------------------------------
+! compute function from local (i,j,k) to ibar^2+jbar^2+kbar^2
+! for time evolution exponent.
+!---------------------------------------------------------------------
+
+    implicit none
+    include 'global.h'
+    integer :: d1, d2, d3
+    double precision :: twiddle(d1+1, d2, d3)
+    integer :: i, j, k, kk, kk2, jj, kj2, ii
+    double precision :: ap
+
+!---------------------------------------------------------------------
+! basically we want to convert the fortran indices
+!   1 2 3 4 5 6 7 8
+! to
+!   0 1 2 3 -4 -3 -2 -1
+! The following magic formula does the trick:
+! mod(i-1+n/2, n) - n/2
+!---------------------------------------------------------------------
+
+    ap = - 4.d0 * alpha * pi *pi
+
+! omp parallel do default(shared) private(i,j,k,kk,kk2,jj,kj2,ii)
+    do k = 1, dims(3)
+        kk =  mod(k-1+nz/2, nz) - nz/2
+        kk2 = kk*kk
+        do j = 1, dims(2)
+            jj = mod(j-1+ny/2, ny) - ny/2
+            kj2 = jj*jj+kk2
+            do i = 1, dims(1)
+                ii = mod(i-1+nx/2, nx) - nx/2
+                twiddle(i,j,k) = dexp(ap*dble(ii*ii+kj2))
+            end do
+        end do
+    end do
+
+    return
+    end subroutine compute_indexmap
+
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine print_timers()
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    implicit none
+    integer :: i
+    include 'global.h'
+    double precision :: t, t_m
+    character(25) :: tstrings(T_max)
+    data tstrings / '          total ', &
+    '          setup ', &
+    '            fft ', &
+    '         evolve ', &
+    '       checksum ', &
+    '           fftx ', &
+    '           ffty ', &
+    '           fftz ' /
+
+    t_m = timer_read(T_total)
+    if (t_m <= 0.0d0) t_m = 1.0d0
+    do i = 1, t_max
+        t = timer_read(i)
+        write(*, 100) i, tstrings(i), t, t*100.0/t_m
+    end do
+    100 format(' timer ', i2, '(', A16,  ') :', F9.4, ' (',F6.2,'%)')
+    return
+    end subroutine print_timers
+
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine fft(dir, x1, x2)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    implicit none
+    include 'global.h'
+    integer :: dir
+    double complex x1(ntotalp), x2(ntotalp)
+
+    double complex y1(fftblockpad_default*maxdim), &
+    y2(fftblockpad_default*maxdim)
+
+!---------------------------------------------------------------------
+! note: args x1, x2 must be different arrays
+! note: args for cfftsx are (direction, layout, xin, xout, scratch)
+!       xin/xout may be the same and it can be somewhat faster
+!       if they are
+!---------------------------------------------------------------------
+
+    if (dir == 1) then
+        call cffts1(1, dims(1), dims(2), dims(3), x1, x1, y1, y2)
+        call cffts2(1, dims(1), dims(2), dims(3), x1, x1, y1, y2)
+        call cffts3(1, dims(1), dims(2), dims(3), x1, x2, y1, y2)
+    else
+        call cffts3(-1, dims(1), dims(2), dims(3), x1, x1, y1, y2)
+        call cffts2(-1, dims(1), dims(2), dims(3), x1, x1, y1, y2)
+        call cffts1(-1, dims(1), dims(2), dims(3), x1, x2, y1, y2)
+    endif
+    return
+    end subroutine fft
+
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine cffts1(is, d1, d2, d3, x, xout, y1, y2)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    implicit none
+
+    include 'global.h'
+    integer :: is, d1, d2, d3, logd1
+    double complex x(d1+1,d2,d3)
+    double complex xout(d1+1,d2,d3)
+    double complex y1(fftblockpad, d1), y2(fftblockpad, d1)
+    integer :: i, j, k, jj
+
+    logd1 = ilog2(d1)
+
+    if (timers_enabled) call timer_start(T_fftx)
+! omp parallel do default(shared) private(i,j,k,jj,y1,y2)
+! omp&  shared(is,logd1,d1)
+    do k = 1, d3
+        do jj = 0, d2 - fftblock, fftblock
+            do j = 1, fftblock
+                do i = 1, d1
+                    y1(j,i) = x(i,j+jj,k)
+                enddo
+            enddo
+                        
+            call cfftz (is, logd1, d1, y1, y2)
+
+
+            do j = 1, fftblock
+                do i = 1, d1
+                    xout(i,j+jj,k) = y1(j,i)
+                enddo
+            enddo
+        enddo
+    enddo
+    if (timers_enabled) call timer_stop(T_fftx)
+
+    return
+    end subroutine cffts1
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine cffts2(is, d1, d2, d3, x, xout, y1, y2)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    implicit none
+
+    include 'global.h'
+    integer :: is, d1, d2, d3, logd2
+    double complex x(d1+1,d2,d3)
+    double complex xout(d1+1,d2,d3)
+    double complex y1(fftblockpad, d2), y2(fftblockpad, d2)
+    integer :: i, j, k, ii
+
+    logd2 = ilog2(d2)
+
+    if (timers_enabled) call timer_start(T_ffty)
+! omp parallel do default(shared) private(i,j,k,ii,y1,y2)
+! omp&  shared(is,logd2,d2)
+    do k = 1, d3
+        do ii = 0, d1 - fftblock, fftblock
+            do j = 1, d2
+                do i = 1, fftblock
+                    y1(i,j) = x(i+ii,j,k)
+                enddo
+            enddo
+
+            call cfftz (is, logd2, d2, y1, y2)
+                       
+            do j = 1, d2
+                do i = 1, fftblock
+                    xout(i+ii,j,k) = y1(i,j)
+                enddo
+            enddo
+        enddo
+    enddo
+    if (timers_enabled) call timer_stop(T_ffty)
+
+    return
+    end subroutine cffts2
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine cffts3(is, d1, d2, d3, x, xout, y1, y2)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    implicit none
+
+    include 'global.h'
+    integer :: is, d1, d2, d3, logd3
+    double complex x(d1+1,d2,d3)
+    double complex xout(d1+1,d2,d3)
+    double complex y1(fftblockpad, d3), y2(fftblockpad, d3)
+    integer :: i, j, k, ii
+
+    logd3 = ilog2(d3)
+
+    if (timers_enabled) call timer_start(T_fftz)
+! omp parallel do default(shared) private(i,j,k,ii,y1,y2)
+! omp&  shared(is)
+    do j = 1, d2
+        do ii = 0, d1 - fftblock, fftblock
+            do k = 1, d3
+                do i = 1, fftblock
+                    y1(i,k) = x(i+ii,j,k)
+                enddo
+            enddo
+
+            call cfftz (is, logd3, d3, y1, y2)
+
+            do k = 1, d3
+                do i = 1, fftblock
+                    xout(i+ii,j,k) = y1(i,k)
+                enddo
+            enddo
+        enddo
+    enddo
+    if (timers_enabled) call timer_stop(T_fftz)
+
+    return
+    end subroutine cffts3
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine fft_init (n)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+!---------------------------------------------------------------------
+! compute the roots-of-unity array that will be used for subsequent FFTs.
+!---------------------------------------------------------------------
+
+    implicit none
+    include 'global.h'
+
+    integer :: m,n,nu,ku,i,j,ln
+    double precision :: t, ti
+
+
+!---------------------------------------------------------------------
+!   Initialize the U array with sines and cosines in a manner that permits
+!   stride one access at each FFT iteration.
+!---------------------------------------------------------------------
+    nu = n
+    m = ilog2(n)
+    u(1) = m
+    ku = 2
+    ln = 1
+
+    do j = 1, m
+        t = pi / ln
+                 
+        do i = 0, ln - 1
+            ti = i * t
+            u(i+ku) = dcmplx (cos (ti), sin(ti))
+        enddo
+                 
+        ku = ku + ln
+        ln = 2 * ln
+    enddo
+          
+    return
+    end subroutine fft_init
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine cfftz (is, m, n, x, y)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+!---------------------------------------------------------------------
+!   Computes NY N-point complex-to-complex FFTs of X using an algorithm due
+!   to Swarztrauber.  X is both the input and the output array, while Y is a
+!   scratch array.  It is assumed that N = 2^M.  Before calling CFFTZ to
+!   perform FFTs, the array U must be initialized by calling CFFTZ with IS
+!   set to 0 and M set to MX, where MX is the maximum value of M for any
+!   subsequent call.
+!---------------------------------------------------------------------
+
+    implicit none
+    include 'global.h'
+
+    integer :: is,m,n,i,j,l,mx
+    double complex x, y
+
+    dimension x(fftblockpad,n), y(fftblockpad,n)
+
+!---------------------------------------------------------------------
+!   Check if input parameters are invalid.
+!---------------------------------------------------------------------
+    mx = u(1)
+    if ((is /= 1 .AND. is /= -1) .OR. m < 1 .OR. m > mx) &
+    then
+        write (*, 1)  is, m, mx
+        1 format ('CFFTZ: Either U has not been initialized, or else'/ &
+        'one of the input parameters is invalid', 3I5)
+        stop
+    endif
+
+!---------------------------------------------------------------------
+!   Perform one variant of the Stockham FFT.
+!---------------------------------------------------------------------
+    do l = 1, m, 2
+        call fftz2 (is, l, m, n, fftblock, fftblockpad, u, x, y)
+        if (l == m) goto 160
+        call fftz2 (is, l + 1, m, n, fftblock, fftblockpad, u, y, x)
+    enddo
+
+    goto 180
+
+!---------------------------------------------------------------------
+!   Copy Y to X.
+!---------------------------------------------------------------------
+    160 do j = 1, n
+        do i = 1, fftblock
+            x(i,j) = y(i,j)
+        enddo
+    enddo
+
+    180 continue
+
+    return
+    end subroutine cfftz
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine fftz2 (is, l, m, n, ny, ny1, u, x, y)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+!---------------------------------------------------------------------
+!   Performs the L-th iteration of the second variant of the Stockham FFT.
+!---------------------------------------------------------------------
+
+    implicit none
+
+    integer :: is,k,l,m,n,ny,ny1,n1,li,lj,lk,ku,i,j,i11,i12,i21,i22
+    double complex u,x,y,u1,x11,x21
+    dimension u(n), x(ny1,n), y(ny1,n)
+
+
+!---------------------------------------------------------------------
+!   Set initial parameters.
+!---------------------------------------------------------------------
+
+    n1 = n / 2
+    lk = 2 ** (l - 1)
+    li = 2 ** (m - l)
+    lj = 2 * lk
+    ku = li + 1
+
+    do i = 0, li - 1
+        i11 = i * lk + 1
+        i12 = i11 + n1
+        i21 = i * lj + 1
+        i22 = i21 + lk
+        if (is >= 1) then
+            u1 = u(ku+i)
+        else
+            u1 = dconjg (u(ku+i))
+        endif
+
+    !---------------------------------------------------------------------
+    !   This loop is vectorizable.
+    !---------------------------------------------------------------------
+        do k = 0, lk - 1
+            do j = 1, ny
+                x11 = x(j,i11+k)
+                x21 = x(j,i12+k)
+                y(j,i21+k) = x11 + x21
+                y(j,i22+k) = u1 * (x11 - x21)
+            enddo
+        enddo
+    enddo
+
+    return
+    end subroutine fftz2
+
+!---------------------------------------------------------------------
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    integer function ilog2(n)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    implicit none
+    integer :: n, nn, lg
+    if (n == 1) then
+        ilog2=0
+        return
+    endif
+    lg = 1
+    nn = 2
+    do while (nn < n)
+        nn = nn*2
+        lg = lg+1
+    end do
+    ilog2 = lg
+    return
+    end function ilog2
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine checksum(i, u1, d1, d2, d3)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    implicit none
+    include 'global.h'
+    integer :: i, d1, d2, d3
+    double complex u1(d1+1,d2,d3)
+    integer :: j, q,r,s
+    double complex chk
+    chk = (0.0,0.0)
+
+! omp parallel do default(shared) private(i,q,r,s) reduction(+:chk)
+    do j=1,1024
+        q = mod(j, nx)+1
+        r = mod(3*j,ny)+1
+        s = mod(5*j,nz)+1
+        chk=chk+u1(q,r,s)
+    end do
+
+    chk = chk/dble(ntotal)
+          
+    write (*, 30) i, chk
+    30 format (' T =',I5,5X,'Checksum =',1P2D22.12)
+    sums(i) = chk
+    return
+    end subroutine checksum
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine verify (d1, d2, d3, nt, verified, class)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    implicit none
+    include 'global.h'
+    integer :: d1, d2, d3, nt
+    character class
+    logical :: verified
+    integer :: i
+    double precision :: err, epsilon
+
+!---------------------------------------------------------------------
+!   Reference checksums
+!---------------------------------------------------------------------
+    double complex csum_ref(25)
+
+
+    class = 'U'
+
+    epsilon = 1.0d-12
+    verified = .FALSE. 
+
+    if (d1 == 64 .AND. &
+    d2 == 64 .AND. &
+    d3 == 64 .AND. &
+    nt == 6) then
+    !---------------------------------------------------------------------
+    !   Sample size reference checksums
+    !---------------------------------------------------------------------
+        class = 'S'
+        csum_ref(1) = dcmplx(5.546087004964D+02, 4.845363331978D+02)
+        csum_ref(2) = dcmplx(5.546385409189D+02, 4.865304269511D+02)
+        csum_ref(3) = dcmplx(5.546148406171D+02, 4.883910722336D+02)
+        csum_ref(4) = dcmplx(5.545423607415D+02, 4.901273169046D+02)
+        csum_ref(5) = dcmplx(5.544255039624D+02, 4.917475857993D+02)
+        csum_ref(6) = dcmplx(5.542683411902D+02, 4.932597244941D+02)
+
+    else if (d1 == 128 .AND. &
+        d2 == 128 .AND. &
+        d3 == 32 .AND. &
+        nt == 6) then
+    !---------------------------------------------------------------------
+    !   Class W size reference checksums
+    !---------------------------------------------------------------------
+        class = 'W'
+        csum_ref(1) = dcmplx(5.673612178944D+02, 5.293246849175D+02)
+        csum_ref(2) = dcmplx(5.631436885271D+02, 5.282149986629D+02)
+        csum_ref(3) = dcmplx(5.594024089970D+02, 5.270996558037D+02)
+        csum_ref(4) = dcmplx(5.560698047020D+02, 5.260027904925D+02)
+        csum_ref(5) = dcmplx(5.530898991250D+02, 5.249400845633D+02)
+        csum_ref(6) = dcmplx(5.504159734538D+02, 5.239212247086D+02)
+
+    else if (d1 == 256 .AND. &
+        d2 == 256 .AND. &
+        d3 == 128 .AND. &
+        nt == 6) then
+    !---------------------------------------------------------------------
+    !   Class A size reference checksums
+    !---------------------------------------------------------------------
+        class = 'A'
+        csum_ref(1) = dcmplx(5.046735008193D+02, 5.114047905510D+02)
+        csum_ref(2) = dcmplx(5.059412319734D+02, 5.098809666433D+02)
+        csum_ref(3) = dcmplx(5.069376896287D+02, 5.098144042213D+02)
+        csum_ref(4) = dcmplx(5.077892868474D+02, 5.101336130759D+02)
+        csum_ref(5) = dcmplx(5.085233095391D+02, 5.104914655194D+02)
+        csum_ref(6) = dcmplx(5.091487099959D+02, 5.107917842803D+02)
+              
+    else if (d1 == 512 .AND. &
+        d2 == 256 .AND. &
+        d3 == 256 .AND. &
+        nt == 20) then
+    !---------------------------------------------------------------------
+    !   Class B size reference checksums
+    !---------------------------------------------------------------------
+        class = 'B'
+        csum_ref(1)  = dcmplx(5.177643571579D+02, 5.077803458597D+02)
+        csum_ref(2)  = dcmplx(5.154521291263D+02, 5.088249431599D+02)
+        csum_ref(3)  = dcmplx(5.146409228649D+02, 5.096208912659D+02)
+        csum_ref(4)  = dcmplx(5.142378756213D+02, 5.101023387619D+02)
+        csum_ref(5)  = dcmplx(5.139626667737D+02, 5.103976610617D+02)
+        csum_ref(6)  = dcmplx(5.137423460082D+02, 5.105948019802D+02)
+        csum_ref(7)  = dcmplx(5.135547056878D+02, 5.107404165783D+02)
+        csum_ref(8)  = dcmplx(5.133910925466D+02, 5.108576573661D+02)
+        csum_ref(9)  = dcmplx(5.132470705390D+02, 5.109577278523D+02)
+        csum_ref(10) = dcmplx(5.131197729984D+02, 5.110460304483D+02)
+        csum_ref(11) = dcmplx(5.130070319283D+02, 5.111252433800D+02)
+        csum_ref(12) = dcmplx(5.129070537032D+02, 5.111968077718D+02)
+        csum_ref(13) = dcmplx(5.128182883502D+02, 5.112616233064D+02)
+        csum_ref(14) = dcmplx(5.127393733383D+02, 5.113203605551D+02)
+        csum_ref(15) = dcmplx(5.126691062020D+02, 5.113735928093D+02)
+        csum_ref(16) = dcmplx(5.126064276004D+02, 5.114218460548D+02)
+        csum_ref(17) = dcmplx(5.125504076570D+02, 5.114656139760D+02)
+        csum_ref(18) = dcmplx(5.125002331720D+02, 5.115053595966D+02)
+        csum_ref(19) = dcmplx(5.124551951846D+02, 5.115415130407D+02)
+        csum_ref(20) = dcmplx(5.124146770029D+02, 5.115744692211D+02)
+
+    else if (d1 == 512 .AND. &
+        d2 == 512 .AND. &
+        d3 == 512 .AND. &
+        nt == 20) then
+    !---------------------------------------------------------------------
+    !   Class C size reference checksums
+    !---------------------------------------------------------------------
+        class = 'C'
+        csum_ref(1)  = dcmplx(5.195078707457D+02, 5.149019699238D+02)
+        csum_ref(2)  = dcmplx(5.155422171134D+02, 5.127578201997D+02)
+        csum_ref(3)  = dcmplx(5.144678022222D+02, 5.122251847514D+02)
+        csum_ref(4)  = dcmplx(5.140150594328D+02, 5.121090289018D+02)
+        csum_ref(5)  = dcmplx(5.137550426810D+02, 5.121143685824D+02)
+        csum_ref(6)  = dcmplx(5.135811056728D+02, 5.121496764568D+02)
+        csum_ref(7)  = dcmplx(5.134569343165D+02, 5.121870921893D+02)
+        csum_ref(8)  = dcmplx(5.133651975661D+02, 5.122193250322D+02)
+        csum_ref(9)  = dcmplx(5.132955192805D+02, 5.122454735794D+02)
+        csum_ref(10) = dcmplx(5.132410471738D+02, 5.122663649603D+02)
+        csum_ref(11) = dcmplx(5.131971141679D+02, 5.122830879827D+02)
+        csum_ref(12) = dcmplx(5.131605205716D+02, 5.122965869718D+02)
+        csum_ref(13) = dcmplx(5.131290734194D+02, 5.123075927445D+02)
+        csum_ref(14) = dcmplx(5.131012720314D+02, 5.123166486553D+02)
+        csum_ref(15) = dcmplx(5.130760908195D+02, 5.123241541685D+02)
+        csum_ref(16) = dcmplx(5.130528295923D+02, 5.123304037599D+02)
+        csum_ref(17) = dcmplx(5.130310107773D+02, 5.123356167976D+02)
+        csum_ref(18) = dcmplx(5.130103090133D+02, 5.123399592211D+02)
+        csum_ref(19) = dcmplx(5.129905029333D+02, 5.123435588985D+02)
+        csum_ref(20) = dcmplx(5.129714421109D+02, 5.123465164008D+02)
+
+    else if (d1 == 2048 .AND. &
+        d2 == 1024 .AND. &
+        d3 == 1024 .AND. &
+        nt == 25) then
+    !---------------------------------------------------------------------
+    !   Class D size reference checksums
+    !---------------------------------------------------------------------
+        class = 'D'
+        csum_ref(1)  = dcmplx(5.122230065252D+02, 5.118534037109D+02)
+        csum_ref(2)  = dcmplx(5.120463975765D+02, 5.117061181082D+02)
+        csum_ref(3)  = dcmplx(5.119865766760D+02, 5.117096364601D+02)
+        csum_ref(4)  = dcmplx(5.119518799488D+02, 5.117373863950D+02)
+        csum_ref(5)  = dcmplx(5.119269088223D+02, 5.117680347632D+02)
+        csum_ref(6)  = dcmplx(5.119082416858D+02, 5.117967875532D+02)
+        csum_ref(7)  = dcmplx(5.118943814638D+02, 5.118225281841D+02)
+        csum_ref(8)  = dcmplx(5.118842385057D+02, 5.118451629348D+02)
+        csum_ref(9)  = dcmplx(5.118769435632D+02, 5.118649119387D+02)
+        csum_ref(10) = dcmplx(5.118718203448D+02, 5.118820803844D+02)
+        csum_ref(11) = dcmplx(5.118683569061D+02, 5.118969781011D+02)
+        csum_ref(12) = dcmplx(5.118661708593D+02, 5.119098918835D+02)
+        csum_ref(13) = dcmplx(5.118649768950D+02, 5.119210777066D+02)
+        csum_ref(14) = dcmplx(5.118645605626D+02, 5.119307604484D+02)
+        csum_ref(15) = dcmplx(5.118647586618D+02, 5.119391362671D+02)
+        csum_ref(16) = dcmplx(5.118654451572D+02, 5.119463757241D+02)
+        csum_ref(17) = dcmplx(5.118665212451D+02, 5.119526269238D+02)
+        csum_ref(18) = dcmplx(5.118679083821D+02, 5.119580184108D+02)
+        csum_ref(19) = dcmplx(5.118695433664D+02, 5.119626617538D+02)
+        csum_ref(20) = dcmplx(5.118713748264D+02, 5.119666538138D+02)
+        csum_ref(21) = dcmplx(5.118733606701D+02, 5.119700787219D+02)
+        csum_ref(22) = dcmplx(5.118754661974D+02, 5.119730095953D+02)
+        csum_ref(23) = dcmplx(5.118776626738D+02, 5.119755100241D+02)
+        csum_ref(24) = dcmplx(5.118799262314D+02, 5.119776353561D+02)
+        csum_ref(25) = dcmplx(5.118822370068D+02, 5.119794338060D+02)
+
+    else if (d1 == 4096 .AND. &
+        d2 == 2048 .AND. &
+        d3 == 2048 .AND. &
+        nt == 25) then
+    !---------------------------------------------------------------------
+    !   Class E size reference checksums
+    !---------------------------------------------------------------------
+        class = 'E'
+        csum_ref(1)  = dcmplx(5.121601045346D+02, 5.117395998266D+02)
+        csum_ref(2)  = dcmplx(5.120905403678D+02, 5.118614716182D+02)
+        csum_ref(3)  = dcmplx(5.120623229306D+02, 5.119074203747D+02)
+        csum_ref(4)  = dcmplx(5.120438418997D+02, 5.119345900733D+02)
+        csum_ref(5)  = dcmplx(5.120311521872D+02, 5.119551325550D+02)
+        csum_ref(6)  = dcmplx(5.120226088809D+02, 5.119720179919D+02)
+        csum_ref(7)  = dcmplx(5.120169296534D+02, 5.119861371665D+02)
+        csum_ref(8)  = dcmplx(5.120131225172D+02, 5.119979364402D+02)
+        csum_ref(9)  = dcmplx(5.120104767108D+02, 5.120077674092D+02)
+        csum_ref(10) = dcmplx(5.120085127969D+02, 5.120159443121D+02)
+        csum_ref(11) = dcmplx(5.120069224127D+02, 5.120227453670D+02)
+        csum_ref(12) = dcmplx(5.120055158164D+02, 5.120284096041D+02)
+        csum_ref(13) = dcmplx(5.120041820159D+02, 5.120331373793D+02)
+        csum_ref(14) = dcmplx(5.120028605402D+02, 5.120370938679D+02)
+        csum_ref(15) = dcmplx(5.120015223011D+02, 5.120404138831D+02)
+        csum_ref(16) = dcmplx(5.120001570022D+02, 5.120432068837D+02)
+        csum_ref(17) = dcmplx(5.119987650555D+02, 5.120455615860D+02)
+        csum_ref(18) = dcmplx(5.119973525091D+02, 5.120475499442D+02)
+        csum_ref(19) = dcmplx(5.119959279472D+02, 5.120492304629D+02)
+        csum_ref(20) = dcmplx(5.119945006558D+02, 5.120506508902D+02)
+        csum_ref(21) = dcmplx(5.119930795911D+02, 5.120518503782D+02)
+        csum_ref(22) = dcmplx(5.119916728462D+02, 5.120528612016D+02)
+        csum_ref(23) = dcmplx(5.119902874185D+02, 5.120537101195D+02)
+        csum_ref(24) = dcmplx(5.119889291565D+02, 5.120544194514D+02)
+        csum_ref(25) = dcmplx(5.119876028049D+02, 5.120550079284D+02)
+
+    endif
+
+
+    if (class /= 'U') then
+
+        do i = 1, nt
+            err = abs( (sums(i) - csum_ref(i)) / csum_ref(i) )
+            if ( .NOT. (err <= epsilon)) goto 100
+        end do
+        verified = .TRUE. 
+        100 continue
+
+    endif
+
+             
+    if (class /= 'U') then
+        if (verified) then
+            write(*,2000)
+            2000 format(' Result verification successful')
+        else
+            write(*,2001)
+            2001 format(' Result verification failed')
+        endif
+    endif
+    print *, 'class = ', class
+
+    return
+    end subroutine verify
+
+
diff --git a/Parser/test-data/nas/ft5.f b/Parser/test-data/nas/ft5.f
new file mode 100644
index 0000000..47dafa5
--- /dev/null
+++ b/Parser/test-data/nas/ft5.f
@@ -0,0 +1,2954 @@
+!-------------------------------------------------------------------------!
+!                                                                         !
+!        N  A  S     P A R A L L E L     B E N C H M A R K S  3.3         !
+!                                                                         !
+!                       O p e n M P     V E R S I O N                     !
+!                                                                         !
+!                                   F T                                   !
+!                                                                         !
+!-------------------------------------------------------------------------!
+!                                                                         !
+!    This benchmark is an OpenMP version of the NPB FT code.              !
+!    It is described in NAS Technical Report 99-011.                      !
+!                                                                         !
+!    Permission to use, copy, distribute and modify this software         !
+!    for any purpose with or without fee is hereby granted.  We           !
+!    request, however, that all derived work reference the NAS            !
+!    Parallel Benchmarks 3.3. This software is provided "as is"           !
+!    without express or implied warranty.                                 !
+!                                                                         !
+!    Information on NPB 3.3, including the technical report, the          !
+!    original specifications, source code, results and information        !
+!    on how to submit new results, is available at:                       !
+!                                                                         !
+!           http://www.nas.nasa.gov/Software/NPB/                         !
+!                                                                         !
+!    Send comments or suggestions to  npb@nas.nasa.gov                    !
+!                                                                         !
+!          NAS Parallel Benchmarks Group                                  !
+!          NASA Ames Research Center                                      !
+!          Mail Stop: T27A-1                                              !
+!          Moffett Field, CA   94035-1000                                 !
+!                                                                         !
+!          E-mail:  npb@nas.nasa.gov                                      !
+!          Fax:     (650) 604-3957                                        !
+!                                                                         !
+!-------------------------------------------------------------------------!
+
+!---------------------------------------------------------------------
+
+! Authors: D. Bailey
+!          W. Saphir
+!          H. Jin
+
+!---------------------------------------------------------------------
+
+!---------------------------------------------------------------------
+
+!---------------------------------------------------------------------
+! FT benchmark
+!---------------------------------------------------------------------
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    program ft
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+
+    implicit none
+
+! CLASS = A
+!  
+!  
+!  This file is generated automatically by the setparams utility.
+!  It sets the number of processors and the class of the NPB
+!  in this directory. Do not modify it by hand.
+!  
+        integer nx, ny, nz, maxdim, niter_default
+        integer ntotal, nxp, nyp, ntotalp
+        parameter (nx=256, ny=256, nz=128, maxdim=256)
+        parameter (niter_default=6)
+        parameter (nxp=nx+1, nyp=ny)
+        parameter (ntotal=nx*nyp*nz)
+        parameter (ntotalp=nxp*nyp*nz)
+        logical  convertdouble
+        parameter (convertdouble = .false.)
+        character compiletime*11
+        parameter (compiletime='18 Oct 2016')
+        character npbversion*5
+        parameter (npbversion='3.3.1')
+        character cs1*8
+        parameter (cs1='gfortran')
+        character cs2*6
+        parameter (cs2='$(F77)')
+        character cs3*6
+        parameter (cs3='(none)')
+        character cs4*6
+        parameter (cs4='(none)')
+        character cs5*40
+        parameter (cs5='-ffree-form -O3 -fopenmp -mcmodel=medium')
+        character cs6*28
+        parameter (cs6='-O3 -fopenmp -mcmodel=medium')
+        character cs7*6
+        parameter (cs7='randi8')
+
+
+! If processor array is 1x1 -> 0D grid decomposition
+
+
+! Cache blocking params. These values are good for most
+! RISC processors.
+! FFT parameters:
+!  fftblock controls how many ffts are done at a time.
+!  The default is appropriate for most cache-based machines
+!  On vector machines, the FFT can be vectorized with vector
+!  length equal to the block size, so the block size should
+!  be as large as possible. This is the size of the smallest
+!  dimension of the problem: 128 for class A, 256 for class B and
+!  512 for class C.
+
+    integer :: fftblock_default, fftblockpad_default
+!      parameter (fftblock_default=16, fftblockpad_default=18)
+    parameter (fftblock_default=32, fftblockpad_default=33)
+          
+    integer :: fftblock, fftblockpad
+    common /blockinfo/ fftblock, fftblockpad
+
+! we need a bunch of logic to keep track of how
+! arrays are laid out.
+
+
+! Note: this serial version is the derived from the parallel 0D case
+! of the ft NPB.
+! The computation proceeds logically as
+
+! set up initial conditions
+! fftx(1)
+! transpose (1->2)
+! ffty(2)
+! transpose (2->3)
+! fftz(3)
+! time evolution
+! fftz(3)
+! transpose (3->2)
+! ffty(2)
+! transpose (2->1)
+! fftx(1)
+! compute residual(1)
+
+! for the 0D, 1D, 2D strategies, the layouts look like xxx
+
+!            0D        1D        2D
+! 1:        xyz       xyz       xyz
+
+! the array dimensions are stored in dims(coord, phase)
+    integer :: dims(3)
+    common /layout/ dims
+
+    integer :: T_total, T_setup, T_fft, T_evolve, T_checksum, &
+    T_fftx, T_ffty, &
+    T_fftz, T_max
+    parameter (T_total = 1, T_setup = 2, T_fft = 3, &
+    T_evolve = 4, T_checksum = 5, &
+    T_fftx = 6, &
+    T_ffty = 7, &
+    T_fftz = 8, T_max = 8)
+
+
+
+    logical :: timers_enabled
+
+
+    external timer_read
+    double precision :: timer_read
+    external ilog2
+    integer :: ilog2
+
+    external randlc
+    double precision :: randlc
+
+
+! other stuff
+    logical :: debug, debugsynch
+    common /dbg/ debug, debugsynch, timers_enabled
+
+    double precision :: seed, a, pi, alpha
+    parameter (seed = 314159265.d0, a = 1220703125.d0, &
+    pi = 3.141592653589793238d0, alpha=1.0d-6)
+
+
+! roots of unity array
+! relies on x being largest dimension?
+    double complex u(nxp)
+    common /ucomm/ u
+
+
+! for checksum data
+    double complex sums(0:niter_default)
+    common /sumcomm/ sums
+
+! number of iterations
+    integer :: niter
+    common /iter/ niter
+    integer :: i
+          
+!---------------------------------------------------------------------
+! u0, u1, u2 are the main arrays in the problem.
+! Depending on the decomposition, these arrays will have different
+! dimensions. To accomodate all possibilities, we allocate them as
+! one-dimensional arrays and pass them to subroutines for different
+! views
+!  - u0 contains the initial (transformed) initial condition
+!  - u1 and u2 are working arrays
+!  - twiddle contains exponents for the time evolution operator.
+!---------------------------------------------------------------------
+
+    double complex   u0(ntotalp), &
+    u1(ntotalp)
+!     >                 u2(ntotalp)
+    double precision :: twiddle(ntotalp)
+!---------------------------------------------------------------------
+! Large arrays are in common so that they are allocated on the
+! heap rather than the stack. This common block is not
+! referenced directly anywhere else. Padding is to avoid accidental
+! cache problems, since all array sizes are powers of two.
+!---------------------------------------------------------------------
+
+!      double complex pad1(3), pad2(3), pad3(3)
+!      common /bigarrays/ u0, pad1, u1, pad2, u2, pad3, twiddle
+    double complex pad1(3), pad2(3)
+    common /bigarrays/ u0, pad1, u1, pad2, twiddle
+
+    integer :: iter
+    double precision :: total_time, mflops
+    logical :: verified
+    character class
+
+!---------------------------------------------------------------------
+! Run the entire problem once to make sure all data is touched.
+! This reduces variable startup costs, which is important for such a
+! short benchmark. The other NPB 2 implementations are similar.
+!---------------------------------------------------------------------
+    do i = 1, t_max
+        call timer_clear(i)
+    end do
+    call setup()
+    call init_ui(u0, u1, twiddle, dims(1), dims(2), dims(3))
+    call compute_indexmap(twiddle, dims(1), dims(2), dims(3))
+    call compute_initial_conditions(u1, dims(1), dims(2), dims(3))
+    call fft_init (dims(1))
+    call fft(1, u1, u0)
+
+!---------------------------------------------------------------------
+! Start over from the beginning. Note that all operations must
+! be timed, in contrast to other benchmarks.
+!---------------------------------------------------------------------
+    do i = 1, t_max
+        call timer_clear(i)
+    end do
+
+    call timer_start(T_total)
+    if (timers_enabled) call timer_start(T_setup)
+
+    call compute_indexmap(twiddle, dims(1), dims(2), dims(3))
+
+    call compute_initial_conditions(u1, dims(1), dims(2), dims(3))
+
+    call fft_init (dims(1))
+
+    if (timers_enabled) call timer_stop(T_setup)
+    if (timers_enabled) call timer_start(T_fft)
+    call fft(1, u1, u0)
+    if (timers_enabled) call timer_stop(T_fft)
+
+    do iter = 1, niter
+        if (timers_enabled) call timer_start(T_evolve)
+        call evolve(u0, u1, twiddle, dims(1), dims(2), dims(3))
+        if (timers_enabled) call timer_stop(T_evolve)
+        if (timers_enabled) call timer_start(T_fft)
+    !         call fft(-1, u1, u2)
+        call fft(-1, u1, u1)
+        if (timers_enabled) call timer_stop(T_fft)
+        if (timers_enabled) call timer_start(T_checksum)
+    !         call checksum(iter, u2, dims(1), dims(2), dims(3))
+        call checksum(iter, u1, dims(1), dims(2), dims(3))
+        if (timers_enabled) call timer_stop(T_checksum)
+    end do
+
+    call verify(nx, ny, nz, niter, verified, class)
+
+    call timer_stop(t_total)
+    total_time = timer_read(t_total)
+
+    if( total_time /= 0. ) then
+        mflops = 1.0d-6*float(ntotal) * &
+        (14.8157+7.19641*log(float(ntotal)) &
+        +  (5.23518+7.21113*log(float(ntotal)))*niter) &
+        /total_time
+    else
+        mflops = 0.0
+    endif
+    call print_results('FT', class, nx, ny, nz, niter, &
+    total_time, mflops, '          floating point', verified, &
+    npbversion, compiletime, cs1, cs2, cs3, cs4, cs5, cs6, cs7)
+    if (timers_enabled) call print_timers()
+
+    END PROGRAM
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine init_ui(u0, u1, twiddle, d1, d2, d3)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+!---------------------------------------------------------------------
+! touch all the big data
+!---------------------------------------------------------------------
+
+    implicit none
+    integer :: d1, d2, d3
+    double complex   u0(d1+1,d2,d3)
+    double complex   u1(d1+1,d2,d3)
+    double precision :: twiddle(d1+1,d2,d3)
+    integer :: i, j, k
+
+! omp parallel do default(shared) private(i,j,k)
+    do k = 1, d3
+        do j = 1, d2
+            do i = 1, d1
+                u0(i,j,k) = 0.d0
+                u1(i,j,k) = 0.d0
+                twiddle(i,j,k) = 0.d0
+            end do
+        end do
+    end do
+
+    return
+    end subroutine init_ui
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine evolve(u0, u1, twiddle, d1, d2, d3)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+!---------------------------------------------------------------------
+! evolve u0 -> u1 (t time steps) in fourier space
+!---------------------------------------------------------------------
+
+    implicit none
+! CLASS = A
+!  
+!  
+!  This file is generated automatically by the setparams utility.
+!  It sets the number of processors and the class of the NPB
+!  in this directory. Do not modify it by hand.
+!  
+        integer nx, ny, nz, maxdim, niter_default
+        integer ntotal, nxp, nyp, ntotalp
+        parameter (nx=256, ny=256, nz=128, maxdim=256)
+        parameter (niter_default=6)
+        parameter (nxp=nx+1, nyp=ny)
+        parameter (ntotal=nx*nyp*nz)
+        parameter (ntotalp=nxp*nyp*nz)
+        logical  convertdouble
+        parameter (convertdouble = .false.)
+        character compiletime*11
+        parameter (compiletime='18 Oct 2016')
+        character npbversion*5
+        parameter (npbversion='3.3.1')
+        character cs1*8
+        parameter (cs1='gfortran')
+        character cs2*6
+        parameter (cs2='$(F77)')
+        character cs3*6
+        parameter (cs3='(none)')
+        character cs4*6
+        parameter (cs4='(none)')
+        character cs5*40
+        parameter (cs5='-ffree-form -O3 -fopenmp -mcmodel=medium')
+        character cs6*28
+        parameter (cs6='-O3 -fopenmp -mcmodel=medium')
+        character cs7*6
+        parameter (cs7='randi8')
+
+
+! If processor array is 1x1 -> 0D grid decomposition
+
+
+! Cache blocking params. These values are good for most
+! RISC processors.
+! FFT parameters:
+!  fftblock controls how many ffts are done at a time.
+!  The default is appropriate for most cache-based machines
+!  On vector machines, the FFT can be vectorized with vector
+!  length equal to the block size, so the block size should
+!  be as large as possible. This is the size of the smallest
+!  dimension of the problem: 128 for class A, 256 for class B and
+!  512 for class C.
+
+    integer :: fftblock_default, fftblockpad_default
+!      parameter (fftblock_default=16, fftblockpad_default=18)
+    parameter (fftblock_default=32, fftblockpad_default=33)
+          
+    integer :: fftblock, fftblockpad
+    common /blockinfo/ fftblock, fftblockpad
+
+! we need a bunch of logic to keep track of how
+! arrays are laid out.
+
+
+! Note: this serial version is the derived from the parallel 0D case
+! of the ft NPB.
+! The computation proceeds logically as
+
+! set up initial conditions
+! fftx(1)
+! transpose (1->2)
+! ffty(2)
+! transpose (2->3)
+! fftz(3)
+! time evolution
+! fftz(3)
+! transpose (3->2)
+! ffty(2)
+! transpose (2->1)
+! fftx(1)
+! compute residual(1)
+
+! for the 0D, 1D, 2D strategies, the layouts look like xxx
+
+!            0D        1D        2D
+! 1:        xyz       xyz       xyz
+
+! the array dimensions are stored in dims(coord, phase)
+    integer :: dims(3)
+    common /layout/ dims
+
+    integer :: T_total, T_setup, T_fft, T_evolve, T_checksum, &
+    T_fftx, T_ffty, &
+    T_fftz, T_max
+    parameter (T_total = 1, T_setup = 2, T_fft = 3, &
+    T_evolve = 4, T_checksum = 5, &
+    T_fftx = 6, &
+    T_ffty = 7, &
+    T_fftz = 8, T_max = 8)
+
+
+
+    logical :: timers_enabled
+
+
+    external timer_read
+    double precision :: timer_read
+    external ilog2
+    integer :: ilog2
+
+    external randlc
+    double precision :: randlc
+
+
+! other stuff
+    logical :: debug, debugsynch
+    common /dbg/ debug, debugsynch, timers_enabled
+
+    double precision :: seed, a, pi, alpha
+    parameter (seed = 314159265.d0, a = 1220703125.d0, &
+    pi = 3.141592653589793238d0, alpha=1.0d-6)
+
+
+! roots of unity array
+! relies on x being largest dimension?
+    double complex u(nxp)
+    common /ucomm/ u
+
+
+! for checksum data
+    double complex sums(0:niter_default)
+    common /sumcomm/ sums
+
+! number of iterations
+    integer :: niter
+    common /iter/ niter
+    integer :: d1, d2, d3
+    double complex   u0(d1+1,d2,d3)
+    double complex   u1(d1+1,d2,d3)
+    double precision :: twiddle(d1+1,d2,d3)
+    integer :: i, j, k
+
+! omp parallel do default(shared) private(i,j,k)
+    do k = 1, d3
+        do j = 1, d2
+            do i = 1, d1
+                u0(i,j,k) = u0(i,j,k) * twiddle(i,j,k)
+                u1(i,j,k) = u0(i,j,k)
+            end do
+        end do
+    end do
+
+    return
+    end subroutine evolve
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine compute_initial_conditions(u0, d1, d2, d3)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+!---------------------------------------------------------------------
+! Fill in array u0 with initial conditions from
+! random number generator
+!---------------------------------------------------------------------
+    implicit none
+! CLASS = A
+!  
+!  
+!  This file is generated automatically by the setparams utility.
+!  It sets the number of processors and the class of the NPB
+!  in this directory. Do not modify it by hand.
+!  
+        integer nx, ny, nz, maxdim, niter_default
+        integer ntotal, nxp, nyp, ntotalp
+        parameter (nx=256, ny=256, nz=128, maxdim=256)
+        parameter (niter_default=6)
+        parameter (nxp=nx+1, nyp=ny)
+        parameter (ntotal=nx*nyp*nz)
+        parameter (ntotalp=nxp*nyp*nz)
+        logical  convertdouble
+        parameter (convertdouble = .false.)
+        character compiletime*11
+        parameter (compiletime='18 Oct 2016')
+        character npbversion*5
+        parameter (npbversion='3.3.1')
+        character cs1*8
+        parameter (cs1='gfortran')
+        character cs2*6
+        parameter (cs2='$(F77)')
+        character cs3*6
+        parameter (cs3='(none)')
+        character cs4*6
+        parameter (cs4='(none)')
+        character cs5*40
+        parameter (cs5='-ffree-form -O3 -fopenmp -mcmodel=medium')
+        character cs6*28
+        parameter (cs6='-O3 -fopenmp -mcmodel=medium')
+        character cs7*6
+        parameter (cs7='randi8')
+
+
+! If processor array is 1x1 -> 0D grid decomposition
+
+
+! Cache blocking params. These values are good for most
+! RISC processors.
+! FFT parameters:
+!  fftblock controls how many ffts are done at a time.
+!  The default is appropriate for most cache-based machines
+!  On vector machines, the FFT can be vectorized with vector
+!  length equal to the block size, so the block size should
+!  be as large as possible. This is the size of the smallest
+!  dimension of the problem: 128 for class A, 256 for class B and
+!  512 for class C.
+
+    integer :: fftblock_default, fftblockpad_default
+!      parameter (fftblock_default=16, fftblockpad_default=18)
+    parameter (fftblock_default=32, fftblockpad_default=33)
+          
+    integer :: fftblock, fftblockpad
+    common /blockinfo/ fftblock, fftblockpad
+
+! we need a bunch of logic to keep track of how
+! arrays are laid out.
+
+
+! Note: this serial version is the derived from the parallel 0D case
+! of the ft NPB.
+! The computation proceeds logically as
+
+! set up initial conditions
+! fftx(1)
+! transpose (1->2)
+! ffty(2)
+! transpose (2->3)
+! fftz(3)
+! time evolution
+! fftz(3)
+! transpose (3->2)
+! ffty(2)
+! transpose (2->1)
+! fftx(1)
+! compute residual(1)
+
+! for the 0D, 1D, 2D strategies, the layouts look like xxx
+
+!            0D        1D        2D
+! 1:        xyz       xyz       xyz
+
+! the array dimensions are stored in dims(coord, phase)
+    integer :: dims(3)
+    common /layout/ dims
+
+    integer :: T_total, T_setup, T_fft, T_evolve, T_checksum, &
+    T_fftx, T_ffty, &
+    T_fftz, T_max
+    parameter (T_total = 1, T_setup = 2, T_fft = 3, &
+    T_evolve = 4, T_checksum = 5, &
+    T_fftx = 6, &
+    T_ffty = 7, &
+    T_fftz = 8, T_max = 8)
+
+
+
+    logical :: timers_enabled
+
+
+    external timer_read
+    double precision :: timer_read
+    external ilog2
+    integer :: ilog2
+
+    external randlc
+    double precision :: randlc
+
+
+! other stuff
+    logical :: debug, debugsynch
+    common /dbg/ debug, debugsynch, timers_enabled
+
+    double precision :: seed, a, pi, alpha
+    parameter (seed = 314159265.d0, a = 1220703125.d0, &
+    pi = 3.141592653589793238d0, alpha=1.0d-6)
+
+
+! roots of unity array
+! relies on x being largest dimension?
+    double complex u(nxp)
+    common /ucomm/ u
+
+
+! for checksum data
+    double complex sums(0:niter_default)
+    common /sumcomm/ sums
+
+! number of iterations
+    integer :: niter
+    common /iter/ niter
+    integer :: d1, d2, d3
+    double complex u0(d1+1, d2, d3)
+    integer :: k, j
+    double precision :: x0, start, an, dummy, starts(nz)
+          
+
+    start = seed
+!---------------------------------------------------------------------
+! Jump to the starting element for our first plane.
+!---------------------------------------------------------------------
+    call ipow46(a, 0, an)
+    dummy = randlc(start, an)
+    call ipow46(a, 2*nx*ny, an)
+
+    starts(1) = start
+    do k = 2, dims(3)
+        dummy = randlc(start, an)
+        starts(k) = start
+    end do
+          
+!---------------------------------------------------------------------
+! Go through by z planes filling in one square at a time.
+!---------------------------------------------------------------------
+! omp parallel do default(shared) private(k,j,x0)
+    do k = 1, dims(3)
+        x0 = starts(k)
+        do j = 1, dims(2)
+            call vranlc(2*nx, x0, a, u0(1, j, k))
+        end do
+    end do
+
+    return
+    end subroutine compute_initial_conditions
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine ipow46(a, exponent, result)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+!---------------------------------------------------------------------
+! compute a^exponent mod 2^46
+!---------------------------------------------------------------------
+
+    implicit none
+    double precision :: a, result, dummy, q, r
+    integer :: exponent, n, n2
+    external randlc
+    double precision :: randlc
+!---------------------------------------------------------------------
+! Use
+!   a^n = a^(n/2)*a^(n/2) if n even else
+!   a^n = a*a^(n-1)       if n odd
+!---------------------------------------------------------------------
+    result = 1
+    if (exponent == 0) return
+    q = a
+    r = 1
+    n = exponent
+
+
+    do while (n > 1)
+        n2 = n/2
+        if (n2 * 2 == n) then
+            dummy = randlc(q, q)
+            n = n2
+        else
+            dummy = randlc(r, q)
+            n = n-1
+        endif
+    end do
+    dummy = randlc(r, q)
+    result = r
+    return
+    end subroutine ipow46
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine setup
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    implicit none
+! CLASS = A
+!  
+!  
+!  This file is generated automatically by the setparams utility.
+!  It sets the number of processors and the class of the NPB
+!  in this directory. Do not modify it by hand.
+!  
+        integer nx, ny, nz, maxdim, niter_default
+        integer ntotal, nxp, nyp, ntotalp
+        parameter (nx=256, ny=256, nz=128, maxdim=256)
+        parameter (niter_default=6)
+        parameter (nxp=nx+1, nyp=ny)
+        parameter (ntotal=nx*nyp*nz)
+        parameter (ntotalp=nxp*nyp*nz)
+        logical  convertdouble
+        parameter (convertdouble = .false.)
+        character compiletime*11
+        parameter (compiletime='18 Oct 2016')
+        character npbversion*5
+        parameter (npbversion='3.3.1')
+        character cs1*8
+        parameter (cs1='gfortran')
+        character cs2*6
+        parameter (cs2='$(F77)')
+        character cs3*6
+        parameter (cs3='(none)')
+        character cs4*6
+        parameter (cs4='(none)')
+        character cs5*40
+        parameter (cs5='-ffree-form -O3 -fopenmp -mcmodel=medium')
+        character cs6*28
+        parameter (cs6='-O3 -fopenmp -mcmodel=medium')
+        character cs7*6
+        parameter (cs7='randi8')
+
+
+! If processor array is 1x1 -> 0D grid decomposition
+
+
+! Cache blocking params. These values are good for most
+! RISC processors.
+! FFT parameters:
+!  fftblock controls how many ffts are done at a time.
+!  The default is appropriate for most cache-based machines
+!  On vector machines, the FFT can be vectorized with vector
+!  length equal to the block size, so the block size should
+!  be as large as possible. This is the size of the smallest
+!  dimension of the problem: 128 for class A, 256 for class B and
+!  512 for class C.
+
+    integer :: fftblock_default, fftblockpad_default
+!      parameter (fftblock_default=16, fftblockpad_default=18)
+    parameter (fftblock_default=32, fftblockpad_default=33)
+          
+    integer :: fftblock, fftblockpad
+    common /blockinfo/ fftblock, fftblockpad
+
+! we need a bunch of logic to keep track of how
+! arrays are laid out.
+
+
+! Note: this serial version is the derived from the parallel 0D case
+! of the ft NPB.
+! The computation proceeds logically as
+
+! set up initial conditions
+! fftx(1)
+! transpose (1->2)
+! ffty(2)
+! transpose (2->3)
+! fftz(3)
+! time evolution
+! fftz(3)
+! transpose (3->2)
+! ffty(2)
+! transpose (2->1)
+! fftx(1)
+! compute residual(1)
+
+! for the 0D, 1D, 2D strategies, the layouts look like xxx
+
+!            0D        1D        2D
+! 1:        xyz       xyz       xyz
+
+! the array dimensions are stored in dims(coord, phase)
+    integer :: dims(3)
+    common /layout/ dims
+
+    integer :: T_total, T_setup, T_fft, T_evolve, T_checksum, &
+    T_fftx, T_ffty, &
+    T_fftz, T_max
+    parameter (T_total = 1, T_setup = 2, T_fft = 3, &
+    T_evolve = 4, T_checksum = 5, &
+    T_fftx = 6, &
+    T_ffty = 7, &
+    T_fftz = 8, T_max = 8)
+
+
+
+    logical :: timers_enabled
+
+
+    external timer_read
+    double precision :: timer_read
+    external ilog2
+    integer :: ilog2
+
+    external randlc
+    double precision :: randlc
+
+
+! other stuff
+    logical :: debug, debugsynch
+    common /dbg/ debug, debugsynch, timers_enabled
+
+    double precision :: seed, a, pi, alpha
+    parameter (seed = 314159265.d0, a = 1220703125.d0, &
+    pi = 3.141592653589793238d0, alpha=1.0d-6)
+
+
+! roots of unity array
+! relies on x being largest dimension?
+    double complex u(nxp)
+    common /ucomm/ u
+
+
+! for checksum data
+    double complex sums(0:niter_default)
+    common /sumcomm/ sums
+
+! number of iterations
+    integer :: niter
+    common /iter/ niter
+
+    integer :: fstatus
+!$    integer  omp_get_max_threads
+!$    external omp_get_max_threads
+    debug = .FALSE. 
+
+    open (unit=2,file='timer.flag',status='old',iostat=fstatus)
+    if (fstatus == 0) then
+        timers_enabled = .TRUE. 
+        close(2)
+    else
+        timers_enabled = .FALSE. 
+    endif
+
+    write(*, 1000)
+
+    niter = niter_default
+
+    write(*, 1001) nx, ny, nz
+    write(*, 1002) niter
+!$    write(*, 1003) omp_get_max_threads()
+    write(*, *)
+
+
+    1000 format(//,' NAS Parallel Benchmarks (NPB3.3-OMP)', &
+    ' - FT Benchmark', /)
+    1001 format(' Size                : ', i4, 'x', i4, 'x', i4)
+    1002 format(' Iterations                  :', i7)
+    1003 format(' Number of available threads :', i7)
+
+    dims(1) = nx
+    dims(2) = ny
+    dims(3) = nz
+
+
+!---------------------------------------------------------------------
+! Set up info for blocking of ffts and transposes.  This improves
+! performance on cache-based systems. Blocking involves
+! working on a chunk of the problem at a time, taking chunks
+! along the first, second, or third dimension.
+
+! - In cffts1 blocking is on 2nd dimension (with fft on 1st dim)
+! - In cffts2/3 blocking is on 1st dimension (with fft on 2nd and 3rd dims)
+
+! Since 1st dim is always in processor, we'll assume it's long enough
+! (default blocking factor is 16 so min size for 1st dim is 16)
+! The only case we have to worry about is cffts1 in a 2d decomposition.
+! so the blocking factor should not be larger than the 2nd dimension.
+!---------------------------------------------------------------------
+
+    fftblock = fftblock_default
+    fftblockpad = fftblockpad_default
+
+    if (fftblock /= fftblock_default) fftblockpad = fftblock+3
+
+    return
+    end subroutine setup
+
+          
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine compute_indexmap(twiddle, d1, d2, d3)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+!---------------------------------------------------------------------
+! compute function from local (i,j,k) to ibar^2+jbar^2+kbar^2
+! for time evolution exponent.
+!---------------------------------------------------------------------
+
+    implicit none
+! CLASS = A
+!  
+!  
+!  This file is generated automatically by the setparams utility.
+!  It sets the number of processors and the class of the NPB
+!  in this directory. Do not modify it by hand.
+!  
+        integer nx, ny, nz, maxdim, niter_default
+        integer ntotal, nxp, nyp, ntotalp
+        parameter (nx=256, ny=256, nz=128, maxdim=256)
+        parameter (niter_default=6)
+        parameter (nxp=nx+1, nyp=ny)
+        parameter (ntotal=nx*nyp*nz)
+        parameter (ntotalp=nxp*nyp*nz)
+        logical  convertdouble
+        parameter (convertdouble = .false.)
+        character compiletime*11
+        parameter (compiletime='18 Oct 2016')
+        character npbversion*5
+        parameter (npbversion='3.3.1')
+        character cs1*8
+        parameter (cs1='gfortran')
+        character cs2*6
+        parameter (cs2='$(F77)')
+        character cs3*6
+        parameter (cs3='(none)')
+        character cs4*6
+        parameter (cs4='(none)')
+        character cs5*40
+        parameter (cs5='-ffree-form -O3 -fopenmp -mcmodel=medium')
+        character cs6*28
+        parameter (cs6='-O3 -fopenmp -mcmodel=medium')
+        character cs7*6
+        parameter (cs7='randi8')
+
+
+! If processor array is 1x1 -> 0D grid decomposition
+
+
+! Cache blocking params. These values are good for most
+! RISC processors.
+! FFT parameters:
+!  fftblock controls how many ffts are done at a time.
+!  The default is appropriate for most cache-based machines
+!  On vector machines, the FFT can be vectorized with vector
+!  length equal to the block size, so the block size should
+!  be as large as possible. This is the size of the smallest
+!  dimension of the problem: 128 for class A, 256 for class B and
+!  512 for class C.
+
+    integer :: fftblock_default, fftblockpad_default
+!      parameter (fftblock_default=16, fftblockpad_default=18)
+    parameter (fftblock_default=32, fftblockpad_default=33)
+          
+    integer :: fftblock, fftblockpad
+    common /blockinfo/ fftblock, fftblockpad
+
+! we need a bunch of logic to keep track of how
+! arrays are laid out.
+
+
+! Note: this serial version is the derived from the parallel 0D case
+! of the ft NPB.
+! The computation proceeds logically as
+
+! set up initial conditions
+! fftx(1)
+! transpose (1->2)
+! ffty(2)
+! transpose (2->3)
+! fftz(3)
+! time evolution
+! fftz(3)
+! transpose (3->2)
+! ffty(2)
+! transpose (2->1)
+! fftx(1)
+! compute residual(1)
+
+! for the 0D, 1D, 2D strategies, the layouts look like xxx
+
+!            0D        1D        2D
+! 1:        xyz       xyz       xyz
+
+! the array dimensions are stored in dims(coord, phase)
+    integer :: dims(3)
+    common /layout/ dims
+
+    integer :: T_total, T_setup, T_fft, T_evolve, T_checksum, &
+    T_fftx, T_ffty, &
+    T_fftz, T_max
+    parameter (T_total = 1, T_setup = 2, T_fft = 3, &
+    T_evolve = 4, T_checksum = 5, &
+    T_fftx = 6, &
+    T_ffty = 7, &
+    T_fftz = 8, T_max = 8)
+
+
+
+    logical :: timers_enabled
+
+
+    external timer_read
+    double precision :: timer_read
+    external ilog2
+    integer :: ilog2
+
+    external randlc
+    double precision :: randlc
+
+
+! other stuff
+    logical :: debug, debugsynch
+    common /dbg/ debug, debugsynch, timers_enabled
+
+    double precision :: seed, a, pi, alpha
+    parameter (seed = 314159265.d0, a = 1220703125.d0, &
+    pi = 3.141592653589793238d0, alpha=1.0d-6)
+
+
+! roots of unity array
+! relies on x being largest dimension?
+    double complex u(nxp)
+    common /ucomm/ u
+
+
+! for checksum data
+    double complex sums(0:niter_default)
+    common /sumcomm/ sums
+
+! number of iterations
+    integer :: niter
+    common /iter/ niter
+    integer :: d1, d2, d3
+    double precision :: twiddle(d1+1, d2, d3)
+    integer :: i, j, k, kk, kk2, jj, kj2, ii
+    double precision :: ap
+
+!---------------------------------------------------------------------
+! basically we want to convert the fortran indices
+!   1 2 3 4 5 6 7 8
+! to
+!   0 1 2 3 -4 -3 -2 -1
+! The following magic formula does the trick:
+! mod(i-1+n/2, n) - n/2
+!---------------------------------------------------------------------
+
+    ap = - 4.d0 * alpha * pi *pi
+
+! omp parallel do default(shared) private(i,j,k,kk,kk2,jj,kj2,ii)
+    do k = 1, dims(3)
+        kk =  mod(k-1+nz/2, nz) - nz/2
+        kk2 = kk*kk
+        do j = 1, dims(2)
+            jj = mod(j-1+ny/2, ny) - ny/2
+            kj2 = jj*jj+kk2
+            do i = 1, dims(1)
+                ii = mod(i-1+nx/2, nx) - nx/2
+                twiddle(i,j,k) = dexp(ap*dble(ii*ii+kj2))
+            end do
+        end do
+    end do
+
+    return
+    end subroutine compute_indexmap
+
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine print_timers()
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    implicit none
+    integer :: i
+! CLASS = A
+!  
+!  
+!  This file is generated automatically by the setparams utility.
+!  It sets the number of processors and the class of the NPB
+!  in this directory. Do not modify it by hand.
+!  
+        integer nx, ny, nz, maxdim, niter_default
+        integer ntotal, nxp, nyp, ntotalp
+        parameter (nx=256, ny=256, nz=128, maxdim=256)
+        parameter (niter_default=6)
+        parameter (nxp=nx+1, nyp=ny)
+        parameter (ntotal=nx*nyp*nz)
+        parameter (ntotalp=nxp*nyp*nz)
+        logical  convertdouble
+        parameter (convertdouble = .false.)
+        character compiletime*11
+        parameter (compiletime='18 Oct 2016')
+        character npbversion*5
+        parameter (npbversion='3.3.1')
+        character cs1*8
+        parameter (cs1='gfortran')
+        character cs2*6
+        parameter (cs2='$(F77)')
+        character cs3*6
+        parameter (cs3='(none)')
+        character cs4*6
+        parameter (cs4='(none)')
+        character cs5*40
+        parameter (cs5='-ffree-form -O3 -fopenmp -mcmodel=medium')
+        character cs6*28
+        parameter (cs6='-O3 -fopenmp -mcmodel=medium')
+        character cs7*6
+        parameter (cs7='randi8')
+
+
+! If processor array is 1x1 -> 0D grid decomposition
+
+
+! Cache blocking params. These values are good for most
+! RISC processors.
+! FFT parameters:
+!  fftblock controls how many ffts are done at a time.
+!  The default is appropriate for most cache-based machines
+!  On vector machines, the FFT can be vectorized with vector
+!  length equal to the block size, so the block size should
+!  be as large as possible. This is the size of the smallest
+!  dimension of the problem: 128 for class A, 256 for class B and
+!  512 for class C.
+
+    integer :: fftblock_default, fftblockpad_default
+!      parameter (fftblock_default=16, fftblockpad_default=18)
+    parameter (fftblock_default=32, fftblockpad_default=33)
+          
+    integer :: fftblock, fftblockpad
+    common /blockinfo/ fftblock, fftblockpad
+
+! we need a bunch of logic to keep track of how
+! arrays are laid out.
+
+
+! Note: this serial version is the derived from the parallel 0D case
+! of the ft NPB.
+! The computation proceeds logically as
+
+! set up initial conditions
+! fftx(1)
+! transpose (1->2)
+! ffty(2)
+! transpose (2->3)
+! fftz(3)
+! time evolution
+! fftz(3)
+! transpose (3->2)
+! ffty(2)
+! transpose (2->1)
+! fftx(1)
+! compute residual(1)
+
+! for the 0D, 1D, 2D strategies, the layouts look like xxx
+
+!            0D        1D        2D
+! 1:        xyz       xyz       xyz
+
+! the array dimensions are stored in dims(coord, phase)
+    integer :: dims(3)
+    common /layout/ dims
+
+    integer :: T_total, T_setup, T_fft, T_evolve, T_checksum, &
+    T_fftx, T_ffty, &
+    T_fftz, T_max
+    parameter (T_total = 1, T_setup = 2, T_fft = 3, &
+    T_evolve = 4, T_checksum = 5, &
+    T_fftx = 6, &
+    T_ffty = 7, &
+    T_fftz = 8, T_max = 8)
+
+
+
+    logical :: timers_enabled
+
+
+    external timer_read
+    double precision :: timer_read
+    external ilog2
+    integer :: ilog2
+
+    external randlc
+    double precision :: randlc
+
+
+! other stuff
+    logical :: debug, debugsynch
+    common /dbg/ debug, debugsynch, timers_enabled
+
+    double precision :: seed, a, pi, alpha
+    parameter (seed = 314159265.d0, a = 1220703125.d0, &
+    pi = 3.141592653589793238d0, alpha=1.0d-6)
+
+
+! roots of unity array
+! relies on x being largest dimension?
+    double complex u(nxp)
+    common /ucomm/ u
+
+
+! for checksum data
+    double complex sums(0:niter_default)
+    common /sumcomm/ sums
+
+! number of iterations
+    integer :: niter
+    common /iter/ niter
+    double precision :: t, t_m
+    character(25) :: tstrings(T_max)
+    data tstrings / '          total ', &
+    '          setup ', &
+    '            fft ', &
+    '         evolve ', &
+    '       checksum ', &
+    '           fftx ', &
+    '           ffty ', &
+    '           fftz ' /
+
+    t_m = timer_read(T_total)
+    if (t_m <= 0.0d0) t_m = 1.0d0
+    do i = 1, t_max
+        t = timer_read(i)
+        write(*, 100) i, tstrings(i), t, t*100.0/t_m
+    end do
+    100 format(' timer ', i2, '(', A16,  ') :', F9.4, ' (',F6.2,'%)')
+    return
+    end subroutine print_timers
+
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine fft(dir, x1, x2)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    implicit none
+! CLASS = A
+!  
+!  
+!  This file is generated automatically by the setparams utility.
+!  It sets the number of processors and the class of the NPB
+!  in this directory. Do not modify it by hand.
+!  
+        integer nx, ny, nz, maxdim, niter_default
+        integer ntotal, nxp, nyp, ntotalp
+        parameter (nx=256, ny=256, nz=128, maxdim=256)
+        parameter (niter_default=6)
+        parameter (nxp=nx+1, nyp=ny)
+        parameter (ntotal=nx*nyp*nz)
+        parameter (ntotalp=nxp*nyp*nz)
+        logical  convertdouble
+        parameter (convertdouble = .false.)
+        character compiletime*11
+        parameter (compiletime='18 Oct 2016')
+        character npbversion*5
+        parameter (npbversion='3.3.1')
+        character cs1*8
+        parameter (cs1='gfortran')
+        character cs2*6
+        parameter (cs2='$(F77)')
+        character cs3*6
+        parameter (cs3='(none)')
+        character cs4*6
+        parameter (cs4='(none)')
+        character cs5*40
+        parameter (cs5='-ffree-form -O3 -fopenmp -mcmodel=medium')
+        character cs6*28
+        parameter (cs6='-O3 -fopenmp -mcmodel=medium')
+        character cs7*6
+        parameter (cs7='randi8')
+
+
+! If processor array is 1x1 -> 0D grid decomposition
+
+
+! Cache blocking params. These values are good for most
+! RISC processors.
+! FFT parameters:
+!  fftblock controls how many ffts are done at a time.
+!  The default is appropriate for most cache-based machines
+!  On vector machines, the FFT can be vectorized with vector
+!  length equal to the block size, so the block size should
+!  be as large as possible. This is the size of the smallest
+!  dimension of the problem: 128 for class A, 256 for class B and
+!  512 for class C.
+
+    integer :: fftblock_default, fftblockpad_default
+!      parameter (fftblock_default=16, fftblockpad_default=18)
+    parameter (fftblock_default=32, fftblockpad_default=33)
+          
+    integer :: fftblock, fftblockpad
+    common /blockinfo/ fftblock, fftblockpad
+
+! we need a bunch of logic to keep track of how
+! arrays are laid out.
+
+
+! Note: this serial version is the derived from the parallel 0D case
+! of the ft NPB.
+! The computation proceeds logically as
+
+! set up initial conditions
+! fftx(1)
+! transpose (1->2)
+! ffty(2)
+! transpose (2->3)
+! fftz(3)
+! time evolution
+! fftz(3)
+! transpose (3->2)
+! ffty(2)
+! transpose (2->1)
+! fftx(1)
+! compute residual(1)
+
+! for the 0D, 1D, 2D strategies, the layouts look like xxx
+
+!            0D        1D        2D
+! 1:        xyz       xyz       xyz
+
+! the array dimensions are stored in dims(coord, phase)
+    integer :: dims(3)
+    common /layout/ dims
+
+    integer :: T_total, T_setup, T_fft, T_evolve, T_checksum, &
+    T_fftx, T_ffty, &
+    T_fftz, T_max
+    parameter (T_total = 1, T_setup = 2, T_fft = 3, &
+    T_evolve = 4, T_checksum = 5, &
+    T_fftx = 6, &
+    T_ffty = 7, &
+    T_fftz = 8, T_max = 8)
+
+
+
+    logical :: timers_enabled
+
+
+    external timer_read
+    double precision :: timer_read
+    external ilog2
+    integer :: ilog2
+
+    external randlc
+    double precision :: randlc
+
+
+! other stuff
+    logical :: debug, debugsynch
+    common /dbg/ debug, debugsynch, timers_enabled
+
+    double precision :: seed, a, pi, alpha
+    parameter (seed = 314159265.d0, a = 1220703125.d0, &
+    pi = 3.141592653589793238d0, alpha=1.0d-6)
+
+
+! roots of unity array
+! relies on x being largest dimension?
+    double complex u(nxp)
+    common /ucomm/ u
+
+
+! for checksum data
+    double complex sums(0:niter_default)
+    common /sumcomm/ sums
+
+! number of iterations
+    integer :: niter
+    common /iter/ niter
+    integer :: dir
+    double complex x1(ntotalp), x2(ntotalp)
+
+    double complex y1(fftblockpad_default*maxdim), &
+    y2(fftblockpad_default*maxdim)
+
+!---------------------------------------------------------------------
+! note: args x1, x2 must be different arrays
+! note: args for cfftsx are (direction, layout, xin, xout, scratch)
+!       xin/xout may be the same and it can be somewhat faster
+!       if they are
+!---------------------------------------------------------------------
+
+    if (dir == 1) then
+        call cffts1(1, dims(1), dims(2), dims(3), x1, x1, y1, y2)
+        call cffts2(1, dims(1), dims(2), dims(3), x1, x1, y1, y2)
+        call cffts3(1, dims(1), dims(2), dims(3), x1, x2, y1, y2)
+    else
+        call cffts3(-1, dims(1), dims(2), dims(3), x1, x1, y1, y2)
+        call cffts2(-1, dims(1), dims(2), dims(3), x1, x1, y1, y2)
+        call cffts1(-1, dims(1), dims(2), dims(3), x1, x2, y1, y2)
+    endif
+    return
+    end subroutine fft
+
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine cffts1(is, d1, d2, d3, x, xout, y1, y2)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    implicit none
+
+! CLASS = A
+!  
+!  
+!  This file is generated automatically by the setparams utility.
+!  It sets the number of processors and the class of the NPB
+!  in this directory. Do not modify it by hand.
+!  
+        integer nx, ny, nz, maxdim, niter_default
+        integer ntotal, nxp, nyp, ntotalp
+        parameter (nx=256, ny=256, nz=128, maxdim=256)
+        parameter (niter_default=6)
+        parameter (nxp=nx+1, nyp=ny)
+        parameter (ntotal=nx*nyp*nz)
+        parameter (ntotalp=nxp*nyp*nz)
+        logical  convertdouble
+        parameter (convertdouble = .false.)
+        character compiletime*11
+        parameter (compiletime='18 Oct 2016')
+        character npbversion*5
+        parameter (npbversion='3.3.1')
+        character cs1*8
+        parameter (cs1='gfortran')
+        character cs2*6
+        parameter (cs2='$(F77)')
+        character cs3*6
+        parameter (cs3='(none)')
+        character cs4*6
+        parameter (cs4='(none)')
+        character cs5*40
+        parameter (cs5='-ffree-form -O3 -fopenmp -mcmodel=medium')
+        character cs6*28
+        parameter (cs6='-O3 -fopenmp -mcmodel=medium')
+        character cs7*6
+        parameter (cs7='randi8')
+
+
+! If processor array is 1x1 -> 0D grid decomposition
+
+
+! Cache blocking params. These values are good for most
+! RISC processors.
+! FFT parameters:
+!  fftblock controls how many ffts are done at a time.
+!  The default is appropriate for most cache-based machines
+!  On vector machines, the FFT can be vectorized with vector
+!  length equal to the block size, so the block size should
+!  be as large as possible. This is the size of the smallest
+!  dimension of the problem: 128 for class A, 256 for class B and
+!  512 for class C.
+
+    integer :: fftblock_default, fftblockpad_default
+!      parameter (fftblock_default=16, fftblockpad_default=18)
+    parameter (fftblock_default=32, fftblockpad_default=33)
+          
+    integer :: fftblock, fftblockpad
+    common /blockinfo/ fftblock, fftblockpad
+
+! we need a bunch of logic to keep track of how
+! arrays are laid out.
+
+
+! Note: this serial version is the derived from the parallel 0D case
+! of the ft NPB.
+! The computation proceeds logically as
+
+! set up initial conditions
+! fftx(1)
+! transpose (1->2)
+! ffty(2)
+! transpose (2->3)
+! fftz(3)
+! time evolution
+! fftz(3)
+! transpose (3->2)
+! ffty(2)
+! transpose (2->1)
+! fftx(1)
+! compute residual(1)
+
+! for the 0D, 1D, 2D strategies, the layouts look like xxx
+
+!            0D        1D        2D
+! 1:        xyz       xyz       xyz
+
+! the array dimensions are stored in dims(coord, phase)
+    integer :: dims(3)
+    common /layout/ dims
+
+    integer :: T_total, T_setup, T_fft, T_evolve, T_checksum, &
+    T_fftx, T_ffty, &
+    T_fftz, T_max
+    parameter (T_total = 1, T_setup = 2, T_fft = 3, &
+    T_evolve = 4, T_checksum = 5, &
+    T_fftx = 6, &
+    T_ffty = 7, &
+    T_fftz = 8, T_max = 8)
+
+
+
+    logical :: timers_enabled
+
+
+    external timer_read
+    double precision :: timer_read
+    external ilog2
+    integer :: ilog2
+
+    external randlc
+    double precision :: randlc
+
+
+! other stuff
+    logical :: debug, debugsynch
+    common /dbg/ debug, debugsynch, timers_enabled
+
+    double precision :: seed, a, pi, alpha
+    parameter (seed = 314159265.d0, a = 1220703125.d0, &
+    pi = 3.141592653589793238d0, alpha=1.0d-6)
+
+
+! roots of unity array
+! relies on x being largest dimension?
+    double complex u(nxp)
+    common /ucomm/ u
+
+
+! for checksum data
+    double complex sums(0:niter_default)
+    common /sumcomm/ sums
+
+! number of iterations
+    integer :: niter
+    common /iter/ niter
+    integer :: is, d1, d2, d3, logd1
+    double complex x(d1+1,d2,d3)
+    double complex xout(d1+1,d2,d3)
+    double complex y1(fftblockpad, d1), y2(fftblockpad, d1)
+    integer :: i, j, k, jj
+
+    logd1 = ilog2(d1)
+
+    if (timers_enabled) call timer_start(T_fftx)
+! omp parallel do default(shared) private(i,j,k,jj,y1,y2)
+! omp&  shared(is,logd1,d1)
+    do k = 1, d3
+        do jj = 0, d2 - fftblock, fftblock
+            do j = 1, fftblock
+                do i = 1, d1
+                    y1(j,i) = x(i,j+jj,k)
+                enddo
+            enddo
+                        
+            call cfftz (is, logd1, d1, y1, y2)
+
+
+            do j = 1, fftblock
+                do i = 1, d1
+                    xout(i,j+jj,k) = y1(j,i)
+                enddo
+            enddo
+        enddo
+    enddo
+    if (timers_enabled) call timer_stop(T_fftx)
+
+    return
+    end subroutine cffts1
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine cffts2(is, d1, d2, d3, x, xout, y1, y2)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    implicit none
+
+! CLASS = A
+!  
+!  
+!  This file is generated automatically by the setparams utility.
+!  It sets the number of processors and the class of the NPB
+!  in this directory. Do not modify it by hand.
+!  
+        integer nx, ny, nz, maxdim, niter_default
+        integer ntotal, nxp, nyp, ntotalp
+        parameter (nx=256, ny=256, nz=128, maxdim=256)
+        parameter (niter_default=6)
+        parameter (nxp=nx+1, nyp=ny)
+        parameter (ntotal=nx*nyp*nz)
+        parameter (ntotalp=nxp*nyp*nz)
+        logical  convertdouble
+        parameter (convertdouble = .false.)
+        character compiletime*11
+        parameter (compiletime='18 Oct 2016')
+        character npbversion*5
+        parameter (npbversion='3.3.1')
+        character cs1*8
+        parameter (cs1='gfortran')
+        character cs2*6
+        parameter (cs2='$(F77)')
+        character cs3*6
+        parameter (cs3='(none)')
+        character cs4*6
+        parameter (cs4='(none)')
+        character cs5*40
+        parameter (cs5='-ffree-form -O3 -fopenmp -mcmodel=medium')
+        character cs6*28
+        parameter (cs6='-O3 -fopenmp -mcmodel=medium')
+        character cs7*6
+        parameter (cs7='randi8')
+
+
+! If processor array is 1x1 -> 0D grid decomposition
+
+
+! Cache blocking params. These values are good for most
+! RISC processors.
+! FFT parameters:
+!  fftblock controls how many ffts are done at a time.
+!  The default is appropriate for most cache-based machines
+!  On vector machines, the FFT can be vectorized with vector
+!  length equal to the block size, so the block size should
+!  be as large as possible. This is the size of the smallest
+!  dimension of the problem: 128 for class A, 256 for class B and
+!  512 for class C.
+
+    integer :: fftblock_default, fftblockpad_default
+!      parameter (fftblock_default=16, fftblockpad_default=18)
+    parameter (fftblock_default=32, fftblockpad_default=33)
+          
+    integer :: fftblock, fftblockpad
+    common /blockinfo/ fftblock, fftblockpad
+
+! we need a bunch of logic to keep track of how
+! arrays are laid out.
+
+
+! Note: this serial version is the derived from the parallel 0D case
+! of the ft NPB.
+! The computation proceeds logically as
+
+! set up initial conditions
+! fftx(1)
+! transpose (1->2)
+! ffty(2)
+! transpose (2->3)
+! fftz(3)
+! time evolution
+! fftz(3)
+! transpose (3->2)
+! ffty(2)
+! transpose (2->1)
+! fftx(1)
+! compute residual(1)
+
+! for the 0D, 1D, 2D strategies, the layouts look like xxx
+
+!            0D        1D        2D
+! 1:        xyz       xyz       xyz
+
+! the array dimensions are stored in dims(coord, phase)
+    integer :: dims(3)
+    common /layout/ dims
+
+    integer :: T_total, T_setup, T_fft, T_evolve, T_checksum, &
+    T_fftx, T_ffty, &
+    T_fftz, T_max
+    parameter (T_total = 1, T_setup = 2, T_fft = 3, &
+    T_evolve = 4, T_checksum = 5, &
+    T_fftx = 6, &
+    T_ffty = 7, &
+    T_fftz = 8, T_max = 8)
+
+
+
+    logical :: timers_enabled
+
+
+    external timer_read
+    double precision :: timer_read
+    external ilog2
+    integer :: ilog2
+
+    external randlc
+    double precision :: randlc
+
+
+! other stuff
+    logical :: debug, debugsynch
+    common /dbg/ debug, debugsynch, timers_enabled
+
+    double precision :: seed, a, pi, alpha
+    parameter (seed = 314159265.d0, a = 1220703125.d0, &
+    pi = 3.141592653589793238d0, alpha=1.0d-6)
+
+
+! roots of unity array
+! relies on x being largest dimension?
+    double complex u(nxp)
+    common /ucomm/ u
+
+
+! for checksum data
+    double complex sums(0:niter_default)
+    common /sumcomm/ sums
+
+! number of iterations
+    integer :: niter
+    common /iter/ niter
+    integer :: is, d1, d2, d3, logd2
+    double complex x(d1+1,d2,d3)
+    double complex xout(d1+1,d2,d3)
+    double complex y1(fftblockpad, d2), y2(fftblockpad, d2)
+    integer :: i, j, k, ii
+
+    logd2 = ilog2(d2)
+
+    if (timers_enabled) call timer_start(T_ffty)
+! omp parallel do default(shared) private(i,j,k,ii,y1,y2)
+! omp&  shared(is,logd2,d2)
+    do k = 1, d3
+        do ii = 0, d1 - fftblock, fftblock
+            do j = 1, d2
+                do i = 1, fftblock
+                    y1(i,j) = x(i+ii,j,k)
+                enddo
+            enddo
+
+            call cfftz (is, logd2, d2, y1, y2)
+                       
+            do j = 1, d2
+                do i = 1, fftblock
+                    xout(i+ii,j,k) = y1(i,j)
+                enddo
+            enddo
+        enddo
+    enddo
+    if (timers_enabled) call timer_stop(T_ffty)
+
+    return
+    end subroutine cffts2
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine cffts3(is, d1, d2, d3, x, xout, y1, y2)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    implicit none
+
+! CLASS = A
+!  
+!  
+!  This file is generated automatically by the setparams utility.
+!  It sets the number of processors and the class of the NPB
+!  in this directory. Do not modify it by hand.
+!  
+        integer nx, ny, nz, maxdim, niter_default
+        integer ntotal, nxp, nyp, ntotalp
+        parameter (nx=256, ny=256, nz=128, maxdim=256)
+        parameter (niter_default=6)
+        parameter (nxp=nx+1, nyp=ny)
+        parameter (ntotal=nx*nyp*nz)
+        parameter (ntotalp=nxp*nyp*nz)
+        logical  convertdouble
+        parameter (convertdouble = .false.)
+        character compiletime*11
+        parameter (compiletime='18 Oct 2016')
+        character npbversion*5
+        parameter (npbversion='3.3.1')
+        character cs1*8
+        parameter (cs1='gfortran')
+        character cs2*6
+        parameter (cs2='$(F77)')
+        character cs3*6
+        parameter (cs3='(none)')
+        character cs4*6
+        parameter (cs4='(none)')
+        character cs5*40
+        parameter (cs5='-ffree-form -O3 -fopenmp -mcmodel=medium')
+        character cs6*28
+        parameter (cs6='-O3 -fopenmp -mcmodel=medium')
+        character cs7*6
+        parameter (cs7='randi8')
+
+
+! If processor array is 1x1 -> 0D grid decomposition
+
+
+! Cache blocking params. These values are good for most
+! RISC processors.
+! FFT parameters:
+!  fftblock controls how many ffts are done at a time.
+!  The default is appropriate for most cache-based machines
+!  On vector machines, the FFT can be vectorized with vector
+!  length equal to the block size, so the block size should
+!  be as large as possible. This is the size of the smallest
+!  dimension of the problem: 128 for class A, 256 for class B and
+!  512 for class C.
+
+    integer :: fftblock_default, fftblockpad_default
+!      parameter (fftblock_default=16, fftblockpad_default=18)
+    parameter (fftblock_default=32, fftblockpad_default=33)
+          
+    integer :: fftblock, fftblockpad
+    common /blockinfo/ fftblock, fftblockpad
+
+! we need a bunch of logic to keep track of how
+! arrays are laid out.
+
+
+! Note: this serial version is the derived from the parallel 0D case
+! of the ft NPB.
+! The computation proceeds logically as
+
+! set up initial conditions
+! fftx(1)
+! transpose (1->2)
+! ffty(2)
+! transpose (2->3)
+! fftz(3)
+! time evolution
+! fftz(3)
+! transpose (3->2)
+! ffty(2)
+! transpose (2->1)
+! fftx(1)
+! compute residual(1)
+
+! for the 0D, 1D, 2D strategies, the layouts look like xxx
+
+!            0D        1D        2D
+! 1:        xyz       xyz       xyz
+
+! the array dimensions are stored in dims(coord, phase)
+    integer :: dims(3)
+    common /layout/ dims
+
+    integer :: T_total, T_setup, T_fft, T_evolve, T_checksum, &
+    T_fftx, T_ffty, &
+    T_fftz, T_max
+    parameter (T_total = 1, T_setup = 2, T_fft = 3, &
+    T_evolve = 4, T_checksum = 5, &
+    T_fftx = 6, &
+    T_ffty = 7, &
+    T_fftz = 8, T_max = 8)
+
+
+
+    logical :: timers_enabled
+
+
+    external timer_read
+    double precision :: timer_read
+    external ilog2
+    integer :: ilog2
+
+    external randlc
+    double precision :: randlc
+
+
+! other stuff
+    logical :: debug, debugsynch
+    common /dbg/ debug, debugsynch, timers_enabled
+
+    double precision :: seed, a, pi, alpha
+    parameter (seed = 314159265.d0, a = 1220703125.d0, &
+    pi = 3.141592653589793238d0, alpha=1.0d-6)
+
+
+! roots of unity array
+! relies on x being largest dimension?
+    double complex u(nxp)
+    common /ucomm/ u
+
+
+! for checksum data
+    double complex sums(0:niter_default)
+    common /sumcomm/ sums
+
+! number of iterations
+    integer :: niter
+    common /iter/ niter
+    integer :: is, d1, d2, d3, logd3
+    double complex x(d1+1,d2,d3)
+    double complex xout(d1+1,d2,d3)
+    double complex y1(fftblockpad, d3), y2(fftblockpad, d3)
+    integer :: i, j, k, ii
+
+    logd3 = ilog2(d3)
+
+    if (timers_enabled) call timer_start(T_fftz)
+! omp parallel do default(shared) private(i,j,k,ii,y1,y2)
+! omp&  shared(is)
+    do j = 1, d2
+        do ii = 0, d1 - fftblock, fftblock
+            do k = 1, d3
+                do i = 1, fftblock
+                    y1(i,k) = x(i+ii,j,k)
+                enddo
+            enddo
+
+            call cfftz (is, logd3, d3, y1, y2)
+
+            do k = 1, d3
+                do i = 1, fftblock
+                    xout(i+ii,j,k) = y1(i,k)
+                enddo
+            enddo
+        enddo
+    enddo
+    if (timers_enabled) call timer_stop(T_fftz)
+
+    return
+    end subroutine cffts3
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine fft_init (n)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+!---------------------------------------------------------------------
+! compute the roots-of-unity array that will be used for subsequent FFTs.
+!---------------------------------------------------------------------
+
+    implicit none
+! CLASS = A
+!  
+!  
+!  This file is generated automatically by the setparams utility.
+!  It sets the number of processors and the class of the NPB
+!  in this directory. Do not modify it by hand.
+!  
+        integer nx, ny, nz, maxdim, niter_default
+        integer ntotal, nxp, nyp, ntotalp
+        parameter (nx=256, ny=256, nz=128, maxdim=256)
+        parameter (niter_default=6)
+        parameter (nxp=nx+1, nyp=ny)
+        parameter (ntotal=nx*nyp*nz)
+        parameter (ntotalp=nxp*nyp*nz)
+        logical  convertdouble
+        parameter (convertdouble = .false.)
+        character compiletime*11
+        parameter (compiletime='18 Oct 2016')
+        character npbversion*5
+        parameter (npbversion='3.3.1')
+        character cs1*8
+        parameter (cs1='gfortran')
+        character cs2*6
+        parameter (cs2='$(F77)')
+        character cs3*6
+        parameter (cs3='(none)')
+        character cs4*6
+        parameter (cs4='(none)')
+        character cs5*40
+        parameter (cs5='-ffree-form -O3 -fopenmp -mcmodel=medium')
+        character cs6*28
+        parameter (cs6='-O3 -fopenmp -mcmodel=medium')
+        character cs7*6
+        parameter (cs7='randi8')
+
+
+! If processor array is 1x1 -> 0D grid decomposition
+
+
+! Cache blocking params. These values are good for most
+! RISC processors.
+! FFT parameters:
+!  fftblock controls how many ffts are done at a time.
+!  The default is appropriate for most cache-based machines
+!  On vector machines, the FFT can be vectorized with vector
+!  length equal to the block size, so the block size should
+!  be as large as possible. This is the size of the smallest
+!  dimension of the problem: 128 for class A, 256 for class B and
+!  512 for class C.
+
+    integer :: fftblock_default, fftblockpad_default
+!      parameter (fftblock_default=16, fftblockpad_default=18)
+    parameter (fftblock_default=32, fftblockpad_default=33)
+          
+    integer :: fftblock, fftblockpad
+    common /blockinfo/ fftblock, fftblockpad
+
+! we need a bunch of logic to keep track of how
+! arrays are laid out.
+
+
+! Note: this serial version is the derived from the parallel 0D case
+! of the ft NPB.
+! The computation proceeds logically as
+
+! set up initial conditions
+! fftx(1)
+! transpose (1->2)
+! ffty(2)
+! transpose (2->3)
+! fftz(3)
+! time evolution
+! fftz(3)
+! transpose (3->2)
+! ffty(2)
+! transpose (2->1)
+! fftx(1)
+! compute residual(1)
+
+! for the 0D, 1D, 2D strategies, the layouts look like xxx
+
+!            0D        1D        2D
+! 1:        xyz       xyz       xyz
+
+! the array dimensions are stored in dims(coord, phase)
+    integer :: dims(3)
+    common /layout/ dims
+
+    integer :: T_total, T_setup, T_fft, T_evolve, T_checksum, &
+    T_fftx, T_ffty, &
+    T_fftz, T_max
+    parameter (T_total = 1, T_setup = 2, T_fft = 3, &
+    T_evolve = 4, T_checksum = 5, &
+    T_fftx = 6, &
+    T_ffty = 7, &
+    T_fftz = 8, T_max = 8)
+
+
+
+    logical :: timers_enabled
+
+
+    external timer_read
+    double precision :: timer_read
+    external ilog2
+    integer :: ilog2
+
+    external randlc
+    double precision :: randlc
+
+
+! other stuff
+    logical :: debug, debugsynch
+    common /dbg/ debug, debugsynch, timers_enabled
+
+    double precision :: seed, a, pi, alpha
+    parameter (seed = 314159265.d0, a = 1220703125.d0, &
+    pi = 3.141592653589793238d0, alpha=1.0d-6)
+
+
+! roots of unity array
+! relies on x being largest dimension?
+    double complex u(nxp)
+    common /ucomm/ u
+
+
+! for checksum data
+    double complex sums(0:niter_default)
+    common /sumcomm/ sums
+
+! number of iterations
+    integer :: niter
+    common /iter/ niter
+
+    integer :: m,n,nu,ku,i,j,ln
+    double precision :: t, ti
+
+
+!---------------------------------------------------------------------
+!   Initialize the U array with sines and cosines in a manner that permits
+!   stride one access at each FFT iteration.
+!---------------------------------------------------------------------
+    nu = n
+    m = ilog2(n)
+    u(1) = m
+    ku = 2
+    ln = 1
+
+    do j = 1, m
+        t = pi / ln
+                 
+        do i = 0, ln - 1
+            ti = i * t
+            u(i+ku) = dcmplx (cos (ti), sin(ti))
+        enddo
+                 
+        ku = ku + ln
+        ln = 2 * ln
+    enddo
+          
+    return
+    end subroutine fft_init
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine cfftz (is, m, n, x, y)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+!---------------------------------------------------------------------
+!   Computes NY N-point complex-to-complex FFTs of X using an algorithm due
+!   to Swarztrauber.  X is both the input and the output array, while Y is a
+!   scratch array.  It is assumed that N = 2^M.  Before calling CFFTZ to
+!   perform FFTs, the array U must be initialized by calling CFFTZ with IS
+!   set to 0 and M set to MX, where MX is the maximum value of M for any
+!   subsequent call.
+!---------------------------------------------------------------------
+
+    implicit none
+! CLASS = A
+!  
+!  
+!  This file is generated automatically by the setparams utility.
+!  It sets the number of processors and the class of the NPB
+!  in this directory. Do not modify it by hand.
+!  
+        integer nx, ny, nz, maxdim, niter_default
+        integer ntotal, nxp, nyp, ntotalp
+        parameter (nx=256, ny=256, nz=128, maxdim=256)
+        parameter (niter_default=6)
+        parameter (nxp=nx+1, nyp=ny)
+        parameter (ntotal=nx*nyp*nz)
+        parameter (ntotalp=nxp*nyp*nz)
+        logical  convertdouble
+        parameter (convertdouble = .false.)
+        character compiletime*11
+        parameter (compiletime='18 Oct 2016')
+        character npbversion*5
+        parameter (npbversion='3.3.1')
+        character cs1*8
+        parameter (cs1='gfortran')
+        character cs2*6
+        parameter (cs2='$(F77)')
+        character cs3*6
+        parameter (cs3='(none)')
+        character cs4*6
+        parameter (cs4='(none)')
+        character cs5*40
+        parameter (cs5='-ffree-form -O3 -fopenmp -mcmodel=medium')
+        character cs6*28
+        parameter (cs6='-O3 -fopenmp -mcmodel=medium')
+        character cs7*6
+        parameter (cs7='randi8')
+
+
+! If processor array is 1x1 -> 0D grid decomposition
+
+
+! Cache blocking params. These values are good for most
+! RISC processors.
+! FFT parameters:
+!  fftblock controls how many ffts are done at a time.
+!  The default is appropriate for most cache-based machines
+!  On vector machines, the FFT can be vectorized with vector
+!  length equal to the block size, so the block size should
+!  be as large as possible. This is the size of the smallest
+!  dimension of the problem: 128 for class A, 256 for class B and
+!  512 for class C.
+
+    integer :: fftblock_default, fftblockpad_default
+!      parameter (fftblock_default=16, fftblockpad_default=18)
+    parameter (fftblock_default=32, fftblockpad_default=33)
+          
+    integer :: fftblock, fftblockpad
+    common /blockinfo/ fftblock, fftblockpad
+
+! we need a bunch of logic to keep track of how
+! arrays are laid out.
+
+
+! Note: this serial version is the derived from the parallel 0D case
+! of the ft NPB.
+! The computation proceeds logically as
+
+! set up initial conditions
+! fftx(1)
+! transpose (1->2)
+! ffty(2)
+! transpose (2->3)
+! fftz(3)
+! time evolution
+! fftz(3)
+! transpose (3->2)
+! ffty(2)
+! transpose (2->1)
+! fftx(1)
+! compute residual(1)
+
+! for the 0D, 1D, 2D strategies, the layouts look like xxx
+
+!            0D        1D        2D
+! 1:        xyz       xyz       xyz
+
+! the array dimensions are stored in dims(coord, phase)
+    integer :: dims(3)
+    common /layout/ dims
+
+    integer :: T_total, T_setup, T_fft, T_evolve, T_checksum, &
+    T_fftx, T_ffty, &
+    T_fftz, T_max
+    parameter (T_total = 1, T_setup = 2, T_fft = 3, &
+    T_evolve = 4, T_checksum = 5, &
+    T_fftx = 6, &
+    T_ffty = 7, &
+    T_fftz = 8, T_max = 8)
+
+
+
+    logical :: timers_enabled
+
+
+    external timer_read
+    double precision :: timer_read
+    external ilog2
+    integer :: ilog2
+
+    external randlc
+    double precision :: randlc
+
+
+! other stuff
+    logical :: debug, debugsynch
+    common /dbg/ debug, debugsynch, timers_enabled
+
+    double precision :: seed, a, pi, alpha
+    parameter (seed = 314159265.d0, a = 1220703125.d0, &
+    pi = 3.141592653589793238d0, alpha=1.0d-6)
+
+
+! roots of unity array
+! relies on x being largest dimension?
+    double complex u(nxp)
+    common /ucomm/ u
+
+
+! for checksum data
+    double complex sums(0:niter_default)
+    common /sumcomm/ sums
+
+! number of iterations
+    integer :: niter
+    common /iter/ niter
+
+    integer :: is,m,n,i,j,l,mx
+    double complex x, y
+
+    dimension x(fftblockpad,n), y(fftblockpad,n)
+
+!---------------------------------------------------------------------
+!   Check if input parameters are invalid.
+!---------------------------------------------------------------------
+    mx = u(1)
+    if ((is /= 1 .AND. is /= -1) .OR. m < 1 .OR. m > mx) &
+    then
+        write (*, 1)  is, m, mx
+        1 format ('CFFTZ: Either U has not been initialized, or else'/ &
+        'one of the input parameters is invalid', 3I5)
+        stop
+    endif
+
+!---------------------------------------------------------------------
+!   Perform one variant of the Stockham FFT.
+!---------------------------------------------------------------------
+    do l = 1, m, 2
+        call fftz2 (is, l, m, n, fftblock, fftblockpad, u, x, y)
+        if (l == m) goto 160
+        call fftz2 (is, l + 1, m, n, fftblock, fftblockpad, u, y, x)
+    enddo
+
+    goto 180
+
+!---------------------------------------------------------------------
+!   Copy Y to X.
+!---------------------------------------------------------------------
+    160 do j = 1, n
+        do i = 1, fftblock
+            x(i,j) = y(i,j)
+        enddo
+    enddo
+
+    180 continue
+
+    return
+    end subroutine cfftz
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine fftz2 (is, l, m, n, ny, ny1, u, x, y)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+!---------------------------------------------------------------------
+!   Performs the L-th iteration of the second variant of the Stockham FFT.
+!---------------------------------------------------------------------
+
+    implicit none
+
+    integer :: is,k,l,m,n,ny,ny1,n1,li,lj,lk,ku,i,j,i11,i12,i21,i22
+    double complex u,x,y,u1,x11,x21
+    dimension u(n), x(ny1,n), y(ny1,n)
+
+
+!---------------------------------------------------------------------
+!   Set initial parameters.
+!---------------------------------------------------------------------
+
+    n1 = n / 2
+    lk = 2 ** (l - 1)
+    li = 2 ** (m - l)
+    lj = 2 * lk
+    ku = li + 1
+
+    do i = 0, li - 1
+        i11 = i * lk + 1
+        i12 = i11 + n1
+        i21 = i * lj + 1
+        i22 = i21 + lk
+        if (is >= 1) then
+            u1 = u(ku+i)
+        else
+            u1 = dconjg (u(ku+i))
+        endif
+
+    !---------------------------------------------------------------------
+    !   This loop is vectorizable.
+    !---------------------------------------------------------------------
+        do k = 0, lk - 1
+            do j = 1, ny
+                x11 = x(j,i11+k)
+                x21 = x(j,i12+k)
+                y(j,i21+k) = x11 + x21
+                y(j,i22+k) = u1 * (x11 - x21)
+            enddo
+        enddo
+    enddo
+
+    return
+    end subroutine fftz2
+
+!---------------------------------------------------------------------
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    integer function ilog2(n)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    implicit none
+    integer :: n, nn, lg
+    if (n == 1) then
+        ilog2=0
+        return
+    endif
+    lg = 1
+    nn = 2
+    do while (nn < n)
+        nn = nn*2
+        lg = lg+1
+    end do
+    ilog2 = lg
+    return
+    end function ilog2
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine checksum(i, u1, d1, d2, d3)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    implicit none
+! CLASS = A
+!  
+!  
+!  This file is generated automatically by the setparams utility.
+!  It sets the number of processors and the class of the NPB
+!  in this directory. Do not modify it by hand.
+!  
+        integer nx, ny, nz, maxdim, niter_default
+        integer ntotal, nxp, nyp, ntotalp
+        parameter (nx=256, ny=256, nz=128, maxdim=256)
+        parameter (niter_default=6)
+        parameter (nxp=nx+1, nyp=ny)
+        parameter (ntotal=nx*nyp*nz)
+        parameter (ntotalp=nxp*nyp*nz)
+        logical  convertdouble
+        parameter (convertdouble = .false.)
+        character compiletime*11
+        parameter (compiletime='18 Oct 2016')
+        character npbversion*5
+        parameter (npbversion='3.3.1')
+        character cs1*8
+        parameter (cs1='gfortran')
+        character cs2*6
+        parameter (cs2='$(F77)')
+        character cs3*6
+        parameter (cs3='(none)')
+        character cs4*6
+        parameter (cs4='(none)')
+        character cs5*40
+        parameter (cs5='-ffree-form -O3 -fopenmp -mcmodel=medium')
+        character cs6*28
+        parameter (cs6='-O3 -fopenmp -mcmodel=medium')
+        character cs7*6
+        parameter (cs7='randi8')
+
+
+! If processor array is 1x1 -> 0D grid decomposition
+
+
+! Cache blocking params. These values are good for most
+! RISC processors.
+! FFT parameters:
+!  fftblock controls how many ffts are done at a time.
+!  The default is appropriate for most cache-based machines
+!  On vector machines, the FFT can be vectorized with vector
+!  length equal to the block size, so the block size should
+!  be as large as possible. This is the size of the smallest
+!  dimension of the problem: 128 for class A, 256 for class B and
+!  512 for class C.
+
+    integer :: fftblock_default, fftblockpad_default
+!      parameter (fftblock_default=16, fftblockpad_default=18)
+    parameter (fftblock_default=32, fftblockpad_default=33)
+          
+    integer :: fftblock, fftblockpad
+    common /blockinfo/ fftblock, fftblockpad
+
+! we need a bunch of logic to keep track of how
+! arrays are laid out.
+
+
+! Note: this serial version is the derived from the parallel 0D case
+! of the ft NPB.
+! The computation proceeds logically as
+
+! set up initial conditions
+! fftx(1)
+! transpose (1->2)
+! ffty(2)
+! transpose (2->3)
+! fftz(3)
+! time evolution
+! fftz(3)
+! transpose (3->2)
+! ffty(2)
+! transpose (2->1)
+! fftx(1)
+! compute residual(1)
+
+! for the 0D, 1D, 2D strategies, the layouts look like xxx
+
+!            0D        1D        2D
+! 1:        xyz       xyz       xyz
+
+! the array dimensions are stored in dims(coord, phase)
+    integer :: dims(3)
+    common /layout/ dims
+
+    integer :: T_total, T_setup, T_fft, T_evolve, T_checksum, &
+    T_fftx, T_ffty, &
+    T_fftz, T_max
+    parameter (T_total = 1, T_setup = 2, T_fft = 3, &
+    T_evolve = 4, T_checksum = 5, &
+    T_fftx = 6, &
+    T_ffty = 7, &
+    T_fftz = 8, T_max = 8)
+
+
+
+    logical :: timers_enabled
+
+
+    external timer_read
+    double precision :: timer_read
+    external ilog2
+    integer :: ilog2
+
+    external randlc
+    double precision :: randlc
+
+
+! other stuff
+    logical :: debug, debugsynch
+    common /dbg/ debug, debugsynch, timers_enabled
+
+    double precision :: seed, a, pi, alpha
+    parameter (seed = 314159265.d0, a = 1220703125.d0, &
+    pi = 3.141592653589793238d0, alpha=1.0d-6)
+
+
+! roots of unity array
+! relies on x being largest dimension?
+    double complex u(nxp)
+    common /ucomm/ u
+
+
+! for checksum data
+    double complex sums(0:niter_default)
+    common /sumcomm/ sums
+
+! number of iterations
+    integer :: niter
+    common /iter/ niter
+    integer :: i, d1, d2, d3
+    double complex u1(d1+1,d2,d3)
+    integer :: j, q,r,s
+    double complex chk
+    chk = (0.0,0.0)
+
+! omp parallel do default(shared) private(i,q,r,s) reduction(+:chk)
+    do j=1,1024
+        q = mod(j, nx)+1
+        r = mod(3*j,ny)+1
+        s = mod(5*j,nz)+1
+        chk=chk+u1(q,r,s)
+    end do
+
+    chk = chk/dble(ntotal)
+          
+    write (*, 30) i, chk
+    30 format (' T =',I5,5X,'Checksum =',1P2D22.12)
+    sums(i) = chk
+    return
+    end subroutine checksum
+
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    subroutine verify (d1, d2, d3, nt, verified, class)
+
+!---------------------------------------------------------------------
+!---------------------------------------------------------------------
+
+    implicit none
+! CLASS = A
+!  
+!  
+!  This file is generated automatically by the setparams utility.
+!  It sets the number of processors and the class of the NPB
+!  in this directory. Do not modify it by hand.
+!  
+        integer nx, ny, nz, maxdim, niter_default
+        integer ntotal, nxp, nyp, ntotalp
+        parameter (nx=256, ny=256, nz=128, maxdim=256)
+        parameter (niter_default=6)
+        parameter (nxp=nx+1, nyp=ny)
+        parameter (ntotal=nx*nyp*nz)
+        parameter (ntotalp=nxp*nyp*nz)
+        logical  convertdouble
+        parameter (convertdouble = .false.)
+        character compiletime*11
+        parameter (compiletime='18 Oct 2016')
+        character npbversion*5
+        parameter (npbversion='3.3.1')
+        character cs1*8
+        parameter (cs1='gfortran')
+        character cs2*6
+        parameter (cs2='$(F77)')
+        character cs3*6
+        parameter (cs3='(none)')
+        character cs4*6
+        parameter (cs4='(none)')
+        character cs5*40
+        parameter (cs5='-ffree-form -O3 -fopenmp -mcmodel=medium')
+        character cs6*28
+        parameter (cs6='-O3 -fopenmp -mcmodel=medium')
+        character cs7*6
+        parameter (cs7='randi8')
+
+
+! If processor array is 1x1 -> 0D grid decomposition
+
+
+! Cache blocking params. These values are good for most
+! RISC processors.
+! FFT parameters:
+!  fftblock controls how many ffts are done at a time.
+!  The default is appropriate for most cache-based machines
+!  On vector machines, the FFT can be vectorized with vector
+!  length equal to the block size, so the block size should
+!  be as large as possible. This is the size of the smallest
+!  dimension of the problem: 128 for class A, 256 for class B and
+!  512 for class C.
+
+    integer :: fftblock_default, fftblockpad_default
+!      parameter (fftblock_default=16, fftblockpad_default=18)
+    parameter (fftblock_default=32, fftblockpad_default=33)
+          
+    integer :: fftblock, fftblockpad
+    common /blockinfo/ fftblock, fftblockpad
+
+! we need a bunch of logic to keep track of how
+! arrays are laid out.
+
+
+! Note: this serial version is the derived from the parallel 0D case
+! of the ft NPB.
+! The computation proceeds logically as
+
+! set up initial conditions
+! fftx(1)
+! transpose (1->2)
+! ffty(2)
+! transpose (2->3)
+! fftz(3)
+! time evolution
+! fftz(3)
+! transpose (3->2)
+! ffty(2)
+! transpose (2->1)
+! fftx(1)
+! compute residual(1)
+
+! for the 0D, 1D, 2D strategies, the layouts look like xxx
+
+!            0D        1D        2D
+! 1:        xyz       xyz       xyz
+
+! the array dimensions are stored in dims(coord, phase)
+    integer :: dims(3)
+    common /layout/ dims
+
+    integer :: T_total, T_setup, T_fft, T_evolve, T_checksum, &
+    T_fftx, T_ffty, &
+    T_fftz, T_max
+    parameter (T_total = 1, T_setup = 2, T_fft = 3, &
+    T_evolve = 4, T_checksum = 5, &
+    T_fftx = 6, &
+    T_ffty = 7, &
+    T_fftz = 8, T_max = 8)
+
+
+
+    logical :: timers_enabled
+
+
+    external timer_read
+    double precision :: timer_read
+    external ilog2
+    integer :: ilog2
+
+    external randlc
+    double precision :: randlc
+
+
+! other stuff
+    logical :: debug, debugsynch
+    common /dbg/ debug, debugsynch, timers_enabled
+
+    double precision :: seed, a, pi, alpha
+    parameter (seed = 314159265.d0, a = 1220703125.d0, &
+    pi = 3.141592653589793238d0, alpha=1.0d-6)
+
+
+! roots of unity array
+! relies on x being largest dimension?
+    double complex u(nxp)
+    common /ucomm/ u
+
+
+! for checksum data
+    double complex sums(0:niter_default)
+    common /sumcomm/ sums
+
+! number of iterations
+    integer :: niter
+    common /iter/ niter
+    integer :: d1, d2, d3, nt
+    character class
+    logical :: verified
+    integer :: i
+    double precision :: err, epsilon
+
+!---------------------------------------------------------------------
+!   Reference checksums
+!---------------------------------------------------------------------
+    double complex csum_ref(25)
+
+
+    class = 'U'
+
+    epsilon = 1.0d-12
+    verified = .FALSE. 
+
+    if (d1 == 64 .AND. &
+    d2 == 64 .AND. &
+    d3 == 64 .AND. &
+    nt == 6) then
+    !---------------------------------------------------------------------
+    !   Sample size reference checksums
+    !---------------------------------------------------------------------
+        class = 'S'
+        csum_ref(1) = dcmplx(5.546087004964D+02, 4.845363331978D+02)
+        csum_ref(2) = dcmplx(5.546385409189D+02, 4.865304269511D+02)
+        csum_ref(3) = dcmplx(5.546148406171D+02, 4.883910722336D+02)
+        csum_ref(4) = dcmplx(5.545423607415D+02, 4.901273169046D+02)
+        csum_ref(5) = dcmplx(5.544255039624D+02, 4.917475857993D+02)
+        csum_ref(6) = dcmplx(5.542683411902D+02, 4.932597244941D+02)
+
+    else if (d1 == 128 .AND. &
+        d2 == 128 .AND. &
+        d3 == 32 .AND. &
+        nt == 6) then
+    !---------------------------------------------------------------------
+    !   Class W size reference checksums
+    !---------------------------------------------------------------------
+        class = 'W'
+        csum_ref(1) = dcmplx(5.673612178944D+02, 5.293246849175D+02)
+        csum_ref(2) = dcmplx(5.631436885271D+02, 5.282149986629D+02)
+        csum_ref(3) = dcmplx(5.594024089970D+02, 5.270996558037D+02)
+        csum_ref(4) = dcmplx(5.560698047020D+02, 5.260027904925D+02)
+        csum_ref(5) = dcmplx(5.530898991250D+02, 5.249400845633D+02)
+        csum_ref(6) = dcmplx(5.504159734538D+02, 5.239212247086D+02)
+
+    else if (d1 == 256 .AND. &
+        d2 == 256 .AND. &
+        d3 == 128 .AND. &
+        nt == 6) then
+    !---------------------------------------------------------------------
+    !   Class A size reference checksums
+    !---------------------------------------------------------------------
+        class = 'A'
+        csum_ref(1) = dcmplx(5.046735008193D+02, 5.114047905510D+02)
+        csum_ref(2) = dcmplx(5.059412319734D+02, 5.098809666433D+02)
+        csum_ref(3) = dcmplx(5.069376896287D+02, 5.098144042213D+02)
+        csum_ref(4) = dcmplx(5.077892868474D+02, 5.101336130759D+02)
+        csum_ref(5) = dcmplx(5.085233095391D+02, 5.104914655194D+02)
+        csum_ref(6) = dcmplx(5.091487099959D+02, 5.107917842803D+02)
+              
+    else if (d1 == 512 .AND. &
+        d2 == 256 .AND. &
+        d3 == 256 .AND. &
+        nt == 20) then
+    !---------------------------------------------------------------------
+    !   Class B size reference checksums
+    !---------------------------------------------------------------------
+        class = 'B'
+        csum_ref(1)  = dcmplx(5.177643571579D+02, 5.077803458597D+02)
+        csum_ref(2)  = dcmplx(5.154521291263D+02, 5.088249431599D+02)
+        csum_ref(3)  = dcmplx(5.146409228649D+02, 5.096208912659D+02)
+        csum_ref(4)  = dcmplx(5.142378756213D+02, 5.101023387619D+02)
+        csum_ref(5)  = dcmplx(5.139626667737D+02, 5.103976610617D+02)
+        csum_ref(6)  = dcmplx(5.137423460082D+02, 5.105948019802D+02)
+        csum_ref(7)  = dcmplx(5.135547056878D+02, 5.107404165783D+02)
+        csum_ref(8)  = dcmplx(5.133910925466D+02, 5.108576573661D+02)
+        csum_ref(9)  = dcmplx(5.132470705390D+02, 5.109577278523D+02)
+        csum_ref(10) = dcmplx(5.131197729984D+02, 5.110460304483D+02)
+        csum_ref(11) = dcmplx(5.130070319283D+02, 5.111252433800D+02)
+        csum_ref(12) = dcmplx(5.129070537032D+02, 5.111968077718D+02)
+        csum_ref(13) = dcmplx(5.128182883502D+02, 5.112616233064D+02)
+        csum_ref(14) = dcmplx(5.127393733383D+02, 5.113203605551D+02)
+        csum_ref(15) = dcmplx(5.126691062020D+02, 5.113735928093D+02)
+        csum_ref(16) = dcmplx(5.126064276004D+02, 5.114218460548D+02)
+        csum_ref(17) = dcmplx(5.125504076570D+02, 5.114656139760D+02)
+        csum_ref(18) = dcmplx(5.125002331720D+02, 5.115053595966D+02)
+        csum_ref(19) = dcmplx(5.124551951846D+02, 5.115415130407D+02)
+        csum_ref(20) = dcmplx(5.124146770029D+02, 5.115744692211D+02)
+
+    else if (d1 == 512 .AND. &
+        d2 == 512 .AND. &
+        d3 == 512 .AND. &
+        nt == 20) then
+    !---------------------------------------------------------------------
+    !   Class C size reference checksums
+    !---------------------------------------------------------------------
+        class = 'C'
+        csum_ref(1)  = dcmplx(5.195078707457D+02, 5.149019699238D+02)
+        csum_ref(2)  = dcmplx(5.155422171134D+02, 5.127578201997D+02)
+        csum_ref(3)  = dcmplx(5.144678022222D+02, 5.122251847514D+02)
+        csum_ref(4)  = dcmplx(5.140150594328D+02, 5.121090289018D+02)
+        csum_ref(5)  = dcmplx(5.137550426810D+02, 5.121143685824D+02)
+        csum_ref(6)  = dcmplx(5.135811056728D+02, 5.121496764568D+02)
+        csum_ref(7)  = dcmplx(5.134569343165D+02, 5.121870921893D+02)
+        csum_ref(8)  = dcmplx(5.133651975661D+02, 5.122193250322D+02)
+        csum_ref(9)  = dcmplx(5.132955192805D+02, 5.122454735794D+02)
+        csum_ref(10) = dcmplx(5.132410471738D+02, 5.122663649603D+02)
+        csum_ref(11) = dcmplx(5.131971141679D+02, 5.122830879827D+02)
+        csum_ref(12) = dcmplx(5.131605205716D+02, 5.122965869718D+02)
+        csum_ref(13) = dcmplx(5.131290734194D+02, 5.123075927445D+02)
+        csum_ref(14) = dcmplx(5.131012720314D+02, 5.123166486553D+02)
+        csum_ref(15) = dcmplx(5.130760908195D+02, 5.123241541685D+02)
+        csum_ref(16) = dcmplx(5.130528295923D+02, 5.123304037599D+02)
+        csum_ref(17) = dcmplx(5.130310107773D+02, 5.123356167976D+02)
+        csum_ref(18) = dcmplx(5.130103090133D+02, 5.123399592211D+02)
+        csum_ref(19) = dcmplx(5.129905029333D+02, 5.123435588985D+02)
+        csum_ref(20) = dcmplx(5.129714421109D+02, 5.123465164008D+02)
+
+    else if (d1 == 2048 .AND. &
+        d2 == 1024 .AND. &
+        d3 == 1024 .AND. &
+        nt == 25) then
+    !---------------------------------------------------------------------
+    !   Class D size reference checksums
+    !---------------------------------------------------------------------
+        class = 'D'
+        csum_ref(1)  = dcmplx(5.122230065252D+02, 5.118534037109D+02)
+        csum_ref(2)  = dcmplx(5.120463975765D+02, 5.117061181082D+02)
+        csum_ref(3)  = dcmplx(5.119865766760D+02, 5.117096364601D+02)
+        csum_ref(4)  = dcmplx(5.119518799488D+02, 5.117373863950D+02)
+        csum_ref(5)  = dcmplx(5.119269088223D+02, 5.117680347632D+02)
+        csum_ref(6)  = dcmplx(5.119082416858D+02, 5.117967875532D+02)
+        csum_ref(7)  = dcmplx(5.118943814638D+02, 5.118225281841D+02)
+        csum_ref(8)  = dcmplx(5.118842385057D+02, 5.118451629348D+02)
+        csum_ref(9)  = dcmplx(5.118769435632D+02, 5.118649119387D+02)
+        csum_ref(10) = dcmplx(5.118718203448D+02, 5.118820803844D+02)
+        csum_ref(11) = dcmplx(5.118683569061D+02, 5.118969781011D+02)
+        csum_ref(12) = dcmplx(5.118661708593D+02, 5.119098918835D+02)
+        csum_ref(13) = dcmplx(5.118649768950D+02, 5.119210777066D+02)
+        csum_ref(14) = dcmplx(5.118645605626D+02, 5.119307604484D+02)
+        csum_ref(15) = dcmplx(5.118647586618D+02, 5.119391362671D+02)
+        csum_ref(16) = dcmplx(5.118654451572D+02, 5.119463757241D+02)
+        csum_ref(17) = dcmplx(5.118665212451D+02, 5.119526269238D+02)
+        csum_ref(18) = dcmplx(5.118679083821D+02, 5.119580184108D+02)
+        csum_ref(19) = dcmplx(5.118695433664D+02, 5.119626617538D+02)
+        csum_ref(20) = dcmplx(5.118713748264D+02, 5.119666538138D+02)
+        csum_ref(21) = dcmplx(5.118733606701D+02, 5.119700787219D+02)
+        csum_ref(22) = dcmplx(5.118754661974D+02, 5.119730095953D+02)
+        csum_ref(23) = dcmplx(5.118776626738D+02, 5.119755100241D+02)
+        csum_ref(24) = dcmplx(5.118799262314D+02, 5.119776353561D+02)
+        csum_ref(25) = dcmplx(5.118822370068D+02, 5.119794338060D+02)
+
+    else if (d1 == 4096 .AND. &
+        d2 == 2048 .AND. &
+        d3 == 2048 .AND. &
+        nt == 25) then
+    !---------------------------------------------------------------------
+    !   Class E size reference checksums
+    !---------------------------------------------------------------------
+        class = 'E'
+        csum_ref(1)  = dcmplx(5.121601045346D+02, 5.117395998266D+02)
+        csum_ref(2)  = dcmplx(5.120905403678D+02, 5.118614716182D+02)
+        csum_ref(3)  = dcmplx(5.120623229306D+02, 5.119074203747D+02)
+        csum_ref(4)  = dcmplx(5.120438418997D+02, 5.119345900733D+02)
+        csum_ref(5)  = dcmplx(5.120311521872D+02, 5.119551325550D+02)
+        csum_ref(6)  = dcmplx(5.120226088809D+02, 5.119720179919D+02)
+        csum_ref(7)  = dcmplx(5.120169296534D+02, 5.119861371665D+02)
+        csum_ref(8)  = dcmplx(5.120131225172D+02, 5.119979364402D+02)
+        csum_ref(9)  = dcmplx(5.120104767108D+02, 5.120077674092D+02)
+        csum_ref(10) = dcmplx(5.120085127969D+02, 5.120159443121D+02)
+        csum_ref(11) = dcmplx(5.120069224127D+02, 5.120227453670D+02)
+        csum_ref(12) = dcmplx(5.120055158164D+02, 5.120284096041D+02)
+        csum_ref(13) = dcmplx(5.120041820159D+02, 5.120331373793D+02)
+        csum_ref(14) = dcmplx(5.120028605402D+02, 5.120370938679D+02)
+        csum_ref(15) = dcmplx(5.120015223011D+02, 5.120404138831D+02)
+        csum_ref(16) = dcmplx(5.120001570022D+02, 5.120432068837D+02)
+        csum_ref(17) = dcmplx(5.119987650555D+02, 5.120455615860D+02)
+        csum_ref(18) = dcmplx(5.119973525091D+02, 5.120475499442D+02)
+        csum_ref(19) = dcmplx(5.119959279472D+02, 5.120492304629D+02)
+        csum_ref(20) = dcmplx(5.119945006558D+02, 5.120506508902D+02)
+        csum_ref(21) = dcmplx(5.119930795911D+02, 5.120518503782D+02)
+        csum_ref(22) = dcmplx(5.119916728462D+02, 5.120528612016D+02)
+        csum_ref(23) = dcmplx(5.119902874185D+02, 5.120537101195D+02)
+        csum_ref(24) = dcmplx(5.119889291565D+02, 5.120544194514D+02)
+        csum_ref(25) = dcmplx(5.119876028049D+02, 5.120550079284D+02)
+
+    endif
+
+
+    if (class /= 'U') then
+
+        do i = 1, nt
+            err = abs( (sums(i) - csum_ref(i)) / csum_ref(i) )
+            if ( .NOT. (err <= epsilon)) goto 100
+        end do
+        verified = .TRUE. 
+        100 continue
+
+    endif
+
+             
+    if (class /= 'U') then
+        if (verified) then
+            write(*,2000)
+            2000 format(' Result verification successful')
+        else
+            write(*,2001)
+            2001 format(' Result verification failed')
+        endif
+    endif
+    print *, 'class = ', class
+
+    return
+    end subroutine verify
+
+
diff --git a/Parser/test-data/rules/R503a.f90 b/Parser/test-data/rules/R503a.f90
new file mode 100644
index 0000000..ea8476f
--- /dev/null
+++ b/Parser/test-data/rules/R503a.f90
@@ -0,0 +1,14 @@
+real(RDP), intent(in)    :: d(:)
+real(RDP), intent(in)    :: d(0:)
+real(RDP), intent(in)    :: d(:,0:)
+real(RDP), intent(in)    :: u_v(    :,:)
+real(RDP), intent(inout) :: r_e(  :,:,:)
+real(RDP), intent(inout) :: r_v(    :,:)
+REAL, DIMENSION (N, 10) :: W
+REAL A (:), B (0:)
+REAL, POINTER :: D (:, :)
+REAL, DIMENSION (:), POINTER :: P
+REAL, ALLOCATABLE, DIMENSION (:) :: E
+REAL, PARAMETER :: V(0:*) = [0.1, 1.1]
+
+end
diff --git a/Parser/test-data/rules/tmp.f90 b/Parser/test-data/rules/tmp.f90
new file mode 100644
index 0000000..70f6222
--- /dev/null
+++ b/Parser/test-data/rules/tmp.f90
@@ -0,0 +1,4 @@
+!$acc parallel async(2+2)
+x = #test#
+!$acc end parallel
+end
diff --git a/Parser/test-data/slots/block.f b/Parser/test-data/slots/block.f
new file mode 100644
index 0000000..52ffae9
--- /dev/null
+++ b/Parser/test-data/slots/block.f
@@ -0,0 +1,7 @@
+! BlockSlot
+
+  do x = 0,p
+    #test#
+  end do
+
+end program
diff --git a/Parser/test-data/specht/array_operations.f b/Parser/test-data/specht/array_operations.f
new file mode 100644
index 0000000..6e2be42
--- /dev/null
+++ b/Parser/test-data/specht/array_operations.f
@@ -0,0 +1,1166 @@
+!> \file      array_operations.f
+!> \brief     Standard operations with arrays
+!> \author    Immo Huismann
+!> \date      2015/01/05
+!> \copyright Institute of Fluid Mechanics, TU Dresden, 01062 Dresden, Germany
+!>
+!> \details
+!> ## Rationale ##
+!> This module provides operations on arrays that work on the targeted hardware,
+!> providing an abstraction for subsequent modules.
+!> As the PGI compiler is completely useless when compiling simple things like
+!>
+!>    !$acc kernels
+!>    x = y * z
+!>    !$acc end kernels
+!>
+!> leading to working, but very slow variants of the operation. This module
+!> provides an easy way to get things working with the compiler.
+!>
+!> ## Programming ##
+!> To enable the compilers to do some more optimization, all public routines
+!> possess an explicit shape interface, while their called background
+!> implementation uses an explicit size interface and only one loop.
+!>
+!> All routines inside this module possess a similar interface. The first array
+!> passed is the value that will be set. Every other array will have intent(in).
+!> If these vectors receive a factor, it will be passed before the specific
+!> vector.
+!>
+!> Inside this module all arrays are named x, y, or z (and w in one instance).
+!> Scalars are a, b, c, etc.
+!>
+!> \todo
+!> * Most routines have an interface now. Those who do not, need to be purged
+!>   or moved to a legacy module that can get purged with enough time.
+!> * Redo the comments on all routines that don't sport formulas in brief.
+!===============================================================================
+
+module Array_Operations
+  ! HiBASE modules
+  use Kind_Parameters, only: RDP
+  use Constants,       only: ONE
+
+  ! Specht base modules
+  use ACC_Parameters,  only: ACC_EXEC_QUEUE
+  implicit none
+  private
+
+  public :: AddScaledVector
+  public :: Scale
+  public :: ScaleAndAddVector
+  public :: MultiplyWithVector
+
+  public :: SetToZero
+  public :: SetToValue
+  public :: SetToVector
+  public :: SetToScaledVector
+  public :: SetToTwoScaledVectors
+  public :: SetToThreeScaledVectors
+  public :: SetToMultipliedVectors
+
+  public :: PointWiseInversion
+  public :: PointWiseSqrt
+
+  !-----------------------------------------------------------------------------
+  !> \brief   Sets a vector to zero: x <- 0
+  !> \author  Immo Huismann
+
+  interface SetToZero
+    module procedure SetToZero_1
+    module procedure SetToZero_2
+    module procedure SetToZero_3
+    module procedure SetToZero_4
+
+    module procedure SetToZero_432
+  end interface SetToZero
+
+  !-----------------------------------------------------------------------------
+  !> \brief   Sets a vector to another one: x <- y
+  !> \author  Immo Huismann
+
+  interface SetToVector
+    module procedure SetToVector_2
+    module procedure SetToVector_3
+    module procedure SetToVector_4
+  end interface SetToVector
+
+  !-----------------------------------------------------------------------------
+  !> \brief   Scales a vector x <- a x
+  !> \author  Immo Huismann
+
+  interface Scale
+    module procedure Scale_2
+    module procedure Scale_3
+    module procedure Scale_4
+
+    module procedure Scale_432
+  end interface Scale
+  
+  !-----------------------------------------------------------------------------
+  !> \brief   Scales a vector and adds another to it: x <- a * x + y
+  !> \author  Immo Huismann
+
+  interface ScaleAndAddVector
+    module procedure ScaleAndAddVector_2
+    module procedure ScaleAndAddVector_3
+    module procedure ScaleAndAddVector_4
+
+    module procedure ScaleAndAddVector_432
+  end interface ScaleAndAddVector
+
+  !-----------------------------------------------------------------------------
+  !> \brief   Adds a scaled vector to a vector: x <- x + a * y
+  !> \author  Immo Huismann
+
+  interface AddScaledVector
+    module procedure AddScaledVector_2
+    module procedure AddScaledVector_3
+    module procedure AddScaledVector_4
+
+    module procedure AddScaledVector_432
+  end interface AddScaledVector
+
+  !-----------------------------------------------------------------------------
+  !> \brief   Sets to a scaled vector: x <- a * y
+  !> \author  Immo Huismann
+
+  interface SetToScaledVector
+    module procedure SetToScaledVector_2
+    module procedure SetToScaledVector_3
+    module procedure SetToScaledVector_4
+
+    module procedure SetToScaledVector_432
+  end interface SetToScaledVector
+
+  !-----------------------------------------------------------------------------
+  !> \brief   Sets to the elementwise product of two vectors: x_i <- y_i * z_i
+  !> \author  Immo Huismann
+
+  interface SetToMultipliedVectors
+    module procedure SetToMultipliedVectors_2
+    module procedure SetToMultipliedVectors_3
+    module procedure SetToMultipliedVectors_4
+
+    module procedure SetToMultipliedVectors_432
+  end interface SetToMultipliedVectors
+
+  !-----------------------------------------------------------------------------
+  !> \brief   Elementwise product of two vectors: x_i <- x_i * y_i
+  !> \author  Immo Huismann
+
+  interface MultiplyWithVector
+    module procedure MultiplyWithVector_2
+    module procedure MultiplyWithVector_3
+    module procedure MultiplyWithVector_4
+
+    module procedure MultiplyWithVector_432
+  end interface MultiplyWithVector
+
+  !-----------------------------------------------------------------------------
+  !> \brief   Elementwise inversion of vector: x_i <- 1 / x_i
+  !> \author  Immo Huismann
+
+  interface PointWiseInversion
+    module procedure PointWiseInversion_2
+    module procedure PointWiseInversion_3
+    module procedure PointWiseInversion_4
+
+    module procedure PointWiseInversion_432
+  end interface PointWiseInversion
+
+  !-----------------------------------------------------------------------------
+  !> \brief   Elementwise inversion of vector: x_i <- 1 / x_i
+  !> \author  Immo Huismann
+
+  interface PointWiseSqrt
+    module procedure PointWiseSqrt_2
+    module procedure PointWiseSqrt_3
+    module procedure PointWiseSqrt_4
+
+    module procedure PointWiseSqrt_432
+  end interface PointWiseSqrt
+
+  !-----------------------------------------------------------------------------
+  !> \brief   Sets the variable to a given value: x_i <- a
+  !> \author  Immo Huismann
+
+  interface SetToValue
+    module procedure SetToValue_2
+    module procedure SetToValue_3
+    module procedure SetToValue_4
+  end interface SetToValue
+
+contains
+
+!===============================================================================
+! Set some vector to zero
+
+!-------------------------------------------------------------------------------
+!> \brief   Sets a vector to zero: x <- 0
+!> \author  Immo Huismann
+
+subroutine SetToZero_1(x)
+  real(RDP), intent(out) :: x(:) !< x <- 0
+
+  call SetToZero_ES(size(x),x)
+
+end subroutine SetToZero_1
+
+!-------------------------------------------------------------------------------
+!> \brief   Sets a vector to zero: x <- 0
+!> \author  Immo Huismann
+
+subroutine SetToZero_2(x)
+  real(RDP), intent(out) :: x(:,:) !< x <- 0
+
+  call SetToZero_ES(size(x),x)
+
+end subroutine SetToZero_2
+
+!-------------------------------------------------------------------------------
+!> \brief   Sets a vector to zero: x <- 0
+!> \author  Immo Huismann
+
+subroutine SetToZero_3(x)
+  real(RDP), intent(out) :: x(:,:,:) !< x <- 0
+
+  call SetToZero_ES(size(x),x)
+
+end subroutine SetToZero_3
+
+!-------------------------------------------------------------------------------
+!> \brief   Sets a vector to zero: x <- 0
+!> \author  Immo Huismann
+
+subroutine SetToZero_4(x)
+  real(RDP), intent(out) :: x(:,:,:,:) !< x <- 0
+
+  call SetToZero_ES(size(x),x)
+
+end subroutine SetToZero_4
+
+!-------------------------------------------------------------------------------
+!> \brief   Sets a vector to zero: x <- 0
+!> \author  Immo Huismann
+
+subroutine SetToZero_432(x_4,x_3,x_2)
+  real(RDP), intent(out) :: x_2(:,:)     !< x <- 0
+  real(RDP), intent(out) :: x_3(:,:,:)   !< x <- 0
+  real(RDP), intent(out) :: x_4(:,:,:,:) !< x <- 0
+
+  call SetToZero_ES(size(x_2),x_2)
+  call SetToZero_ES(size(x_3),x_3)
+  call SetToZero_ES(size(x_4),x_4)
+
+end subroutine SetToZero_432
+
+!-------------------------------------------------------------------------------
+!> \brief   Sets a vector to zero, explicit size
+!> \author  Immo Huismann
+
+subroutine SetToZero_ES(n_points,x)
+  integer,   intent(in)  ::   n_points  !< number of points to work on
+  real(RDP), intent(out) :: x(n_points) !< x_i <- 0
+
+  integer :: i
+
+  !$acc data present(x)
+  !$acc parallel async(ACC_EXEC_QUEUE)
+  !$acc loop
+  !$omp do   private(i)
+  do i  = 1, n_points
+    x(i) = 0
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+  !$acc end data
+
+end subroutine SetToZero_ES
+
+!===============================================================================
+! Add scaled vectors
+
+!-------------------------------------------------------------------------------
+!> \brief   Adds a scaled vector to the variable x <- x + a * y
+!> \author  Immo Huismann
+
+subroutine AddScaledVector_2(x,a,y)
+  real(RDP), intent(inout) :: x(:,:) !< x <- x + a * y
+  real(RDP), intent(in)    :: a      !< scaling factor
+  real(RDP), intent(in)    :: y(:,:) !< array to scale and add to x
+
+  call AddScaledVector_ES(size(x),x,a,y)
+
+end subroutine AddScaledVector_2
+
+!-------------------------------------------------------------------------------
+!> \brief   Adds a scaled vector to the variable x <- x + a * y
+!> \author  Immo Huismann
+
+subroutine AddScaledVector_3(x,a,y)
+  real(RDP), intent(inout) :: x(:,:,:) !< x <- x + a * y
+  real(RDP), intent(in)    :: a        !< scaling factor
+  real(RDP), intent(in)    :: y(:,:,:) !< array to scale and add to x
+
+  call AddScaledVector_ES(size(x),x,a,y)
+
+end subroutine AddScaledVector_3
+
+!-------------------------------------------------------------------------------
+!> \brief   Adds a scaled vector to the variable x <- x + a * y
+!> \author  Immo Huismann
+
+subroutine AddScaledVector_4(x,a,y)
+  real(RDP), intent(inout) :: x(:,:,:,:) !< x <- x + a * y
+  real(RDP), intent(in)    :: a          !< scaling factor
+  real(RDP), intent(in)    :: y(:,:,:,:) !< array to scale and add to x
+
+  call AddScaledVector_ES(size(x),x,a,y)
+
+end subroutine AddScaledVector_4
+
+!-------------------------------------------------------------------------------
+!> \brief   Adds a scaled vector to the variable x <- x + a * y
+!> \author  Immo Huismann
+
+subroutine AddScaledVector_432(x_4,x_3,x_2,a,y_4,y_3,y_2)
+  real(RDP), intent(inout) :: x_4(:,:,:,:) !< x <- x + a * y
+  real(RDP), intent(inout) :: x_3(:,:,:)   !< x <- x + a * y
+  real(RDP), intent(inout) :: x_2(:,:)     !< x <- x + a * y
+  real(RDP), intent(in)    :: a            !< scaling factor
+  real(RDP), intent(in)    :: y_4(:,:,:,:) !< array to scale and add to x
+  real(RDP), intent(in)    :: y_3(:,:,:)   !< array to scale and add to x
+  real(RDP), intent(in)    :: y_2(:,:)     !< array to scale and add to x
+
+  call AddScaledVector_ES(size(x_4),x_4,a,y_4)
+  call AddScaledVector_ES(size(x_3),x_3,a,y_3)
+  call AddScaledVector_ES(size(x_2),x_2,a,y_2)
+
+end subroutine AddScaledVector_432
+
+!-------------------------------------------------------------------------------
+!> \brief   Adds a scaled vector to the variable
+!> \author  Immo Huismann
+
+subroutine AddScaledVector_ES(n_points,x,a,y)
+  integer,   intent(in)    ::   n_points  !< number of points to work on
+  real(RDP), intent(inout) :: x(n_points) !< x_i <- x_i + a * y_i
+  real(RDP), intent(in)    :: a           !< scaling factor
+  real(RDP), intent(in)    :: y(n_points) !< array to scale and add to x
+
+  integer :: i
+
+  !$acc data present(x,y)
+  !$acc parallel async(ACC_EXEC_QUEUE)
+  !$acc loop
+  !$omp do private(i)
+  do i  = 1, n_points
+    x(i) = x(i) + a * y(i)
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+  !$acc end data
+
+end subroutine AddScaledVector_ES
+
+!===============================================================================
+! Scales a vector
+
+!-------------------------------------------------------------------------------
+!> \brief   Scales a vector x <- a * x
+!> \author  Immo Huismann
+
+subroutine Scale_2(x,a)
+  real(RDP), intent(inout) :: x(:,:) !< vector to scale
+  real(RDP), intent(in)    :: a      !< scaling factor
+
+  call Scale_ES(size(x),x,a)
+
+end subroutine Scale_2
+
+!-------------------------------------------------------------------------------
+!> \brief   Scales a vector x <- a * x
+!> \author  Immo Huismann
+
+subroutine Scale_3(x,a)
+  real(RDP), intent(inout) :: x(:,:,:) !< vector to scale
+  real(RDP), intent(in)    :: a        !< scaling factor
+
+  call Scale_ES(size(x),x,a)
+
+end subroutine Scale_3
+
+!-------------------------------------------------------------------------------
+!> \brief   Scales a vector x <- a * x
+!> \author  Immo Huismann
+
+subroutine Scale_4(x,a)
+  real(RDP), intent(inout) :: x(:,:,:,:) !< vector to scale and add to
+  real(RDP), intent(in)    :: a          !< scaling factor
+
+  call Scale_ES(size(x),x,a)
+
+end subroutine Scale_4
+
+!-------------------------------------------------------------------------------
+!> \brief   Scales a vector x <- a * x
+!> \author  Immo Huismann
+
+subroutine Scale_432(x_4,x_3,x_2,a)
+  real(RDP), intent(inout) :: x_4(:,:,:,:) !< x <- x
+  real(RDP), intent(inout) :: x_3(:,:,:)   !< x <- x
+  real(RDP), intent(inout) :: x_2(:,:)     !< x <- x
+  real(RDP), intent(in)    :: a            !< scaling factor
+
+  call Scale_ES(size(x_4),x_4,a)
+  call Scale_ES(size(x_3),x_3,a)
+  call Scale_ES(size(x_2),x_2,a)
+
+end subroutine Scale_432
+
+!-------------------------------------------------------------------------------
+!> \brief   Scales a vector and adds another one to it (aka daxpy)
+!> \author  Immo Huismann
+
+subroutine Scale_ES(n_points,x,a)
+  integer,   intent(in)    ::   n_points  !< number of points in the vector
+  real(RDP), intent(inout) :: x(n_points) !< vector to scale and add to
+  real(RDP), intent(in)    :: a           !< scaling factor
+
+  integer :: i
+
+  !$acc data present(x,y)
+  !$acc parallel async(ACC_EXEC_QUEUE)
+  !$acc loop
+  !$omp do private(i)
+  do i  = 1, n_points
+    x(i) = a * x(i)
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+  !$acc end data
+
+end subroutine Scale_ES
+
+!===============================================================================
+! Scale and add vectors
+
+!-------------------------------------------------------------------------------
+!> \brief   Scales a vector and adds another one to it x <- a * x + y
+!> \author  Immo Huismann
+
+subroutine ScaleAndAddVector_2(x,a,y)
+  real(RDP), intent(inout) :: x(:,:) !< vector to scale and add to
+  real(RDP), intent(in)    :: a      !< scaling factor
+  real(RDP), intent(in)    :: y(:,:) !< vector to add to x
+
+  call ScaleAndAddVector_ES(size(x),x,a,y)
+
+end subroutine ScaleAndAddVector_2
+
+!-------------------------------------------------------------------------------
+!> \brief   Scales a vector and adds another one to it x <- a * x + y
+!> \author  Immo Huismann
+
+subroutine ScaleAndAddVector_3(x,a,y)
+  real(RDP), intent(inout) :: x(:,:,:) !< vector to scale and add to
+  real(RDP), intent(in)    :: a        !< scaling factor
+  real(RDP), intent(in)    :: y(:,:,:) !< vector to add to x
+
+  call ScaleAndAddVector_ES(size(x),x,a,y)
+
+end subroutine ScaleAndAddVector_3
+
+!-------------------------------------------------------------------------------
+!> \brief   Scales a vector and adds another one to it x <- a * x + y
+!> \author  Immo Huismann
+
+subroutine ScaleAndAddVector_4(x,a,y)
+  real(RDP), intent(inout) :: x(:,:,:,:) !< vector to scale and add to
+  real(RDP), intent(in)    :: a          !< scaling factor
+  real(RDP), intent(in)    :: y(:,:,:,:) !< vector to add to x
+
+  call ScaleAndAddVector_ES(size(x),x,a,y)
+
+end subroutine ScaleAndAddVector_4
+
+!-------------------------------------------------------------------------------
+!> \brief   Scales a vector and adds another one to it x <- a * x + y
+!> \author  Immo Huismann
+
+subroutine ScaleAndAddVector_432(x_4,x_3,x_2,a,y_4,y_3,y_2)
+  real(RDP), intent(inout) :: x_4(:,:,:,:) !< x <- x + a * y
+  real(RDP), intent(inout) :: x_3(:,:,:)   !< x <- x + a * y
+  real(RDP), intent(inout) :: x_2(:,:)     !< x <- x + a * y
+  real(RDP), intent(in)    :: a            !< scaling factor
+  real(RDP), intent(in)    :: y_4(:,:,:,:) !< array to scale and add to x
+  real(RDP), intent(in)    :: y_3(:,:,:)   !< array to scale and add to x
+  real(RDP), intent(in)    :: y_2(:,:)     !< array to scale and add to x
+
+  call ScaleAndAddVector_ES(size(x_4),x_4,a,y_4)
+  call ScaleAndAddVector_ES(size(x_3),x_3,a,y_3)
+  call ScaleAndAddVector_ES(size(x_2),x_2,a,y_2)
+
+end subroutine ScaleAndAddVector_432
+
+!-------------------------------------------------------------------------------
+!> \brief   Scales a vector and adds another one to it (aka daxpy)
+!> \author  Immo Huismann
+
+subroutine ScaleAndAddVector_ES(n_points,x,a,y)
+  integer,   intent(in)    ::   n_points  !< number of points in the vector
+  real(RDP), intent(inout) :: x(n_points) !< vector to scale and add to
+  real(RDP), intent(in)    :: a           !< scaling factor
+  real(RDP), intent(in)    :: y(n_points) !< vector to add to x
+
+  integer :: i
+
+  !$acc data present(x,y)
+  !$acc parallel async(ACC_EXEC_QUEUE)
+  !$acc loop
+  !$omp do private(i)
+  do i  = 1, n_points
+    x(i) = a * x(i) + y(i)
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+  !$acc end data
+
+end subroutine ScaleAndAddVector_ES
+
+!===============================================================================
+! Set to scaled vectors
+
+!-------------------------------------------------------------------------------
+!> \brief   Sets x <- a * y
+!> \author  Immo Huismann
+
+subroutine SetToScaledVector_2(x,a,y)
+  real(RDP), intent(out) :: x(:,:) !< x <- a * y
+  real(RDP), intent(in)  :: a      !< scaling factor for y
+  real(RDP), intent(in)  :: y(:,:) !< vector to scale and store in x
+
+  call SetToScaledVector_ES(size(x),x,a,y)
+
+end subroutine SetToScaledVector_2
+
+!-------------------------------------------------------------------------------
+!> \brief   Sets \f$ x <- a * y \f$
+!> \author  Immo Huismann
+
+subroutine SetToScaledVector_3(x,a,y)
+  real(RDP), intent(out) :: x(:,:,:) !< x <- a * y
+  real(RDP), intent(in)  :: a        !< scaling factor for y
+  real(RDP), intent(in)  :: y(:,:,:) !< vector to scale and store in x
+
+  call SetToScaledVector_ES(size(x),x,a,y)
+
+end subroutine SetToScaledVector_3
+
+!-------------------------------------------------------------------------------
+!> \brief   Sets x <- a * y
+!> \author  Immo Huismann
+
+subroutine SetToScaledVector_4(x,a,y)
+  real(RDP), intent(in)  :: y(:,:,:,:) !< x <- a * y
+  real(RDP), intent(in)  :: a          !< scaling factor for y
+  real(RDP), intent(out) :: x(:,:,:,:) !< vector to scale and store in x
+
+  call SetToScaledVector_ES(size(x),x,a,y)
+
+end subroutine SetToScaledVector_4
+
+!-------------------------------------------------------------------------------
+!> \brief   Sets x <- a * y
+!> \author  Immo Huismann
+
+subroutine SetToScaledVector_432(x_4,x_3,x_2,a,y_4,y_3,y_2)
+  real(RDP), intent(inout) :: x_4(:,:,:,:) !< x <- a * y
+  real(RDP), intent(inout) :: x_3(:,:,:)   !< x <- a * y
+  real(RDP), intent(inout) :: x_2(:,:)     !< x <- a * y
+  real(RDP), intent(in)    :: a            !< scaling factor for y
+  real(RDP), intent(in)    :: y_4(:,:,:,:) !< vector to scale and store in x
+  real(RDP), intent(in)    :: y_3(:,:,:)   !< vector to scale and store in x
+  real(RDP), intent(in)    :: y_2(:,:)     !< vector to scale and store in x
+
+  call SetToScaledVector_ES(size(x_4),x_4,a,y_4)
+  call SetToScaledVector_ES(size(x_3),x_3,a,y_3)
+  call SetToScaledVector_ES(size(x_2),x_2,a,y_2)
+
+end subroutine SetToScaledVector_432
+
+!-------------------------------------------------------------------------------
+!> \brief   Sets x <- a * y - explicit size subroutine
+!> \author  Immo Huismann
+
+subroutine SetToScaledVector_ES(n_points,x,a,y)
+  integer,   intent(in)  ::   n_points  !< number of points to work on
+  real(RDP), intent(out) :: x(n_points) !< x_i <- a * y_i
+  real(RDP), intent(in)  :: a           !< scaling factor for y
+  real(RDP), intent(in)  :: y(n_points) !< vector to scale and store in x
+
+  integer :: i
+
+  !$acc data present(x,y)
+  !$acc parallel async(ACC_EXEC_QUEUE)
+  !$acc loop
+  !$omp do   private(i)
+  do i = 1, n_points
+    x(i) = a * y(i)
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+  !$acc end data
+
+end subroutine SetToScaledVector_ES
+
+
+!===============================================================================
+! Multiply two vectors, store in first
+
+!-------------------------------------------------------------------------------
+!> \brief   Pointwise multiplication  x_i <- x_i * y_i
+!> \author  Immo Huismann
+
+subroutine MultiplyWithVector_2(x,y)
+  real(RDP), intent(inout) :: x(:,:) !< vector to save into
+  real(RDP), intent(in)    :: y(:,:) !< vector to multiply it with
+
+  call MultiplyWithVector_ES(size(x),x,y)
+
+end subroutine MultiplyWithVector_2
+
+!-------------------------------------------------------------------------------
+!> \brief   Pointwise multiplication  x_i <- x_i * y_i
+!> \author  Immo Huismann
+
+subroutine MultiplyWithVector_3(x,y)
+  real(RDP), intent(inout) :: x(:,:,:) !< vector to save into
+  real(RDP), intent(in)    :: y(:,:,:) !< vector to multiply it with
+
+  call MultiplyWithVector_ES(size(x),x,y)
+
+end subroutine MultiplyWithVector_3
+
+!-------------------------------------------------------------------------------
+!> \brief   Pointwise multiplication  x_i <- x_i * y_i
+!> \author  Immo Huismann
+
+subroutine MultiplyWithVector_4(x,y)
+  real(RDP), intent(inout) :: x(:,:,:,:) !< vector to save into
+  real(RDP), intent(in)    :: y(:,:,:,:) !< vector to multiply it with
+
+  call MultiplyWithVector_ES(size(x),x,y)
+
+end subroutine MultiplyWithVector_4
+
+!-------------------------------------------------------------------------------
+!> \brief   Pointwise multiplication  x_i <- x_i * y_i
+!> \author  Immo Huismann
+
+subroutine MultiplyWithVector_432(x_4,x_3,x_2,y_4,y_3,y_2)
+  real(RDP), intent(inout) :: x_4(:,:,:,:) !< vector to save into
+  real(RDP), intent(inout) :: x_3(:,:,:)   !< vector to save into
+  real(RDP), intent(inout) :: x_2(:,:)     !< vector to save into
+  real(RDP), intent(in)    :: y_4(:,:,:,:) !< vector to multiply it with
+  real(RDP), intent(in)    :: y_3(:,:,:)   !< vector to multiply it with
+  real(RDP), intent(in)    :: y_2(:,:)     !< vector to multiply it with
+
+  call MultiplyWithVector_ES(size(x_4),x_4,y_4)
+  call MultiplyWithVector_ES(size(x_3),x_3,y_3)
+  call MultiplyWithVector_ES(size(x_2),x_2,y_2)
+
+end subroutine MultiplyWithVector_432
+
+!-------------------------------------------------------------------------------
+!> \brief   Pointwise multiplication  x_i <- x_i * y_i
+!> \author  Immo Huismann
+
+subroutine MultiplyWithVector_ES(n_points,x,y)
+  integer,   intent(in)    ::   n_points  !< number of points in the vectors
+  real(RDP), intent(inout) :: x(n_points) !< first vector, store result in it
+  real(RDP), intent(in)    :: y(n_points) !< second vector, multiply with it
+
+  integer :: i
+
+  !$acc data present(x,y)
+  !$acc parallel async(ACC_EXEC_QUEUE)
+  !$acc loop
+  !$omp do private(i)
+  do i = 1, n_points
+    x(i) = x(i) * y(i)
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+  !$acc end data
+
+end subroutine MultiplyWithVector_ES
+
+!===============================================================================
+! Set a vector to the elementwise product of two other vectors
+
+!-------------------------------------------------------------------------------
+!> \brief   Pointwise multiplication x_i <- y_i * z_i
+!> \author  Immo Huismann
+
+subroutine SetToMultipliedVectors_2(x,y,z)
+  real(RDP), intent(out) :: x(:,:) !< array to store in
+  real(RDP), intent(in)  :: y(:,:)
+  real(RDP), intent(in)  :: z(:,:)
+
+  call SetToMultipliedVectors_ES(size(x),x,y,z)
+
+end subroutine SetToMultipliedVectors_2
+
+!-------------------------------------------------------------------------------
+!> \brief   Pointwise multiplication x_i <- y_i * z_i
+!> \author  Immo Huismann
+
+subroutine SetToMultipliedVectors_3(x,y,z)
+  real(RDP), intent(out) :: x(:,:,:) !< array to store in
+  real(RDP), intent(in)  :: y(:,:,:)
+  real(RDP), intent(in)  :: z(:,:,:)
+
+  call SetToMultipliedVectors_ES(size(x),x,y,z)
+
+end subroutine SetToMultipliedVectors_3
+
+!-------------------------------------------------------------------------------
+!> \brief   Pointwise multiplication x_i <- y_i * z_i
+!> \author  Immo Huismann
+
+subroutine SetToMultipliedVectors_4(x,y,z)
+  real(RDP), intent(out) :: x(:,:,:,:) !< array to store in
+  real(RDP), intent(in)  :: y(:,:,:,:)
+  real(RDP), intent(in)  :: z(:,:,:,:)
+
+  call SetToMultipliedVectors_ES(size(x),x,y,z)
+
+end subroutine SetToMultipliedVectors_4
+
+!-------------------------------------------------------------------------------
+!> \brief   Pointwise multiplication x_i <- y_i * z_i
+!> \author Immo Huismann
+
+subroutine SetToMultipliedVectors_432(x_4,x_3,x_2, y_4,y_3,y_2, z_4,z_3,z_2)
+  real(RDP), intent(out) :: x_4(:,:,:,:) !< x_i <- y_i * z_i
+  real(RDP), intent(out) :: x_3(:,:,:)   !< x_i <- y_i * z_i
+  real(RDP), intent(out) :: x_2(:,:)     !< x_i <- y_i * z_i
+  real(RDP), intent(in)  :: y_4(:,:,:,:)
+  real(RDP), intent(in)  :: y_3(:,:,:)
+  real(RDP), intent(in)  :: y_2(:,:)
+  real(RDP), intent(in)  :: z_4(:,:,:,:)
+  real(RDP), intent(in)  :: z_3(:,:,:)
+  real(RDP), intent(in)  :: z_2(:,:)
+
+  call SetToMultipliedVectors_ES(size(x_4),x_4,y_4,z_4)
+  call SetToMultipliedVectors_ES(size(x_3),x_3,y_3,z_3)
+  call SetToMultipliedVectors_ES(size(x_2),x_2,y_2,z_2)
+
+end subroutine SetToMultipliedVectors_432
+
+!-------------------------------------------------------------------------------
+!> \brief   Pointwise multiplication x_i <- y_i * z_i -  explicit size routine
+!> \author  Immo Huismann
+
+subroutine SetToMultipliedVectors_ES(n_points,x,y,z)
+  integer,   intent(in)  ::   n_points  !< number of points to work on
+  real(RDP), intent(out) :: x(n_points) !< x_i <- y_i * z_i
+  real(RDP), intent(in)  :: y(n_points) !< First array in multiplication
+  real(RDP), intent(in)  :: z(n_points) !< Second array in multiplication
+
+  integer :: i
+
+  !$acc data present(x,y,z)
+  !$acc parallel async(ACC_EXEC_QUEUE)
+  !$acc loop
+  !$omp do private(i)
+  do i = 1, n_points
+    x(i) = y(i) * z(i)
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+  !$acc end data
+
+end subroutine SetToMultipliedVectors_ES
+
+!===============================================================================
+! Copy a vector
+
+!-------------------------------------------------------------------------------
+!> \brief  Sets the variable to a given value, x <- y
+!> \author Immo Huismann
+
+subroutine SetToVector_2(x,y)
+  real(RDP), intent(out) :: x(:,:) !< x <- y
+  real(RDP), intent(in)  :: y(:,:) !< vector to set it to
+
+  call SetToVector_ES(size(x),x,y)
+
+end subroutine SetToVector_2
+
+!-------------------------------------------------------------------------------
+!> \brief  Sets the variable to a given value, x <- y
+!> \author Immo Huismann
+
+subroutine SetToVector_3(x,y)
+  real(RDP), intent(out) :: x(:,:,:) !< x <- y
+  real(RDP), intent(in)  :: y(:,:,:) !< vector to set it to
+
+  call SetToVector_ES(size(x),x,y)
+
+end subroutine SetToVector_3
+
+!-------------------------------------------------------------------------------
+!> \brief  Sets the variable to a given value, x <- y
+!> \author Immo Huismann
+
+subroutine SetToVector_4(x,y)
+  real(RDP), intent(out) :: x(:,:,:,:) !< x <- y
+  real(RDP), intent(in)  :: y(:,:,:,:) !< vector to set it to
+
+  call SetToVector_ES(size(x),x,y)
+
+end subroutine SetToVector_4
+
+!-------------------------------------------------------------------------------
+!> \brief  Sets containing n entries to zero
+!> \author Immo Huismann
+
+subroutine SetToVector_ES(n_point,x,y)
+  integer,   intent(in)  ::   n_point  !< number of points to work on
+  real(RDP), intent(out) :: x(n_point) !< x_i <- y_i
+  real(RDP), intent(in)  :: y(n_point) !< vector to set x to
+
+  integer :: i
+
+  !$acc data present(x,y)
+  !$acc parallel async(ACC_EXEC_QUEUE)
+  !$acc loop
+  !$omp do private(i)
+  do i = 1, n_point
+    x(i) = y(i)
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+  !$acc end data
+
+end subroutine SetToVector_ES
+
+!===============================================================================
+! Pointwise inversion
+
+!-------------------------------------------------------------------------------
+!> \brief  Inverts every element of an array, x_i <- 1 / x_i
+!> \author Immo Huismann
+
+subroutine PointWiseInversion_2(x)
+  real(RDP), intent(inout) :: x(:,:) !< x_i <- 1 / x_i
+
+  call PointWiseInversion_ES(size(x),x)
+
+end subroutine PointWiseInversion_2
+
+!-------------------------------------------------------------------------------
+!> \brief  Inverts every element of an array, x_i <- 1 / x_i
+!> \author Immo Huismann
+
+subroutine PointWiseInversion_3(x)
+  real(RDP), intent(inout) :: x(:,:,:) !< x_i <- 1 / x_i
+
+  call PointWiseInversion_ES(size(x),x)
+
+end subroutine PointWiseInversion_3
+
+!-------------------------------------------------------------------------------
+!> \brief  Inverts every element of an array.
+!> \author Immo Huismann
+
+subroutine PointWiseInversion_4(x)
+  real(RDP), intent(inout) :: x(:,:,:,:) !< x_i <- 1 / x_i
+
+  call PointWiseInversion_ES(size(x),x)
+
+end subroutine PointWiseInversion_4
+
+!-------------------------------------------------------------------------------
+!> \brief  Inverts every element of an array.
+!> \author Immo Huismann
+
+subroutine PointWiseInversion_432(x_4,x_3,x_2)
+  real(RDP), intent(inout) :: x_4(:,:,:,:) !< x_i <- 1 / x_i
+  real(RDP), intent(inout) :: x_3(:,:,:)   !< x_i <- 1 / x_i
+  real(RDP), intent(inout) :: x_2(:,:)     !< x_i <- 1 / x_i
+
+  call PointWiseInversion_ES(size(x_4),x_4)
+  call PointWiseInversion_ES(size(x_3),x_3)
+  call PointWiseInversion_ES(size(x_2),x_2)
+
+end subroutine PointWiseInversion_432
+
+!-------------------------------------------------------------------------------
+!> \brief  Inverts every element of an array - explicit size version
+!> \author Immo Huismann
+
+subroutine PointWiseInversion_ES(n_point,x)
+  integer,   intent(in)    ::   n_point  !< number of points to process
+  real(RDP), intent(inout) :: x(n_point) !< x_i <- 1 / x_i
+
+  integer :: i
+
+  !$acc data present(x)
+  !$acc parallel async(ACC_EXEC_QUEUE)
+  !$acc loop
+  !$omp do   private(i)
+  do i = 1, n_point
+    x(i) = ONE / x(i)
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+  !$acc end data
+
+end subroutine PointWiseInversion_ES
+
+!===============================================================================
+! Sqrt
+
+!-------------------------------------------------------------------------------
+!> \brief  Square root of every element of an array, x_i <- sqrt(x_i)
+!> \author Immo Huismann
+
+subroutine PointWiseSqrt_2(x)
+  real(RDP), intent(inout) :: x(:,:) !< x_i <- sqrt(x_i)
+
+  call PointWiseSqrt_ES(size(x),x)
+
+end subroutine PointWiseSqrt_2
+
+!-------------------------------------------------------------------------------
+!> \brief  Square root of every element of an array, x_i <- sqrt(x_i)
+!> \author Immo Huismann
+
+subroutine PointWiseSqrt_3(x)
+  real(RDP), intent(inout) :: x(:,:,:) !< x_i <- sqrt(x_i)
+
+  call PointWiseSqrt_ES(size(x),x)
+
+end subroutine PointWiseSqrt_3
+
+!-------------------------------------------------------------------------------
+!> \brief  Square root of every element of an array, x_i <- sqrt(x_i)
+!> \author Immo Huismann
+
+subroutine PointWiseSqrt_4(x)
+  real(RDP), intent(inout) :: x(:,:,:,:) !< x_i <- sqrt(x_i)
+
+  call PointWiseSqrt_ES(size(x),x)
+
+end subroutine PointWiseSqrt_4
+
+!-------------------------------------------------------------------------------
+!> \brief  Square root of every element of an array, x_i <- sqrt(x_i)
+!> \author Immo Huismann
+
+subroutine PointWiseSqrt_432(x_4,x_3,x_2)
+  real(RDP), intent(inout) :: x_4(:,:,:,:) !< x_i <- sqrt(x_i)
+  real(RDP), intent(inout) :: x_3(:,:,:)   !< x_i <- sqrt(x_i)
+  real(RDP), intent(inout) :: x_2(:,:)     !< x_i <- sqrt(x_i)
+
+  call PointWiseSqrt_ES(size(x_4),x_4)
+  call PointWiseSqrt_ES(size(x_3),x_3)
+  call PointWiseSqrt_ES(size(x_2),x_2)
+
+end subroutine PointWiseSqrt_432
+
+!-------------------------------------------------------------------------------
+!> \brief  Inverts every element of an array - explicit size version
+!> \author Immo Huismann
+
+subroutine PointWiseSqrt_ES(n_point,x)
+  integer,   intent(in)    ::   n_point  !< number of points to work on
+  real(RDP), intent(inout) :: x(n_point) !< x_i <- sqrt(x_i)
+
+  integer :: i
+
+  !$acc data present(x)
+  !$acc parallel async(ACC_EXEC_QUEUE)
+  !$acc loop
+  !$omp do   private(i)
+  do i = 1, n_point
+    x(i) = sqrt(x(i))
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+  !$acc end data
+
+end subroutine PointWiseSqrt_ES
+
+!===============================================================================
+! x <- a * y + b * z
+
+!-------------------------------------------------------------------------------
+!> \brief   Explicit size routine for x <- a * y + b * z
+!> \author  Immo Huismann
+
+subroutine SetToTwoScaledVectors_ES(n_points,x,a,y,b,z)
+  integer,   intent(in)  ::   n_points
+  real(RDP), intent(out) :: x(n_points) !< array to set
+  real(RDP), intent(in)  :: a           !< scaling factor for y
+  real(RDP), intent(in)  :: y(n_points) !< array to set x to
+  real(RDP), intent(in)  :: b           !< scaling factor for z
+  real(RDP), intent(in)  :: z(n_points) !< array to scale and add to x
+
+  integer :: i
+
+  !$acc data present(x,y,z)
+  !$acc parallel async(ACC_EXEC_QUEUE)
+  !$acc loop
+  !$omp do   private(i)
+  do i = 1, n_points
+    x(i) = a * y(i) + b * z(i)
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+  !$acc end data
+
+end subroutine SetToTwoScaledVectors_ES
+
+!-------------------------------------------------------------------------------
+!> \brief   Adds a scaled vector to the variable w <- a * x + b * y + c * z
+!> \author  Immo Huismann
+
+subroutine SetToThreeScaledVectors(w,a,x,b,y,c,z)
+  real(RDP), intent(out) :: w(:,:,:,:) !< array to set
+  real(RDP), intent(in)  :: a          !< scaling factor for x
+  real(RDP), intent(in)  :: x(:,:,:,:) !< first array to set x to
+  real(RDP), intent(in)  :: b          !< scaling factor for z
+  real(RDP), intent(in)  :: y(:,:,:,:) !< second array to scale and add to w
+  real(RDP), intent(in)  :: c          !< scaling factor for z
+  real(RDP), intent(in)  :: z(:,:,:,:) !< third array to scale and add to w
+
+  call SetToThreeScaledVectors_ES(size(x),w,a,x,b,y,c,z)
+
+end subroutine SetToThreeScaledVectors
+
+!-------------------------------------------------------------------------------
+!> \brief   Adds a scaled vector to the variable w <- a * x + b * y + c * z
+!> \author  Immo Huismann
+
+subroutine SetToThreeScaledVectors_ES(n_points,w,a,x,b,y,c,z)
+  integer,   intent(in)  ::   n_points  !< number of points to process
+  real(RDP), intent(out) :: w(n_points) !< array to set
+  real(RDP), intent(in)  :: a           !< scaling factor for x
+  real(RDP), intent(in)  :: x(n_points) !< first array to set x to
+  real(RDP), intent(in)  :: b           !< scaling factor for z
+  real(RDP), intent(in)  :: y(n_points) !< second array to scale and add to w
+  real(RDP), intent(in)  :: c           !< scaling factor for z
+  real(RDP), intent(in)  :: z(n_points) !< third array to scale and add to w
+
+  integer :: i
+
+  !$acc data present(w,x,y,z)
+  !$acc parallel async(ACC_EXEC_QUEUE)
+  !$acc loop
+  !$omp do   private(i)
+  do i = 1, n_points
+    w(i) = a * x(i) + b * y(i) + c * z(i)
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+  !$acc end data
+
+end subroutine SetToThreeScaledVectors_ES
+
+
+!-------------------------------------------------------------------------------
+!> \brief   Adds a scaled vector to the variable x <- a * y + b * z
+!> \author  Immo Huismann
+
+subroutine SetToTwoScaledVectors(x,a,y,b,z)
+  real(RDP), intent(out) :: x(:,:,:,:) !< array to set
+  real(RDP), intent(in)  :: a          !< scaling factor for y
+  real(RDP), intent(in)  :: y(:,:,:,:) !< array to set x to
+  real(RDP), intent(in)  :: b          !< scaling factor for z
+  real(RDP), intent(in)  :: z(:,:,:,:) !< array to scale and add to x
+
+  call SetToTwoScaledVectors_ES(size(x),x,a,y,b,z)
+
+end subroutine SetToTwoScaledVectors
+
+!-------------------------------------------------------------------------------
+!> \brief  Sets the variable to a given value: x_i <- a
+!> \author Immo Huismann
+
+subroutine SetToValue_2(x,a)
+  real(RDP), intent(out) :: x(:,:)
+  real(RDP), intent(in)  :: a
+
+  call SetToValue_ES(size(x),x,a)
+
+end subroutine SetToValue_2
+
+!-------------------------------------------------------------------------------
+!> \brief  Sets the variable to a given value: x_i <- a
+!> \author Immo Huismann
+
+subroutine SetToValue_3(x,a)
+  real(RDP), intent(out) :: x(:,:,:)
+  real(RDP), intent(in)  :: a
+
+  call SetToValue_ES(size(x),x,a)
+
+end subroutine SetToValue_3
+
+!-------------------------------------------------------------------------------
+!> \brief  Sets the variable to a given value: x_i <- a
+!> \author Immo Huismann
+
+subroutine SetToValue_4(x,a)
+  real(RDP), intent(out) :: x(:,:,:,:)
+  real(RDP), intent(in)  :: a
+
+  call SetToValue_ES(size(x),x,a)
+
+end subroutine SetToValue_4
+
+!-------------------------------------------------------------------------------
+!> \brief  Sets containing n entries to a, x_i <- a
+!> \author Immo Huismann
+
+subroutine SetToValue_ES(n_point,x,a)
+  integer,   intent(in)  ::   n_point  !< number of points in array
+  real(RDP), intent(out) :: x(n_point) !< array to set to a
+  real(RDP), intent(in)  :: a          !< value to set array to
+
+  integer :: i
+
+  !$acc data present(x)
+  !$acc parallel async(ACC_EXEC_QUEUE)
+  !$acc loop
+  !$omp do   private(i)
+  do i = 1, n_point
+    x(i) = a
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+  !$acc end data
+
+end subroutine SetToValue_ES
+
+!===============================================================================
+
+end module Array_Operations
diff --git a/Parser/test-data/specht/condensed_primary_part.f b/Parser/test-data/specht/condensed_primary_part.f
new file mode 100644
index 0000000..6b16812
--- /dev/null
+++ b/Parser/test-data/specht/condensed_primary_part.f
@@ -0,0 +1,930 @@
+!> \file      condensed_primary_part.f
+!> \brief     Primary part for the condensed operator
+!> \author    Immo Huismann
+!> \date      2015/02/17
+!> \copyright Institute of Fluid Mechanics, TU Dresden, 01062 Dresden, Germany
+!===============================================================================
+
+module Condensed_Primary_Part
+  use Kind_parameters,             only: RDP
+  use Constants,                   only: HALF
+  use Standard_Operators_1D,       only: StandardOperators1D
+  use Factored_Matrix_Products_2D, only: AddFactoredMatrixProduct2D
+
+  use ACC_Parameters,              only: ACC_EXEC_QUEUE
+  use Element_Boundary_Parameters, only: N_VERTICES, N_FACES, N_EDGES
+  use Geometry_Coefficients,       only: GetHelmholtzCoefficients
+  implicit none
+  private
+
+  public :: AddHelmholtzOpCondensedPrimary
+
+  !-----------------------------------------------------------------------------
+  !> \brief   Adds the condensed primary part to the result
+  !> \author  Immo Huismann
+
+  interface AddHelmholtzOpCondensedPrimary
+    module procedure AddHelmholtzOpCondensedPrimary_ClassicInterface
+    module procedure AddHelmholtzOpCondensedPrimary_ShortInterface
+  end interface AddHelmholtzOpCondensedPrimary
+
+contains
+
+!-------------------------------------------------------------------------------
+!> \brief   Calculates the primary part for the condensed system, adds it to r
+!> \author  Immo Huismann
+
+subroutine AddHelmholtzOpCondensedPrimary_ClassicInterface                     &
+    (op,h,lambda,u_f,u_e,u_v,r_f,r_e,r_v,tmp_f_1,tmp_f_2)
+  class(StandardOperators1D), intent(in) :: op     !< 1D operators
+  real(RDP),                  intent(in) :: h(:,:) !< element width
+  real(RDP),                  intent(in) :: lambda !< Helmholtz parameter
+
+  real(RDP), intent(in)    :: u_f(:,:,:,:)         !< u values on faces
+  real(RDP), intent(in)    :: u_e(  :,:,:)         !< u values on edges
+  real(RDP), intent(in)    :: u_v(    :,:)         !< u values on vertices
+
+  real(RDP), intent(inout) :: r_f(:,:,:,:)         !< r values on faces
+  real(RDP), intent(inout) :: r_e(  :,:,:)         !< r values on edges
+  real(RDP), intent(inout) :: r_v(    :,:)         !< r values on vertices
+
+  real(RDP), intent(out)   :: tmp_f_1(:,:,:,:)     !< scratch values on faces
+  real(RDP), intent(out)   :: tmp_f_2(:,:,:,:)     !< scratch values on faces
+
+  integer :: n         ! number of internal points on one edge
+  integer :: n_element ! number of elements
+
+  real(RDP), allocatable, save :: d(:,:) ! element-wise Helmholtz coefficients
+
+  !.............................................................................
+  ! Initialization
+
+  ! Get the problem size
+  n         = size(r_f,1)
+  n_element = size(r_f,4)
+
+  ! allocate and set the Helmholtz coefficients
+  call GetHelmholtzCoefficients(lambda, h, d)
+
+  !$acc data copyin(d) async(ACC_EXEC_QUEUE)
+
+  ! Add the primary parts ......................................................
+  call HelmholtzOpPrimFaceToFace    (n,n_element,op,d,u_f,r_f,tmp_f_1,tmp_f_2)
+  call HelmholtzOpPrimEdgeToFace    (n,n_element,op,d,u_e,r_f)
+  call HelmholtzOpPrimFaceToEdge    (n,n_element,op,d,u_f,r_e)
+  call HelmholtzOpPrimEdgeToEdge    (n,n_element,op,d,u_e,r_e)
+  call HelmholtzOpPrimVertexToEdge  (n,n_element,op,d,u_v,r_e)
+  call HelmholtzOpPrimEdgeToVertex  (n,n_element,op,d,u_e,r_v)
+  call HelmholtzOpPrimVertexToVertex(n,n_element,op,d,u_v,r_v)
+
+  !$acc end data
+
+  ! Deallocate shared variables ................................................
+  !$omp single
+  deallocate(d)
+  !$omp end single
+
+end subroutine AddHelmholtzOpCondensedPrimary_ClassicInterface
+
+!-------------------------------------------------------------------------------
+!> \brief   Calculates the primary part for the condensed system, adds it to r
+!> \author  Immo Huismann
+!>
+!> \details
+!> This version gets the helmholtz coefficients directly and won't calculate
+!> them every iteration
+
+subroutine AddHelmholtzOpCondensedPrimary_ShortInterface                       &
+    (op,d,u_f,u_e,u_v,r_f,r_e,r_v,tmp_f_1,tmp_f_2)
+  class(StandardOperators1D), intent(in) :: op      !< 1D operators
+  real(RDP),                  intent(in) :: d(0:,:) !< Helmholtz coefficients
+
+  real(RDP), intent(in)    :: u_f(:,:,:,:)          !< u values on faces
+  real(RDP), intent(in)    :: u_e(  :,:,:)          !< u values on edges
+  real(RDP), intent(in)    :: u_v(    :,:)          !< u values on vertices
+
+  real(RDP), intent(inout) :: r_f(:,:,:,:)          !< r values on faces
+  real(RDP), intent(inout) :: r_e(  :,:,:)          !< r values on edges
+  real(RDP), intent(inout) :: r_v(    :,:)          !< r values on vertices
+
+  real(RDP), intent(out)   :: tmp_f_1(:,:,:,:)      !< scratch values on faces
+  real(RDP), intent(out)   :: tmp_f_2(:,:,:,:)      !< scratch values on faces
+
+  integer :: n         ! number of internal points on one edge
+  integer :: n_element ! number of elements
+
+  !.............................................................................
+  ! Initialization
+
+  ! Get the problem size
+  n         = size(r_f,1)
+  n_element = size(r_f,4)
+
+  ! Add the primary parts ......................................................
+  call HelmholtzOpPrimFaceToFace    (n,n_element,op,d,u_f,r_f,tmp_f_1,tmp_f_2)
+  call HelmholtzOpPrimEdgeToFace    (n,n_element,op,d,u_e,r_f)
+  call HelmholtzOpPrimFaceToEdge    (n,n_element,op,d,u_f,r_e)
+  call HelmholtzOpPrimEdgeToEdge    (n,n_element,op,d,u_e,r_e)
+  call HelmholtzOpPrimVertexToEdge  (n,n_element,op,d,u_v,r_e)
+  call HelmholtzOpPrimEdgeToVertex  (n,n_element,op,d,u_e,r_v)
+  call HelmholtzOpPrimVertexToVertex(n,n_element,op,d,u_v,r_v)
+
+end subroutine AddHelmholtzOpCondensedPrimary_ShortInterface
+
+! Bound to Bound operations ====================================================
+
+!-------------------------------------------------------------------------------
+!> \brief   Calculates vertex to vertex interaction in the Helmholtz operator
+!> \author  Immo Huismann
+!>
+!> \details
+!> Mind that the properties
+!>
+!>    L_{0p} == L_{p0}
+!>    L_{00} == L_{pp}
+!>    w_{0}  == w_{p}
+!>
+!> are being exploited to shorten all terms.
+!> These properties hold for the GLL and GL points (they do for all symmetric
+!> node distribution), but for other base choices they might not.
+!>
+!> Explicit size for an easier compilation with OpenACC.
+
+subroutine HelmholtzOpPrimVertexToVertex(n,n_element,op,d,u,r)
+  integer,                    intent(in) :: n         !< po - 1
+  integer,                    intent(in) :: n_element !< number of elements
+  class(StandardOperators1D), intent(in) :: op        !< 1D operators
+
+  real(RDP), intent(in)    :: d(0:3,n_element)        !< element helmholtz coef.
+  real(RDP), intent(in)    :: u(N_VERTICES,n_element) !< variable u on vertices
+  real(RDP), intent(inout) :: r(N_VERTICES,n_element) !< r = H u, r on vertices
+
+  real(RDP) :: laplace_factor_00, laplace_factor_0p, mass_factor
+  real(RDP) :: L_00, L_0p, w_0          ! extracts from the system matrices
+  real(RDP) :: d_xy_00,d_xz_00, d_yz_00 ! geometry coefficients from 0 to 0
+  real(RDP) :: d_xy_0p,d_xz_0p, d_yz_0p ! geometry coefficients from 0 to p
+  real(RDP) :: d_xyz                    ! coefficient from mass term
+  integer   :: e                        ! element loop index
+
+  ! Calculate the factors common to each element ...............................
+  ! Mind that the properties
+  !
+  !    L_{0p} == L_{p0}
+  !    L_{00} == L_{pp}
+  !    w_{0}  == w_{p}
+  !
+  ! are being exploited to shorten all terms
+
+  L_00 = op%L(0,0  )
+  L_0p = op%L(0,n+1)
+  w_0  = op%w(0)
+
+  mass_factor       = w_0 * w_0 * w_0
+  laplace_factor_00 = w_0 * w_0 * L_00
+  laplace_factor_0p = w_0 * w_0 * L_0p
+
+  ! vertex to vertex interaction ...............................................
+
+  !$acc parallel async(ACC_EXEC_QUEUE) present(d,u,r)
+  !$acc loop
+  !$omp do private(e,d_xyz,d_xy_00,d_xy_0p,d_xz_00,d_xz_0p,d_yz_00,d_yz_0p)
+  do e = 1, n_element
+    ! set the metric coefficients
+    d_xyz   = d(0,e) * mass_factor
+
+    d_xy_00 = d(3,e) * laplace_factor_00
+    d_xy_0p = d(3,e) * laplace_factor_0p
+
+    d_xz_00 = d(2,e) * laplace_factor_00
+    d_xz_0p = d(2,e) * laplace_factor_0p
+
+    d_yz_00 = d(1,e) * laplace_factor_00
+    d_yz_0p = d(1,e) * laplace_factor_0p
+
+    ! r(0,0,0,e)
+    ! depends on       self          u(0,0,0,e) -- u(1,e)
+    !                   + x          u(p,0,0,e) -- u(2,e)
+    !                   + y          u(0,p,0,e) -- u(3,e)
+    !                   + z          u(0,0,p,e) -- u(5,e)
+    r(1,e) =                                       r(1,e)                      &
+         + (d_xyz + d_xy_00 + d_xz_00 + d_yz_00) * u(1,e)                      &
+         + (                            d_yz_0p) * u(2,e)                      &
+         + (                 d_xz_0p           ) * u(3,e)                      &
+         + (        d_xy_0p                    ) * u(5,e)
+
+    ! r(p,0,0,e)
+    ! depends on       self          u(p,0,0,e) -- u(2,e)
+    !                   - x          u(0,0,0,e) -- u(1,e)
+    !                   + y          u(p,p,0,e) -- u(4,e)
+    !                   + z          u(p,0,p,e) -- u(6,e)
+    r(2,e) =                                       r(2,e)                      &
+         + (d_xyz + d_xy_00 + d_xz_00 + d_yz_00) * u(2,e)                      &
+         + (                            d_yz_0p) * u(1,e)                      &
+         + (                  d_xz_0p          ) * u(4,e)                      &
+         + (        d_xy_0p                    ) * u(6,e)
+
+    ! r(0,p,0,e)
+    ! depends on       self          u(0,p,0,e) -- u(3,e)
+    !                   + x          u(p,p,0,e) -- u(4,e)
+    !                   - y          u(0,0,0,e) -- u(1,e)
+    !                   + z          u(0,p,p,e) -- u(7,e)
+    r(3,e) =                                       r(3,e)                      &
+         + (d_xyz + d_xy_00 + d_xz_00 + d_yz_00) * u(3,e)                      &
+         + (                            d_yz_0p) * u(4,e)                      &
+         + (                  d_xz_0p          ) * u(1,e)                      &
+         + (        d_xy_0p                    ) * u(7,e)
+
+    ! r(p,p,0,e)
+    ! depends on       self          u(p,p,0,e) -- u(4,e)
+    !                   - x          u(0,p,0,e) -- u(3,e)
+    !                   - y          u(p,0,0,e) -- u(2,e)
+    !                   + z          u(p,p,p,e) -- u(8,e)
+    r(4,e) =                                       r(4,e)                      &
+         + (d_xyz + d_xy_00 + d_xz_00 + d_yz_00) * u(4,e)                      &
+         + (                            d_yz_0p) * u(3,e)                      &
+         + (                  d_xz_0p          ) * u(2,e)                      &
+         + (        d_xy_0p                    ) * u(8,e)
+
+    ! r(0,0,p,e)
+    ! depends on       self          u(0,0,p,e) -- u(5,e)
+    !                   + x          u(p,0,p,e) -- u(6,e)
+    !                   + y          u(0,p,p,e) -- u(7,e)
+    !                   - z          u(0,0,p,e) -- u(5,e)
+    r(5,e) =                                       r(5,e)                      &
+         + (d_xyz + d_xy_00 + d_xz_00 + d_yz_00) * u(5,e)                      &
+         + (                            d_yz_0p) * u(6,e)                      &
+         + (                  d_xz_0p          ) * u(7,e)                      &
+         + (        d_xy_0p                    ) * u(1,e)
+
+    ! r(p,0,p,e)
+    ! depends on       self          u(p,0,p,e) -- u(6,e)
+    !                   - x          u(0,0,p,e) -- u(5,e)
+    !                   + y          u(p,p,p,e) -- u(8,e)
+    !                   - z          u(p,0,0,e) -- u(2,e)
+    r(6,e) =                                       r(6,e)                      &
+         + (d_xyz + d_xy_00 + d_xz_00 + d_yz_00) * u(6,e)                      &
+         + (                            d_yz_0p) * u(5,e)                      &
+         + (                  d_xz_0p          ) * u(8,e)                      &
+         + (        d_xy_0p                    ) * u(2,e)
+
+    ! r(0,p,p,e)
+    ! depends on       self          u(0,p,p,e) -- u(7,e)
+    !                   + x          u(p,p,p,e) -- u(8,e)
+    !                   - y          u(0,0,p,e) -- u(5,e)
+    !                   - z          u(0,p,0,e) -- u(3,e)
+    r(7,e) =                                       r(7,e)                      &
+         + (d_xyz + d_xy_00 + d_xz_00 + d_yz_00) * u(7,e)                      &
+         + (                            d_yz_0p) * u(8,e)                      &
+         + (                 d_xz_0p           ) * u(5,e)                      &
+         + (        d_xy_0p                    ) * u(3,e)
+
+    ! r(p,p,p,e)
+    ! depends on       self          u(p,p,p,e) -- u(8,e)
+    !                   - x          u(0,p,p,e) -- u(7,e)
+    !                   - y          u(p,0,p,e) -- u(6,e)
+    !                   - z          u(p,p,0,e) -- u(4,e)
+    r(8,e) =                                       r(8,e)                      &
+         + (d_xyz + d_xy_00 + d_xz_00 + d_yz_00) * u(8,e)                      &
+         + (                            d_yz_0p) * u(7,e)                      &
+         + (                  d_xz_0p          ) * u(6,e)                      &
+         + (        d_xy_0p                    ) * u(4,e)
+
+  end do
+  !$omp end do
+  !$acc end parallel
+
+
+end subroutine HelmholtzOpPrimVertexToVertex
+
+!-------------------------------------------------------------------------------
+!> \brief   Vertex to edge interaction in the Helmholtz operator r = H u
+!> \author  Immo Huismann
+
+subroutine HelmholtzOpPrimVertexToEdge(n,n_element,op,d,u_v,r_e)
+  integer,   intent(in)    :: n                     !< points per inner line
+  integer,   intent(in)    :: n_element             !< number of elements
+  class(StandardOperators1D), intent(in) :: op      !< polynomial system
+  real(RDP), intent(in)    :: d(0:3,n_element)      !< helmholtz factors
+  real(RDP), intent(in)    :: u_v(  N_VERTICES,n_element) !< variable u
+  real(RDP), intent(inout) :: r_e(n,N_EDGES,   n_element) !< r = H u
+
+  real(RDP) :: mat_i0(n), mat_ip(n)                     ! matrices to use
+  real(RDP) :: d_1_0, d_1_p, d_2_0, d_2_p, d_3_0, d_3_p ! matrix entry * metric
+  integer   :: i, e                                     ! loop indices
+
+  ! Initialization .............................................................
+
+  ! Set the rows of the matrices
+  mat_i0 =  op%L(1:n,0  ) * op%w(0) * op%w(0)
+  mat_ip =  op%L(1:n,n+1) * op%w(0) * op%w(0)
+
+  ! Computation ................................................................
+
+  ! Scalar to vector -> a perfectly nested loop
+
+  !$acc parallel async(ACC_EXEC_QUEUE) present(d,u_v,r_e)
+  !$acc loop collapse(2) private(d_1_0,d_1_p,d_2_0,d_2_p,d_3_0,d_3_p)
+  !$omp do private(i,e, d_1_0,d_1_p,d_2_0,d_2_p,d_3_0,d_3_p)
+  do e = 1, n_element
+  do i = 1, n
+    ! Compute factors for point and element
+    d_1_0 = d(1,e) * mat_i0(i); d_1_p = d(1,e) * mat_ip(i)
+    d_2_0 = d(2,e) * mat_i0(i); d_2_p = d(2,e) * mat_ip(i)
+    d_3_0 = d(3,e) * mat_i0(i); d_3_p = d(3,e) * mat_ip(i)
+
+    !...........................................................................
+    ! x_1 edges
+    r_e(i, 1,e) = r_e(i, 1,e) + d_1_0 * u_v(1,e) + d_1_p * u_v(2,e)
+    r_e(i, 2,e) = r_e(i, 2,e) + d_1_0 * u_v(3,e) + d_1_p * u_v(4,e)
+    r_e(i, 3,e) = r_e(i, 3,e) + d_1_0 * u_v(5,e) + d_1_p * u_v(6,e)
+    r_e(i, 4,e) = r_e(i, 4,e) + d_1_0 * u_v(7,e) + d_1_p * u_v(8,e)
+
+    !...........................................................................
+    ! x_2 edges
+    r_e(i, 5,e) = r_e(i, 5,e) + d_2_0 * u_v(1,e) + d_2_p * u_v(3,e)
+    r_e(i, 6,e) = r_e(i, 6,e) + d_2_0 * u_v(2,e) + d_2_p * u_v(4,e)
+    r_e(i, 7,e) = r_e(i, 7,e) + d_2_0 * u_v(5,e) + d_2_p * u_v(7,e)
+    r_e(i, 8,e) = r_e(i, 8,e) + d_2_0 * u_v(6,e) + d_2_p * u_v(8,e)
+
+    !...........................................................................
+    ! x_3 edges
+    r_e(i, 9,e) = r_e(i, 9,e) + d_3_0 * u_v(1,e) + d_3_p * u_v(5,e)
+    r_e(i,10,e) = r_e(i,10,e) + d_3_0 * u_v(2,e) + d_3_p * u_v(6,e)
+    r_e(i,11,e) = r_e(i,11,e) + d_3_0 * u_v(3,e) + d_3_p * u_v(7,e)
+    r_e(i,12,e) = r_e(i,12,e) + d_3_0 * u_v(4,e) + d_3_p * u_v(8,e)
+
+  end do
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+
+end subroutine HelmholtzOpPrimVertexToEdge
+
+!-------------------------------------------------------------------------------
+!> \brief   Edge to vertex interaction in the Helmholtz operator r = H u
+!> \author  Immo Huismann
+!>
+!> \todo
+!> * maybe think about reimplementing it with a loop over the edges
+!>  * could have fewer loads, though that is probably wasted on this routine
+
+subroutine HelmholtzOpPrimEdgeToVertex(n,n_element,op,d,u_e,r_v)
+  integer,   intent(in)    :: n                           !< points per edge
+  integer,   intent(in)    :: n_element                   !< number of elements
+  class(StandardOperators1D), intent(in) :: op            !< 1D operators
+  real(RDP), intent(in)    :: d(0:3,n_element)            !< helmholtz factors
+  real(RDP), intent(in)    :: u_e(n,N_EDGES,   n_element) !< variable u on edges
+  real(RDP), intent(inout) :: r_v(  N_VERTICES,n_element) !< r = H u on vertices
+
+  real(RDP) :: mat_i0(n), mat_ip(n)  ! operators
+  integer   :: i, e
+
+  ! Initialization .............................................................
+  ! Set the rows of the matrices
+  mat_i0 =  op%L(1:n,0  ) * op%w(0) * op%w(0)
+  mat_ip =  op%L(1:n,n+1) * op%w(0) * op%w(0)
+
+  ! Computation ................................................................
+
+
+  !$acc parallel async(ACC_EXEC_QUEUE) present(d,u_e,r_v)
+  !$acc loop
+  !$omp do private(i,e)
+  do e = 1, n_element
+  do i = 1, n
+    r_v(1,e) = r_v(1,e) + mat_I0(i) * d(1,e) * u_e(i, 1,e)                     &
+                        + mat_I0(i) * d(2,e) * u_e(i, 5,e)                     &
+                        + mat_I0(i) * d(3,e) * u_e(i, 9,e)
+
+    r_v(2,e) = r_v(2,e) + mat_Ip(i) * d(1,e) * u_e(i, 1,e)                     &
+                        + mat_I0(i) * d(2,e) * u_e(i, 6,e)                     &
+                        + mat_I0(i) * d(3,e) * u_e(i,10,e)
+
+    r_v(3,e) = r_v(3,e) + mat_I0(i) * d(1,e) * u_e(i, 2,e)                     &
+                        + mat_Ip(i) * d(2,e) * u_e(i, 5,e)                     &
+                        + mat_I0(i) * d(3,e) * u_e(i,11,e)
+
+    r_v(4,e) = r_v(4,e) + mat_Ip(i) * d(1,e) * u_e(i, 2,e)                     &
+                        + mat_Ip(i) * d(2,e) * u_e(i, 6,e)                     &
+                        + mat_I0(i) * d(3,e) * u_e(i,12,e)
+
+    r_v(5,e) = r_v(5,e) + mat_I0(i) * d(1,e) * u_e(i, 3,e)                     &
+                        + mat_I0(i) * d(2,e) * u_e(i, 7,e)                     &
+                        + mat_Ip(i) * d(3,e) * u_e(i, 9,e)
+
+    r_v(6,e) = r_v(6,e) + mat_Ip(i) * d(1,e) * u_e(i, 3,e)                     &
+                        + mat_I0(i) * d(2,e) * u_e(i, 8,e)                     &
+                        + mat_Ip(i) * d(3,e) * u_e(i,10,e)
+
+    r_v(7,e) = r_v(7,e) + mat_I0(i) * d(1,e) * u_e(i, 4,e)                     &
+                        + mat_Ip(i) * d(2,e) * u_e(i, 7,e)                     &
+                        + mat_Ip(i) * d(3,e) * u_e(i,11,e)
+
+    r_v(8,e) = r_v(8,e) + mat_Ip(i) * d(1,e) * u_e(i, 4,e)                     &
+                        + mat_Ip(i) * d(2,e) * u_e(i, 8,e)                     &
+                        + mat_Ip(i) * d(3,e) * u_e(i,12,e)
+
+  end do
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+
+end subroutine HelmholtzOpPrimEdgeToVertex
+
+!-------------------------------------------------------------------------------
+!> \brief   Edge to vertex interaction in the Helmholtz operator r = H u
+!> \author  Immo Huismann
+
+subroutine HelmholtzOpPrimEdgeToEdge(n,n_element,op,d,u_e,r_e)
+  integer,   intent(in)    :: n                 !< points per inner line
+  integer,   intent(in)    :: n_element         !< number of elements
+  class(StandardOperators1D), intent(in) :: op  !< 1D operators
+  real(RDP), intent(in)    :: d(0:3,n_element)  !< element helmholtz coeffs
+  real(RDP), intent(in)    :: u_e(n,N_EDGES,n_element) !< variable u on edges
+  real(RDP), intent(inout) :: r_e(n,N_EDGES,n_element) !< r = H u    on edges
+
+  real(RDP) :: w_0            ! first coefficient in mass matrix
+  real(RDP) :: L_00           ! coefficient (0,0) in laplace matrix
+  real(RDP) :: L_0p           ! coefficient (0,p) in laplace matrix
+  real(RDP) :: w_I(n)         ! inner part of mass    matrix
+  real(RDP) :: L_I_w_w(n,n)   ! inner part of laplace matrix * w_0^2
+  real(RDP) :: factors(3)     ! metric factors * matrices
+
+  integer   :: i,j,e
+
+  ! initialization .............................................................
+
+  ! Set the operators
+  w_0  = op%w(0)
+  L_00 = op%L(0,0  )
+  L_0p = op%L(0,n+1)
+  w_I  = op%w(1:n)
+  L_I_w_w  = op%L(1:n,1:n) * w_0 * w_0
+
+  ! Laplace term in edge direction .............................................
+
+
+  !$acc parallel async(ACC_EXEC_QUEUE) present(d,u_e,r_e)
+  !$acc loop collapse(2)
+  !$omp do private(i,j,e)
+  do e = 1, n_element
+  do i = 1, n
+    do j = 1, n
+      r_e(i, 1,e) = r_e(i, 1,e) + d(1,e) * L_I_w_w(i,j) * u_e(j, 1,e)
+      r_e(i, 2,e) = r_e(i, 2,e) + d(1,e) * L_I_w_w(i,j) * u_e(j, 2,e)
+      r_e(i, 3,e) = r_e(i, 3,e) + d(1,e) * L_I_w_w(i,j) * u_e(j, 3,e)
+      r_e(i, 4,e) = r_e(i, 4,e) + d(1,e) * L_I_w_w(i,j) * u_e(j, 4,e)
+
+      r_e(i, 5,e) = r_e(i, 5,e) + d(2,e) * L_I_w_w(i,j) * u_e(j, 5,e)
+      r_e(i, 6,e) = r_e(i, 6,e) + d(2,e) * L_I_w_w(i,j) * u_e(j, 6,e)
+      r_e(i, 7,e) = r_e(i, 7,e) + d(2,e) * L_I_w_w(i,j) * u_e(j, 7,e)
+      r_e(i, 8,e) = r_e(i, 8,e) + d(2,e) * L_I_w_w(i,j) * u_e(j, 8,e)
+
+      r_e(i, 9,e) = r_e(i, 9,e) + d(3,e) * L_I_w_w(i,j) * u_e(j, 9,e)
+      r_e(i,10,e) = r_e(i,10,e) + d(3,e) * L_I_w_w(i,j) * u_e(j,10,e)
+      r_e(i,11,e) = r_e(i,11,e) + d(3,e) * L_I_w_w(i,j) * u_e(j,11,e)
+      r_e(i,12,e) = r_e(i,12,e) + d(3,e) * L_I_w_w(i,j) * u_e(j,12,e)
+    end do
+  end do
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+
+  ! Laplace term to other edges ................................................
+
+
+  !$acc parallel async(ACC_EXEC_QUEUE) present(d,u_e,r_e)
+  !$acc loop collapse(2) private(factors)
+  !$omp do private(i,e)
+  do e = 1, n_element
+  do i = 1, n
+
+    !...........................................................................
+    ! x_1 edges
+    ! Calculate the metric factors, then use them
+    factors(1) = w_I(i) * w_0 * (d(0,e) * w_0 + L_00 * (d(2,e) + d(3,e)))
+    factors(2) = w_I(i) * w_0 * L_0p * d(2,e)
+    factors(3) = w_I(i) * w_0 * L_0p * d(3,e)
+
+    r_e(i, 1,e) =      r_e(i, 1,e)                                              &
+        + factors(1) * u_e(i, 1,e)                                              &
+        + factors(2) * u_e(i, 2,e)                                              &
+        + factors(3) * u_e(i, 3,e)
+
+    r_e(i, 2,e) =      r_e(i, 2,e)                                              &
+        + factors(1) * u_e(i, 2,e)                                              &
+        + factors(2) * u_e(i, 1,e)                                              &
+        + factors(3) * u_e(i, 4,e)
+
+    r_e(i, 3,e) =      r_e(i, 3,e)                                              &
+        + factors(1) * u_e(i, 3,e)                                              &
+        + factors(2) * u_e(i, 4,e)                                              &
+        + factors(3) * u_e(i, 1,e)
+
+    r_e(i, 4,e) =      r_e(i, 4,e)                                              &
+        + factors(1) * u_e(i, 4,e)                                              &
+        + factors(2) * u_e(i, 3,e)                                              &
+        + factors(3) * u_e(i, 2,e)
+
+    !...........................................................................
+    ! x_2 edges
+    ! Calculate the metric factors, then use them
+    factors(1) = w_I(i) * w_0 * (d(0,e) * w_0 + L_00 * (d(1,e) + d(3,e)))
+    factors(2) = w_I(i) * w_0 * L_0p * d(1,e)
+    factors(3) = w_I(i) * w_0 * L_0p * d(3,e)
+
+    r_e(i, 5,e) =      r_e(i, 5,e)                                              &
+        + factors(1) * u_e(i, 5,e)                                              &
+        + factors(2) * u_e(i, 6,e)                                              &
+        + factors(3) * u_e(i, 7,e)
+
+    r_e(i, 6,e) =      r_e(i, 6,e)                                              &
+        + factors(1) * u_e(i, 6,e)                                              &
+        + factors(2) * u_e(i, 5,e)                                              &
+        + factors(3) * u_e(i, 8,e)
+
+    r_e(i, 7,e) =      r_e(i, 7,e)                                              &
+        + factors(1) * u_e(i, 7,e)                                              &
+        + factors(2) * u_e(i, 8,e)                                              &
+        + factors(3) * u_e(i, 5,e)
+
+    r_e(i, 8,e) =      r_e(i, 8,e)                                              &
+        + factors(1) * u_e(i, 8,e)                                              &
+        + factors(2) * u_e(i, 7,e)                                              &
+        + factors(3) * u_e(i, 6,e)
+
+    !...........................................................................
+    ! x_3 edges
+    ! Calculate the metric factors, then use them
+    factors(1) = w_I(i) * w_0 * (d(0,e) * w_0 + L_00 * (d(1,e) + d(2,e)))
+    factors(2) = w_I(i) * w_0 * L_0p * d(1,e)
+    factors(3) = w_I(i) * w_0 * L_0p * d(2,e)
+
+    r_e(i, 9,e) =      r_e(i, 9,e)                                              &
+        + factors(1) * u_e(i, 9,e)                                              &
+        + factors(2) * u_e(i,10,e)                                              &
+        + factors(3) * u_e(i,11,e)
+
+    r_e(i,10,e) =      r_e(i,10,e)                                              &
+        + factors(1) * u_e(i,10,e)                                              &
+        + factors(2) * u_e(i, 9,e)                                              &
+        + factors(3) * u_e(i,12,e)
+
+    r_e(i,11,e) =      r_e(i,11,e)                                              &
+        + factors(1) * u_e(i,11,e)                                              &
+        + factors(2) * u_e(i,12,e)                                              &
+        + factors(3) * u_e(i, 9,e)
+
+    r_e(i,12,e) =      r_e(i,12,e)                                              &
+        + factors(1) * u_e(i,12,e)                                              &
+        + factors(2) * u_e(i,11,e)                                              &
+        + factors(3) * u_e(i,10,e)
+
+  end do
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+
+end subroutine HelmholtzOpPrimEdgeToEdge
+
+!-------------------------------------------------------------------------------
+!> \brief   Edge to face interaction in the Helmholtz operator r = H u
+!> \author  Immo Huismann
+
+subroutine HelmholtzOpPrimEdgeToFace(n,n_element,op,d,u_e,r_f)
+  integer,   intent(in)    :: n                     !< points per inner line
+  integer,   intent(in)    :: n_element             !< number of elements
+  class(StandardOperators1D), intent(in) :: op      !< 1D operators
+  real(RDP), intent(in)    :: d(0:3,n_element)      !< helmholtz coefficients
+  real(RDP), intent(in)    :: u_e(  n,N_EDGES,n_element) !< variable u on edges
+  real(RDP), intent(inout) :: r_f(n,n,N_FACES,n_element) !< r = H u    on faces
+
+  integer   :: e,i,j                                          ! loop indices
+
+  real(RDP) :: w_I(n), L_I0(n), L_Ip(n)                       ! operators
+
+  ! initialization .............................................................
+
+  ! Set the row matrices
+  w_I  = op%w(1:n)
+  L_I0 = op%L(1:n,0  ) * op%w(0)
+  L_Ip = op%L(1:n,n+1) * op%w(0)
+
+  ! x_1 edges ..................................................................
+
+  !$acc parallel async(ACC_EXEC_QUEUE) present(d,u_e,r_f)
+  !$acc loop collapse(3)
+  !$omp do private(i,j,e)
+  do e = 1, n_element
+  do j = 1, n
+  do i = 1, n
+
+    ! x_1 faces
+    r_f(i,j,1,e) =                        r_f(i,j, 1,e)                        &
+        + d(3,e) * w_I(i  ) * L_I0(  j) * u_e(i,   5,e)                        &
+        + d(3,e) * w_I(i  ) * L_Ip(  j) * u_e(i,   7,e)                        &
+        + d(2,e) * w_I(  j) * L_I0(i  ) * u_e(  j, 9,e)                        &
+        + d(2,e) * w_I(  j) * L_Ip(i  ) * u_e(  j,11,e)
+
+    r_f(i,j,2,e) =                        r_f(i,j, 2,e)                        &
+        + d(3,e) * w_I(i  ) * L_I0(  j) * u_e(i,   6,e)                        &
+        + d(3,e) * w_I(i  ) * L_Ip(  j) * u_e(i,   8,e)                        &
+        + d(2,e) * w_I(  j) * L_I0(i  ) * u_e(  j,10,e)                        &
+        + d(2,e) * w_I(  j) * L_Ip(i  ) * u_e(  j,12,e)
+
+    ! x_2 faces
+    r_f(i,j,3,e) =                        r_f(i,j, 3,e)                        &
+        + d(3,e) * w_I(i  ) * L_I0(  j) * u_e(i,   1,e)                        &
+        + d(3,e) * w_I(i  ) * L_Ip(  j) * u_e(i,   3,e)                        &
+        + d(1,e) * w_I(  j) * L_I0(i  ) * u_e(  j, 9,e)                        &
+        + d(1,e) * w_I(  j) * L_Ip(i  ) * u_e(  j,10,e)
+
+    r_f(i,j,4,e) =                        r_f(i,j, 4,e)                        &
+        + d(3,e) * w_I(i  ) * L_I0(  j) * u_e(i,   2,e)                        &
+        + d(3,e) * w_I(i  ) * L_Ip(  j) * u_e(i,   4,e)                        &
+        + d(1,e) * w_I(  j) * L_I0(i  ) * u_e(  j,11,e)                        &
+        + d(1,e) * w_I(  j) * L_Ip(i  ) * u_e(  j,12,e)
+
+    ! x_3 faces
+    r_f(i,j,5,e) =                        r_f(i,j, 5,e)                        &
+        + d(2,e) * w_I(i  ) * L_I0(  j) * u_e(i,   1,e)                        &
+        + d(2,e) * w_I(i  ) * L_Ip(  j) * u_e(i,   2,e)                        &
+        + d(1,e) * w_I(  j) * L_I0(i  ) * u_e(  j, 5,e)                        &
+        + d(1,e) * w_I(  j) * L_Ip(i  ) * u_e(  j, 6,e)
+
+    r_f(i,j,6,e) =                        r_f(i,j, 6,e)                        &
+        + d(2,e) * w_I(i  ) * L_I0(  j) * u_e(i,   3,e)                        &
+        + d(2,e) * w_I(i  ) * L_Ip(  j) * u_e(i,   4,e)                        &
+        + d(1,e) * w_I(  j) * L_I0(i  ) * u_e(  j, 7,e)                        &
+        + d(1,e) * w_I(  j) * L_Ip(i  ) * u_e(  j, 8,e)
+
+  end do
+  end do
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+
+end subroutine HelmholtzOpPrimEdgeToFace
+
+!-------------------------------------------------------------------------------
+!> \brief   Face to edge interaction in the Helmholtz operator r = H u
+!> \author  Immo Huismann
+!>
+!> \todo
+!> * Put the computation of d_i into an outer routine
+
+subroutine HelmholtzOpPrimFaceToEdge(n,n_element,op,d,u_f,r_e)
+  integer,   intent(in)    :: n                     !< points per inner line
+  integer,   intent(in)    :: n_element             !< number of elements
+  class(StandardOperators1D), intent(in) :: op      !< 1D operators
+  real(RDP), intent(in)    :: d(0:3,n_element)      !< helmholtz coefficients
+  real(RDP), intent(in)    :: u_f(n,n,N_FACES,n_element) !< variable u on faces
+  real(RDP), intent(inout) :: r_e(  n,N_EDGES,n_element) !< r = H u    on edges
+
+  integer   :: e,i,j                                          ! loop indices
+
+  real(RDP) :: w_I(n), L_0I(n), L_pI(n)                       ! operators
+
+  ! initialization .............................................................
+
+  ! Set the row matrices
+  w_I  = op%w(1:n)
+  L_0I = op%L(0  ,1:n) * op%w(0)
+  L_pI = op%L(n+1,1:n) * op%w(0)
+
+  ! One loop to rule them all ..................................................
+
+  ! A three times nested loop is utilized. One for the matrix multiplication,
+  ! two for the points themselve.
+  !$acc parallel async(ACC_EXEC_QUEUE) present(d,u_f,r_e)
+  !$acc loop collapse(2)
+  !$omp do private(i,j,e)
+  do e = 1, n_element
+  do i = 1, n
+    do j = 1, n
+      r_e(i, 1,e) =                     r_e(i  , 1,e)                            &
+          + d(2,e) * w_I(i) * L_0I(j) * u_f(i,j, 5,e)                            &
+          + d(3,e) * w_I(i) * L_0I(j) * u_f(i,j, 3,e)
+
+      r_e(i, 2,e) =                     r_e(i  , 2,e)                            &
+          + d(2,e) * w_I(i) * L_pI(j) * u_f(i,j, 5,e)                            &
+          + d(3,e) * w_I(i) * L_0I(j) * u_f(i,j, 4,e)
+
+      r_e(i, 3,e) =                     r_e(i  , 3,e)                            &
+          + d(2,e) * w_I(i) * L_0I(j) * u_f(i,j, 6,e)                            &
+          + d(3,e) * w_I(i) * L_pI(j) * u_f(i,j, 3,e)
+
+      r_e(i, 4,e) =                     r_e(i  , 4,e)                            &
+          + d(2,e) * w_I(i) * L_pI(j) * u_f(i,j, 6,e)                            &
+          + d(3,e) * w_I(i) * L_pI(j) * u_f(i,j, 4,e)
+
+      r_e(i, 5,e) =                     r_e(i  , 5,e)                            &
+          + d(3,e) * w_I(i) * L_0I(j) * u_f(i,j, 1,e)                            &
+          + d(1,e) * w_I(i) * L_0I(j) * u_f(j,i, 5,e)
+
+      r_e(i, 6,e) =                     r_e(i  , 6,e)                            &
+          + d(3,e) * w_I(i) * L_0I(j) * u_f(i,j, 2,e)                            &
+          + d(1,e) * w_I(i) * L_pI(j) * u_f(j,i, 5,e)
+
+      r_e(i, 7,e) =                     r_e(i  , 7,e)                            &
+          + d(3,e) * w_I(i) * L_pI(j) * u_f(i,j, 1,e)                            &
+          + d(1,e) * w_I(i) * L_0I(j) * u_f(j,i, 6,e)
+
+      r_e(i, 8,e) =                     r_e(i  , 8,e)                            &
+          + d(3,e) * w_I(i) * L_pI(j) * u_f(i,j, 2,e)                            &
+          + d(1,e) * w_I(i) * L_pI(j) * u_f(j,i, 6,e)
+
+      r_e(i, 9,e) =                     r_e(i  , 9,e)                            &
+          + d(2,e) * w_I(i) * L_0I(j) * u_f(j,i, 1,e)                            &
+          + d(1,e) * w_I(i) * L_0I(j) * u_f(j,i, 3,e)
+
+      r_e(i,10,e) =                     r_e(i  ,10,e)                            &
+          + d(2,e) * w_I(i) * L_0I(j) * u_f(j,i, 2,e)                            &
+          + d(1,e) * w_I(i) * L_pI(j) * u_f(j,i, 3,e)
+
+      r_e(i,11,e) =                     r_e(i  ,11,e)                            &
+          + d(2,e) * w_I(i) * L_pI(j) * u_f(j,i, 1,e)                            &
+          + d(1,e) * w_I(i) * L_0I(j) * u_f(j,i, 4,e)
+
+      r_e(i,12,e) =                     r_e(i  ,12,e)                            &
+          + d(2,e) * w_I(i) * L_pI(j) * u_f(j,i, 2,e)                            &
+          + d(1,e) * w_I(i) * L_pI(j) * u_f(j,i, 4,e)
+
+    end do
+  end do
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+
+end subroutine HelmholtzOpPrimFaceToEdge
+
+!-------------------------------------------------------------------------------
+!> \brief   Face to face interaction in the Helmholtz operator r = H u
+!> \author  Immo Huismann
+!>
+!> \todo
+!> * Put the computation of d_i into an outer routine
+!> * Put the treatment of one direction's faces into a separate subroutine
+!>   * leads to a more cache-friendly implementation (most crucial part of this
+!>     module)
+!>
+!> \details
+!> For each edge two loops are present for the laplace terms. This is due to a
+!> compiler bug (with O3) in ifort.
+
+subroutine HelmholtzOpPrimFaceToFace(n,n_element,op,d,u_f,r_f,tmp_f_1,tmp_f_2)
+  integer,   intent(in)    :: n                    !< points per inner line
+  integer,   intent(in)    :: n_element            !< number of elements
+  class(StandardOperators1D), intent(in) :: op     !< 1D operators
+  real(RDP), intent(in)    :: d(0:3,n_element)     !< helmholtz factors
+  real(RDP), intent(in)    :: u_f    (n,n,N_FACES,n_element) !< variable u on faces
+  real(RDP), intent(inout) :: r_f    (n,n,N_FACES,n_element) !< r = H u    on edges
+  real(RDP), intent(out)   :: tmp_f_1(n,n,N_FACES,n_element) !< scratch values
+  real(RDP), intent(out)   :: tmp_f_2(n,n,N_FACES,n_element) !< scratch values
+
+  integer   :: i,j ! loop indices
+
+  real(RDP) :: M_II(n,n), w_I(n), L_II(n,n), L_0p, w_0,L_00     ! operators
+
+  ! initialization .............................................................
+
+  ! Set the row matrices
+  w_0  = op%w(0)
+  w_I  = op%w(1:n)
+  L_00 = op%L(0,  0  )
+  L_II = op%L(1:n,1:n)
+  L_0p = op%L(0,  n+1)
+  M_II = reshape([((w_I(i) * w_I(j), i = 1, n), j = 1,n)], [n,n])
+
+  ! Work in each of the three directions seperatedly
+  call FaceToFace&
+      (n,n_element,d,w_0,L_00,L_0p,w_I,M_II,L_II,u_f,r_f,tmp_f_1,tmp_f_2)
+
+
+end subroutine HelmholtzOpPrimFaceToFace
+
+!===============================================================================
+! Suboperators
+
+subroutine FaceToFace&
+    (n,n_element,d,w_0,L_00,L_0p,w_I,M_II,L_I,u_f,r_f,tmp_f_1,tmp_f_2)
+  integer,   intent(in) :: n              !< points per edge
+  integer,   intent(in) :: n_element      !< number of elements
+  real(RDP), intent(in) :: d(0:3,n_element) !< mass term coefficient
+  real(RDP), intent(in) :: w_0            !< coefficient (0,0) of mass matrix
+  real(RDP), intent(in) :: L_00           !< coefficient (0,0) of laplace matrix
+  real(RDP), intent(in) :: L_0p           !< coefficient (0,p) of laplace matrix
+  real(RDP), intent(in) :: w_I(n)         !< inner part of mass matrix
+  real(RDP), intent(in) :: M_II(n,n)      !< inner part of 2D mass matrix
+  real(RDP), intent(in) :: L_I (n,n)      !< inner part of laplace matrix
+
+  real(RDP), intent(in)    :: u_f    (n,n,N_FACES,n_element)
+  real(RDP), intent(inout) :: r_f    (n,n,N_FACES,n_element)
+  real(RDP), intent(out)   :: tmp_f_1(n,n,N_FACES,n_element)
+  real(RDP), intent(out)   :: tmp_f_2(n,n,N_FACES,n_element)
+
+  ! laplace factors on faces, in first and second dimension on face
+  real(RDP), allocatable, save :: d_f_1(:,:), d_f_2(:,:)
+
+  integer   :: e,i,j
+
+  !.............................................................................
+  ! Initialization of on-face laplace factors
+
+  !$omp single
+  allocate(d_f_1(N_FACES, n_element))
+  allocate(d_f_2(N_FACES, n_element))
+  !$omp end single
+
+  !$omp do private(e)
+  do e = 1, n_element
+    ! metric factor for first dimension of face
+    d_f_1(1, e) = d(2,e)
+    d_f_1(2, e) = d(2,e)
+    d_f_1(3, e) = d(1,e)
+    d_f_1(4, e) = d(1,e)
+    d_f_1(5, e) = d(1,e)
+    d_f_1(6, e) = d(1,e)
+
+    ! metric factor for second dimension of face
+    d_f_2(1, e) = d(3,e)
+    d_f_2(2, e) = d(3,e)
+    d_f_2(3, e) = d(3,e)
+    d_f_2(4, e) = d(3,e)
+    d_f_2(5, e) = d(2,e)
+    d_f_2(6, e) = d(2,e)
+  end  do
+  !$omp  end do
+
+  !.............................................................................
+  ! Mass term + orthogonal components
+
+  !$acc parallel async(ACC_EXEC_QUEUE) present(d,u_f,r_f)
+  !$acc loop
+  !$omp  do private(i,j,e)
+  do e  = 1, n_element
+    !$acc loop collapse(2)
+    do  j = 1, n
+    do  i = 1, n
+       r_f(i,j,1,e) = r_f(i,j,1,e)                                             &
+           + M_II(i,j) * (d(0,e) * w_0 + d(1,e) * L_00) * u_f(i,j,1,e)         &
+           + M_II(i,j) *                 d(1,e) * L_0p  * u_f(i,j,2,e)
+       r_f(i,j,2,e) = r_f(i,j,2,e)                                             &
+           + M_II(i,j) * (d(0,e) * w_0 + d(1,e) * L_00) * u_f(i,j,2,e)         &
+           + M_II(i,j) *                 d(1,e) * L_0p  * u_f(i,j,1,e)
+    end do
+    end do
+    !$acc end loop
+    !$acc loop collapse(2)
+    do j = 1, n
+    do i = 1, n
+      r_f(i,j,3,e) = r_f(i,j,3,e)                                              &
+          + M_II(i,j) * (d(0,e) * w_0 + d(2,e) * L_00) * u_f(i,j,3,e)          &
+          + M_II(i,j) *                 d(2,e) * L_0p  * u_f(i,j,4,e)
+      r_f(i,j,4,e) = r_f(i,j,4,e)                                              &
+          + M_II(i,j) * (d(0,e) * w_0 + d(2,e) * L_00) * u_f(i,j,4,e)          &
+          + M_II(i,j) *                 d(2,e) * L_0p  * u_f(i,j,3,e)
+    end do
+    end do
+    !$acc end loop
+    !$acc loop collapse(2)
+    do j = 1, n
+    do i = 1, n
+      r_f(i,j,5,e) = r_f(i,j,5,e)                                              &
+          + M_II(i,j) * (d(0,e) * w_0 + d(3,e) * L_00) * u_f(i,j,5,e)          &
+          + M_II(i,j) *                 d(3,e) * L_0p  * u_f(i,j,6,e)
+      r_f(i,j,6,e) = r_f(i,j,6,e)                                              &
+          + M_II(i,j) * (d(0,e) * w_0 + d(3,e) * L_00) * u_f(i,j,6,e)          &
+          + M_II(i,j) *                 d(3,e) * L_0p  * u_f(i,j,5,e)
+    end do
+    end do
+    !$acc end loop
+  end do
+  !$omp end do
+  !$acc end parallel
+
+  !.............................................................................
+  ! Laplace factors
+
+  ! Laplace in first  direction of face
+  call AddFactoredMatrixProduct2D(L_I,w_I * w_0,d_f_1,u_f,r_f,tmp_f_1,tmp_f_2)
+  ! Laplace in second direction of face
+  call AddFactoredMatrixProduct2D(w_I,L_I * w_0,d_f_2,u_f,r_f,tmp_f_1,tmp_f_2)
+
+  !.............................................................................
+  ! Epilogue
+  !$omp single
+  deallocate(d_f_1,d_f_2)
+  !$omp end single
+
+end subroutine FaceToFace
+
+!===============================================================================
+
+end module Condensed_Primary_Part
diff --git a/Parser/test-data/specht/matrix_operations.f b/Parser/test-data/specht/matrix_operations.f
new file mode 100644
index 0000000..06c9cfd
--- /dev/null
+++ b/Parser/test-data/specht/matrix_operations.f
@@ -0,0 +1,651 @@
+!> \file       matrix_operations.f
+!> \brief      Provides explicit shape routines for simple matrix operations
+!> \author     Immo Huismann
+!> \date       2014/08/14
+!> \copyright  Institute of Fluid Mechanics, TU Dresden, 01062 Dresden, Germany
+!>
+!> \details
+!> This module provides routines in explicit size for some simple matrix and
+!> array operations.
+!>
+!> Currently the operations of diagonal matrices on matrices, scalar products
+!> and the transposition of triad dimensions are supported.
+!>
+!> \todo
+!> * Create unit tests.
+!> * OpenACC and OpenMP in unit tests
+!> * Check OpenMP parallelization of the routines
+!===============================================================================
+
+module Matrix_Operations
+  ! HiBASE modules
+  use Kind_Parameters, only: RDP
+
+  ! Specht base modules
+  use ACC_Parameters,  only: ACC_EXEC_QUEUE
+  implicit none
+  private
+
+  public ::        DiagonalMatrixProduct ! application of a diagonal matrix
+  public ::     AddDiagonalMatrixProduct ! as above, but adding to result vector
+  public :: InPlaceDiagonalMatrixProduct ! as first, but inplace operation
+
+  public ::    StackedTranspose          ! transposition of multiple matrices
+  public :: AddStackedTranspose          ! transposition of multiple matrices
+
+  !-----------------------------------------------------------------------------
+  !> \brief   A matrix product with a diagonal matrix as operator
+  !> \author  Immo Huismann
+
+  interface DiagonalMatrixProduct
+    module procedure       DiagonalMatrixProduct1
+    module procedure       DiagonalMatrixProduct2
+    module procedure       DiagonalMatrixProduct3
+    module procedure ScaledDiagonalMatrixProduct2
+    module procedure ScaledDiagonalMatrixProduct22
+    module procedure ScaledDiagonalMatrixProduct3
+  end interface DiagonalMatrixProduct
+
+  !-----------------------------------------------------------------------------
+  !> \brief   A matrix product with a diagonal matrix as operator
+  !> \author  Immo Huismann
+
+  interface InPlaceDiagonalMatrixProduct
+    module procedure       InPlaceDiagonalMatrixProduct3
+    module procedure ScaledInPlaceDiagonalMatrixProduct3
+  end interface InPlaceDiagonalMatrixProduct
+
+  !-----------------------------------------------------------------------------
+  !> \brief   An additive matrix product with a diagonal matrix as operator
+  !> \author  Immo Huismann
+
+  interface AddDiagonalMatrixProduct
+    module procedure AddDiagonalMatrixProduct1
+    module procedure AddDiagonalMatrixProduct2
+    module procedure AddDiagonalMatrixProduct3
+    module procedure AddScaledDiagonalMatrixProduct2
+  end interface AddDiagonalMatrixProduct
+
+contains
+
+! Wrappers =====================================================================
+
+!-------------------------------------------------------------------------------
+!> \brief   Applies a diagonal matrix to the values
+!> \author  Immo Huismann
+
+subroutine DiagonalMatrixProduct1(diag,values,res)
+  real(RDP), intent(in)    :: diag  (:)       !< Diagonal matrix to use B on
+  real(RDP), intent(in)    :: values(:,:,:,:) !< Input data
+  real(RDP), intent(  out) :: res   (:,:,:,:) !< Matrix to store the result in
+
+  integer :: k,m
+
+  k = size(diag)
+  m = product([size(values,2),size(values,3),size(values,4)])
+
+  call DiagonalMatrixProduct_ES(k,m,diag,values,res)
+
+end subroutine DiagonalMatrixProduct1
+
+!-------------------------------------------------------------------------------
+!> \brief   Applies a diagonal matrix to the values
+!> \author  Immo Huismann
+
+subroutine DiagonalMatrixProduct2(diag,values,res)
+  real(RDP), intent(in)    :: diag  (:,:)     !< Diagonal matrix to use B on
+  real(RDP), intent(in)    :: values(:,:,:,:) !< Input data
+  real(RDP), intent(  out) :: res   (:,:,:,:) !< Matrix to store the result in
+
+  integer :: k,m
+
+  k = size(diag)
+  m = product([size(values,3),size(values,4)])
+
+  call DiagonalMatrixProduct_ES(k,m,diag,values,res)
+
+end subroutine DiagonalMatrixProduct2
+
+!-------------------------------------------------------------------------------
+!> \brief   This interface provides DGEMM for a square matrix A
+!> \author  Immo Huismann
+
+subroutine DiagonalMatrixProduct3(diag,values,res)
+  real(RDP), intent(in)    :: diag  (:,:,:)   !< Diagonal matrix to use B on
+  real(RDP), intent(in)    :: values(:,:,:,:) !< Input data
+  real(RDP), intent(  out) :: res   (:,:,:,:) !< Matrix to store the result in
+
+  integer :: k,m
+
+  k = size(diag)
+  m = size(values,4)
+
+  call DiagonalMatrixProduct_ES(k,m,diag,values,res)
+
+end subroutine DiagonalMatrixProduct3
+
+!-------------------------------------------------------------------------------
+!> \brief   This interface provides DGEMM for a square matrix A
+!> \author  Immo Huismann
+
+subroutine InPlaceDiagonalMatrixProduct3(diag,values)
+  real(RDP), intent(in)    :: diag  (:,:,:)   !< Diagonal matrix to use B on
+  real(RDP), intent(inout) :: values(:,:,:,:) !< Input data
+
+  integer :: k,m
+
+  k = size(diag)
+  m = size(values,4)
+
+  call InPlaceDiagonalMatrixProduct_ES(k,m,diag,values)
+
+end subroutine InPlaceDiagonalMatrixProduct3
+
+!-------------------------------------------------------------------------------
+!> \brief   This interface provides DGEMM for a square matrix A
+!> \author  Immo Huismann
+
+subroutine ScaledInPlaceDiagonalMatrixProduct3(diag,scales,values)
+  real(RDP), intent(in)    :: diag  (:,:,:)   !< Diagonal matrix to use B on
+  real(RDP), intent(in)    :: scales(      :) !< elementwise scale
+  real(RDP), intent(inout) :: values(:,:,:,:) !< Input data
+
+  integer :: k,m
+
+  k = size(diag)
+  m = size(scales)
+
+  call ScaledInPlaceDiagonalMatrixProduct_ES(k,m,diag,scales,values)
+
+end subroutine ScaledInPlaceDiagonalMatrixProduct3
+
+!-------------------------------------------------------------------------------
+!> \brief   This interface provides DGEMM for a square matrix A
+!> \author  Immo Huismann
+
+subroutine ScaledDiagonalMatrixProduct2(diag,scales,values,res)
+  real(RDP), intent(in)    :: diag  (:,:)   !< Diagonal matrix to use B on
+  real(RDP), intent(in)    :: scales    (:) !< Scaling factors
+  real(RDP), intent(in)    :: values(:,:,:) !< Input data
+  real(RDP), intent(  out) :: res   (:,:,:) !< Matrix to store result in
+
+  integer :: k,m
+
+  k = size(diag)
+  m = size(scales)
+
+  call ScaledDiagonalMatrixProduct_ES(k,m,diag,scales,values,res)
+
+end subroutine ScaledDiagonalMatrixProduct2
+
+!-------------------------------------------------------------------------------
+!> \brief   This interface provides DGEMM for a square matrix A
+!> \author  Immo Huismann
+
+subroutine ScaledDiagonalMatrixProduct22(diag,scales,values,res)
+  real(RDP), intent(in)    :: diag  (:,:)     !< Diagonal matrix to use B on
+  real(RDP), intent(in)    :: scales    (:,:) !< Scaling factors
+  real(RDP), intent(in)    :: values(:,:,:,:) !< Input data
+  real(RDP), intent(  out) :: res   (:,:,:,:) !< Matrix to store result in
+
+  integer :: k,m
+
+  k = size(diag)
+  m = size(scales)
+
+  call ScaledDiagonalMatrixProduct_ES(k,m,diag,scales,values,res)
+
+end subroutine ScaledDiagonalMatrixProduct22
+
+!-------------------------------------------------------------------------------
+!> \brief   This interface provides DGEMM for a square matrix A
+!> \author  Immo Huismann
+
+subroutine ScaledDiagonalMatrixProduct3(diag,scales,values,res)
+  real(RDP), intent(in)    :: diag  (:,:,:)   !< Diagonal matrix to use B on
+  real(RDP), intent(in)    :: scales      (:) !< Scaling factors
+  real(RDP), intent(in)    :: values(:,:,:,:) !< Input data
+  real(RDP), intent(  out) :: res   (:,:,:,:) !< Matrix to store result in
+
+  integer :: k,m
+
+  k = size(diag)
+  m = size(scales)
+
+  call ScaledDiagonalMatrixProduct_ES(k,m,diag,scales,values,res)
+
+end subroutine ScaledDiagonalMatrixProduct3
+
+!-------------------------------------------------------------------------------
+!> \brief   This interface provides DGEMM for a square matrix A
+!> \author  Immo Huismann
+
+subroutine AddDiagonalMatrixProduct1(diag,values,res)
+  real(RDP), intent(in)    :: diag  (:)       !< Diagonal matrix to use B on
+  real(RDP), intent(in)    :: values(:,:,:,:) !< Input data
+  real(RDP), intent(inout) :: res   (:,:,:,:) !< Matrix to store the result in
+
+  integer :: k,m
+
+  k = size(diag)
+  m = product([size(values,2),size(values,3),size(values,4)])
+
+  call AddDiagonalMatrixProduct_ES (k,m,diag,values,res)
+
+end subroutine AddDiagonalMatrixProduct1
+
+!-------------------------------------------------------------------------------
+!> \brief   This interface provides DGEMM for a square matrix A
+!> \author  Immo Huismann
+
+subroutine AddDiagonalMatrixProduct2(diag,values,res)
+  real(RDP), intent(in)    :: diag  (:,:)     !< Diagonal matrix to use B on
+  real(RDP), intent(in)    :: values(:,:,:,:) !< Input data
+  real(RDP), intent(inout) :: res   (:,:,:,:) !< Matrix to store the result in
+
+  integer :: k,m
+
+  k = size(diag)
+  m = product([size(values,3),size(values,4)])
+
+  call AddDiagonalMatrixProduct_ES(k,m,diag,values,res)
+
+end subroutine AddDiagonalMatrixProduct2
+
+!-------------------------------------------------------------------------------
+!> \brief   This interface provides DGEMM for a square matrix A
+!> \author  Immo Huismann
+
+subroutine AddDiagonalMatrixProduct3(diag,values,res)
+  real(RDP), intent(in)    :: diag  (:,:,:)   !< Diagonal matrix to use B on
+  real(RDP), intent(in)    :: values(:,:,:,:) !< Input data
+  real(RDP), intent(inout) :: res   (:,:,:,:) !< Matrix to store the result in
+
+  integer :: k,m
+
+  k = size(diag)
+  m = size(values,4)
+
+  call AddDiagonalMatrixProduct_ES(k,m,diag,values,res)
+
+end subroutine AddDiagonalMatrixProduct3
+
+!-------------------------------------------------------------------------------
+!> \brief   This interface provides DGEMM for a square matrix A
+!> \author  Immo Huismann
+
+subroutine AddScaledDiagonalMatrixProduct2(diag,scales,values,res)
+  real(RDP), intent(in)    :: diag  (:,:)   !< Diagonal matrix to use B on
+  real(RDP), intent(in)    :: scales    (:) !< Scaling factors
+  real(RDP), intent(in)    :: values(:,:,:) !< Input data
+  real(RDP), intent(inout) :: res   (:,:,:) !< Matrix to store result in
+
+  integer :: k,m
+
+  k = size(diag)
+  m = size(scales)
+
+  call AddScaledDiagonalMatrixProduct_ES(k,m,diag,scales,values,res)
+
+end subroutine AddScaledDiagonalMatrixProduct2
+
+! Implementations ==============================================================
+
+!-------------------------------------------------------------------------------
+!> \brief   Provides a diagonal matrix multiplication
+!> \author  Immo Huismann
+!>
+!> \details
+!> Explicit size application of a diagonal matrix on a vector.
+!> As it is implemented in explicit size with only two array dimensions instead
+!> of four or six, it is easily vectorized by the compiler.
+!>
+!> This routine is not designed to be used directly. Rather one of the wrappers
+!> with explicit shape should be used. Mind that this routine is not
+!> thread-safe, as it uses OpenACC and OpenMP to parallelize the operations.
+
+subroutine DiagonalMatrixProduct_ES (k,m,diag,values,res)
+  integer  , intent(in)    :: k           !< size of first  dim of matrix A
+  integer  , intent(in)    :: m           !< size of second dim of B
+  real(RDP), intent(in)    :: diag  (k  ) !< Diagonal matrix to use B on
+  real(RDP), intent(in)    :: values(k,m) !< Input data
+  real(RDP), intent(  out) :: res   (k,m) !< Matrix to store the result im
+
+  integer   :: i,j ! loop indices
+
+  ! Mind that the copy of the diag matrix is not allowed in the data statement,
+  ! unless the data statement receives the 'async' keyword.
+  ! Obmitting the async might lead to the parallel region starting without the
+  ! copy process being finished.
+
+  !$acc data present(values,res) copyin(diag) async(ACC_EXEC_QUEUE)
+
+  !$acc parallel async(ACC_EXEC_QUEUE)
+
+  !$acc loop independent collapse(2) private(i,j)
+  !$omp do private(i,j)
+  do j = 1, m
+  do i = 1, k
+    ! point-wise multiplication
+    res(i,j) = diag(i) * values(i,j)
+  end do
+  end do
+  !$omp end do
+  !$acc end loop
+
+  !$acc end parallel
+
+  !$acc end data
+
+end subroutine DiagonalMatrixProduct_ES
+
+!-------------------------------------------------------------------------------
+!> \brief   Provides a diagonal matrix multiplication
+!> \author  Immo Huismann
+!>
+!> \details
+!> Explicit size application of a diagonal matrix on a vector.
+!> As it is implemented in explicit size with only two array dimensions instead
+!> of four or six, it is easily vectorized by the compiler.
+!>
+!> This routine is not designed to be used directly. Rather one of the wrappers
+!> with explicit shape should be used. Mind that this routine is not
+!> thread-safe, as it uses OpenACC and OpenMP to parallelize the operations.
+
+subroutine InPlaceDiagonalMatrixProduct_ES (k,m,diag,values)
+  integer  , intent(in)    :: k           !< size of first  dim of matrix A
+  integer  , intent(in)    :: m           !< size of second dim of B
+  real(RDP), intent(in)    :: diag  (k  ) !< Diagonal matrix to use B on
+  real(RDP), intent(inout) :: values(k,m) !< data to work on
+
+  integer   :: i,j ! loop indices
+
+  ! Mind that the copy of the diag matrix is not allowed in the data statement,
+  ! unless the data statement receives the 'async' keyword.
+  ! Obmitting the async might lead to the parallel region starting without the
+  ! copy process being finished.
+
+  !$acc data present(values) copyin(diag) async(ACC_EXEC_QUEUE)
+  !$acc parallel                          async(ACC_EXEC_QUEUE)
+
+  !$acc loop collapse(2) private(i,j)
+  !$omp do               private(i,j)
+  do j = 1, m
+  do i = 1, k
+    ! point-wise multiplication
+    values(i,j) = diag(i) * values(i,j)
+  end do
+  end do
+  !$omp end do
+  !$acc end loop
+
+  !$acc end parallel
+  !$acc end data
+
+end subroutine InPlaceDiagonalMatrixProduct_ES
+
+!-------------------------------------------------------------------------------
+!> \brief   Provides a diagonal matrix multiplication
+!> \author  Immo Huismann
+!>
+!> \details
+!> Explicit size application of a diagonal matrix on a vector.
+!> As it is implemented in explicit size with only two array dimensions instead
+!> of four or six, it is easily vectorized by the compiler.
+!>
+!> This routine is not designed to be used directly. Rather one of the wrappers
+!> with explicit shape should be used. Mind that this routine is not
+!> thread-safe, as it uses OpenACC and OpenMP to parallelize the operations.
+
+subroutine ScaledDiagonalMatrixProduct_ES (k,m,diag,factors,values,res)
+  integer  , intent(in)    :: k            !< size of first  dim of matrix A
+  integer  , intent(in)    :: m            !< size of second dim of B
+  real(RDP), intent(in)    :: diag   (k  ) !< Diagonal matrix to use B on
+  real(RDP), intent(in)    :: factors(  m) !< Scaling factors
+  real(RDP), intent(in)    :: values (k,m) !< Input data
+  real(RDP), intent(  out) :: res    (k,m) !< Matrix to store the result im
+
+  integer   :: i,j ! loop indices
+
+  ! Mind that the copy of the diag matrix is not allowed in the data statement,
+  ! unless the data statement receives the 'async' keyword.
+  ! Obmitting the async might lead to the parallel region starting without the
+  ! copy process being finished.
+
+  !$acc data present(values,res) copyin(diag, factors) async(ACC_EXEC_QUEUE)
+  !$acc parallel                                       async(ACC_EXEC_QUEUE)
+
+  !$acc loop collapse(2) private(i,j)
+  !$omp do               private(i,j)
+  do j = 1, m
+  do i = 1, k
+    ! point-wise multiplication
+    res(i,j) = diag(i) * factors(j) * values(i,j)
+  end do
+  end do
+  !$omp end do
+  !$acc end loop
+
+  !$acc end parallel
+  !$acc end data
+
+end subroutine ScaledDiagonalMatrixProduct_ES
+
+!-------------------------------------------------------------------------------
+!> \brief   Provides a diagonal matrix multiplication
+!> \author  Immo Huismann
+!>
+!> \details
+!> Explicit size application of a diagonal matrix on a vector.
+!> As it is implemented in explicit size with only two array dimensions instead
+!> of four or six, it is easily vectorized by the compiler.
+!>
+!> This routine is not designed to be used directly. Rather one of the wrappers
+!> with explicit shape should be used. Mind that this routine is not
+!> thread-safe, as it uses OpenACC and OpenMP to parallelize the operations.
+
+subroutine AddScaledDiagonalMatrixProduct_ES (k,m,diag,factors,values,res)
+  integer  , intent(in)    :: k            !< size of first  dim of matrix A
+  integer  , intent(in)    :: m            !< size of second dim of B
+  real(RDP), intent(in)    :: diag   (k  ) !< Diagonal matrix to use B on
+  real(RDP), intent(in)    :: factors(  m) !< Scaling factors
+  real(RDP), intent(in)    :: values (k,m) !< Input data
+  real(RDP), intent(inout) :: res    (k,m) !< Matrix to store the result im
+
+  integer   :: i,j ! loop indices
+
+  ! Mind that the copy of the diag matrix is not allowed in the data statement,
+  ! unless the data statement receives the 'async' keyword.
+  ! Obmitting the async might lead to the parallel region starting without the
+  ! copy process being finished.
+
+  !$acc data present(values,res) copyin(diag, factors) async(ACC_EXEC_QUEUE)
+  !$acc parallel                                       async(ACC_EXEC_QUEUE)
+
+  !$acc loop collapse(2) private(i,j)
+  !$omp do               private(i,j)
+  do j = 1, m
+  do i = 1, k
+    ! point-wise multiplication
+    res(i,j) = res(i,j) + diag(i) * factors(j) * values(i,j)
+  end do
+  end do
+  !$omp end do
+  !$acc end loop
+
+  !$acc end parallel
+  !$acc end data
+
+end subroutine AddScaledDiagonalMatrixProduct_ES
+
+!-------------------------------------------------------------------------------
+!> \brief   Provides a diagonal matrix multiplication
+!> \author  Immo Huismann
+!>
+!> \details
+!> Explicit size application of a diagonal matrix on a vector.
+!> As it is implemented in explicit size with only two array dimensions instead
+!> of four or six, it is easily vectorized by the compiler.
+!>
+!> This routine is not designed to be used directly. Rather one of the wrappers
+!> with explicit shape should be used. Mind that this routine is not
+!> thread-safe, as it uses OpenACC and OpenMP to parallelize the operations.
+
+subroutine ScaledInPlaceDiagonalMatrixProduct_ES (k,m,diag,scales,values)
+  integer  , intent(in)    :: k           !< size of first  dim of matrix A
+  integer  , intent(in)    :: m           !< size of second dim of B
+  real(RDP), intent(in)    :: diag  (k  ) !< Diagonal matrix to use B on
+  real(RDP), intent(in)    :: scales(  m) !< elementwise scale
+  real(RDP), intent(inout) :: values(k,m) !< data to work on
+
+  integer   :: i,j ! loop indices
+
+  ! Mind that the copy of the diag matrix is not allowed in the data statement,
+  ! unless the data statement receives the 'async' keyword.
+  ! Obmitting the async might lead to the parallel region starting without the
+  ! copy process being finished.
+
+  !$acc data present(values) copyin(diag,scales) async(ACC_EXEC_QUEUE)
+  !$acc parallel                                 async(ACC_EXEC_QUEUE)
+
+  !$acc loop collapse(2) private(i,j)
+  !$omp do               private(i,j)
+  do j = 1, m
+  do i = 1, k
+    ! point-wise multiplication
+    values(i,j) = diag(i) * scales(j) * values(i,j)
+  end do
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+
+  !$acc end data
+
+end subroutine ScaledInPlaceDiagonalMatrixProduct_ES
+
+!-------------------------------------------------------------------------------
+!> Same as DiagonalMatrixProduct_ES, only differing by adding to res instead of
+!> setting it to the result of diag times values.
+
+subroutine AddDiagonalMatrixProduct_ES (k,m,diag,values,res)
+  integer  , intent(in)    :: k      !< size of first  dim of matrix A
+  integer  , intent(in)    :: m      !< size of second dim of B
+  real(RDP), intent(in)    :: diag  (k  ) !< Diagonal matrix to use B on
+  real(RDP), intent(in)    :: values(k,m) !< Input data
+  real(RDP), intent(inout) :: res   (k,m) !< Matrix to store the result im
+
+  integer   :: i,j ! loop indices
+
+  ! copying in async queue 2, so that the computation is not hindered
+  !$acc data present(values,res) copyin(diag) async(ACC_EXEC_QUEUE)
+  !$acc parallel                              async(ACC_EXEC_QUEUE)
+
+  !$acc loop collapse(2) private(i,j)
+  !$omp do               private(i,j)
+  do j = 1, m
+  do i = 1, k
+    ! point-wise multiplication
+    res(i,j) = res(i,j) + diag(i) * values(i,j)
+  end do
+  end do
+  !$omp end do
+  !$acc end loop
+
+  !$acc end parallel
+  !$acc end data
+
+end subroutine AddDiagonalMatrixProduct_ES
+
+!-------------------------------------------------------------------------------
+!> \brief   A transpose operations working on the first two dimensions of a triad
+!> \author  Immo Huismann
+!>
+!> \details
+!> By itself, this routine is useless. Its goal lies in providing a means of
+!> directly working on different dimensions of a data set by using the first
+!> dimension. This in turn enables a better performance, as the cache lines are
+!> fully utilized, or on a GPGPU the memory accesses are coalesced.
+
+subroutine StackedTranspose(n_1,n_2,n_stack,in,out)
+  integer  , intent(in)  :: n_1     !< first  dimension of in, second of out
+  integer  , intent(in)  :: n_2     !< second dimension of in, first  of out
+  integer  , intent(in)  :: n_stack !< stacked dimension of the transpose
+
+  real(RDP), intent(in)  ::  in(n_1, n_2, n_stack) !< input vector
+  real(RDP), intent(out) :: out(n_2, n_1, n_stack) !< its transpose
+
+  integer :: i,j,k ! loop indices
+
+
+  ! Transposing the first two dimensions. As typically storing data is more
+  ! costly than loading data, the j loop is chosen as the inner loop. This
+  ! delivers a slightly better performance, as the memory writes are aligned.
+
+  !$acc data present(in,out)
+  !$acc parallel async(ACC_EXEC_QUEUE)
+
+  !$acc loop collapse(3) private(i,j,k)
+  !$omp do               private(i,j,k)
+  do k = 1,n_stack
+    do i = 1, n_1
+    do j = 1, n_2
+      out(j,i,k) = in(i,j,k)
+    end do
+    end do
+  end do
+  !$omp end do
+  !$acc end loop
+
+  !$acc end parallel
+  !$acc end data
+
+end subroutine StackedTranspose
+
+!-------------------------------------------------------------------------------
+!> \brief   A transpose operations working on the first two dimensions of a triad.
+!> \author  Immo Huismann
+!>
+!> \details
+!> By itself, this routine is useless. Its goal lies in providing a means of
+!> directly working on different dimensions of a data set by using the first
+!> dimension. This in turn enables a better performance, as the cache lines are
+!> fully utilized, or on a GPGPU the memory accesses are coalesced.
+
+subroutine AddStackedTranspose(n_1,n_2,n_stack,in,res)
+  integer  , intent(in ) :: n_1     !< first  dimension of in, second of out
+  integer  , intent(in ) :: n_2     !< second dimension of in, first  of out
+  integer  , intent(in ) :: n_stack !< stacked dimension of the transpose
+
+  real(RDP), intent(in )   ::  in(n_1, n_2, n_stack) !< input vector
+  real(RDP), intent(inout) :: res(n_2, n_1, n_stack) !< its transpose
+
+  integer :: i,j,k ! loop indices
+
+  ! Transposing the first two dimensions. As typically storing data is more
+  ! costly than loading data, the j loop is chosen as the inner loop. This
+  ! delivers a slightly better performance, as the memory writes are aligned.
+
+  !$acc data present(in,res)
+  !$acc parallel async(ACC_EXEC_QUEUE)
+
+  !$acc loop collapse(3) private(i,j,k)
+  !$omp do               private(i,j,k)
+  do k = 1,n_stack
+    do i = 1, n_1
+    do j = 1, n_2
+      res(j,i,k) = res(j,i,k) + in(i,j,k)
+    end do
+    end do
+  end do
+  !$omp end do
+  !$acc end loop
+
+  !$acc end parallel
+  !$acc end data
+
+end subroutine AddStackedTranspose
+
+!===============================================================================
+
+end module Matrix_Operations
diff --git a/Parser/test-data/specht/tiny_matrix_products_explicit.f b/Parser/test-data/specht/tiny_matrix_products_explicit.f
new file mode 100644
index 0000000..5f66425
--- /dev/null
+++ b/Parser/test-data/specht/tiny_matrix_products_explicit.f
@@ -0,0 +1,1386 @@
+!> \file       tiny_matrix_products_explicit.f
+!> \brief      matrix products, for known matrix sizes
+!> \author     Immo Huismann
+!> \date       2015/04/20
+!> \copyright  Institute of Fluid Mechanics, TU Dresden, 01062 Dresden, Germany
+!>
+!===============================================================================
+
+module Tiny_Matrix_Products_Explicit
+  use Kind_Parameters, only: RDP
+  use ACC_Parameters,  only: ACC_EXEC_QUEUE
+  implicit none
+  private
+
+  public ::    SquareTransposeMatrixProduct_Generated
+  public :: AddSquareTransposeMatrixProduct_Generated
+
+contains
+
+!-------------------------------------------------------------------------------
+!> \brief   C <- A B + C, generated version choosing different implementations
+!> \author  Immo Huismann
+
+subroutine AddSquareTransposeMatrixProduct_Generated(k,m,A,B,C)
+  integer,   intent(in)    :: k!< Matrix dim.
+  integer,   intent(in)    :: m!< number of vectors
+  real(RDP), intent(in)    :: A(k,k)!< matrix to multiply with
+  real(RDP), intent(in)    :: B(k,m)!< matrix to multiply
+  real(RDP), intent(inout) :: C(k,m)!< result
+
+  select case(k)
+  case( 1)
+    call AddSquareTransposeMatrixProduct_01(m,A,B,C)
+  case( 2)
+    call AddSquareTransposeMatrixProduct_02(m,A,B,C)
+  case( 3)
+    call AddSquareTransposeMatrixProduct_03(m,A,B,C)
+  case( 4)
+    call AddSquareTransposeMatrixProduct_04(m,A,B,C)
+  case( 5)
+    call AddSquareTransposeMatrixProduct_05(m,A,B,C)
+  case( 6)
+    call AddSquareTransposeMatrixProduct_06(m,A,B,C)
+  case( 7)
+    call AddSquareTransposeMatrixProduct_07(m,A,B,C)
+  case( 8)
+    call AddSquareTransposeMatrixProduct_08(m,A,B,C)
+  case( 9)
+    call AddSquareTransposeMatrixProduct_09(m,A,B,C)
+  case(10)
+    call AddSquareTransposeMatrixProduct_10(m,A,B,C)
+  case(11)
+    call AddSquareTransposeMatrixProduct_11(m,A,B,C)
+  case(12)
+    call AddSquareTransposeMatrixProduct_12(m,A,B,C)
+  case(13)
+    call AddSquareTransposeMatrixProduct_13(m,A,B,C)
+  case(14)
+    call AddSquareTransposeMatrixProduct_14(m,A,B,C)
+  case(15)
+    call AddSquareTransposeMatrixProduct_15(m,A,B,C)
+  case(16)
+    call AddSquareTransposeMatrixProduct_16(m,A,B,C)
+  case default
+    call AddSquareTransposeMatrixProduct_XX(k,m,A,B,C)
+  end select
+
+end subroutine AddSquareTransposeMatrixProduct_Generated
+!-------------------------------------------------------------------------------
+!> \brief   C <- A B + C for small square matrix A of size k x k
+!> \author  Immo Huismann
+subroutine AddSquareTransposeMatrixProduct_XX(k,m,A_T,B,C)
+  integer,   intent(in)    :: k
+  integer,   intent(in)    :: m
+  real(RDP), intent(in)    :: A_T(k,k)
+  real(RDP), intent(in)    :: B  (k,m)
+  real(RDP), intent(inout) :: C  (k,m)
+
+  integer   :: i, j, l ! loop indices
+  real(RDP) :: tmp     ! reduction variable
+
+  !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE)
+  !$acc loop collapse(2) private(i,j,l,tmp) gang worker vector
+  !$omp do private(i,j,l,tmp)
+  do j = 1, m
+  do i = 1, k
+    tmp = 0
+    do l = 1, k
+      tmp = tmp + A_T(l,i) * B(l,j)
+    end do
+
+    C(i,j) = C(i,j) + tmp
+  end do
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+
+end subroutine AddSquareTransposeMatrixProduct_XX
+
+
+!-------------------------------------------------------------------------------
+!> \brief   C <- A B + C for small square matrix A of size 1 x 1
+!> \author  Immo Huismann
+subroutine AddSquareTransposeMatrixProduct_01(m,A_T,B,C)
+  integer,   parameter   :: N = 1
+  integer,   intent(in)  :: m
+  real(RDP), intent(in)  :: A_T(N,N)
+  real(RDP), intent(in)  :: B  (N,m)
+  real(RDP), intent(out) :: C  (N,m)
+
+  integer   :: i, j ! loop indices
+  real(RDP) :: tmp  ! reduction variable
+
+  !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE)
+  !$acc loop collapse(2) private(i,j,tmp)
+  !$omp do private(i,j,tmp)
+  do j = 1, m
+  do i = 1, N
+    tmp    =                     &
+           + A_T( 1,i) * B( 1,j)
+
+    C(i,j) = C(i,j) + tmp
+  end do
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+
+end subroutine AddSquareTransposeMatrixProduct_01
+
+!-------------------------------------------------------------------------------
+!> \brief   C <- A B + C for small square matrix A of size 2 x 2
+!> \author  Immo Huismann
+subroutine AddSquareTransposeMatrixProduct_02(m,A_T,B,C)
+  integer,   parameter   :: N = 2
+  integer,   intent(in)  :: m
+  real(RDP), intent(in)  :: A_T(N,N)
+  real(RDP), intent(in)  :: B  (N,m)
+  real(RDP), intent(out) :: C  (N,m)
+
+  integer   :: i, j ! loop indices
+  real(RDP) :: tmp  ! reduction variable
+
+  !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE)
+  !$acc loop collapse(2) private(i,j,tmp)
+  !$omp do private(i,j,tmp)
+  do j = 1, m
+  do i = 1, N
+    tmp    =                     &
+           + A_T( 1,i) * B( 1,j) &
+           + A_T( 2,i) * B( 2,j)
+
+    C(i,j) = C(i,j) + tmp
+  end do
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+
+end subroutine AddSquareTransposeMatrixProduct_02
+
+!-------------------------------------------------------------------------------
+!> \brief   C <- A B + C for small square matrix A of size 3 x 3
+!> \author  Immo Huismann
+subroutine AddSquareTransposeMatrixProduct_03(m,A_T,B,C)
+  integer,   parameter   :: N = 3
+  integer,   intent(in)  :: m
+  real(RDP), intent(in)  :: A_T(N,N)
+  real(RDP), intent(in)  :: B  (N,m)
+  real(RDP), intent(out) :: C  (N,m)
+
+  integer   :: i, j ! loop indices
+  real(RDP) :: tmp  ! reduction variable
+
+  !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE)
+  !$acc loop collapse(2) private(i,j,tmp)
+  !$omp do private(i,j,tmp)
+  do j = 1, m
+  do i = 1, N
+    tmp    =                     &
+           + A_T( 1,i) * B( 1,j) &
+           + A_T( 2,i) * B( 2,j) &
+           + A_T( 3,i) * B( 3,j)
+
+    C(i,j) = C(i,j) + tmp
+  end do
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+
+end subroutine AddSquareTransposeMatrixProduct_03
+
+!-------------------------------------------------------------------------------
+!> \brief   C <- A B + C for small square matrix A of size 4 x 4
+!> \author  Immo Huismann
+subroutine AddSquareTransposeMatrixProduct_04(m,A_T,B,C)
+  integer,   parameter   :: N = 4
+  integer,   intent(in)  :: m
+  real(RDP), intent(in)  :: A_T(N,N)
+  real(RDP), intent(in)  :: B  (N,m)
+  real(RDP), intent(out) :: C  (N,m)
+
+  integer   :: i, j ! loop indices
+  real(RDP) :: tmp  ! reduction variable
+
+  !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE)
+  !$acc loop collapse(2) private(i,j,tmp)
+  !$omp do private(i,j,tmp)
+  do j = 1, m
+  do i = 1, N
+    tmp    =                     &
+           + A_T( 1,i) * B( 1,j) &
+           + A_T( 2,i) * B( 2,j) &
+           + A_T( 3,i) * B( 3,j) &
+           + A_T( 4,i) * B( 4,j)
+
+    C(i,j) = C(i,j) + tmp
+  end do
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+
+end subroutine AddSquareTransposeMatrixProduct_04
+
+!-------------------------------------------------------------------------------
+!> \brief   C <- A B + C for small square matrix A of size 5 x 5
+!> \author  Immo Huismann
+subroutine AddSquareTransposeMatrixProduct_05(m,A_T,B,C)
+  integer,   parameter   :: N = 5
+  integer,   intent(in)  :: m
+  real(RDP), intent(in)  :: A_T(N,N)
+  real(RDP), intent(in)  :: B  (N,m)
+  real(RDP), intent(out) :: C  (N,m)
+
+  integer   :: i, j ! loop indices
+  real(RDP) :: tmp  ! reduction variable
+
+  !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE)
+  !$acc loop collapse(2) private(i,j,tmp)
+  !$omp do private(i,j,tmp)
+  do j = 1, m
+  do i = 1, N
+    tmp    =                     &
+           + A_T( 1,i) * B( 1,j) &
+           + A_T( 2,i) * B( 2,j) &
+           + A_T( 3,i) * B( 3,j) &
+           + A_T( 4,i) * B( 4,j) &
+           + A_T( 5,i) * B( 5,j)
+
+    C(i,j) = C(i,j) + tmp
+  end do
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+
+end subroutine AddSquareTransposeMatrixProduct_05
+
+!-------------------------------------------------------------------------------
+!> \brief   C <- A B + C for small square matrix A of size 6 x 6
+!> \author  Immo Huismann
+subroutine AddSquareTransposeMatrixProduct_06(m,A_T,B,C)
+  integer,   parameter   :: N = 6
+  integer,   intent(in)  :: m
+  real(RDP), intent(in)  :: A_T(N,N)
+  real(RDP), intent(in)  :: B  (N,m)
+  real(RDP), intent(out) :: C  (N,m)
+
+  integer   :: i, j ! loop indices
+  real(RDP) :: tmp  ! reduction variable
+
+  !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE)
+  !$acc loop collapse(2) private(i,j,tmp)
+  !$omp do private(i,j,tmp)
+  do j = 1, m
+  do i = 1, N
+    tmp    =                     &
+           + A_T( 1,i) * B( 1,j) &
+           + A_T( 2,i) * B( 2,j) &
+           + A_T( 3,i) * B( 3,j) &
+           + A_T( 4,i) * B( 4,j) &
+           + A_T( 5,i) * B( 5,j) &
+           + A_T( 6,i) * B( 6,j)
+
+    C(i,j) = C(i,j) + tmp
+  end do
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+
+end subroutine AddSquareTransposeMatrixProduct_06
+
+!-------------------------------------------------------------------------------
+!> \brief   C <- A B + C for small square matrix A of size 7 x 7
+!> \author  Immo Huismann
+subroutine AddSquareTransposeMatrixProduct_07(m,A_T,B,C)
+  integer,   parameter   :: N = 7
+  integer,   intent(in)  :: m
+  real(RDP), intent(in)  :: A_T(N,N)
+  real(RDP), intent(in)  :: B  (N,m)
+  real(RDP), intent(out) :: C  (N,m)
+
+  integer   :: i, j ! loop indices
+  real(RDP) :: tmp  ! reduction variable
+
+  !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE)
+  !$acc loop collapse(2) private(i,j,tmp)
+  !$omp do private(i,j,tmp)
+  do j = 1, m
+  do i = 1, N
+    tmp    =                     &
+           + A_T( 1,i) * B( 1,j) &
+           + A_T( 2,i) * B( 2,j) &
+           + A_T( 3,i) * B( 3,j) &
+           + A_T( 4,i) * B( 4,j) &
+           + A_T( 5,i) * B( 5,j) &
+           + A_T( 6,i) * B( 6,j) &
+           + A_T( 7,i) * B( 7,j)
+
+    C(i,j) = C(i,j) + tmp
+  end do
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+
+end subroutine AddSquareTransposeMatrixProduct_07
+
+!-------------------------------------------------------------------------------
+!> \brief   C <- A B + C for small square matrix A of size 8 x 8
+!> \author  Immo Huismann
+subroutine AddSquareTransposeMatrixProduct_08(m,A_T,B,C)
+  integer,   parameter   :: N = 8
+  integer,   intent(in)  :: m
+  real(RDP), intent(in)  :: A_T(N,N)
+  real(RDP), intent(in)  :: B  (N,m)
+  real(RDP), intent(out) :: C  (N,m)
+
+  integer   :: i, j ! loop indices
+  real(RDP) :: tmp  ! reduction variable
+
+  !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE)
+  !$acc loop collapse(2) private(i,j,tmp)
+  !$omp do private(i,j,tmp)
+  do j = 1, m
+  do i = 1, N
+    tmp    =                     &
+           + A_T( 1,i) * B( 1,j) &
+           + A_T( 2,i) * B( 2,j) &
+           + A_T( 3,i) * B( 3,j) &
+           + A_T( 4,i) * B( 4,j) &
+           + A_T( 5,i) * B( 5,j) &
+           + A_T( 6,i) * B( 6,j) &
+           + A_T( 7,i) * B( 7,j) &
+           + A_T( 8,i) * B( 8,j)
+
+    C(i,j) = C(i,j) + tmp
+  end do
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+
+end subroutine AddSquareTransposeMatrixProduct_08
+
+!-------------------------------------------------------------------------------
+!> \brief   C <- A B + C for small square matrix A of size 9 x 9
+!> \author  Immo Huismann
+subroutine AddSquareTransposeMatrixProduct_09(m,A_T,B,C)
+  integer,   parameter   :: N = 9
+  integer,   intent(in)  :: m
+  real(RDP), intent(in)  :: A_T(N,N)
+  real(RDP), intent(in)  :: B  (N,m)
+  real(RDP), intent(out) :: C  (N,m)
+
+  integer   :: i, j ! loop indices
+  real(RDP) :: tmp  ! reduction variable
+
+  !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE)
+  !$acc loop collapse(2) private(i,j,tmp)
+  !$omp do private(i,j,tmp)
+  do j = 1, m
+  do i = 1, N
+    tmp    =                     &
+           + A_T( 1,i) * B( 1,j) &
+           + A_T( 2,i) * B( 2,j) &
+           + A_T( 3,i) * B( 3,j) &
+           + A_T( 4,i) * B( 4,j) &
+           + A_T( 5,i) * B( 5,j) &
+           + A_T( 6,i) * B( 6,j) &
+           + A_T( 7,i) * B( 7,j) &
+           + A_T( 8,i) * B( 8,j) &
+           + A_T( 9,i) * B( 9,j)
+
+    C(i,j) = C(i,j) + tmp
+  end do
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+
+end subroutine AddSquareTransposeMatrixProduct_09
+
+!-------------------------------------------------------------------------------
+!> \brief   C <- A B + C for small square matrix A of size 10 x 10
+!> \author  Immo Huismann
+subroutine AddSquareTransposeMatrixProduct_10(m,A_T,B,C)
+  integer,   parameter   :: N = 10
+  integer,   intent(in)  :: m
+  real(RDP), intent(in)  :: A_T(N,N)
+  real(RDP), intent(in)  :: B  (N,m)
+  real(RDP), intent(out) :: C  (N,m)
+
+  integer   :: i, j ! loop indices
+  real(RDP) :: tmp  ! reduction variable
+
+  !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE)
+  !$acc loop collapse(2) private(i,j,tmp)
+  !$omp do private(i,j,tmp)
+  do j = 1, m
+  do i = 1, N
+    tmp    =                     &
+           + A_T( 1,i) * B( 1,j) &
+           + A_T( 2,i) * B( 2,j) &
+           + A_T( 3,i) * B( 3,j) &
+           + A_T( 4,i) * B( 4,j) &
+           + A_T( 5,i) * B( 5,j) &
+           + A_T( 6,i) * B( 6,j) &
+           + A_T( 7,i) * B( 7,j) &
+           + A_T( 8,i) * B( 8,j) &
+           + A_T( 9,i) * B( 9,j) &
+           + A_T(10,i) * B(10,j)
+
+    C(i,j) = C(i,j) + tmp
+  end do
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+
+end subroutine AddSquareTransposeMatrixProduct_10
+
+!-------------------------------------------------------------------------------
+!> \brief   C <- A B + C for small square matrix A of size 11 x 11
+!> \author  Immo Huismann
+subroutine AddSquareTransposeMatrixProduct_11(m,A_T,B,C)
+  integer,   parameter   :: N = 11
+  integer,   intent(in)  :: m
+  real(RDP), intent(in)  :: A_T(N,N)
+  real(RDP), intent(in)  :: B  (N,m)
+  real(RDP), intent(out) :: C  (N,m)
+
+  integer   :: i, j ! loop indices
+  real(RDP) :: tmp  ! reduction variable
+
+  !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE)
+  !$acc loop collapse(2) private(i,j,tmp)
+  !$omp do private(i,j,tmp)
+  do j = 1, m
+  do i = 1, N
+    tmp    =                     &
+           + A_T( 1,i) * B( 1,j) &
+           + A_T( 2,i) * B( 2,j) &
+           + A_T( 3,i) * B( 3,j) &
+           + A_T( 4,i) * B( 4,j) &
+           + A_T( 5,i) * B( 5,j) &
+           + A_T( 6,i) * B( 6,j) &
+           + A_T( 7,i) * B( 7,j) &
+           + A_T( 8,i) * B( 8,j) &
+           + A_T( 9,i) * B( 9,j) &
+           + A_T(10,i) * B(10,j) &
+           + A_T(11,i) * B(11,j)
+
+    C(i,j) = C(i,j) + tmp
+  end do
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+
+end subroutine AddSquareTransposeMatrixProduct_11
+
+!-------------------------------------------------------------------------------
+!> \brief   C <- A B + C for small square matrix A of size 12 x 12
+!> \author  Immo Huismann
+subroutine AddSquareTransposeMatrixProduct_12(m,A_T,B,C)
+  integer,   parameter   :: N = 12
+  integer,   intent(in)  :: m
+  real(RDP), intent(in)  :: A_T(N,N)
+  real(RDP), intent(in)  :: B  (N,m)
+  real(RDP), intent(out) :: C  (N,m)
+
+  integer   :: i, j ! loop indices
+  real(RDP) :: tmp  ! reduction variable
+
+  !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE)
+  !$acc loop collapse(2) private(i,j,tmp)
+  !$omp do private(i,j,tmp)
+  do j = 1, m
+  do i = 1, N
+    tmp    =                     &
+           + A_T( 1,i) * B( 1,j) &
+           + A_T( 2,i) * B( 2,j) &
+           + A_T( 3,i) * B( 3,j) &
+           + A_T( 4,i) * B( 4,j) &
+           + A_T( 5,i) * B( 5,j) &
+           + A_T( 6,i) * B( 6,j) &
+           + A_T( 7,i) * B( 7,j) &
+           + A_T( 8,i) * B( 8,j) &
+           + A_T( 9,i) * B( 9,j) &
+           + A_T(10,i) * B(10,j) &
+           + A_T(11,i) * B(11,j) &
+           + A_T(12,i) * B(12,j)
+
+    C(i,j) = C(i,j) + tmp
+  end do
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+
+end subroutine AddSquareTransposeMatrixProduct_12
+
+!-------------------------------------------------------------------------------
+!> \brief   C <- A B + C for small square matrix A of size 13 x 13
+!> \author  Immo Huismann
+subroutine AddSquareTransposeMatrixProduct_13(m,A_T,B,C)
+  integer,   parameter   :: N = 13
+  integer,   intent(in)  :: m
+  real(RDP), intent(in)  :: A_T(N,N)
+  real(RDP), intent(in)  :: B  (N,m)
+  real(RDP), intent(out) :: C  (N,m)
+
+  integer   :: i, j ! loop indices
+  real(RDP) :: tmp  ! reduction variable
+
+  !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE)
+  !$acc loop collapse(2) private(i,j,tmp)
+  !$omp do private(i,j,tmp)
+  do j = 1, m
+  do i = 1, N
+    tmp    =                     &
+           + A_T( 1,i) * B( 1,j) &
+           + A_T( 2,i) * B( 2,j) &
+           + A_T( 3,i) * B( 3,j) &
+           + A_T( 4,i) * B( 4,j) &
+           + A_T( 5,i) * B( 5,j) &
+           + A_T( 6,i) * B( 6,j) &
+           + A_T( 7,i) * B( 7,j) &
+           + A_T( 8,i) * B( 8,j) &
+           + A_T( 9,i) * B( 9,j) &
+           + A_T(10,i) * B(10,j) &
+           + A_T(11,i) * B(11,j) &
+           + A_T(12,i) * B(12,j) &
+           + A_T(13,i) * B(13,j)
+
+    C(i,j) = C(i,j) + tmp
+  end do
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+
+end subroutine AddSquareTransposeMatrixProduct_13
+
+!-------------------------------------------------------------------------------
+!> \brief   C <- A B + C for small square matrix A of size 14 x 14
+!> \author  Immo Huismann
+subroutine AddSquareTransposeMatrixProduct_14(m,A_T,B,C)
+  integer,   parameter   :: N = 14
+  integer,   intent(in)  :: m
+  real(RDP), intent(in)  :: A_T(N,N)
+  real(RDP), intent(in)  :: B  (N,m)
+  real(RDP), intent(out) :: C  (N,m)
+
+  integer   :: i, j ! loop indices
+  real(RDP) :: tmp  ! reduction variable
+
+  !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE)
+  !$acc loop collapse(2) private(i,j,tmp)
+  !$omp do private(i,j,tmp)
+  do j = 1, m
+  do i = 1, N
+    tmp    =                     &
+           + A_T( 1,i) * B( 1,j) &
+           + A_T( 2,i) * B( 2,j) &
+           + A_T( 3,i) * B( 3,j) &
+           + A_T( 4,i) * B( 4,j) &
+           + A_T( 5,i) * B( 5,j) &
+           + A_T( 6,i) * B( 6,j) &
+           + A_T( 7,i) * B( 7,j) &
+           + A_T( 8,i) * B( 8,j) &
+           + A_T( 9,i) * B( 9,j) &
+           + A_T(10,i) * B(10,j) &
+           + A_T(11,i) * B(11,j) &
+           + A_T(12,i) * B(12,j) &
+           + A_T(13,i) * B(13,j) &
+           + A_T(14,i) * B(14,j)
+
+    C(i,j) = C(i,j) + tmp
+  end do
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+
+end subroutine AddSquareTransposeMatrixProduct_14
+
+!-------------------------------------------------------------------------------
+!> \brief   C <- A B + C for small square matrix A of size 15 x 15
+!> \author  Immo Huismann
+subroutine AddSquareTransposeMatrixProduct_15(m,A_T,B,C)
+  integer,   parameter   :: N = 15
+  integer,   intent(in)  :: m
+  real(RDP), intent(in)  :: A_T(N,N)
+  real(RDP), intent(in)  :: B  (N,m)
+  real(RDP), intent(out) :: C  (N,m)
+
+  integer   :: i, j ! loop indices
+  real(RDP) :: tmp  ! reduction variable
+
+  !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE)
+  !$acc loop collapse(2) private(i,j,tmp)
+  !$omp do private(i,j,tmp)
+  do j = 1, m
+  do i = 1, N
+    tmp    =                     &
+           + A_T( 1,i) * B( 1,j) &
+           + A_T( 2,i) * B( 2,j) &
+           + A_T( 3,i) * B( 3,j) &
+           + A_T( 4,i) * B( 4,j) &
+           + A_T( 5,i) * B( 5,j) &
+           + A_T( 6,i) * B( 6,j) &
+           + A_T( 7,i) * B( 7,j) &
+           + A_T( 8,i) * B( 8,j) &
+           + A_T( 9,i) * B( 9,j) &
+           + A_T(10,i) * B(10,j) &
+           + A_T(11,i) * B(11,j) &
+           + A_T(12,i) * B(12,j) &
+           + A_T(13,i) * B(13,j) &
+           + A_T(14,i) * B(14,j) &
+           + A_T(15,i) * B(15,j)
+
+    C(i,j) = C(i,j) + tmp
+  end do
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+
+end subroutine AddSquareTransposeMatrixProduct_15
+
+!-------------------------------------------------------------------------------
+!> \brief   C <- A B + C for small square matrix A of size 16 x 16
+!> \author  Immo Huismann
+subroutine AddSquareTransposeMatrixProduct_16(m,A_T,B,C)
+  integer,   parameter   :: N = 16
+  integer,   intent(in)  :: m
+  real(RDP), intent(in)  :: A_T(N,N)
+  real(RDP), intent(in)  :: B  (N,m)
+  real(RDP), intent(out) :: C  (N,m)
+
+  integer   :: i, j ! loop indices
+  real(RDP) :: tmp  ! reduction variable
+
+  !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE)
+  !$acc loop collapse(2) private(i,j,tmp)
+  !$omp do private(i,j,tmp)
+  do j = 1, m
+  do i = 1, N
+    tmp    =                     &
+           + A_T( 1,i) * B( 1,j) &
+           + A_T( 2,i) * B( 2,j) &
+           + A_T( 3,i) * B( 3,j) &
+           + A_T( 4,i) * B( 4,j) &
+           + A_T( 5,i) * B( 5,j) &
+           + A_T( 6,i) * B( 6,j) &
+           + A_T( 7,i) * B( 7,j) &
+           + A_T( 8,i) * B( 8,j) &
+           + A_T( 9,i) * B( 9,j) &
+           + A_T(10,i) * B(10,j) &
+           + A_T(11,i) * B(11,j) &
+           + A_T(12,i) * B(12,j) &
+           + A_T(13,i) * B(13,j) &
+           + A_T(14,i) * B(14,j) &
+           + A_T(15,i) * B(15,j) &
+           + A_T(16,i) * B(16,j)
+
+    C(i,j) = C(i,j) + tmp
+  end do
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+
+end subroutine AddSquareTransposeMatrixProduct_16
+
+!-------------------------------------------------------------------------------
+!> \brief   C <- A B, generated version choosing different implementations
+!> \author  Immo Huismann
+
+subroutine SquareTransposeMatrixProduct_Generated(k,m,A,B,C)
+  integer,   intent(in)    :: k!< Matrix dim.
+  integer,   intent(in)    :: m!< number of vectors
+  real(RDP), intent(in)    :: A(k,k)!< matrix to multiply with
+  real(RDP), intent(in)    :: B(k,m)!< matrix to multiply
+  real(RDP), intent(out)   :: C(k,m)!< result
+
+  select case(k)
+  case( 1)
+    call SquareTransposeMatrixProduct_01(m,A,B,C)
+  case( 2)
+    call SquareTransposeMatrixProduct_02(m,A,B,C)
+  case( 3)
+    call SquareTransposeMatrixProduct_03(m,A,B,C)
+  case( 4)
+    call SquareTransposeMatrixProduct_04(m,A,B,C)
+  case( 5)
+    call SquareTransposeMatrixProduct_05(m,A,B,C)
+  case( 6)
+    call SquareTransposeMatrixProduct_06(m,A,B,C)
+  case( 7)
+    call SquareTransposeMatrixProduct_07(m,A,B,C)
+  case( 8)
+    call SquareTransposeMatrixProduct_08(m,A,B,C)
+  case( 9)
+    call SquareTransposeMatrixProduct_09(m,A,B,C)
+  case(10)
+    call SquareTransposeMatrixProduct_10(m,A,B,C)
+  case(11)
+    call SquareTransposeMatrixProduct_11(m,A,B,C)
+  case(12)
+    call SquareTransposeMatrixProduct_12(m,A,B,C)
+  case(13)
+    call SquareTransposeMatrixProduct_13(m,A,B,C)
+  case(14)
+    call SquareTransposeMatrixProduct_14(m,A,B,C)
+  case(15)
+    call SquareTransposeMatrixProduct_15(m,A,B,C)
+  case(16)
+    call SquareTransposeMatrixProduct_16(m,A,B,C)
+  case default
+    call SquareTransposeMatrixProduct_XX(k,m,A,B,C)
+  end select
+
+end subroutine SquareTransposeMatrixProduct_Generated
+!-------------------------------------------------------------------------------
+!> \brief   C <- A B for small square matrix A of size k x k
+!> \author  Immo Huismann
+subroutine SquareTransposeMatrixProduct_XX(k,m,A_T,B,C)
+  integer,   intent(in)    :: k
+  integer,   intent(in)    :: m
+  real(RDP), intent(in)    :: A_T(k,k)
+  real(RDP), intent(in)    :: B  (k,m)
+  real(RDP), intent(out)   :: C  (k,m)
+
+  integer   :: i, j, l ! loop indices
+  real(RDP) :: tmp     ! reduction variable
+
+  !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE)
+  !$acc loop collapse(2) private(i,j,l,tmp) gang worker vector
+  !$omp do private(i,j,l,tmp)
+  do j = 1, m
+  do i = 1, k
+    tmp = 0
+    do l = 1, k
+      tmp = tmp + A_T(l,i) * B(l,j)
+    end do
+
+    C(i,j) = tmp
+  end do
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+
+end subroutine SquareTransposeMatrixProduct_XX
+
+
+!-------------------------------------------------------------------------------
+!> \brief   C <- A B for small square matrix A of size 1 x 1
+!> \author  Immo Huismann
+subroutine SquareTransposeMatrixProduct_01(m,A_T,B,C)
+  integer,   parameter   :: N = 1
+  integer,   intent(in)  :: m
+  real(RDP), intent(in)  :: A_T(N,N)
+  real(RDP), intent(in)  :: B  (N,m)
+  real(RDP), intent(out) :: C  (N,m)
+
+  integer   :: i, j ! loop indices
+  real(RDP) :: tmp  ! reduction variable
+
+  !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE)
+  !$acc loop collapse(2) private(i,j,tmp)
+  !$omp do private(i,j,tmp)
+  do j = 1, m
+  do i = 1, N
+    tmp    =                     &
+           + A_T( 1,i) * B( 1,j)
+
+    C(i,j) = tmp
+  end do
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+
+end subroutine SquareTransposeMatrixProduct_01
+
+!-------------------------------------------------------------------------------
+!> \brief   C <- A B for small square matrix A of size 2 x 2
+!> \author  Immo Huismann
+subroutine SquareTransposeMatrixProduct_02(m,A_T,B,C)
+  integer,   parameter   :: N = 2
+  integer,   intent(in)  :: m
+  real(RDP), intent(in)  :: A_T(N,N)
+  real(RDP), intent(in)  :: B  (N,m)
+  real(RDP), intent(out) :: C  (N,m)
+
+  integer   :: i, j ! loop indices
+  real(RDP) :: tmp  ! reduction variable
+
+  !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE)
+  !$acc loop collapse(2) private(i,j,tmp)
+  !$omp do private(i,j,tmp)
+  do j = 1, m
+  do i = 1, N
+    tmp    =                     &
+           + A_T( 1,i) * B( 1,j) &
+           + A_T( 2,i) * B( 2,j)
+
+    C(i,j) = tmp
+  end do
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+
+end subroutine SquareTransposeMatrixProduct_02
+
+!-------------------------------------------------------------------------------
+!> \brief   C <- A B for small square matrix A of size 3 x 3
+!> \author  Immo Huismann
+subroutine SquareTransposeMatrixProduct_03(m,A_T,B,C)
+  integer,   parameter   :: N = 3
+  integer,   intent(in)  :: m
+  real(RDP), intent(in)  :: A_T(N,N)
+  real(RDP), intent(in)  :: B  (N,m)
+  real(RDP), intent(out) :: C  (N,m)
+
+  integer   :: i, j ! loop indices
+  real(RDP) :: tmp  ! reduction variable
+
+  !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE)
+  !$acc loop collapse(2) private(i,j,tmp)
+  !$omp do private(i,j,tmp)
+  do j = 1, m
+  do i = 1, N
+    tmp    =                     &
+           + A_T( 1,i) * B( 1,j) &
+           + A_T( 2,i) * B( 2,j) &
+           + A_T( 3,i) * B( 3,j)
+
+    C(i,j) = tmp
+  end do
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+
+end subroutine SquareTransposeMatrixProduct_03
+
+!-------------------------------------------------------------------------------
+!> \brief   C <- A B for small square matrix A of size 4 x 4
+!> \author  Immo Huismann
+subroutine SquareTransposeMatrixProduct_04(m,A_T,B,C)
+  integer,   parameter   :: N = 4
+  integer,   intent(in)  :: m
+  real(RDP), intent(in)  :: A_T(N,N)
+  real(RDP), intent(in)  :: B  (N,m)
+  real(RDP), intent(out) :: C  (N,m)
+
+  integer   :: i, j ! loop indices
+  real(RDP) :: tmp  ! reduction variable
+
+  !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE)
+  !$acc loop collapse(2) private(i,j,tmp)
+  !$omp do private(i,j,tmp)
+  do j = 1, m
+  do i = 1, N
+    tmp    =                     &
+           + A_T( 1,i) * B( 1,j) &
+           + A_T( 2,i) * B( 2,j) &
+           + A_T( 3,i) * B( 3,j) &
+           + A_T( 4,i) * B( 4,j)
+
+    C(i,j) = tmp
+  end do
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+
+end subroutine SquareTransposeMatrixProduct_04
+
+!-------------------------------------------------------------------------------
+!> \brief   C <- A B for small square matrix A of size 5 x 5
+!> \author  Immo Huismann
+subroutine SquareTransposeMatrixProduct_05(m,A_T,B,C)
+  integer,   parameter   :: N = 5
+  integer,   intent(in)  :: m
+  real(RDP), intent(in)  :: A_T(N,N)
+  real(RDP), intent(in)  :: B  (N,m)
+  real(RDP), intent(out) :: C  (N,m)
+
+  integer   :: i, j ! loop indices
+  real(RDP) :: tmp  ! reduction variable
+
+  !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE)
+  !$acc loop collapse(2) private(i,j,tmp)
+  !$omp do private(i,j,tmp)
+  do j = 1, m
+  do i = 1, N
+    tmp    =                     &
+           + A_T( 1,i) * B( 1,j) &
+           + A_T( 2,i) * B( 2,j) &
+           + A_T( 3,i) * B( 3,j) &
+           + A_T( 4,i) * B( 4,j) &
+           + A_T( 5,i) * B( 5,j)
+
+    C(i,j) = tmp
+  end do
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+
+end subroutine SquareTransposeMatrixProduct_05
+
+!-------------------------------------------------------------------------------
+!> \brief   C <- A B for small square matrix A of size 6 x 6
+!> \author  Immo Huismann
+subroutine SquareTransposeMatrixProduct_06(m,A_T,B,C)
+  integer,   parameter   :: N = 6
+  integer,   intent(in)  :: m
+  real(RDP), intent(in)  :: A_T(N,N)
+  real(RDP), intent(in)  :: B  (N,m)
+  real(RDP), intent(out) :: C  (N,m)
+
+  integer   :: i, j ! loop indices
+  real(RDP) :: tmp  ! reduction variable
+
+  !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE)
+  !$acc loop collapse(2) private(i,j,tmp)
+  !$omp do private(i,j,tmp)
+  do j = 1, m
+  do i = 1, N
+    tmp    =                     &
+           + A_T( 1,i) * B( 1,j) &
+           + A_T( 2,i) * B( 2,j) &
+           + A_T( 3,i) * B( 3,j) &
+           + A_T( 4,i) * B( 4,j) &
+           + A_T( 5,i) * B( 5,j) &
+           + A_T( 6,i) * B( 6,j)
+
+    C(i,j) = tmp
+  end do
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+
+end subroutine SquareTransposeMatrixProduct_06
+
+!-------------------------------------------------------------------------------
+!> \brief   C <- A B for small square matrix A of size 7 x 7
+!> \author  Immo Huismann
+subroutine SquareTransposeMatrixProduct_07(m,A_T,B,C)
+  integer,   parameter   :: N = 7
+  integer,   intent(in)  :: m
+  real(RDP), intent(in)  :: A_T(N,N)
+  real(RDP), intent(in)  :: B  (N,m)
+  real(RDP), intent(out) :: C  (N,m)
+
+  integer   :: i, j ! loop indices
+  real(RDP) :: tmp  ! reduction variable
+
+  !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE)
+  !$acc loop collapse(2) private(i,j,tmp)
+  !$omp do private(i,j,tmp)
+  do j = 1, m
+  do i = 1, N
+    tmp    =                     &
+           + A_T( 1,i) * B( 1,j) &
+           + A_T( 2,i) * B( 2,j) &
+           + A_T( 3,i) * B( 3,j) &
+           + A_T( 4,i) * B( 4,j) &
+           + A_T( 5,i) * B( 5,j) &
+           + A_T( 6,i) * B( 6,j) &
+           + A_T( 7,i) * B( 7,j)
+
+    C(i,j) = tmp
+  end do
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+
+end subroutine SquareTransposeMatrixProduct_07
+
+!-------------------------------------------------------------------------------
+!> \brief   C <- A B for small square matrix A of size 8 x 8
+!> \author  Immo Huismann
+subroutine SquareTransposeMatrixProduct_08(m,A_T,B,C)
+  integer,   parameter   :: N = 8
+  integer,   intent(in)  :: m
+  real(RDP), intent(in)  :: A_T(N,N)
+  real(RDP), intent(in)  :: B  (N,m)
+  real(RDP), intent(out) :: C  (N,m)
+
+  integer   :: i, j ! loop indices
+  real(RDP) :: tmp  ! reduction variable
+
+  !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE)
+  !$acc loop collapse(2) private(i,j,tmp)
+  !$omp do private(i,j,tmp)
+  do j = 1, m
+  do i = 1, N
+    tmp    =                     &
+           + A_T( 1,i) * B( 1,j) &
+           + A_T( 2,i) * B( 2,j) &
+           + A_T( 3,i) * B( 3,j) &
+           + A_T( 4,i) * B( 4,j) &
+           + A_T( 5,i) * B( 5,j) &
+           + A_T( 6,i) * B( 6,j) &
+           + A_T( 7,i) * B( 7,j) &
+           + A_T( 8,i) * B( 8,j)
+
+    C(i,j) = tmp
+  end do
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+
+end subroutine SquareTransposeMatrixProduct_08
+
+!-------------------------------------------------------------------------------
+!> \brief   C <- A B for small square matrix A of size 9 x 9
+!> \author  Immo Huismann
+subroutine SquareTransposeMatrixProduct_09(m,A_T,B,C)
+  integer,   parameter   :: N = 9
+  integer,   intent(in)  :: m
+  real(RDP), intent(in)  :: A_T(N,N)
+  real(RDP), intent(in)  :: B  (N,m)
+  real(RDP), intent(out) :: C  (N,m)
+
+  integer   :: i, j ! loop indices
+  real(RDP) :: tmp  ! reduction variable
+
+  !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE)
+  !$acc loop collapse(2) private(i,j,tmp)
+  !$omp do private(i,j,tmp)
+  do j = 1, m
+  do i = 1, N
+    tmp    =                     &
+           + A_T( 1,i) * B( 1,j) &
+           + A_T( 2,i) * B( 2,j) &
+           + A_T( 3,i) * B( 3,j) &
+           + A_T( 4,i) * B( 4,j) &
+           + A_T( 5,i) * B( 5,j) &
+           + A_T( 6,i) * B( 6,j) &
+           + A_T( 7,i) * B( 7,j) &
+           + A_T( 8,i) * B( 8,j) &
+           + A_T( 9,i) * B( 9,j)
+
+    C(i,j) = tmp
+  end do
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+
+end subroutine SquareTransposeMatrixProduct_09
+
+!-------------------------------------------------------------------------------
+!> \brief   C <- A B for small square matrix A of size 10 x 10
+!> \author  Immo Huismann
+subroutine SquareTransposeMatrixProduct_10(m,A_T,B,C)
+  integer,   parameter   :: N = 10
+  integer,   intent(in)  :: m
+  real(RDP), intent(in)  :: A_T(N,N)
+  real(RDP), intent(in)  :: B  (N,m)
+  real(RDP), intent(out) :: C  (N,m)
+
+  integer   :: i, j ! loop indices
+  real(RDP) :: tmp  ! reduction variable
+
+  !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE)
+  !$acc loop collapse(2) private(i,j,tmp)
+  !$omp do private(i,j,tmp)
+  do j = 1, m
+  do i = 1, N
+    tmp    =                     &
+           + A_T( 1,i) * B( 1,j) &
+           + A_T( 2,i) * B( 2,j) &
+           + A_T( 3,i) * B( 3,j) &
+           + A_T( 4,i) * B( 4,j) &
+           + A_T( 5,i) * B( 5,j) &
+           + A_T( 6,i) * B( 6,j) &
+           + A_T( 7,i) * B( 7,j) &
+           + A_T( 8,i) * B( 8,j) &
+           + A_T( 9,i) * B( 9,j) &
+           + A_T(10,i) * B(10,j)
+
+    C(i,j) = tmp
+  end do
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+
+end subroutine SquareTransposeMatrixProduct_10
+
+!-------------------------------------------------------------------------------
+!> \brief   C <- A B for small square matrix A of size 11 x 11
+!> \author  Immo Huismann
+subroutine SquareTransposeMatrixProduct_11(m,A_T,B,C)
+  integer,   parameter   :: N = 11
+  integer,   intent(in)  :: m
+  real(RDP), intent(in)  :: A_T(N,N)
+  real(RDP), intent(in)  :: B  (N,m)
+  real(RDP), intent(out) :: C  (N,m)
+
+  integer   :: i, j ! loop indices
+  real(RDP) :: tmp  ! reduction variable
+
+  !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE)
+  !$acc loop collapse(2) private(i,j,tmp)
+  !$omp do private(i,j,tmp)
+  do j = 1, m
+  do i = 1, N
+    tmp    =                     &
+           + A_T( 1,i) * B( 1,j) &
+           + A_T( 2,i) * B( 2,j) &
+           + A_T( 3,i) * B( 3,j) &
+           + A_T( 4,i) * B( 4,j) &
+           + A_T( 5,i) * B( 5,j) &
+           + A_T( 6,i) * B( 6,j) &
+           + A_T( 7,i) * B( 7,j) &
+           + A_T( 8,i) * B( 8,j) &
+           + A_T( 9,i) * B( 9,j) &
+           + A_T(10,i) * B(10,j) &
+           + A_T(11,i) * B(11,j)
+
+    C(i,j) = tmp
+  end do
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+
+end subroutine SquareTransposeMatrixProduct_11
+
+!-------------------------------------------------------------------------------
+!> \brief   C <- A B for small square matrix A of size 12 x 12
+!> \author  Immo Huismann
+subroutine SquareTransposeMatrixProduct_12(m,A_T,B,C)
+  integer,   parameter   :: N = 12
+  integer,   intent(in)  :: m
+  real(RDP), intent(in)  :: A_T(N,N)
+  real(RDP), intent(in)  :: B  (N,m)
+  real(RDP), intent(out) :: C  (N,m)
+
+  integer   :: i, j ! loop indices
+  real(RDP) :: tmp  ! reduction variable
+
+  !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE)
+  !$acc loop collapse(2) private(i,j,tmp)
+  !$omp do private(i,j,tmp)
+  do j = 1, m
+  do i = 1, N
+    tmp    =                     &
+           + A_T( 1,i) * B( 1,j) &
+           + A_T( 2,i) * B( 2,j) &
+           + A_T( 3,i) * B( 3,j) &
+           + A_T( 4,i) * B( 4,j) &
+           + A_T( 5,i) * B( 5,j) &
+           + A_T( 6,i) * B( 6,j) &
+           + A_T( 7,i) * B( 7,j) &
+           + A_T( 8,i) * B( 8,j) &
+           + A_T( 9,i) * B( 9,j) &
+           + A_T(10,i) * B(10,j) &
+           + A_T(11,i) * B(11,j) &
+           + A_T(12,i) * B(12,j)
+
+    C(i,j) = tmp
+  end do
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+
+end subroutine SquareTransposeMatrixProduct_12
+
+!-------------------------------------------------------------------------------
+!> \brief   C <- A B for small square matrix A of size 13 x 13
+!> \author  Immo Huismann
+subroutine SquareTransposeMatrixProduct_13(m,A_T,B,C)
+  integer,   parameter   :: N = 13
+  integer,   intent(in)  :: m
+  real(RDP), intent(in)  :: A_T(N,N)
+  real(RDP), intent(in)  :: B  (N,m)
+  real(RDP), intent(out) :: C  (N,m)
+
+  integer   :: i, j ! loop indices
+  real(RDP) :: tmp  ! reduction variable
+
+  !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE)
+  !$acc loop collapse(2) private(i,j,tmp)
+  !$omp do private(i,j,tmp)
+  do j = 1, m
+  do i = 1, N
+    tmp    =                     &
+           + A_T( 1,i) * B( 1,j) &
+           + A_T( 2,i) * B( 2,j) &
+           + A_T( 3,i) * B( 3,j) &
+           + A_T( 4,i) * B( 4,j) &
+           + A_T( 5,i) * B( 5,j) &
+           + A_T( 6,i) * B( 6,j) &
+           + A_T( 7,i) * B( 7,j) &
+           + A_T( 8,i) * B( 8,j) &
+           + A_T( 9,i) * B( 9,j) &
+           + A_T(10,i) * B(10,j) &
+           + A_T(11,i) * B(11,j) &
+           + A_T(12,i) * B(12,j) &
+           + A_T(13,i) * B(13,j)
+
+    C(i,j) = tmp
+  end do
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+
+end subroutine SquareTransposeMatrixProduct_13
+
+!-------------------------------------------------------------------------------
+!> \brief   C <- A B for small square matrix A of size 14 x 14
+!> \author  Immo Huismann
+subroutine SquareTransposeMatrixProduct_14(m,A_T,B,C)
+  integer,   parameter   :: N = 14
+  integer,   intent(in)  :: m
+  real(RDP), intent(in)  :: A_T(N,N)
+  real(RDP), intent(in)  :: B  (N,m)
+  real(RDP), intent(out) :: C  (N,m)
+
+  integer   :: i, j ! loop indices
+  real(RDP) :: tmp  ! reduction variable
+
+  !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE)
+  !$acc loop collapse(2) private(i,j,tmp)
+  !$omp do private(i,j,tmp)
+  do j = 1, m
+  do i = 1, N
+    tmp    =                     &
+           + A_T( 1,i) * B( 1,j) &
+           + A_T( 2,i) * B( 2,j) &
+           + A_T( 3,i) * B( 3,j) &
+           + A_T( 4,i) * B( 4,j) &
+           + A_T( 5,i) * B( 5,j) &
+           + A_T( 6,i) * B( 6,j) &
+           + A_T( 7,i) * B( 7,j) &
+           + A_T( 8,i) * B( 8,j) &
+           + A_T( 9,i) * B( 9,j) &
+           + A_T(10,i) * B(10,j) &
+           + A_T(11,i) * B(11,j) &
+           + A_T(12,i) * B(12,j) &
+           + A_T(13,i) * B(13,j) &
+           + A_T(14,i) * B(14,j)
+
+    C(i,j) = tmp
+  end do
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+
+end subroutine SquareTransposeMatrixProduct_14
+
+!-------------------------------------------------------------------------------
+!> \brief   C <- A B for small square matrix A of size 15 x 15
+!> \author  Immo Huismann
+subroutine SquareTransposeMatrixProduct_15(m,A_T,B,C)
+  integer,   parameter   :: N = 15
+  integer,   intent(in)  :: m
+  real(RDP), intent(in)  :: A_T(N,N)
+  real(RDP), intent(in)  :: B  (N,m)
+  real(RDP), intent(out) :: C  (N,m)
+
+  integer   :: i, j ! loop indices
+  real(RDP) :: tmp  ! reduction variable
+
+  !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE)
+  !$acc loop collapse(2) private(i,j,tmp)
+  !$omp do private(i,j,tmp)
+  do j = 1, m
+  do i = 1, N
+    tmp    =                     &
+           + A_T( 1,i) * B( 1,j) &
+           + A_T( 2,i) * B( 2,j) &
+           + A_T( 3,i) * B( 3,j) &
+           + A_T( 4,i) * B( 4,j) &
+           + A_T( 5,i) * B( 5,j) &
+           + A_T( 6,i) * B( 6,j) &
+           + A_T( 7,i) * B( 7,j) &
+           + A_T( 8,i) * B( 8,j) &
+           + A_T( 9,i) * B( 9,j) &
+           + A_T(10,i) * B(10,j) &
+           + A_T(11,i) * B(11,j) &
+           + A_T(12,i) * B(12,j) &
+           + A_T(13,i) * B(13,j) &
+           + A_T(14,i) * B(14,j) &
+           + A_T(15,i) * B(15,j)
+
+    C(i,j) = tmp
+  end do
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+
+end subroutine SquareTransposeMatrixProduct_15
+
+!-------------------------------------------------------------------------------
+!> \brief   C <- A B for small square matrix A of size 16 x 16
+!> \author  Immo Huismann
+subroutine SquareTransposeMatrixProduct_16(m,A_T,B,C)
+  integer,   parameter   :: N = 16
+  integer,   intent(in)  :: m
+  real(RDP), intent(in)  :: A_T(N,N)
+  real(RDP), intent(in)  :: B  (N,m)
+  real(RDP), intent(out) :: C  (N,m)
+
+  integer   :: i, j ! loop indices
+  real(RDP) :: tmp  ! reduction variable
+
+  !$acc parallel present(B,C) copyin(A_T) async(ACC_EXEC_QUEUE)
+  !$acc loop collapse(2) private(i,j,tmp)
+  !$omp do private(i,j,tmp)
+  do j = 1, m
+  do i = 1, N
+    tmp    =                     &
+           + A_T( 1,i) * B( 1,j) &
+           + A_T( 2,i) * B( 2,j) &
+           + A_T( 3,i) * B( 3,j) &
+           + A_T( 4,i) * B( 4,j) &
+           + A_T( 5,i) * B( 5,j) &
+           + A_T( 6,i) * B( 6,j) &
+           + A_T( 7,i) * B( 7,j) &
+           + A_T( 8,i) * B( 8,j) &
+           + A_T( 9,i) * B( 9,j) &
+           + A_T(10,i) * B(10,j) &
+           + A_T(11,i) * B(11,j) &
+           + A_T(12,i) * B(12,j) &
+           + A_T(13,i) * B(13,j) &
+           + A_T(14,i) * B(14,j) &
+           + A_T(15,i) * B(15,j) &
+           + A_T(16,i) * B(16,j)
+
+    C(i,j) = tmp
+  end do
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+
+end subroutine SquareTransposeMatrixProduct_16
+
+!===============================================================================
+
+end module Tiny_Matrix_Products_Explicit
diff --git a/Parser/test-data/specht/transformed_condensed_part.f b/Parser/test-data/specht/transformed_condensed_part.f
new file mode 100644
index 0000000..e6d4828
--- /dev/null
+++ b/Parser/test-data/specht/transformed_condensed_part.f
@@ -0,0 +1,151 @@
+!> \file      eigenspace_condensed_part.f
+!> \brief     Condensed part for condensed face eigenspace solver
+!> \author    Immo Huismann
+!> \date      2015/05/23
+!> \copyright Institute of Fluid Mechanics, TU Dresden, 01062 Dresden, Germany
+!>
+!> \details
+!> This module provides the primary part for the condensed system eigenspace
+!> solver.
+!===============================================================================
+
+module Transformed_Condensed_Part
+  use Kind_Parameters,             only: RDP
+  use Constants,                   only: HALF
+
+  use ACC_Parameters,              only: ACC_EXEC_QUEUE
+  use Condensed_SEM_Operators,     only: CondensedOperators1D
+  use Geometry_Coefficients,       only: GetLaplaceCoefficients
+  use Element_Boundary_Parameters, only: N_FACES
+  implicit none
+  private
+
+  public :: SubtractTransformedCondensedPart
+
+contains
+
+!-------------------------------------------------------------------------------
+!> \brief   Subtracts the condensed part of the eigenspace system from r.
+!> \author  Immo Huismann
+!>
+!> \details
+!> Serves as a wrapper to the real routine, but with far fewer arguments.
+
+subroutine SubtractTransformedCondensedPart(cop,d,D_inv,u_f,r_f)
+  class(CondensedOperators1D), intent(in) :: cop !< condensed operators
+  real(RDP), intent(in)    :: d(0:,:)            !< Helmholtz parameters
+  real(RDP), intent(in)    :: D_inv(:,:,:,:)     !< eigenvalues of iHii
+  real(RDP), intent(in)    :: u_f(:,:,:,:)       !< variable u on faces
+  real(RDP), intent(inout) :: r_f(:,:,:,:)       !< result on faces
+
+  integer :: n, n_element         ! points per edge, number of elements
+
+  ! Prologue ...................................................................
+  n         = size(D_inv,1)
+  n_element = size(D_inv,4)
+
+  ! computation ................................................................
+
+  ! map from faces to inner element eigenspace.
+  ! This is done with the transpose of the transformation matrix and the
+  ! Helmholtz suboperator corresponding to the specific face
+
+  call Operator(n,n_element,cop%S_T_L_I0,cop%S_T_L_Ip,d,u_f,D_inv,r_f)
+
+
+end subroutine SubtractTransformedCondensedPart
+
+!-------------------------------------------------------------------------------
+!> \brief   Explicit size implementation of the transformed condensed part
+!> \author  Immo Huismann
+!>
+!> \details
+!> This implementation uses a small working set (one temporary), leading to few
+!> load and store operations from memory.
+
+subroutine Operator(n,n_element,S_T_L_I0,S_T_L_Ip,d,u_f,D_inv,r_f)
+  integer,   intent(in)    :: n                !< points per edge
+  integer,   intent(in)    :: n_element        !< number of elements
+  real(RDP), intent(in)    :: S_T_L_I0(n)      !< \f$S^{T} L_{I0}\f$
+  real(RDP), intent(in)    :: S_T_L_Ip(n)      !< \f$S^{T} L_{Ip}\f$
+  real(RDP), intent(in)    :: d(0:3,n_element) !< Helmholtz coefficients
+  real(RDP), intent(in)    :: u_f  (  n,n,N_FACES,n_element) !< u face values
+  real(RDP), intent(in)    :: D_inv(n,n,n,n_element)         !< D^{-1}
+  real(RDP), intent(inout) :: r_f  (  n,n,N_FACES,n_element) !< r face values
+
+  real(RDP) :: v(n,n,n)
+  real(RDP) :: L_0I_S(n), L_pI_S(n)
+  integer   :: i,j,k,e
+
+  ! Small structure exploitation
+  L_0I_S = S_T_L_I0
+  L_pI_S = S_T_L_Ip
+
+  ! Loop over all elements
+  !$acc parallel async(ACC_EXEC_QUEUE) present(d,D_inv,u_f,r_f)
+  !$acc loop private(v)
+  !$omp do private(i,j,k,e,v)
+  do e = 1, n_element
+    ! Gather contributions .....................................................
+    !$acc loop collapse(3)
+    do k = 1, n
+    do j = 1, n
+    do i = 1, n
+      v(i,j,k) = d(1,e) * S_T_L_I0(i) * u_f(  j,k,1,e)                         &
+               + d(1,e) * S_T_L_Ip(i) * u_f(  j,k,2,e)                         &
+               + d(2,e) * S_T_L_I0(j) * u_f(i,  k,3,e)                         &
+               + d(2,e) * S_T_L_Ip(j) * u_f(i,  k,4,e)                         &
+               + d(3,e) * S_T_L_I0(k) * u_f(i,j,  5,e)                         &
+               + d(3,e) * S_T_L_Ip(k) * u_f(i,j,  6,e)
+
+      v(i,j,k) = D_inv(i,j,k,e) * v(i,j,k)
+    end do
+    end do
+    end do
+    !$acc end loop
+    
+    ! Scatter contributions ....................................................
+    ! First direction
+    !$acc loop collapse(2)
+    do k = 1, n
+    do j = 1, n
+      do i = 1, n
+        r_f(  j,k,1,e) = r_f(  j,k,1,e) - d(1,e) * L_0I_S(i) * v(i,j,k)
+        r_f(  j,k,2,e) = r_f(  j,k,2,e) - d(1,e) * L_pI_S(i) * v(i,j,k)
+      end do
+    end do
+    end do
+
+    ! Second direction
+    !$acc loop collapse(2)
+    do k = 1, n
+    do i = 1, n
+      do j = 1, n
+        r_f(i,  k,3,e) = r_f(i,  k,3,e) - d(2,e) * L_0I_S(j) * v(i,j,k)
+        r_f(i,  k,4,e) = r_f(i,  k,4,e) - d(2,e) * L_pI_S(j) * v(i,j,k)
+      end do
+    end do
+    end do
+    !$acc end loop
+    
+    ! Third direction
+    do k = 1, n
+      !$acc loop collapse(2)
+      do j = 1, n
+      do i = 1, n
+        r_f(i,j,  5,e) = r_f(i,j,  5,e) - d(3,e) * L_0I_S(k) * v(i,j,k)
+        r_f(i,j,  6,e) = r_f(i,j,  6,e) - d(3,e) * L_pI_S(k) * v(i,j,k)
+      end do
+      end do
+      !$acc end loop
+    end do
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+
+end subroutine Operator
+
+!===============================================================================
+
+end module Transformed_Condensed_Part
diff --git a/Parser/test-data/specht/transformed_primary_part.f b/Parser/test-data/specht/transformed_primary_part.f
new file mode 100644
index 0000000..626ce4b
--- /dev/null
+++ b/Parser/test-data/specht/transformed_primary_part.f
@@ -0,0 +1,939 @@
+!> \file      eigenspace_primary_part.f
+!> \brief     Primary part for condensed face eigenspace solver
+!> \author    Immo Huismann
+!> \date      2015/05/20
+!> \copyright Institute of Fluid Mechanics, TU Dresden, 01062 Dresden, Germany
+!>
+!> \details
+!> This module provides the primary part for the condensed system eigenspace
+!> solver.
+!>
+!> \todo
+!> * The OpenACC queues could be used for latency hiding via task parallelism
+!===============================================================================
+
+module Transformed_Primary_Part
+  ! HiBASE
+  use Kind_Parameters,             only: RDP
+
+  ! Specht
+  use ACC_Parameters,              only: ACC_EXEC_QUEUE
+  use Condensed_SEM_Operators,     only: CondensedOperators1D
+  use Element_Boundary_Parameters, only: N_FACES, N_EDGES, N_VERTICES
+  implicit none
+  private
+
+  public :: AddTransformedPrimaryPart
+
+contains
+
+!-------------------------------------------------------------------------------
+!> \brief   Adds the primary part to the result vector.
+!> \author  Immo Huismann
+!>
+!> \details
+!> * Assumes that d,u_f,u_e,u_v,r_f,r_e,r_v are on the accelerator
+
+subroutine AddTransformedPrimaryPart(cop,d,u_f,u_e,u_v,r_f,r_e,r_v)
+  class(CondensedOperators1D), intent(in) :: cop     !< 1D condensed operators
+  real(RDP),                   intent(in) :: d(0:,:) !< Helmholtz coefficents
+  real(RDP), intent(in)    :: u_f(:,:,:,:)           !< u values on faces
+  real(RDP), intent(in)    :: u_e(  :,:,:)           !< u values on edges
+  real(RDP), intent(in)    :: u_v(    :,:)           !< u values on vertices
+  real(RDP), intent(inout) :: r_f(:,:,:,:)           !< r values on faces
+  real(RDP), intent(inout) :: r_e(  :,:,:)           !< r values on edges
+  real(RDP), intent(inout) :: r_v(    :,:)           !< r values on vertices
+
+  ! Add the single contributions
+  
+  call AddFaceContribution  (cop,d,u_f,        r_f,r_e    )
+  call AddEdgeContribution  (cop,d,    u_e,    r_f,r_e,r_v)
+  call AddVertexContribution(cop,d,        u_v,    r_e,r_v)
+
+end subroutine AddTransformedPrimaryPart
+
+!===============================================================================
+! Internals
+
+!-------------------------------------------------------------------------------
+!> \brief   Adds the contribution of the faces to the residual
+!> \author  Immo Huismann
+
+subroutine AddFaceContribution(cop,d,u_f,r_f,r_e)
+  class(CondensedOperators1D), intent(in) :: cop
+  real(RDP), intent(in)    :: d(0:,:)      !< metric coefficients  (0:3,   n_e)
+  real(RDP), intent(in)    :: u_f(:,:,:,:) !< values   on faces    (n,n,6, n_e)
+  real(RDP), intent(inout) :: r_f(:,:,:,:) !< residual on faces    (n,n,6, n_e)
+  real(RDP), intent(inout) :: r_e(  :,:,:) !< residual on edges    (  n,12,n_e)
+
+  integer   :: n, n_element
+  real(RDP) :: M_00, L_00, L_0p
+
+  n         = size(u_f,1)
+  n_element = size(u_f,4)
+
+  M_00 = cop%w(0)
+  L_00 = cop%L(0,0)
+  L_0p = cop%L(0,1+n)
+
+  call FaceToEdge(n,n_element,M_00,          cop%L_0I_S,cop%L_pI_S,d,u_f,r_e)
+  call FaceToFace(n,n_element,M_00,L_00,L_0p,cop%eig,              d,u_f,r_f)
+
+end subroutine AddFaceContribution
+
+!-------------------------------------------------------------------------------
+!> \brief   Performs edge to edge operations of primary part
+!> \author  Immo Huismann
+
+subroutine AddEdgeContribution(cop,d,u_e,r_f,r_e,r_v)
+  class(CondensedOperators1D), intent(in) :: cop
+  real(RDP), intent(in)    :: d(0:,:)      !< metric coefficients  (0:3,   n_e)
+  real(RDP), intent(in)    :: u_e(  :,:,:) !< values on the edges  (  n,12,n_e)
+  real(RDP), intent(inout) :: r_f(:,:,:,:) !< residual on faces    (n,n, 6,n_e)
+  real(RDP), intent(inout) :: r_e(  :,:,:) !< residual on edges    (  n,12,n_e)
+  real(RDP), intent(inout) :: r_v(    :,:) !< residual on vertices (     8,n_e)
+
+  integer   :: n, n_element
+  real(RDP) :: M_00, L_00, L_0p
+  real(RDP) :: L_0I   (size(cop%S,1)), L_pI   (size(cop%S,1))
+  real(RDP) :: L_0I_S (size(cop%S,1)), L_pI_S (size(cop%S,1))
+  real(RDP) :: L_I0   (size(cop%S,1)), L_Ip   (size(cop%S,1))
+  real(RDP) :: S_T_LI0(size(cop%S,1)), S_T_LIp(size(cop%S,1))
+  real(RDP) :: eig(size(cop%eig))
+
+  n         = size(u_e,1)
+  n_element = size(u_e,3)
+
+  M_00 = cop%w(0)
+  L_00 = cop%L(0,0)
+  L_0p = cop%L(0,1+n)
+  L_pI = cop%L(1+n,1:n)
+  L_0I = cop%L(0,  1:n)
+  L_Ip = cop%L(1:n,1+n)
+  L_I0 = cop%L(1:n,0  )
+
+  L_0I_S  = cop%L_0I_S
+  L_pI_S  = cop%L_pI_S
+  S_T_LI0 = cop%S_T_L_I0
+  S_T_LIp = cop%S_T_L_Ip
+
+  eig     = cop%eig
+
+  call EdgeToVertex(n,n_element,M_00,L_0I_S,L_pI_S,  d,u_e,r_v)
+  call EdgeToEdge  (n,n_element,M_00,L_00,L_0p,eig,  d,u_e,    r_e)
+  call EdgeToFace  (n,n_element,M_00,S_T_LI0,S_T_LIp,d,u_e,        r_f)
+
+end subroutine AddEdgeContribution
+
+!-------------------------------------------------------------------------------
+!> \brief   Add the vertices' contribution to the primary part
+!> \author  Immo Huismann
+
+subroutine AddVertexContribution(cop,d,u_v,r_e,r_v)
+  class(CondensedOperators1D), intent(in) :: cop !< condensed 1D operators
+  real(RDP), intent(in)    :: d(:,0:)      !< metric coefficients  (    n_e,0:3)
+  real(RDP), intent(in)    :: u_v(    :,:) !< values on vertices   (    n_e, 6)
+  real(RDP), intent(inout) :: r_e(  :,:,:) !< residual on edges    (  n,n_e,12)
+  real(RDP), intent(inout) :: r_v(    :,:) !< residual on vertices (    n_e, 8)
+
+  integer   :: n, n_element
+  real(RDP) :: M_00, L_00, L_0p
+  real(RDP) :: S_T_L_I0(size(cop%S,1))
+  real(RDP) :: S_T_L_Ip(size(cop%S,1))
+
+  n         = size(r_e,1)
+  n_element = size(r_e,3)
+
+  M_00     = cop%w(0)
+  L_00     = cop%L(0,0)
+  L_0p     = cop%L(0,1+n)
+
+  S_T_L_I0 = cop%S_T_L_I0
+  S_T_L_Ip = cop%S_T_L_Ip
+
+  call VertexToVertex(n_element,M_00,L_00,L_0p,        d,u_v,r_v)
+  call VertexToEdge(n,n_element,M_00,S_T_L_I0,S_T_L_Ip,d,u_v,    r_e)
+
+end subroutine AddVertexContribution
+
+!===============================================================================
+! Particular terms
+
+!-------------------------------------------------------------------------------
+!> \brief   Face to face term
+!> \author  Immo Huismann
+
+subroutine FaceToFace(n,n_element,M_00,L_00,L_0p,eig,d,u_f,r_f)
+  integer,   intent(in) :: n                !< points per edge
+  integer,   intent(in) :: n_element        !< number of elements
+  real(RDP), intent(in) :: M_00             !< M_{00}
+  real(RDP), intent(in) :: L_00             !< L_{00}
+  real(RDP), intent(in) :: L_0p             !< L_{0p}
+  real(RDP), intent(in) :: eig(n)           !< gen. eigenvalues of L_{II}
+  real(RDP), intent(in) :: d(0:3,n_element) !< metric coefficients
+
+  real(RDP), intent(in)    :: u_f(n,n,N_FACES,n_element) !< values on faces
+  real(RDP), intent(inout) :: r_f(n,n,N_FACES,n_element) !< result on edges
+
+  integer :: i,j,e
+
+  !$acc parallel async(ACC_EXEC_QUEUE) present(d,u_f,r_f)
+  !$acc loop
+  !$omp do private(i,j,e)
+  do e = 1, n_element
+    !...........................................................................
+    ! x_1 faces
+    !$acc loop collapse(2)
+    do j = 1, n
+    do i = 1, n
+      r_f(i,j,1,e) =                                    r_f(i,j,1,e)           &
+          +           d(0,e) * M_00                   * u_f(i,j,1,e)           &
+          +           d(1,e) * L_00                   * u_f(i,j,1,e)           &
+          +           d(2,e) * M_00 * eig(i)          * u_f(i,j,1,e)           &
+          +           d(3,e) * M_00 *          eig(j) * u_f(i,j,1,e)           &
+          +           d(1,e) * L_0p                   * u_f(i,j,2,e)
+
+      r_f(i,j,2,e) =                                    r_f(i,j,2,e)           &
+          +           d(0,e) * M_00                   * u_f(i,j,2,e)           &
+          +           d(1,e) * L_00                   * u_f(i,j,2,e)           &
+          +           d(2,e) * M_00 * eig(i)          * u_f(i,j,2,e)           &
+          +           d(3,e) * M_00 *          eig(j) * u_f(i,j,2,e)           &
+          +           d(1,e) * L_0p                   * u_f(i,j,1,e)
+    end do
+    end do
+    !$acc end loop
+    
+    !...........................................................................
+    ! x_2 faces
+    !$acc loop collapse(2)
+    do j = 1, n
+    do i = 1, n
+      r_f(i,j,3,e) =                                    r_f(i,j,3,e)           &
+          +           d(0,e) * M_00                   * u_f(i,j,3,e)           &
+          +           d(2,e) * L_00                   * u_f(i,j,3,e)           &
+          +           d(1,e) * M_00 * eig(i)          * u_f(i,j,3,e)           &
+          +           d(3,e) * M_00 *          eig(j) * u_f(i,j,3,e)           &
+          +           d(2,e) * L_0p                   * u_f(i,j,4,e)
+
+      r_f(i,j,4,e) =                                    r_f(i,j,4,e)           &
+          +           d(0,e) * M_00                   * u_f(i,j,4,e)           &
+          +           d(2,e) * L_00                   * u_f(i,j,4,e)           &
+          +           d(1,e) * M_00 * eig(i)          * u_f(i,j,4,e)           &
+          +           d(3,e) * M_00 *          eig(j) * u_f(i,j,4,e)           &
+          +           d(2,e) * L_0p                   * u_f(i,j,3,e)
+    end do
+    end do
+    !$acc end loop
+    
+    !...........................................................................
+    ! x_3 faces
+    !$acc loop collapse(2)
+    do j = 1, n
+    do i = 1, n
+      r_f(i,j,5,e) =                                    r_f(i,j,5,e)           &
+          +           d(0,e) * M_00                   * u_f(i,j,5,e)           &
+          +           d(3,e) * L_00                   * u_f(i,j,5,e)           &
+          +           d(1,e) * M_00 * eig(i)          * u_f(i,j,5,e)           &
+          +           d(2,e) * M_00 *          eig(j) * u_f(i,j,5,e)           &
+          +           d(3,e) * L_0p                   * u_f(i,j,6,e)
+
+      r_f(i,j,6,e) =                                    r_f(i,j,6,e)           &
+          +           d(0,e) * M_00                   * u_f(i,j,6,e)           &
+          +           d(3,e) * L_00                   * u_f(i,j,6,e)           &
+          +           d(1,e) * M_00 * eig(i)          * u_f(i,j,6,e)           &
+          +           d(2,e) * M_00 *          eig(j) * u_f(i,j,6,e)           &
+          +           d(3,e) * L_0p                   * u_f(i,j,5,e)
+    end do
+    end do
+    !$acc end loop
+    
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+
+end subroutine FaceToFace
+
+!-------------------------------------------------------------------------------
+!> \brief   Face to edge term
+!> \author  Immo Huismann
+
+subroutine FaceToEdge(n,n_element,M_00,L_0I_S,L_pI_S,d,u_f,r_e)
+  integer,   intent(in) :: n                !< points per edge
+  integer,   intent(in) :: n_element        !< number of elements
+  real(RDP), intent(in) :: M_00             !< M_{00}
+  real(RDP), intent(in) :: L_0I_S(n)        !< S^T L_{I0}
+  real(RDP), intent(in) :: L_pI_S(n)        !< S^T L_{Ip}
+  real(RDP), intent(in) :: d(0:3,n_element) !< metric coefficients
+
+  real(RDP), intent(in)    :: u_f(n,n,N_FACES,n_element) !< values on faces
+  real(RDP), intent(inout) :: r_e(  n,N_EDGES,n_element) !< result on edges
+
+  real(RDP) :: M_00_L_0I_S(n) !< M_00 L_{I0} S^T
+  real(RDP) :: M_00_L_pI_S(n) !< M_00 L_{Ip} S^T
+
+  integer :: i,j,e
+
+  M_00_L_0I_S = M_00 * L_0I_S
+  M_00_L_pI_S = M_00 * L_pI_S
+
+  !$acc parallel async(ACC_EXEC_QUEUE) present(d,u_f,r_e)
+  !$acc loop collapse(2)
+  !$omp do private(i,j,e)
+  do e = 1, n_element
+  do i = 1, n
+    do j = 1, n
+  
+      r_e(i, 1,e) =                             r_e(i  , 1,e)                    &
+          +           d(2,e) * M_00_L_0I_S(j) * u_f(i,j, 5,e)                    &
+          +           d(3,e) * M_00_L_0I_S(j) * u_f(i,j, 3,e)
+  
+      r_e(i, 2,e) =                             r_e(i  , 2,e)                    &
+          +           d(2,e) * M_00_L_pI_S(j) * u_f(i,j, 5,e)                    &
+          +           d(3,e) * M_00_L_0I_S(j) * u_f(i,j, 4,e)
+  
+      r_e(i, 3,e) =                             r_e(i  , 3,e)                    &
+          +           d(2,e) * M_00_L_0I_S(j) * u_f(i,j, 6,e)                    &
+          +           d(3,e) * M_00_L_pI_S(j) * u_f(i,j, 3,e)
+  
+      r_e(i, 4,e) =                             r_e(i  , 4,e)                    &
+          +           d(2,e) * M_00_L_pI_S(j) * u_f(i,j, 6,e)                    &
+          +           d(3,e) * M_00_L_pI_S(j) * u_f(i,j, 4,e)
+  
+      r_e(i, 5,e) =                             r_e(i  , 5,e)                    &
+          +           d(3,e) * M_00_L_0I_S(j) * u_f(i,j, 1,e)                    &
+          +           d(1,e) * M_00_L_0I_S(j) * u_f(j,i, 5,e)
+  
+      r_e(i, 6,e) =                             r_e(i  , 6,e)                    &
+          +           d(3,e) * M_00_L_0I_S(j) * u_f(i,j, 2,e)                    &
+          +           d(1,e) * M_00_L_pI_S(j) * u_f(j,i, 5,e)
+  
+      r_e(i, 7,e) =                             r_e(i  , 7,e)                    &
+          +           d(3,e) * M_00_L_pI_S(j) * u_f(i,j, 1,e)                    &
+          +           d(1,e) * M_00_L_0I_S(j) * u_f(j,i, 6,e)
+  
+      r_e(i, 8,e) =                             r_e(i  , 8,e)                    &
+          +           d(3,e) * M_00_L_pI_S(j) * u_f(i,j, 2,e)                    &
+          +           d(1,e) * M_00_L_pI_S(j) * u_f(j,i, 6,e)
+  
+      r_e(i, 9,e) =                             r_e(i  , 9,e)                    &
+          +           d(2,e) * M_00_L_0I_S(j) * u_f(j,i, 1,e)                    &
+          +           d(1,e) * M_00_L_0I_S(j) * u_f(j,i, 3,e)
+  
+      r_e(i,10,e) =                             r_e(i  ,10,e)                    &
+          +           d(2,e) * M_00_L_0I_S(j) * u_f(j,i, 2,e)                    &
+          +           d(1,e) * M_00_L_pI_S(j) * u_f(j,i, 3,e)
+  
+      r_e(i,11,e) =                             r_e(i  ,11,e)                    &
+          +           d(2,e) * M_00_L_pI_S(j) * u_f(j,i, 1,e)                    &
+          +           d(1,e) * M_00_L_0I_S(j) * u_f(j,i, 4,e)
+  
+      r_e(i,12,e) =                             r_e(i  ,12,e)                    &
+          +           d(2,e) * M_00_L_pI_S(j) * u_f(j,i, 2,e)                    &
+          +           d(1,e) * M_00_L_pI_S(j) * u_f(j,i, 4,e)
+  
+    end do
+  end do
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+
+end subroutine FaceToEdge
+
+!-------------------------------------------------------------------------------
+!> \brief   Primary part for edge -> face
+!> \author  Immo Huismann
+
+subroutine EdgeToFace(n,n_element,M_00,S_T_LI0,S_T_LIp,d,u_e,r_f)
+  integer,   intent(in) :: n                !< points per edge
+  integer,   intent(in) :: n_element        !< number of elements
+  real(RDP), intent(in) :: M_00             !< M_{00}
+  real(RDP), intent(in) :: S_T_LI0(n)       !< S^T L_{I0}
+  real(RDP), intent(in) :: S_T_LIp(n)       !< S^T L_{Ip}
+  real(RDP), intent(in) :: d(0:3,n_element) !< metric coefficients
+
+  real(RDP), intent(in)    :: u_e(  n,N_EDGES,n_element) !< values on edges
+  real(RDP), intent(inout) :: r_f(n,n,N_FACES,n_element) !< result on faces
+
+  real(RDP) :: M_00_S_T_LI0(n) !< M_00 S^T L_{I0}
+  real(RDP) :: M_00_S_T_LIp(n) !< M_00 S^T L_{Ip}
+
+  integer :: i,j,e
+
+  M_00_S_T_LI0 = M_00 * S_T_LI0
+  M_00_S_T_LIp = M_00 * S_T_LIp
+
+  !$acc parallel async(ACC_EXEC_QUEUE) present(d,u_e,r_f)
+  !$acc loop collapse(3)
+  !$omp do private(i,j,e)
+  do e = 1, n_element
+  do j = 1, n
+  do i = 1, n
+
+    r_f(i,j,1,e) =                               r_f(i,j, 1,e)                 &
+        +           d(3,e) * M_00_S_T_LI0(  j) * u_e(i,   5,e)                 &
+        +           d(3,e) * M_00_S_T_LIp(  j) * u_e(i,   7,e)                 &
+        +           d(2,e) * M_00_S_T_LI0(i  ) * u_e(  j, 9,e)                 &
+        +           d(2,e) * M_00_S_T_LIp(i  ) * u_e(  j,11,e)
+
+    r_f(i,j,2,e) =                               r_f(i,j, 2,e)                 &
+        +           d(3,e) * M_00_S_T_LI0(  j) * u_e(i,   6,e)                 &
+        +           d(3,e) * M_00_S_T_LIp(  j) * u_e(i,   8,e)                 &
+        +           d(2,e) * M_00_S_T_LI0(i  ) * u_e(  j,10,e)                 &
+        +           d(2,e) * M_00_S_T_LIp(i  ) * u_e(  j,12,e)
+
+    r_f(i,j,3,e) =                               r_f(i,j, 3,e)                 &
+        +           d(3,e) * M_00_S_T_LI0(  j) * u_e(i,   1,e)                 &
+        +           d(3,e) * M_00_S_T_LIp(  j) * u_e(i,   3,e)                 &
+        +           d(1,e) * M_00_S_T_LI0(i  ) * u_e(  j, 9,e)                 &
+        +           d(1,e) * M_00_S_T_LIp(i  ) * u_e(  j,10,e)
+
+    r_f(i,j,4,e) =                               r_f(i,j, 4,e)                 &
+        +           d(3,e) * M_00_S_T_LI0(  j) * u_e(i,   2,e)                 &
+        +           d(3,e) * M_00_S_T_LIp(  j) * u_e(i,   4,e)                 &
+        +           d(1,e) * M_00_S_T_LI0(i  ) * u_e(  j,11,e)                 &
+        +           d(1,e) * M_00_S_T_LIp(i  ) * u_e(  j,12,e)
+
+    r_f(i,j,5,e) =                               r_f(i,j, 5,e)                 &
+        +           d(2,e) * M_00_S_T_LI0(  j) * u_e(i,   1,e)                 &
+        +           d(2,e) * M_00_S_T_LIp(  j) * u_e(i,   2,e)                 &
+        +           d(1,e) * M_00_S_T_LI0(i  ) * u_e(  j, 5,e)                 &
+        +           d(1,e) * M_00_S_T_LIp(i  ) * u_e(  j, 6,e)
+
+    r_f(i,j,6,e) =                               r_f(i,j, 6,e)                 &
+        +           d(2,e) * M_00_S_T_LI0(  j) * u_e(i,   3,e)                 &
+        +           d(2,e) * M_00_S_T_LIp(  j) * u_e(i,   4,e)                 &
+        +           d(1,e) * M_00_S_T_LI0(i  ) * u_e(  j, 7,e)                 &
+        +           d(1,e) * M_00_S_T_LIp(i  ) * u_e(  j, 8,e)
+
+  end do
+  end do
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+
+end subroutine EdgeToFace
+
+!-------------------------------------------------------------------------------
+!> \brief   Edge to edge interaction
+!> \author  Immo Huismann
+
+subroutine EdgeToEdge(n,n_element,M_00,L_00,L_0p,eig,d,u_e,r_e)
+  integer,   intent(in) :: n              !< points per edge
+  integer,   intent(in) :: n_element      !< number of elements
+  real(RDP), intent(in) :: d(0:3,n_element) !< metric coefficient * lambda
+  real(RDP), intent(in) :: M_00           !< M_{00}
+  real(RDP), intent(in) :: L_00           !< L_{00}
+  real(RDP), intent(in) :: L_0p           !< L_{0p}
+  real(RDP), intent(in) :: eig(n)         !< eigenvalues of L_II
+
+  real(RDP), intent(in)    :: u_e(n,N_EDGES,n_element) !< values on edges
+  real(RDP), intent(inout) :: r_e(n,N_EDGES,n_element) !< result on edges
+
+  integer :: i, e
+
+  real(RDP) :: f_0, f_1, f_2
+
+  !$acc parallel async(ACC_EXEC_QUEUE) present(d,u_e,r_e)
+  !$acc loop gang worker vector
+  !$omp do private(i,e,f_0,f_1,f_2)
+  do e = 1, n_element
+    !...........................................................................
+    ! x_1 edges
+    !$acc loop vector
+    do i = 1, n
+
+      f_0 =  d(0,e) * M_00 * M_00           & ! lambda * M * M * M
+          +  d(1,e) * M_00 * M_00  * eig(i) & !          M * M * L
+          +  d(2,e) * L_00 * M_00           & !          M * L * M
+          +  d(3,e) * M_00 * L_00             !          L * M * M
+
+      f_1 =  d(2,e) * M_00 * L_0p             !          M * L * M
+      f_2 =  d(3,e) * M_00 * L_0p             !          L * M * M
+
+      r_e(i, 1,e) =      r_e(i, 1,e)                                              &
+          + f_0 *        u_e(i, 1,e)                                              &
+          + f_1 *        u_e(i, 2,e)                                              &
+          + f_2 *        u_e(i, 3,e)
+
+      r_e(i, 2,e) =      r_e(i, 2,e)                                              &
+          + f_0 *        u_e(i, 2,e)                                              &
+          + f_1 *        u_e(i, 1,e)                                              &
+          + f_2 *        u_e(i, 4,e)
+
+      r_e(i, 3,e) =      r_e(i, 3,e)                                              &
+          + f_0 *        u_e(i, 3,e)                                              &
+          + f_1 *        u_e(i, 4,e)                                              &
+          + f_2 *        u_e(i, 1,e)
+
+      r_e(i, 4,e) =      r_e(i, 4,e)                                              &
+          +        f_0 * u_e(i, 4,e)                                              &
+          +        f_1 * u_e(i, 3,e)                                              &
+          +        f_2 * u_e(i, 2,e)
+    end do
+    !$acc end loop
+    
+    !.............................................................................
+    ! x_2 edges
+    !$acc loop vector
+    do i = 1, n
+      f_0 =  d(0,e) * M_00 * M_00           & ! lambda * M * M * M
+          +  d(2,e) * M_00 * M_00  * eig(i) & !          M * M * L
+          +  d(1,e) * L_00 * M_00           & !          M * L * M
+          +  d(3,e) * M_00 * L_00             !          L * M * M
+
+      f_1 =  d(1,e) * M_00 * L_0p             !          M * L * M
+      f_2 =  d(3,e) * M_00 * L_0p             !          L * M * M
+
+      r_e(i, 5,e) =      r_e(i, 5,e)                                              &
+          + f_0 *        u_e(i, 5,e)                                              &
+          + f_1 *        u_e(i, 6,e)                                              &
+          + f_2 *        u_e(i, 7,e)
+
+      r_e(i, 6,e) =      r_e(i, 6,e)                                              &
+          + f_0 *        u_e(i, 6,e)                                              &
+          + f_1 *        u_e(i, 5,e)                                              &
+          + f_2 *        u_e(i, 8,e)
+
+      r_e(i, 7,e) =      r_e(i, 7,e)                                              &
+          + f_0 *        u_e(i, 7,e)                                              &
+          + f_1 *        u_e(i, 8,e)                                              &
+          + f_2 *        u_e(i, 5,e)
+
+      r_e(i, 8,e) =      r_e(i, 8,e)                                              &
+          +        f_0 * u_e(i, 8,e)                                              &
+          +        f_1 * u_e(i, 7,e)                                              &
+          +        f_2 * u_e(i, 6,e)
+    end do
+    !$acc end loop
+    
+    !.............................................................................
+    ! x_3 edges
+    !$acc loop vector
+    do i = 1, n
+      f_0 =  d(0,e) * M_00 * M_00           & ! lambda * M * M * M
+          +  d(3,e) * M_00 * M_00  * eig(i) & !          M * M * L
+          +  d(1,e) * L_00 * M_00           & !          M * L * M
+          +  d(2,e) * M_00 * L_00             !          L * M * M
+
+      f_1 =  d(1,e) * M_00 * L_0p             !          M * L * M
+      f_2 =  d(2,e) * M_00 * L_0p             !          L * M * M
+
+      r_e(i, 9,e) =      r_e(i, 9,e)                                              &
+          + f_0 *        u_e(i, 9,e)                                              &
+          + f_1 *        u_e(i,10,e)                                              &
+          + f_2 *        u_e(i,11,e)
+
+      r_e(i,10,e) =      r_e(i,10,e)                                              &
+          + f_0 *        u_e(i,10,e)                                              &
+          + f_1 *        u_e(i, 9,e)                                              &
+          + f_2 *        u_e(i,12,e)
+
+      r_e(i,11,e) =      r_e(i,11,e)                                              &
+          + f_0 *        u_e(i,11,e)                                              &
+          + f_1 *        u_e(i,12,e)                                              &
+          + f_2 *        u_e(i, 9,e)
+
+      r_e(i,12,e) =      r_e(i,12,e)                                              &
+          +        f_0 * u_e(i,12,e)                                              &
+          +        f_1 * u_e(i,11,e)                                              &
+          +        f_2 * u_e(i,10,e)
+
+    end do
+    !$acc end loop
+
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+
+end subroutine EdgeToEdge
+
+!-------------------------------------------------------------------------------
+!> \brief   Edge to vertex interaction
+!> \author  Immo Huismann
+
+subroutine EdgeToVertex(n,n_element,M_00,L_0I_S,L_pI_S,d,u_e,r_v)
+  integer,   intent(in)    :: n
+  integer,   intent(in)    :: n_element
+  real(RDP), intent(in)    :: M_00
+  real(RDP), intent(in)    :: L_0I_S(n)
+  real(RDP), intent(in)    :: L_pI_S(n)
+  real(RDP), intent(in)    :: d(0:3,           n_element)
+  real(RDP), intent(in)    :: u_e(n,N_EDGES,   n_element)
+  real(RDP), intent(inout) :: r_v(  N_VERTICES,n_element)
+
+  integer   :: i, e
+  real(RDP) :: tmp_1,tmp_2, factor
+
+  !$acc parallel async(ACC_EXEC_QUEUE) present(d,u_e,r_v)
+  !$acc loop gang worker vector
+  !$omp do private(i,e,tmp_1,tmp_2,factor)
+  do e = 1, n_element
+
+    !---------------------------------------------------------------------------
+    ! x_1 edges
+
+    factor = M_00 * M_00 * d(1,e)
+
+    !...........................................................................
+    ! first one
+    tmp_1  = 0
+    tmp_2  = 0
+
+    !$acc loop vector reduction(+:tmp_1,tmp_2)
+    do i = 1, n
+      tmp_1 = tmp_1 + L_0I_S(i) * u_e(i, 1,e)
+      tmp_2 = tmp_2 + L_pI_S(i) * u_e(i, 1,e)
+    end do
+    !$acc end loop
+    
+    r_v(1,e) = r_v(1,e) + tmp_1 * factor
+    r_v(2,e) = r_v(2,e) + tmp_2 * factor
+
+    !...........................................................................
+    ! second one
+    tmp_1  = 0
+    tmp_2  = 0
+
+    do i = 1, n
+      tmp_1 = tmp_1 + L_0I_S(i) * u_e(i, 2,e)
+      tmp_2 = tmp_2 + L_pI_S(i) * u_e(i, 2,e)
+    end do
+
+    r_v(3,e) = r_v(3,e) + tmp_1 * factor
+    r_v(4,e) = r_v(4,e) + tmp_2 * factor
+
+    !...........................................................................
+    ! third one
+    tmp_1  = 0
+    tmp_2  = 0
+
+    do i = 1, n
+      tmp_1 = tmp_1 + L_0I_S(i) * u_e(i, 3,e)
+      tmp_2 = tmp_2 + L_pI_S(i) * u_e(i, 3,e)
+    end do
+
+    r_v(5,e) = r_v(5,e) + tmp_1 * factor
+    r_v(6,e) = r_v(6,e) + tmp_2 * factor
+
+    !...........................................................................
+    ! Fourth one
+    tmp_1  = 0
+    tmp_2  = 0
+
+    do i = 1, n
+      tmp_1 = tmp_1 + L_0I_S(i) * u_e(i, 4,e)
+      tmp_2 = tmp_2 + L_pI_S(i) * u_e(i, 4,e)
+    end do
+
+    r_v(7,e) = r_v(7,e) + tmp_1 * factor
+    r_v(8,e) = r_v(8,e) + tmp_2 * factor
+
+
+    !---------------------------------------------------------------------------
+    ! x_2 edges
+
+    factor = M_00 * M_00 * d(2,e)
+
+    !...........................................................................
+    ! first one
+    tmp_1  = 0
+    tmp_2  = 0
+
+    do i = 1, n
+      tmp_1 = tmp_1 + L_0I_S(i) * u_e(i, 5,e)
+      tmp_2 = tmp_2 + L_pI_S(i) * u_e(i, 5,e)
+    end do
+
+    r_v(1,e) = r_v(1,e) + tmp_1 * factor
+    r_v(3,e) = r_v(3,e) + tmp_2 * factor
+
+    !...........................................................................
+    ! second one
+    tmp_1  = 0
+    tmp_2  = 0
+
+    do i = 1, n
+      tmp_1 = tmp_1 + L_0I_S(i) * u_e(i, 6,e)
+      tmp_2 = tmp_2 + L_pI_S(i) * u_e(i, 6,e)
+    end do
+
+    r_v(2,e) = r_v(2,e) + tmp_1 * factor
+    r_v(4,e) = r_v(4,e) + tmp_2 * factor
+
+    !...........................................................................
+    ! third one
+    tmp_1  = 0
+    tmp_2  = 0
+
+    do i = 1, n
+      tmp_1 = tmp_1 + L_0I_S(i) * u_e(i, 7,e)
+      tmp_2 = tmp_2 + L_pI_S(i) * u_e(i, 7,e)
+    end do
+
+    r_v(5,e) = r_v(5,e) + tmp_1 * factor
+    r_v(7,e) = r_v(7,e) + tmp_2 * factor
+
+    !...........................................................................
+    ! Fourth one
+    tmp_1  = 0
+    tmp_2  = 0
+
+    do i = 1, n
+      tmp_1 = tmp_1 + L_0I_S(i) * u_e(i, 8,e)
+      tmp_2 = tmp_2 + L_pI_S(i) * u_e(i, 8,e)
+    end do
+
+    r_v(6,e) = r_v(6,e) + tmp_1 * factor
+    r_v(8,e) = r_v(8,e) + tmp_2 * factor
+
+    !---------------------------------------------------------------------------
+    ! x_3 edges
+
+    factor = M_00 * M_00 * d(3,e)
+
+    !...........................................................................
+    ! first one
+    tmp_1  = 0
+    tmp_2  = 0
+
+    do i = 1, n
+      tmp_1 = tmp_1 + L_0I_S(i) * u_e(i, 9,e)
+      tmp_2 = tmp_2 + L_pI_S(i) * u_e(i, 9,e)
+    end do
+
+    r_v(1,e) = r_v(1,e) + tmp_1 * factor
+    r_v(5,e) = r_v(5,e) + tmp_2 * factor
+
+    !...........................................................................
+    ! second one
+    tmp_1  = 0
+    tmp_2  = 0
+
+    do i = 1, n
+      tmp_1 = tmp_1 + L_0I_S(i) * u_e(i,10,e)
+      tmp_2 = tmp_2 + L_pI_S(i) * u_e(i,10,e)
+    end do
+
+    r_v(2,e) = r_v(2,e) + tmp_1 * factor
+    r_v(6,e) = r_v(6,e) + tmp_2 * factor
+
+    !...........................................................................
+    ! third one
+    tmp_1  = 0
+    tmp_2  = 0
+
+    do i = 1, n
+      tmp_1 = tmp_1 + L_0I_S(i) * u_e(i,11,e)
+      tmp_2 = tmp_2 + L_pI_S(i) * u_e(i,11,e)
+    end do
+
+    r_v(3,e) = r_v(3,e) + tmp_1 * factor
+    r_v(7,e) = r_v(7,e) + tmp_2 * factor
+
+    !...........................................................................
+    ! Fourth one
+    tmp_1  = 0
+    tmp_2  = 0
+
+    do i = 1, n
+      tmp_1 = tmp_1 + L_0I_S(i) * u_e(i,12,e)
+      tmp_2 = tmp_2 + L_pI_S(i) * u_e(i,12,e)
+    end do
+
+    r_v(4,e) = r_v(4,e) + tmp_1 * factor
+    r_v(8,e) = r_v(8,e) + tmp_2 * factor
+
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+
+end subroutine EdgeToVertex
+
+!-------------------------------------------------------------------------------
+!> \brief   Vertex to edge interaction
+!> \author  Immo Huismann
+
+subroutine VertexToEdge(n,n_element,M_00,S_T_L_I0,S_T_L_Ip,d,u_v,r_e)
+  integer,   intent(in)    :: n
+  integer,   intent(in)    :: n_element
+  real(RDP), intent(in)    :: M_00
+  real(RDP), intent(in)    :: S_T_L_I0(n)
+  real(RDP), intent(in)    :: S_T_L_Ip(n)
+  real(RDP), intent(in)    :: d(0:3,           n_element)
+  real(RDP), intent(in)    :: u_v(  N_VERTICES,n_element)
+  real(RDP), intent(inout) :: r_e(n,N_EDGES,   n_element)
+
+  integer   :: i, e
+  real(RDP) :: tmp, factor
+
+  !$acc parallel async(ACC_EXEC_QUEUE) present(d,u_v,r_e)
+  !$acc loop
+  !$omp do private(i,e,tmp,factor)
+  do e = 1, n_element
+    !...........................................................................
+    ! x_1 edges
+    factor = M_00 * M_00 * d(1,e)
+
+    do i = 1, n
+      tmp         = S_T_L_I0(i) * u_v(1,e) + S_T_L_Ip(i) * u_v(2,e)
+      r_e(i, 1,e) = r_e(i, 1,e) + factor * tmp
+    end do
+    do i = 1, n
+      tmp         = S_T_L_I0(i) * u_v(3,e) + S_T_L_Ip(i) * u_v(4,e)
+      r_e(i, 2,e) = r_e(i, 2,e) + factor * tmp
+    end do
+    do i = 1, n
+      tmp         = S_T_L_I0(i) * u_v(5,e) + S_T_L_Ip(i) * u_v(6,e)
+      r_e(i, 3,e) = r_e(i, 3,e) + factor * tmp
+    end do
+    do i = 1, n
+      tmp         = S_T_L_I0(i) * u_v(7,e) + S_T_L_Ip(i) * u_v(8,e)
+      r_e(i, 4,e) = r_e(i, 4,e) + factor * tmp
+    end do
+
+    !...........................................................................
+    ! x_2 edges
+    factor = M_00 * M_00 * d(2,e)
+
+    do i = 1, n
+      tmp         = S_T_L_I0(i) * u_v(1,e) + S_T_L_Ip(i) * u_v(3,e)
+      r_e(i, 5,e) = r_e(i, 5,e) + factor * tmp
+    end do
+    do i = 1, n
+      tmp         = S_T_L_I0(i) * u_v(2,e) + S_T_L_Ip(i) * u_v(4,e)
+      r_e(i, 6,e) = r_e(i, 6,e) + factor * tmp
+    end do
+    do i = 1, n
+      tmp         = S_T_L_I0(i) * u_v(5,e) + S_T_L_Ip(i) * u_v(7,e)
+      r_e(i, 7,e) = r_e(i, 7,e) + factor * tmp
+    end do
+    do i = 1, n
+      tmp         = S_T_L_I0(i) * u_v(6,e) + S_T_L_Ip(i) * u_v(8,e)
+      r_e(i, 8,e) = r_e(i, 8,e) + factor * tmp
+    end do
+
+    !...........................................................................
+    ! x_3 edges
+    factor = M_00 * M_00 * d(3,e)
+
+    do i = 1, n
+      tmp         = S_T_L_I0(i) * u_v(1,e) + S_T_L_Ip(i) * u_v(5,e)
+      r_e(i, 9,e) = r_e(i, 9,e) + factor * tmp
+    end do
+    do i = 1, n
+      tmp         = S_T_L_I0(i) * u_v(2,e) + S_T_L_Ip(i) * u_v(6,e)
+      r_e(i,10,e) = r_e(i,10,e) + factor * tmp
+    end do
+    do i = 1, n
+      tmp         = S_T_L_I0(i) * u_v(3,e) + S_T_L_Ip(i) * u_v(7,e)
+      r_e(i,11,e) = r_e(i,11,e) + factor * tmp
+    end do
+    do i = 1, n
+      tmp         = S_T_L_I0(i) * u_v(4,e) + S_T_L_Ip(i) * u_v(8,e)
+      r_e(i,12,e) = r_e(i,12,e) + factor * tmp
+    end do
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+
+end subroutine VertexToEdge
+
+!-------------------------------------------------------------------------------
+!> \brief   Primary part vertex to vertex in transformed condensed system
+!> \author  Immo Huismann
+
+subroutine VertexToVertex(n_element,M_00,L_00,L_0p,d,u_v,r_v)
+  integer,   intent(in)    :: n_element
+  real(RDP), intent(in)    :: M_00
+  real(RDP), intent(in)    :: L_00
+  real(RDP), intent(in)    :: L_0p
+  real(RDP), intent(in)    :: d  (0:3,n_element)
+  real(RDP), intent(in)    :: u_v(N_VERTICES,n_element) !< values on vertices
+  real(RDP), intent(inout) :: r_v(N_VERTICES,n_element) !< result on vertices
+
+  integer   :: e
+  real(RDP) :: M_00_M_00_M_00
+  real(RDP) :: L_00_M_00_M_00
+  real(RDP) :: L_0p_M_00_M_00
+
+  M_00_M_00_M_00 = M_00 * M_00 * M_00
+  L_00_M_00_M_00 = L_00 * M_00 * M_00
+  L_0p_M_00_M_00 = L_0p * M_00 * M_00
+
+  !$acc parallel async(ACC_EXEC_QUEUE) present(d,u_v,r_v)
+  !$acc loop
+  !$omp do private(e)
+  do e = 1, n_element
+
+    r_v(1,e) = r_v(1,e)                                                        &
+        + M_00_M_00_M_00 *  d(0,e)                    * u_v(1,e)               &
+        + L_00_M_00_M_00 * (d(1,e) + d(2,e) + d(3,e)) * u_v(1,e)               &
+        + L_0p_M_00_M_00 *  d(1,e)                    * u_v(2,e)               &
+        + L_0p_M_00_M_00 *  d(2,e)                    * u_v(3,e)               &
+        + L_0p_M_00_M_00 *  d(3,e)                    * u_v(5,e)
+
+    r_v(2,e) = r_v(2,e)                                                        &
+        + M_00_M_00_M_00 *  d(0,e)                    * u_v(2,e)               &
+        + L_00_M_00_M_00 * (d(1,e) + d(2,e) + d(3,e)) * u_v(2,e)               &
+        + L_0p_M_00_M_00 *  d(1,e)                    * u_v(1,e)               &
+        + L_0p_M_00_M_00 *  d(2,e)                    * u_v(4,e)               &
+        + L_0p_M_00_M_00 *  d(3,e)                    * u_v(6,e)
+
+    r_v(3,e) = r_v(3,e)                                                        &
+        + M_00_M_00_M_00 *  d(0,e)                    * u_v(3,e)               &
+        + L_00_M_00_M_00 * (d(1,e) + d(2,e) + d(3,e)) * u_v(3,e)               &
+        + L_0p_M_00_M_00 *  d(1,e)                    * u_v(4,e)               &
+        + L_0p_M_00_M_00 *  d(2,e)                    * u_v(1,e)               &
+        + L_0p_M_00_M_00 *  d(3,e)                    * u_v(7,e)
+
+    r_v(4,e) = r_v(4,e)                                                        &
+        + M_00_M_00_M_00 *  d(0,e)                    * u_v(4,e)               &
+        + L_00_M_00_M_00 * (d(1,e) + d(2,e) + d(3,e)) * u_v(4,e)               &
+        + L_0p_M_00_M_00 *  d(1,e)                    * u_v(3,e)               &
+        + L_0p_M_00_M_00 *  d(2,e)                    * u_v(2,e)               &
+        + L_0p_M_00_M_00 *  d(3,e)                    * u_v(8,e)
+
+    r_v(5,e) = r_v(5,e)                                                        &
+        + M_00_M_00_M_00 *  d(0,e)                    * u_v(5,e)               &
+        + L_00_M_00_M_00 * (d(1,e) + d(2,e) + d(3,e)) * u_v(5,e)               &
+        + L_0p_M_00_M_00 *  d(1,e)                    * u_v(6,e)               &
+        + L_0p_M_00_M_00 *  d(2,e)                    * u_v(7,e)               &
+        + L_0p_M_00_M_00 *  d(3,e)                    * u_v(1,e)
+
+    r_v(6,e) = r_v(6,e)                                                        &
+        + M_00_M_00_M_00 *  d(0,e)                    * u_v(6,e)               &
+        + L_00_M_00_M_00 * (d(1,e) + d(2,e) + d(3,e)) * u_v(6,e)               &
+        + L_0p_M_00_M_00 *  d(1,e)                    * u_v(5,e)               &
+        + L_0p_M_00_M_00 *  d(2,e)                    * u_v(8,e)               &
+        + L_0p_M_00_M_00 *  d(3,e)                    * u_v(2,e)
+
+    r_v(7,e) = r_v(7,e)                                                        &
+        + M_00_M_00_M_00 *  d(0,e)                    * u_v(7,e)               &
+        + L_00_M_00_M_00 * (d(1,e) + d(2,e) + d(3,e)) * u_v(7,e)               &
+        + L_0p_M_00_M_00 *  d(1,e)                    * u_v(8,e)               &
+        + L_0p_M_00_M_00 *  d(2,e)                    * u_v(5,e)               &
+        + L_0p_M_00_M_00 *  d(3,e)                    * u_v(3,e)
+
+    r_v(8,e) = r_v(8,e)                                                        &
+        + M_00_M_00_M_00 *  d(0,e)                    * u_v(8,e)               &
+        + L_00_M_00_M_00 * (d(1,e) + d(2,e) + d(3,e)) * u_v(8,e)               &
+        + L_0p_M_00_M_00 *  d(1,e)                    * u_v(7,e)               &
+        + L_0p_M_00_M_00 *  d(2,e)                    * u_v(6,e)               &
+        + L_0p_M_00_M_00 *  d(3,e)                    * u_v(4,e)
+
+  end do
+  !$omp end do
+  !$acc end loop
+  !$acc end parallel
+
+end subroutine VertexToVertex
+
+!===============================================================================
+
+end module Transformed_Primary_Part
diff --git a/Parser/test/org/tud/forty/test/FragmentTest.java b/Parser/test/org/tud/forty/test/FragmentTest.java
index 5512267..30f4569 100644
--- a/Parser/test/org/tud/forty/test/FragmentTest.java
+++ b/Parser/test/org/tud/forty/test/FragmentTest.java
@@ -2,6 +2,8 @@ package org.tud.forty.test;
 
 import org.testng.annotations.DataProvider;
 import org.testng.annotations.Test;
+import org.tud.forty.ast.ExecutableConstruct;
+import org.tud.forty.ast.ExecutionPartConstruct;
 import org.tud.forty.ast.Expr;
 
 import java.io.File;
@@ -14,9 +16,24 @@ public class FragmentTest extends TestBase {
         return ruleProvider("test-data/fragments/Expr");
     }
 
+    @DataProvider(name = "executableconstruct")
+    public static Iterator<Object[]> fortranExecutableConstructProvider() {
+        return ruleProvider("test-data/fragments/ExecutableConstruct");
+    }
+
     @Test(dataProvider = "exprs")
     public void testFragmentExprParser(File f) throws Exception {
         testParse(f, false, true, false, true, Expr.class);
     }
 
+    @Test(dataProvider = "executableconstruct")
+    public void testFragmentExecutableConstructParser(File f) throws Exception {
+        testParse(f, false, true, false, true, ExecutableConstruct.class, true);
+    }
+
+    @Test(dataProvider = "executableconstruct")
+    public void testFragmentExecutionPartConstructParser(File f) throws Exception {
+        testParse(f, false, true, false, true, ExecutionPartConstruct.class, true);
+    }
+
 }
diff --git a/Parser/test/org/tud/forty/test/NASTest.java b/Parser/test/org/tud/forty/test/NASTest.java
new file mode 100644
index 0000000..3349276
--- /dev/null
+++ b/Parser/test/org/tud/forty/test/NASTest.java
@@ -0,0 +1,37 @@
+package org.tud.forty.test;
+
+import org.testng.annotations.DataProvider;
+import org.testng.annotations.Test;
+import org.tud.forty.ast.Root;
+
+import java.io.File;
+import java.util.Iterator;
+
+public class NASTest extends TestBase {
+
+
+    @DataProvider(name = "nas")
+    public static Iterator<Object[]> fortranNasProvider() {
+        return ruleProvider("test-data/nas");
+    }
+
+    @Test(dataProvider = "nas", groups = {"parser"})
+    public void testNASParser(File f) throws Exception {
+        testParse(f, false, true, false, true, Root.class);
+    }
+
+    @Test(dataProvider = "nas", groups = {"prettyprinter"})
+    public void testNASPrettyPrinter(File f) throws Exception {
+        testParse(f, true, false, false, true, Root.class);
+    }
+
+    @Test(dataProvider = "nas", groups = {"compare"})
+    public void testCompareNASPrettyPrinter(File f) throws Exception {
+        testParse(f, false, false, true, true, Root.class);
+    }
+
+    @Test(dataProvider = "nas", groups = {"compare"})
+    public void testCompareNASPrettyPrinterWithSpaces(File f) throws Exception {
+        testParse(f, false, false, true, false, Root.class);
+    }
+}
diff --git a/Parser/test/org/tud/forty/test/SlotTest.java b/Parser/test/org/tud/forty/test/SlotTest.java
index ccf9d0d..228ce73 100644
--- a/Parser/test/org/tud/forty/test/SlotTest.java
+++ b/Parser/test/org/tud/forty/test/SlotTest.java
@@ -38,4 +38,5 @@ public class SlotTest extends TestBase {
     public void testSlotsName(File f) throws Exception {
         testParse(f, false, false, true, false, Root.class);
     }
+
 }
diff --git a/Parser/test/org/tud/forty/test/SpechtTest.java b/Parser/test/org/tud/forty/test/SpechtTest.java
new file mode 100644
index 0000000..1bc787b
--- /dev/null
+++ b/Parser/test/org/tud/forty/test/SpechtTest.java
@@ -0,0 +1,42 @@
+package org.tud.forty.test;
+
+import java.io.File;
+import java.util.Iterator;
+import org.testng.annotations.DataProvider;
+import org.testng.annotations.Test;
+import org.tud.forty.ast.Root;
+
+public class SpechtTest extends TestBase {
+
+
+    @DataProvider(name = "specht")
+    public static Iterator<Object[]> fortranNasProvider() {
+        return ruleProvider("test-data/specht");
+    }
+
+    @Test(dataProvider = "specht", groups = {"parser"})
+    public void testSpechtParser(File f) throws Exception {
+        testParse(f, false, true, false, true, Root.class);
+    }
+
+    @Test(dataProvider = "specht", groups = {"prettyprinter"})
+    public void testSpechtPrettyPrinter(File f) throws Exception {
+        testParse(f, true, false, false, true, Root.class);
+    }
+
+    @Test(dataProvider = "specht", groups = {"compare"})
+    public void testCompareSpechtPrettyPrinter(File f) throws Exception {
+        testParse(f, false, false, true, true, Root.class);
+    }
+
+    @Test(dataProvider = "specht", groups = {"compare"})
+    public void testCompareSpechtPrettyPrinterWithSpaces(File f) throws Exception {
+        testParse(f, false, false, true, false, Root.class);
+    }
+
+    @Test(dataProvider = "specht", groups = {"compare"})
+    public void testParsePrintParseSpecht(File f) throws Exception {
+        testParsePrintParse(f);
+    }
+
+}
diff --git a/Parser/test/org/tud/forty/test/TestBase.java b/Parser/test/org/tud/forty/test/TestBase.java
index f9fe437..e5d96cf 100644
--- a/Parser/test/org/tud/forty/test/TestBase.java
+++ b/Parser/test/org/tud/forty/test/TestBase.java
@@ -1,16 +1,11 @@
 package org.tud.forty.test;
 
 import org.testng.Assert;
-import org.tud.forty.ast.ASTNode;
-import org.tud.forty.ast.Expr;
-import org.tud.forty.ast.PrettyPrinter;
-import org.tud.forty.ast.Root;
+import org.tud.forty.ast.*;
 import org.tud.forty.parser.SlottableFortranParser;
 import xtc.parser.Result;
 
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileReader;
+import java.io.*;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Iterator;
@@ -59,42 +54,120 @@ public class TestBase {
         System.out.println(sb.toString());
     }
 
-    protected void testParse(File f, boolean pp, boolean ast, boolean compare, boolean ignore_spaces, Class clazz) throws Exception {
+    private String trimCode(String code) {
+        System.out.println(">>" + code + "<<");
+        String lastLine = code.split("\n")[code.split("\n").length-1];
+        code = code.replaceAll("\n" + lastLine + "  ","\n");
+
+        code = code.replaceAll("^\\p{Blank}*\n", "");
+        code = code.replaceAll("\n\\p{Blank}*$", "");
+        System.out.println(">>" + code + "<<");
+        code = code.trim() + "\n";
+        System.out.println(">>" + code + "<<");
+        return code;
+    }
+
+    protected void testSanitize(File f) throws Exception {
         Assert.assertTrue(f.exists());
 
-        FileReader reader = new FileReader(f);
+        Reader reader;
+
+        FileReader t = new FileReader(f);
+        StringBuilder sw = new StringBuilder();
+        int i;
+        while ((i = t.read()) != -1) {
+            sw.append((char)i);
+        }
+        Assert.assertEquals(sw.toString(),SlottableFortranParser.removeLineBreaks(sw.toString()));
+
+    }
+
+    protected void testParse(File f, boolean pp, boolean ast, boolean compare, boolean ignore_spaces, Class clazz) throws Exception {
+        testParse(f, pp, ast, compare, ignore_spaces, clazz, false);
+    }
+
+    private String reprint(Reader reader) throws Exception {
+        SlottableFortranParser parser = SlottableFortranParser.createParser(reader, "");
+        Result r = parser.proot(0);
+        if (!r.hasValue()) {
+            System.out.println(parser.format(r.parseError()));
+            if (r.parseError().index > 0) {
+                Assert.assertTrue(r.hasValue(), "ParseError:" + parser.location(r.parseError().index) + ":(" + r.parseError().index + ") " + r.parseError().msg + ".");
+            } else {
+                Assert.assertTrue(r.hasValue(), "ParseError:" + parser.location(r.parseError().index) + ":(" + r.parseError().index + ") " + r.parseError().msg + ".");
+            }
+        }
+        PrettyPrinter pp = new PrettyPrinter("  ", false);
+        ((ASTNode) r.semanticValue()).prettyPrint(pp);
+        return pp.toString();
+    }
+
+    protected void testParsePrintParse(File file) throws Exception {
+        FileReader firstReader = new FileReader(file);
+        String print1 = reprint(firstReader);
+        //System.out.println(print1);
+        StringReader secondReader = new StringReader(print1);
+        String print2 = reprint(secondReader);
+        Assert.assertEquals(print2,print1);
+    }
+
+    protected void testParse(File file, boolean testPrettyPrinter, boolean printAst, boolean compareOutput, boolean compareOutputIgnoringSpaces, Class clazz, boolean trim) throws Exception {
+        Assert.assertTrue(file.exists());
+
+        Reader reader;
+
+        if (trim) {
+            FileReader t = new FileReader(file);
+            StringWriter sw = new StringWriter();
+            int i;
+            while ((i = t.read()) != -1) {
+                sw.append((char)i);
+            }
+            reader = new StringReader(trimCode(sw.toString()));
+        } else {
+            reader = new FileReader(file);
+        }
+
+
+
         SlottableFortranParser parser = SlottableFortranParser.createParser(reader, "");
         Result r;
         if(clazz.equals(Expr.class)) {
             r = parser.pexpr(0);
+        } else if (clazz.equals(ExecutableConstruct.class)) {
+            r = parser.pexecutable_construct(0);
+        } else if (clazz.equals(ExecutionPartConstruct.class)) {
+            r = parser.pexecution_part_construct(0);
         } else {
             r = parser.proot(0);
         }
         if (!r.hasValue()) {
             if (r.parseError().index > 0) {
-                Assert.assertTrue(r.hasValue(), "ParseError in " + f.getName() + ":" + parser.location(r.parseError().index) + ":(" + r.parseError().index + ") " + r.parseError().msg + ".");
+                Assert.assertTrue(r.hasValue(), "ParseError in " + file.getName() + ":" + parser.location(r.parseError().index) + ":(" + r.parseError().index + ") " + r.parseError().msg + ".");
             } else {
-                Assert.assertTrue(r.hasValue(), "ParseError in " + f.getName() + ":" + ":(" + r.parseError().index + ") " + r.parseError().msg + ".");
+                Assert.assertTrue(r.hasValue(), "ParseError in " + file.getName() + ":" + ":(" + r.parseError().index + ") " + r.parseError().msg + ".");
             }
-
         }
 
-        if (pp || compare) {
+        ((ASTNode) r.semanticValue()).fixTreeStructure();
+        ((ASTNode) r.semanticValue()).checkTreeStructure();
+
+        if (testPrettyPrinter || compareOutput) {
             PrettyPrinter s = new PrettyPrinter("  ", false);
             ((ASTNode) r.semanticValue()).prettyPrint(s);
 
             int ln = 1;
-            if (pp) {
+            if (testPrettyPrinter) {
                 System.out.println("\n===============================================");
-                System.out.println("== " + f.getName());
+                System.out.println("== " + file.getName());
                 System.out.println("===============================================");
                 for (String sl : s.toString().split("\n")) {
                     System.out.println(String.format("%03d  ", ln++) + sl);
                 }
             }
-            if (compare) {
+            if (compareOutput) {
                 reader.close();
-                reader = new FileReader(f);
+                reader = new FileReader(file);
                 BufferedReader br = new BufferedReader(reader);
                 StringBuffer sb = new StringBuffer();
 
@@ -104,17 +177,21 @@ public class TestBase {
                     sb.append(line).append("\n");
                 }
 
-                if (ignore_spaces) {
-                    Assert.assertEquals(trim(s.toString()), trim(sb.toString()));
+                if (compareOutputIgnoringSpaces) {
+                    Assert.assertEquals(trim(s.toString()), SlottableFortranParser.removeLineBreaks(trim(sb.toString())));
                 } else {
                     Assert.assertEquals(s.toString(), sb.toString());
                 }
             }
         }
-        if (ast) {
+        if (printAst) {
             System.out.println("===============================================");
             if(clazz.equals(Expr.class)) {
                 System.out.println(((Expr) r.semanticValue()).getAstStringWithSource());
+            } else if(clazz.equals(ExecutableConstruct.class)) {
+                System.out.println(((ExecutableConstruct) r.semanticValue()).getAstStringWithSource());
+            } else if(clazz.equals(ExecutionPartConstruct.class)) {
+                System.out.println(((ExecutionPartConstruct) r.semanticValue()).getAstStringWithSource());
             } else {
                 System.out.println(((Root) r.semanticValue()).getAstStringWithSource());
             }
-- 
GitLab