TREX-CoE · v1j4y · Mar 29, 2021 · Mar 29, 2021 · Mar 29, 2021 · Mar 29, 2021
diff --git a/Makefile b/Makefile
@@ -1,17 +1,82 @@
-IRPF90 = irpf90 #--codelet=factor_een_blas:2 #-s nelec:10 -s nnuc:2 -s ncord:5 #-a -d
-FC     = ifort -xHost -g -mkl=parallel
-FCFLAGS= -O2 -ffree-line-length-none -I .
+FC     = ifort -O3 -ip -g -xCORE-AVX512 -qopt-zmm-usage=high -mkl=parallel -fopenmp -align rec32byte -align array32byte -fpp # -fma -ftz -fomit-frame-pointer -O3 -march=native -mkl=sequential -g -I$(PWD) #-xHost -check all 
+CC     = gcc
+CPP    = g++ -O2 -march=native # -Wall
+FCFLAGS= #-O2 -ffree-line-length-none -I .
 NINJA  = ninja
-AR = ar 
+AR = ar
 ARCHIVE = ar crs
 RANLIB = ranlib
 
-SRC=
+SRC= 
 OBJ=
-LIB=
+LIB= $(MAGMA_F90FLAGS) $(LDFLAGS) $(MAGMA_LIBS) $(STARPU_CFLAGS) $(STARPU_LIBS) -L${GLIB} -lstdc++  #magma_dgemm_async_gpu.o dgemm.o gemm/common/blas.o
+MAGMA         = /p/software/juwelsbooster/stages/2020/software/magma/2.5.4-gcccoremkl-9.3.0-2020.2.254
+MAGMADIR      = /p/software/juwelsbooster/stages/2020/software/magma/2.5.4-gcccoremkl-9.3.0-2020.2.254
+FORTRAN       = /p/software/juwelsbooster/stages/2020/software/GCCcore/9.3.0/lib64
+GLIB          = /p/software/juwelsbooster/stages/2020/software/GCCcore/9.3.0/lib64
+CUDADIR      ?= /p/software/juwelsbooster/stages/2020/software/CUDA/11.0
+OPENBLASDIR  ?= /p/software/juwelsbooster/stages/2020/software/GCC/
+MAGMA_CFLAGS   := -DADD_ -I$(MAGMADIR)/include -I$(CUDADIR)/include
+MAGMA_F90FLAGS := -I$(MAGMADIR)/include -Dmagma_devptr_t="integer(kind=8)"
 
+#MAGMA_LIBS   := -L$(MAGMADIR)/lib -L$(CUDADIR)/lib64 -L$(OPENBLASDIR)/lib \
+                -lmagma -lcublas -lcudart -lmkl
+### STAR PU ###
+#STARPU_VERSION=1.3
+#CPPFLAGS += $(shell pkg-config --cflags starpu-$(STARPU_VERSION))
+#LDLIBS += $(shell pkg-config --libs starpu-$(STARPU_VERSION))
+#STARPU_LIBS += $(shell pkg-config --libs starpu-$(STARPU_VERSION))
+#STARPU_CFLAGS += $(shell pkg-config --cflags starpu-$(STARPU_VERSION))
+#
+#CFLAGS += -O3 -Wall -Wextra
+#
+## to avoid having to use LD_LIBRARY_PATH
+#LDLIBS += -Wl,-rpath -Wl,$(shell pkg-config --variable=libdir starpu-$(STARPU_VERSION))
+#
+## Automatically enable CUDA / OpenCL
+#STARPU_CONFIG=$(shell pkg-config --variable=includedir starpu-$(STARPU_VERSION))/starpu/$(STARPU_VERSION)/starpu_config.h
+#STARPU_CFLAGS=-I$(shell pkg-config --variable=includedir starpu-$(STARPU_VERSION))/starpu/$(STARPU_VERSION)
+#ifneq ($(shell grep "USE_CUDA 1" $(STARPU_CONFIG)),)
+#USE_CUDA=1
+#endif
+#ifneq ($(shell grep "USE_OPENCL 1" $(STARPU_CONFIG)),)
+#USE_OPENCL=1
+#endif
+#ifneq ($(shell grep "RELEASE_VERSION 99" $(STARPU_CONFIG)),)
+#USE_ENERGY=1
+#endif
+
+
+IRPF90 = irpf90 --codelet=elec_dist:2 -s tile_size:24
 -include irpf90.make
 export
 
-irpf90.make: $(filter-out IRPF90_temp/%, $(wildcard */*.irp.f)) $(wildcard *.irp.f) $(wildcard *.inc.f) Makefile 
+#irpf90.make: fortran.o tiling_interface.o magma_dgemm_async_gpu.o dgemm.o $(filter-out IRPF90_temp/%, $(wildcard */*.irp.f)) $(wildcard *.irp.f) $(wildcard *.inc.f) Makefile
+irpf90.make: tiling_interface.o $(filter-out IRPF90_temp/%, $(wildcard */*.irp.f)) $(wildcard *.irp.f) $(wildcard *.inc.f) Makefile
 	$(IRPF90)
+
+#magma_dgemm_async_gpu.o: 
+#	${CPP} $(CFLAGS) $(MAGMA_CFLAGS) -DCUBLAS_GFORTRAN -c magma_dgemm_async_gpu.cc -o magma_dgemm_async_gpu.o
+#
+#fortran.o: $(CUDADIR)/src/fortran.c
+#	$(CC) $(CFLAGS) $(MAGMA_CFLAGS) -DCUBLAS_GFORTRAN -c -o $@ $<
+#
+tiling_interface.o: tiling_interface.f90
+	$(FC) $(FFLAGS) -c -o $@ $<
+
+
+#%.o: %.cu
+#	nvcc $(CPPFLAGS) $< -c -o $@
+
+#CFLAGS+=-DSTARPU_OPENBLAS=0
+#
+#dgemm.o:
+#	$(CC) $(CFLAGS) $(STARPU_CFLAGS) -c gemm/dgemm.c 
+#
+#gemm/dgemm: gemm/dgemm.o gemm/common/blas.o
+#
+#gemm/dgemm: LDLIBS+=-lmkl_intel_lp64
+#ifeq ($(USE_CUDA),1)
+#gemm/dgemm: LDLIBS+=-L$(CUDA_PATH)/lib64 -lcublas -lcudart
+#endif
+
diff --git a/codelet_elec_dist.irp.f b/codelet_elec_dist.irp.f
@@ -0,0 +1,29 @@
+
+program codelet_elec_dist
+  implicit none
+  integer :: i
+  double precision :: ticks_0, ticks_1, cpu_0, cpu_1
+  integer, parameter :: irp_imax = 2
+
+
+
+  call provide_elec_dist
+
+  double precision :: irp_rdtsc
+
+  call cpu_time(cpu_0)
+  ticks_0 = irp_rdtsc()
+  do i=1,irp_imax
+    call bld_elec_dist
+  enddo
+  ticks_1 = irp_rdtsc()
+  call cpu_time(cpu_1)
+  print *, 'elec_dist'
+  print *, '-----------'
+  print *, 'Cycles:'
+  print *,  (ticks_1-ticks_0)/dble(irp_imax)
+  print *, 'Seconds:'
+  print *,  (cpu_1-cpu_0)/dble(irp_imax)
+end
+
+
diff --git a/codelet_factor_een_blas_notiling.irp.f b/codelet_factor_een_blas_notiling.irp.f
@@ -0,0 +1,38 @@
+
+program codelet_factor_een_blas
+  implicit none
+  integer :: i
+  double precision :: ticks_0, ticks_1, cpu_0, cpu_1
+  integer*8 :: irp_imax 
+
+
+  !PROVIDE factor_een_blas tmp_c_tiled
+  PROVIDE factor_een_blas tmp_c
+
+  call provide_factor_een_blas
+
+  double precision :: irp_rdtsc
+
+  irp_imax = max(1_8,20_8 * 125000000_8 /(int(nelec,8) * int(nelec,8) * int(nnuc,8) * ncord))
+  irp_imax = 1
+  print *,"irp_imax=",irp_imax
+
+  call cpu_time(cpu_0)
+  ticks_0 = irp_rdtsc()
+  do i=1,irp_imax
+  print *,i
+    !call bld_tmp_c_tiled
+    call bld_tmp_c
+    call bld_factor_een_blas
+  enddo
+  ticks_1 = irp_rdtsc()
+  call cpu_time(cpu_1)
+  print *, 'factor_een_blas'
+  print *, '-----------'
+  print *, 'Cycles:'
+  print *,  (ticks_1-ticks_0)/dble(irp_imax)
+  print *, 'Seconds:'
+  print *,  (cpu_1-cpu_0)/dble(irp_imax)
+end
+
+
diff --git a/codelet_factor_een_tiled_nounroll.irp.f b/codelet_factor_een_tiled_nounroll.irp.f
@@ -0,0 +1,38 @@
+
+program codelet_factor_een_tiled_nounroll
+  implicit none
+  integer :: i
+  double precision :: ticks_0, ticks_1, cpu_0, cpu_1
+  integer*8 :: irp_imax 
+
+
+  !PROVIDE factor_een_blas tmp_c_tiled
+  PROVIDE factor_een_tiled_nounroll tmp_c_tiled_nounroll
+
+  call provide_factor_een_tiled_nounroll
+
+  double precision :: irp_rdtsc
+
+  irp_imax = max(1_8,20_8 * 125000000_8 /(int(nelec,8) * int(nelec,8) * int(nnuc,8) * ncord))
+  irp_imax = 1
+  print *,"irp_imax=",irp_imax
+
+  call cpu_time(cpu_0)
+  ticks_0 = irp_rdtsc()
+  do i=1,irp_imax
+  print *,i
+    !call bld_tmp_c_tiled
+    call bld_tmp_c_tiled_nounroll
+    call bld_factor_een_tiled_nounroll
+  enddo
+  ticks_1 = irp_rdtsc()
+  call cpu_time(cpu_1)
+  print *, 'factor_een_tiled_nounroll'
+  print *, '-----------'
+  print *, 'Cycles:'
+  print *,  (ticks_1-ticks_0)/dble(irp_imax)
+  print *, 'Seconds:'
+  print *,  (cpu_1-cpu_0)/dble(irp_imax)
+end
+
+
diff --git a/codelet_factor_een_tiling.irp.f b/codelet_factor_een_tiling.irp.f
@@ -0,0 +1,36 @@
+
+program codelet_factor_een_blas_tiled
+  implicit none
+  integer :: i
+  double precision :: ticks_0, ticks_1, cpu_0, cpu_1
+  integer*8 :: irp_imax 
+
+
+  PROVIDE factor_een_blas_tiled tmp_c_tiled
+
+  call provide_factor_een_blas_tiled
+
+  double precision :: irp_rdtsc
+
+  irp_imax = max(1_8,20_8 * 125000000_8 /(int(nelec,8) * int(nelec,8) * int(nnuc,8) * ncord))
+  irp_imax = 1
+  print *,"irp_imax=",irp_imax
+
+  call cpu_time(cpu_0)
+  ticks_0 = irp_rdtsc()
+  do i=1,irp_imax
+  print *,i
+    call bld_tmp_c_tiled
+    call bld_factor_een_blas_tiled
+  enddo
+  ticks_1 = irp_rdtsc()
+  call cpu_time(cpu_1)
+  print *, 'factor_een_blas_tiled'
+  print *, '-----------'
+  print *, 'Cycles:'
+  print *,  (ticks_1-ticks_0)/dble(irp_imax)
+  print *, 'Seconds:'
+  print *,  (cpu_1-cpu_0)/dble(irp_imax)
+end
+
+
diff --git a/codelet_factor_een_tiling_simd.irp.f b/codelet_factor_een_tiling_simd.irp.f
@@ -0,0 +1,36 @@
+
+program codelet_factor_een_blas_tiled_simd
+  implicit none
+  integer :: i
+  double precision :: ticks_0, ticks_1, cpu_0, cpu_1
+  integer*8 :: irp_imax 
+
+
+  PROVIDE factor_een_blas_tiled_simd tmp_c_tiled_simd
+
+  call provide_factor_een_blas_tiled_simd
+
+  double precision :: irp_rdtsc
+
+  irp_imax = max(1_8,20_8 * 125000000_8 /(int(nelec,8) * int(nelec,8) * int(nnuc,8) * ncord))
+  irp_imax = 1
+  print *,"irp_imax=",irp_imax
+
+  call cpu_time(cpu_0)
+  ticks_0 = irp_rdtsc()
+  do i=1,irp_imax
+  print *,i
+    call bld_tmp_c_tiled_simd
+    call bld_factor_een_blas_tiled_simd
+  enddo
+  ticks_1 = irp_rdtsc()
+  call cpu_time(cpu_1)
+  print *, 'factor_een_blas_tiled_simd'
+  print *, '-----------'
+  print *, 'Cycles:'
+  print *,  (ticks_1-ticks_0)/dble(irp_imax)
+  print *, 'Seconds:'
+  print *,  (cpu_1-cpu_0)/dble(irp_imax)
+end
+
+
diff --git a/codelet_factor_een_tiling_wj.irp.f b/codelet_factor_een_tiling_wj.irp.f
@@ -0,0 +1,36 @@
+
+program codelet_factor_een_blas_tiled_wj
+  implicit none
+  integer :: i
+  double precision :: ticks_0, ticks_1, cpu_0, cpu_1
+  integer*8 :: irp_imax 
+
+
+  PROVIDE factor_een_blas_tiled_wj tmp_c_tiled_wj
+
+  call provide_factor_een_blas_tiled_wj
+
+  double precision :: irp_rdtsc
+
+  irp_imax = max(1_8,20_8 * 125000000_8 /(int(nelec,8) * int(nelec,8) * int(nnuc,8) * ncord))
+  irp_imax = 1
+  print *,"irp_imax=",irp_imax
+
+  call cpu_time(cpu_0)
+  ticks_0 = irp_rdtsc()
+  do i=1,irp_imax
+  print *,i
+    call bld_tmp_c_tiled_wj
+    call bld_factor_een_blas_tiled_wj
+  enddo
+  ticks_1 = irp_rdtsc()
+  call cpu_time(cpu_1)
+  print *, 'factor_een_blas_tiled_wj'
+  print *, '-----------'
+  print *, 'Cycles:'
+  print *,  (ticks_1-ticks_0)/dble(irp_imax)
+  print *, 'Seconds:'
+  print *,  (cpu_1-cpu_0)/dble(irp_imax)
+end
+
+
diff --git a/el_nuc_el.irp.f b/el_nuc_el.irp.f
@@ -1,12 +1,16 @@
-BEGIN_PROVIDER [ double precision, factor_een ]
+ BEGIN_PROVIDER [ double precision, factor_een ]
+&BEGIN_PROVIDER [ double precision, factor_een_deriv_e, (4, nelec) ]
    implicit none
    BEGIN_DOC
    ! Electron -electron-nuclei contribution to Jastrow factor
    !
    ! 5436.20340250000
+   ! Derivative of the Jeen
+   ! 35533.115255
    END_DOC
    integer                        :: i, j, a, p, k, l, lmax, m, n
    double precision               :: cn, accu2, accu
+   double precision               :: daccu(1:4), daccu2(1:4)
 
 !   double precision :: ria_tmp(nelec,dim_cord_vect,nnuc)
 !   double precision :: rja_tmp(nelec,dim_cord_vect,nnuc)
@@ -56,17 +60,6 @@
      enddo
 
    enddo
-
-END_PROVIDER
-
-BEGIN_PROVIDER [ double precision, factor_een_deriv_e, (4, nelec) ]
-  implicit none
-  BEGIN_DOC
-! Derivative of the Jeen
-! 35533.115255
-  END_DOC
-  integer                        :: i, j, a, p, k, l, lmax, m, n
-  double precision               :: cn, accu, accu2, daccu(1:4), daccu2(1:4)
 
 !  factor_een_deriv_e(1:4,1:nelec) = factor_een_deriv_e_blas(1:4,1:nelec)
 !  return