Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Tiling and Batched DGEMM. #4

Open
wants to merge 23 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 72 additions & 7 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,17 +1,82 @@
IRPF90 = irpf90 #--codelet=factor_een_blas:2 #-s nelec:10 -s nnuc:2 -s ncord:5 #-a -d
FC = ifort -xHost -g -mkl=parallel
FCFLAGS= -O2 -ffree-line-length-none -I .
FC = ifort -O3 -ip -g -xCORE-AVX512 -qopt-zmm-usage=high -mkl=parallel -fopenmp -align rec32byte -align array32byte -fpp # -fma -ftz -fomit-frame-pointer -O3 -march=native -mkl=sequential -g -I$(PWD) #-xHost -check all
CC = gcc
CPP = g++ -O2 -march=native # -Wall
FCFLAGS= #-O2 -ffree-line-length-none -I .
NINJA = ninja
AR = ar
AR = ar
ARCHIVE = ar crs
RANLIB = ranlib

SRC=
SRC=
OBJ=
LIB=
LIB= $(MAGMA_F90FLAGS) $(LDFLAGS) $(MAGMA_LIBS) $(STARPU_CFLAGS) $(STARPU_LIBS) -L${GLIB} -lstdc++ #magma_dgemm_async_gpu.o dgemm.o gemm/common/blas.o
MAGMA = /p/software/juwelsbooster/stages/2020/software/magma/2.5.4-gcccoremkl-9.3.0-2020.2.254
MAGMADIR = /p/software/juwelsbooster/stages/2020/software/magma/2.5.4-gcccoremkl-9.3.0-2020.2.254
FORTRAN = /p/software/juwelsbooster/stages/2020/software/GCCcore/9.3.0/lib64
GLIB = /p/software/juwelsbooster/stages/2020/software/GCCcore/9.3.0/lib64
CUDADIR ?= /p/software/juwelsbooster/stages/2020/software/CUDA/11.0
OPENBLASDIR ?= /p/software/juwelsbooster/stages/2020/software/GCC/
MAGMA_CFLAGS := -DADD_ -I$(MAGMADIR)/include -I$(CUDADIR)/include
MAGMA_F90FLAGS := -I$(MAGMADIR)/include -Dmagma_devptr_t="integer(kind=8)"

#MAGMA_LIBS := -L$(MAGMADIR)/lib -L$(CUDADIR)/lib64 -L$(OPENBLASDIR)/lib \
-lmagma -lcublas -lcudart -lmkl
### STAR PU ###
#STARPU_VERSION=1.3
#CPPFLAGS += $(shell pkg-config --cflags starpu-$(STARPU_VERSION))
#LDLIBS += $(shell pkg-config --libs starpu-$(STARPU_VERSION))
#STARPU_LIBS += $(shell pkg-config --libs starpu-$(STARPU_VERSION))
#STARPU_CFLAGS += $(shell pkg-config --cflags starpu-$(STARPU_VERSION))
#
#CFLAGS += -O3 -Wall -Wextra
#
## to avoid having to use LD_LIBRARY_PATH
#LDLIBS += -Wl,-rpath -Wl,$(shell pkg-config --variable=libdir starpu-$(STARPU_VERSION))
#
## Automatically enable CUDA / OpenCL
#STARPU_CONFIG=$(shell pkg-config --variable=includedir starpu-$(STARPU_VERSION))/starpu/$(STARPU_VERSION)/starpu_config.h
#STARPU_CFLAGS=-I$(shell pkg-config --variable=includedir starpu-$(STARPU_VERSION))/starpu/$(STARPU_VERSION)
#ifneq ($(shell grep "USE_CUDA 1" $(STARPU_CONFIG)),)
#USE_CUDA=1
#endif
#ifneq ($(shell grep "USE_OPENCL 1" $(STARPU_CONFIG)),)
#USE_OPENCL=1
#endif
#ifneq ($(shell grep "RELEASE_VERSION 99" $(STARPU_CONFIG)),)
#USE_ENERGY=1
#endif


IRPF90 = irpf90 --codelet=elec_dist:2 -s tile_size:24
-include irpf90.make
export

irpf90.make: $(filter-out IRPF90_temp/%, $(wildcard */*.irp.f)) $(wildcard *.irp.f) $(wildcard *.inc.f) Makefile
#irpf90.make: fortran.o tiling_interface.o magma_dgemm_async_gpu.o dgemm.o $(filter-out IRPF90_temp/%, $(wildcard */*.irp.f)) $(wildcard *.irp.f) $(wildcard *.inc.f) Makefile
irpf90.make: tiling_interface.o $(filter-out IRPF90_temp/%, $(wildcard */*.irp.f)) $(wildcard *.irp.f) $(wildcard *.inc.f) Makefile
$(IRPF90)

#magma_dgemm_async_gpu.o:
# ${CPP} $(CFLAGS) $(MAGMA_CFLAGS) -DCUBLAS_GFORTRAN -c magma_dgemm_async_gpu.cc -o magma_dgemm_async_gpu.o
#
#fortran.o: $(CUDADIR)/src/fortran.c
# $(CC) $(CFLAGS) $(MAGMA_CFLAGS) -DCUBLAS_GFORTRAN -c -o $@ $<
#
tiling_interface.o: tiling_interface.f90
$(FC) $(FFLAGS) -c -o $@ $<


#%.o: %.cu
# nvcc $(CPPFLAGS) $< -c -o $@

#CFLAGS+=-DSTARPU_OPENBLAS=0
#
#dgemm.o:
# $(CC) $(CFLAGS) $(STARPU_CFLAGS) -c gemm/dgemm.c
#
#gemm/dgemm: gemm/dgemm.o gemm/common/blas.o
#
#gemm/dgemm: LDLIBS+=-lmkl_intel_lp64
#ifeq ($(USE_CUDA),1)
#gemm/dgemm: LDLIBS+=-L$(CUDA_PATH)/lib64 -lcublas -lcudart
#endif

29 changes: 29 additions & 0 deletions codelet_elec_dist.irp.f
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@

program codelet_elec_dist
implicit none
integer :: i
double precision :: ticks_0, ticks_1, cpu_0, cpu_1
integer, parameter :: irp_imax = 2



call provide_elec_dist

double precision :: irp_rdtsc

call cpu_time(cpu_0)
ticks_0 = irp_rdtsc()
do i=1,irp_imax
call bld_elec_dist
enddo
ticks_1 = irp_rdtsc()
call cpu_time(cpu_1)
print *, 'elec_dist'
print *, '-----------'
print *, 'Cycles:'
print *, (ticks_1-ticks_0)/dble(irp_imax)
print *, 'Seconds:'
print *, (cpu_1-cpu_0)/dble(irp_imax)
end


38 changes: 38 additions & 0 deletions codelet_factor_een_blas_notiling.irp.f
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@

program codelet_factor_een_blas
implicit none
integer :: i
double precision :: ticks_0, ticks_1, cpu_0, cpu_1
integer*8 :: irp_imax


!PROVIDE factor_een_blas tmp_c_tiled
PROVIDE factor_een_blas tmp_c

call provide_factor_een_blas

double precision :: irp_rdtsc

irp_imax = max(1_8,20_8 * 125000000_8 /(int(nelec,8) * int(nelec,8) * int(nnuc,8) * ncord))
irp_imax = 1
print *,"irp_imax=",irp_imax

call cpu_time(cpu_0)
ticks_0 = irp_rdtsc()
do i=1,irp_imax
print *,i
!call bld_tmp_c_tiled
call bld_tmp_c
call bld_factor_een_blas
enddo
ticks_1 = irp_rdtsc()
call cpu_time(cpu_1)
print *, 'factor_een_blas'
print *, '-----------'
print *, 'Cycles:'
print *, (ticks_1-ticks_0)/dble(irp_imax)
print *, 'Seconds:'
print *, (cpu_1-cpu_0)/dble(irp_imax)
end


38 changes: 38 additions & 0 deletions codelet_factor_een_tiled_nounroll.irp.f
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@

program codelet_factor_een_tiled_nounroll
implicit none
integer :: i
double precision :: ticks_0, ticks_1, cpu_0, cpu_1
integer*8 :: irp_imax


!PROVIDE factor_een_blas tmp_c_tiled
PROVIDE factor_een_tiled_nounroll tmp_c_tiled_nounroll

call provide_factor_een_tiled_nounroll

double precision :: irp_rdtsc

irp_imax = max(1_8,20_8 * 125000000_8 /(int(nelec,8) * int(nelec,8) * int(nnuc,8) * ncord))
irp_imax = 1
print *,"irp_imax=",irp_imax

call cpu_time(cpu_0)
ticks_0 = irp_rdtsc()
do i=1,irp_imax
print *,i
!call bld_tmp_c_tiled
call bld_tmp_c_tiled_nounroll
call bld_factor_een_tiled_nounroll
enddo
ticks_1 = irp_rdtsc()
call cpu_time(cpu_1)
print *, 'factor_een_tiled_nounroll'
print *, '-----------'
print *, 'Cycles:'
print *, (ticks_1-ticks_0)/dble(irp_imax)
print *, 'Seconds:'
print *, (cpu_1-cpu_0)/dble(irp_imax)
end


36 changes: 36 additions & 0 deletions codelet_factor_een_tiling.irp.f
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@

program codelet_factor_een_blas_tiled
implicit none
integer :: i
double precision :: ticks_0, ticks_1, cpu_0, cpu_1
integer*8 :: irp_imax


PROVIDE factor_een_blas_tiled tmp_c_tiled

call provide_factor_een_blas_tiled

double precision :: irp_rdtsc

irp_imax = max(1_8,20_8 * 125000000_8 /(int(nelec,8) * int(nelec,8) * int(nnuc,8) * ncord))
irp_imax = 1
print *,"irp_imax=",irp_imax

call cpu_time(cpu_0)
ticks_0 = irp_rdtsc()
do i=1,irp_imax
print *,i
call bld_tmp_c_tiled
call bld_factor_een_blas_tiled
enddo
ticks_1 = irp_rdtsc()
call cpu_time(cpu_1)
print *, 'factor_een_blas_tiled'
print *, '-----------'
print *, 'Cycles:'
print *, (ticks_1-ticks_0)/dble(irp_imax)
print *, 'Seconds:'
print *, (cpu_1-cpu_0)/dble(irp_imax)
end


36 changes: 36 additions & 0 deletions codelet_factor_een_tiling_simd.irp.f
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@

program codelet_factor_een_blas_tiled_simd
implicit none
integer :: i
double precision :: ticks_0, ticks_1, cpu_0, cpu_1
integer*8 :: irp_imax


PROVIDE factor_een_blas_tiled_simd tmp_c_tiled_simd

call provide_factor_een_blas_tiled_simd

double precision :: irp_rdtsc

irp_imax = max(1_8,20_8 * 125000000_8 /(int(nelec,8) * int(nelec,8) * int(nnuc,8) * ncord))
irp_imax = 1
print *,"irp_imax=",irp_imax

call cpu_time(cpu_0)
ticks_0 = irp_rdtsc()
do i=1,irp_imax
print *,i
call bld_tmp_c_tiled_simd
call bld_factor_een_blas_tiled_simd
enddo
ticks_1 = irp_rdtsc()
call cpu_time(cpu_1)
print *, 'factor_een_blas_tiled_simd'
print *, '-----------'
print *, 'Cycles:'
print *, (ticks_1-ticks_0)/dble(irp_imax)
print *, 'Seconds:'
print *, (cpu_1-cpu_0)/dble(irp_imax)
end


36 changes: 36 additions & 0 deletions codelet_factor_een_tiling_wj.irp.f
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@

program codelet_factor_een_blas_tiled_wj
implicit none
integer :: i
double precision :: ticks_0, ticks_1, cpu_0, cpu_1
integer*8 :: irp_imax


PROVIDE factor_een_blas_tiled_wj tmp_c_tiled_wj

call provide_factor_een_blas_tiled_wj

double precision :: irp_rdtsc

irp_imax = max(1_8,20_8 * 125000000_8 /(int(nelec,8) * int(nelec,8) * int(nnuc,8) * ncord))
irp_imax = 1
print *,"irp_imax=",irp_imax

call cpu_time(cpu_0)
ticks_0 = irp_rdtsc()
do i=1,irp_imax
print *,i
call bld_tmp_c_tiled_wj
call bld_factor_een_blas_tiled_wj
enddo
ticks_1 = irp_rdtsc()
call cpu_time(cpu_1)
print *, 'factor_een_blas_tiled_wj'
print *, '-----------'
print *, 'Cycles:'
print *, (ticks_1-ticks_0)/dble(irp_imax)
print *, 'Seconds:'
print *, (cpu_1-cpu_0)/dble(irp_imax)
end


17 changes: 5 additions & 12 deletions el_nuc_el.irp.f
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
BEGIN_PROVIDER [ double precision, factor_een ]
BEGIN_PROVIDER [ double precision, factor_een ]
&BEGIN_PROVIDER [ double precision, factor_een_deriv_e, (4, nelec) ]
implicit none
BEGIN_DOC
! Electron -electron-nuclei contribution to Jastrow factor
!
! 5436.20340250000
! Derivative of the Jeen
! 35533.115255
END_DOC
integer :: i, j, a, p, k, l, lmax, m, n
double precision :: cn, accu2, accu
double precision :: daccu(1:4), daccu2(1:4)

! double precision :: ria_tmp(nelec,dim_cord_vect,nnuc)
! double precision :: rja_tmp(nelec,dim_cord_vect,nnuc)
Expand Down Expand Up @@ -56,17 +60,6 @@
enddo

enddo

END_PROVIDER

BEGIN_PROVIDER [ double precision, factor_een_deriv_e, (4, nelec) ]
implicit none
BEGIN_DOC
! Derivative of the Jeen
! 35533.115255
END_DOC
integer :: i, j, a, p, k, l, lmax, m, n
double precision :: cn, accu, accu2, daccu(1:4), daccu2(1:4)

! factor_een_deriv_e(1:4,1:nelec) = factor_een_deriv_e_blas(1:4,1:nelec)
! return
Expand Down
Loading