File: VkFFT_API_guide.lyx

package info (click to toggle)
vkfft 1.2.26%2Bds1-1
links: PTS, VCS
area: main
in suites: bookworm
size: 7,696 kB
sloc: ansic: 30,080; cpp: 10,808; makefile: 6
file content (7548 lines) | stat: -rw-r--r-- 169,736 bytes
#LyX 2.3 created this file. For more info see http://www.lyx.org/
\lyxformat 544
\begin_document
\begin_header
\save_transient_properties true
\origin unavailable
\textclass article
\begin_preamble
\usepackage{sourcecodepro}
\usepackage[parfill]{parskip}
\usepackage{enumitem}
\setlist[itemize]{leftmargin=*}
\usepackage{minted}
\usepackage{mdframed}
\definecolor{bg}{rgb}{0.95,0.95,0.95}
\usepackage{hyperref}
\hypersetup{
    colorlinks,
    citecolor=black,
    filecolor=black,
    linkcolor=black,
    urlcolor=black
}
\end_preamble
\use_default_options true
\maintain_unincluded_children false
\language english
\language_package default
\inputencoding auto
\fontencoding global
\font_roman "default" "default"
\font_sans "default" "default"
\font_typewriter "default" "default"
\font_math "auto" "auto"
\font_default_family default
\use_non_tex_fonts true
\font_sc false
\font_osf false
\font_sf_scale 100 100
\font_tt_scale 100 100
\use_microtype false
\use_dash_ligatures true
\graphics default
\default_output_format default
\output_sync 0
\bibtex_command default
\index_command default
\paperfontsize 12
\spacing single
\use_hyperref false
\papersize default
\use_geometry true
\use_package amsmath 1
\use_package amssymb 1
\use_package cancel 1
\use_package esint 1
\use_package mathdots 1
\use_package mathtools 1
\use_package mhchem 1
\use_package stackrel 1
\use_package stmaryrd 1
\use_package undertilde 1
\cite_engine basic
\cite_engine_type default
\biblio_style plain
\use_bibtopic false
\use_indices false
\paperorientation portrait
\suppress_date false
\justification true
\use_refstyle 1
\use_minted 0
\index Index
\shortcut idx
\color #008000
\end_index
\leftmargin 2.5cm
\topmargin 2.5cm
\rightmargin 2.5cm
\bottommargin 2.5cm
\secnumdepth 3
\tocdepth 3
\paragraph_separation indent
\paragraph_indentation default
\is_math_indent 0
\math_numbering_side default
\quotes_style english
\dynamic_quotes 0
\papercolumns 1
\papersides 1
\paperpagestyle default
\tracking_changes false
\output_changes false
\html_math_output 0
\html_css_as_file 0
\html_be_strict false
\end_header

\begin_body

\begin_layout Standard
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
UseRawInputEncoding
\end_layout

\begin_layout Plain Layout


\backslash
begin{titlepage} 	
\end_layout

\begin_layout Plain Layout


\backslash
centering 	
\end_layout

\begin_layout Plain Layout


\backslash
vspace{1cm} 	
\end_layout

\begin_layout Plain Layout

{
\backslash
scshape
\backslash
LARGE VkFFT - Vulkan/CUDA/HIP/OpenCL/Level Zero Fast Fourier Transform library
 
\backslash
par} 		
\end_layout

\begin_layout Plain Layout


\backslash
vspace{1.5cm} 	
\end_layout

\begin_layout Plain Layout

{
\backslash
huge
\backslash
bfseries API guide with examples
\backslash
par} 	
\end_layout

\begin_layout Plain Layout


\backslash
vspace{2cm} 	
\end_layout

\begin_layout Plain Layout

{
\backslash
Large Dmitrii Tolmachev
\backslash
par} 	
\end_layout

\begin_layout Plain Layout

	
\end_layout

\begin_layout Plain Layout


\backslash
vspace{1cm} 	
\end_layout

\begin_layout Plain Layout

{
\backslash
large August 2022, version 1.2.26
\backslash
par} 
\end_layout

\begin_layout Plain Layout


\backslash
end{titlepage}
\end_layout

\end_inset


\end_layout

\begin_layout Standard
\begin_inset Newpage newpage
\end_inset


\end_layout

\begin_layout Standard
\begin_inset CommandInset toc
LatexCommand tableofcontents

\end_inset


\end_layout

\begin_layout Standard
\begin_inset Newpage newpage
\end_inset


\end_layout

\begin_layout Section
Introduction
\end_layout

\begin_layout Standard
This document describes VkFFT - Vulkan/CUDA/HIP/OpenCL/Level Zero Fast Fourier
 Transform library.
 It describes the features and current limitations of VkFFT, explains the
 API and compares it to other FFT libraries (like FFTW and cuFFT) on the
 set of examples.
 It is by no means the final version, so if there is something unclear -
 feel free to contact me (dtolm96@gmail.com), so I can update it.
 
\end_layout

\begin_layout Standard
\begin_inset Newpage newpage
\end_inset


\end_layout

\begin_layout Section
Using the VkFFT API
\end_layout

\begin_layout Standard
This chapter will cover the basics of VkFFT.
 Fourier transform of a sequence is called Discrete Fourier Transform (DFT).
 It is defined by the following formula:
\begin_inset Formula 
\begin{equation}
X_{k}=\stackrel[n=0]{N-1}{\sum}x_{n}e^{-\frac{2\pi i}{N}nk}=\mathrm{DFT}{}_{N}(x_{n},k),\label{eq:dft}
\end{equation}

\end_inset


\end_layout

\begin_layout Standard
where 
\begin_inset Formula $x_{n}$
\end_inset

is the input sequence, 
\begin_inset Formula $N$
\end_inset

 is the length of the input sequence and 
\begin_inset Formula $k\in[0,N-1],k\in\mathbb{Z}$
\end_inset

is the output index, corresponding to frequency in Fourier space.
 Corresponding to that, inverse DFT is defined as following:
\begin_inset Formula 
\begin{equation}
x_{n}=\stackrel[k=0]{\mathrm{N-1}}{\sum}X_{k}e^{\frac{2\pi i}{N}nk}=\mathrm{iDFT}{}_{N}(X_{k},n)
\end{equation}

\end_inset


\end_layout

\begin_layout Standard
VkFFT follows the same definitions as FFTW and cuFFT - forward FFT has the
 exponent sign 
\begin_inset Formula $-1$
\end_inset

, while the inverse has the exponent sign 
\begin_inset Formula $1$
\end_inset

.
 Note, that inverse transform by default is unnormalized, so to get the
 input sequence after FFT + iFFT, the user has to divide the result by 
\begin_inset Formula $N$
\end_inset

.
\end_layout

\begin_layout Subsection
Installing VkFFT
\end_layout

\begin_layout Standard
VkFFT is distributed as a header-only library.
 The installation process consists of the following steps:
\end_layout

\begin_layout Enumerate
\noindent
\align left
Copy vkFFT.h file into one of the directories included in the user's project.
\end_layout

\begin_layout Enumerate
\noindent
Define VKFFT_BACKEND as a number corresponding to the API used in the user's
 project: 0 - Vulkan, 1 - CUDA, 2 - HIP, 3 - OpenCL, 4 - Level Zero.
 Definition is done like:
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
begin{mdframed}[backgroundcolor=bg]
\end_layout

\begin_layout Plain Layout


\backslash
begin{minted}[tabsize=4,obeytabs,breaklines]{make}
\end_layout

\begin_layout Plain Layout

-DVKFFT_BACKEND=X
\end_layout

\begin_layout Plain Layout


\backslash
end{minted}
\end_layout

\begin_layout Plain Layout


\backslash
end{mdframed}
\end_layout

\end_inset

 in GCC or as 
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
begin{mdframed}[backgroundcolor=bg]
\end_layout

\begin_layout Plain Layout


\backslash
begin{minted}[tabsize=4,obeytabs,breaklines]{cmake}
\end_layout

\begin_layout Plain Layout

set(VKFFT_BACKEND 1 CACHE STRING "0 - Vulkan, 1 - CUDA, 2 - HIP, 3 - OpenCL,
 4 - Level Zero")
\end_layout

\begin_layout Plain Layout


\backslash
end{minted}
\end_layout

\begin_layout Plain Layout


\backslash
end{mdframed}
\end_layout

\end_inset

in CMake.
\end_layout

\begin_layout Enumerate
Depending on the API backend, the project must use additional libraries
 for run-time compilation:
\end_layout

\begin_deeper
\begin_layout Enumerate
Vulkan API: SPIRV, glslang and Vulkan.
 Define VK_API_VERSION to the available Vulkan version.
 Sample CMakeLists can look like this:
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
begin{mdframed}[backgroundcolor=bg]
\end_layout

\begin_layout Plain Layout


\backslash
begin{minted}[tabsize=4,obeytabs,breaklines]{cmake}
\end_layout

\begin_layout Plain Layout

find_package(Vulkan REQUIRED)
\end_layout

\begin_layout Plain Layout

target_compile_definitions(${PROJECT_NAME} PUBLIC -DVK_API_VERSION=11)#10
 - Vulkan 1.0, 11 - Vulkan 1.1, 12 - Vulkan 1.2 
\end_layout

\begin_layout Plain Layout

target_include_directories(${PROJECT_NAME} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/gl
slang-master/glslang/Include/) 
\end_layout

\begin_layout Plain Layout

add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/glslang-master)
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

target_include_directories(${PROJECT_NAME} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vk
FFT/)
\end_layout

\begin_layout Plain Layout

add_library(VkFFT INTERFACE)
\end_layout

\begin_layout Plain Layout

target_compile_definitions(VkFFT INTERFACE -DVKFFT_BACKEND=0)
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

target_link_libraries(${PROJECT_NAME} PUBLIC SPIRV glslang Vulkan::Vulkan
 VkFFT)
\end_layout

\begin_layout Plain Layout


\backslash
end{minted}
\end_layout

\begin_layout Plain Layout


\backslash
end{mdframed}
\end_layout

\end_inset


\end_layout

\begin_layout Enumerate
CUDA API: CUDA and NVRTC.
 Sample CMakeLists can look like this:
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
begin{mdframed}[backgroundcolor=bg]
\end_layout

\begin_layout Plain Layout


\backslash
begin{minted}[tabsize=4,obeytabs,breaklines]{cmake}
\end_layout

\begin_layout Plain Layout

find_package(CUDA 9.0 REQUIRED) 	
\end_layout

\begin_layout Plain Layout

enable_language(CUDA) 	
\end_layout

\begin_layout Plain Layout

set_property(TARGET ${PROJECT_NAME} PROPERTY CUDA_ARCHITECTURES 35 60 70
 75 80 86) 	
\end_layout

\begin_layout Plain Layout

target_compile_options(${PROJECT_NAME} PUBLIC "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:
 	
\end_layout

\begin_layout Plain Layout

	-DVKFFT_BACKEND=${VKFFT_BACKEND} 	
\end_layout

\begin_layout Plain Layout

	-gencode arch=compute_35,code=compute_35
\end_layout

\begin_layout Plain Layout

	-gencode arch=compute_60,code=compute_60
\end_layout

\begin_layout Plain Layout

	-gencode arch=compute_70,code=compute_70
\end_layout

\begin_layout Plain Layout

	-gencode arch=compute_75,code=compute_75
\end_layout

\begin_layout Plain Layout

	-gencode arch=compute_80,code=compute_80
\end_layout

\begin_layout Plain Layout

	-gencode arch=compute_86,code=compute_86>")
\end_layout

\begin_layout Plain Layout

set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_SEPARABLE_COMPILATION
 ON)
\end_layout

\begin_layout Plain Layout

set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS
 ON)
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

find_library(CUDA_NVRTC_LIB libnvrtc nvrtc HINTS "${CUDA_TOOLKIT_ROOT_DIR}/lib64
" "${LIBNVRTC_LIBRARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/lib/x64" /usr/lib64
 /usr/local/cuda/lib64)
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

target_include_directories(${PROJECT_NAME} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vk
FFT/)
\end_layout

\begin_layout Plain Layout

add_library(VkFFT INTERFACE)
\end_layout

\begin_layout Plain Layout

target_compile_definitions(VkFFT INTERFACE -DVKFFT_BACKEND=1)
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

target_link_libraries(${PROJECT_NAME} PUBLIC ${CUDA_LIBRARIES} cuda ${CUDA_NVRTC
_LIB} VkFFT)
\end_layout

\begin_layout Plain Layout


\backslash
end{minted}
\end_layout

\begin_layout Plain Layout


\backslash
end{mdframed}
\end_layout

\end_inset


\end_layout

\begin_layout Enumerate
HIP API: HIP and HIPRTC.
 Sample CMakeLists can look like this:
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
begin{mdframed}[backgroundcolor=bg]
\end_layout

\begin_layout Plain Layout


\backslash
begin{minted}[tabsize=4,obeytabs,breaklines]{cmake}
\end_layout

\begin_layout Plain Layout

list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
\end_layout

\begin_layout Plain Layout

find_package(hip)
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

target_include_directories(${PROJECT_NAME} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vk
FFT/)
\end_layout

\begin_layout Plain Layout

add_library(VkFFT INTERFACE)
\end_layout

\begin_layout Plain Layout

target_compile_definitions(VkFFT INTERFACE -DVKFFT_BACKEND=2)
\end_layout

\begin_layout Plain Layout

#target_compile_definitions(${PROJECT_NAME} PUBLIC -DVKFFT_OLD_ROCM) #ROCm
 versions before 4.5 needed kernel include of hiprtc
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

target_link_libraries(${PROJECT_NAME} PUBLIC hip::host VkFFT)
\end_layout

\begin_layout Plain Layout


\backslash
end{minted}
\end_layout

\begin_layout Plain Layout


\backslash
end{mdframed}
\end_layout

\end_inset


\end_layout

\begin_layout Enumerate
OpenCL API: OpenCL.
 Sample CMakeLists can look like this:
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
begin{mdframed}[backgroundcolor=bg]
\end_layout

\begin_layout Plain Layout


\backslash
begin{minted}[tabsize=4,obeytabs,breaklines]{cmake}
\end_layout

\begin_layout Plain Layout

find_package(OpenCL REQUIRED)
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

target_include_directories(${PROJECT_NAME} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vk
FFT/)
\end_layout

\begin_layout Plain Layout

add_library(VkFFT INTERFACE)
\end_layout

\begin_layout Plain Layout

target_compile_definitions(VkFFT INTERFACE -DVKFFT_BACKEND=3)
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

target_link_libraries(${PROJECT_NAME} PUBLIC OpenCL::OpenCL VkFFT)
\end_layout

\begin_layout Plain Layout


\backslash
end{minted}
\end_layout

\begin_layout Plain Layout


\backslash
end{mdframed}
\end_layout

\end_inset


\end_layout

\begin_layout Enumerate
Level Zero API: Level Zero; Clang and llvm-spirv must be in the system path
 (for kernel compilation).
 Sample CMakeLists can look like this:
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
begin{mdframed}[backgroundcolor=bg]
\end_layout

\begin_layout Plain Layout


\backslash
begin{minted}[tabsize=4,obeytabs,breaklines]{cmake}
\end_layout

\begin_layout Plain Layout

set(LevelZero_LIBRARY "/usr/lib/x86_64-linux-gnu/")
\end_layout

\begin_layout Plain Layout

set(LevelZero_INCLUDE_DIR "/usr/include/")
\end_layout

\begin_layout Plain Layout

find_library(
\end_layout

\begin_layout Plain Layout

	LevelZero_LIB
\end_layout

\begin_layout Plain Layout

	NAMES "ze_loader"
\end_layout

\begin_layout Plain Layout

	PATHS ${LevelZero_LIBRARY}
\end_layout

\begin_layout Plain Layout

	PATH_SUFFIXES "lib" "lib64"
\end_layout

\begin_layout Plain Layout

	NO_DEFAULT_PATH
\end_layout

\begin_layout Plain Layout

  )
\end_layout

\begin_layout Plain Layout

find_path(
\end_layout

\begin_layout Plain Layout

	LevelZero_INCLUDES
\end_layout

\begin_layout Plain Layout

	NAMES "ze_api.h"
\end_layout

\begin_layout Plain Layout

	PATHS ${LevelZero_INCLUDE_DIR}
\end_layout

\begin_layout Plain Layout

	PATH_SUFFIXES "include" 
\end_layout

\begin_layout Plain Layout

	NO_DEFAULT_PATH
\end_layout

\begin_layout Plain Layout

  )
\end_layout

\begin_layout Plain Layout

target_include_directories(${PROJECT_NAME} PUBLIC ${LevelZero_INCLUDES})
\end_layout

\begin_layout Plain Layout

add_library(VkFFT INTERFACE)
\end_layout

\begin_layout Plain Layout

target_compile_definitions(VkFFT INTERFACE -DVKFFT_BACKEND=4)
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

target_link_libraries(${PROJECT_NAME} PUBLIC LevelZero VkFFT)
\end_layout

\begin_layout Plain Layout


\backslash
end{minted}
\end_layout

\begin_layout Plain Layout


\backslash
end{mdframed}
\end_layout

\end_inset


\end_layout

\end_deeper
\begin_layout Subsection
Fourier Transform Setup
\end_layout

\begin_layout Standard
VkFFT follows a plan structure like FFTW/cuFFT with a notable difference
 - there is a unified interface to all transforms.
 This means that there are no separate functions like fftPlan1D/fftPlan2D/fftPla
nMany/etc.
 The initialization is done through a single configuration struct - VkFFTConfigu
ration.
 Each parameter of it will be covered in detail in this document.
 Plans in VkFFT are called VkFFTApplication and they are created with a
 unified initializeVkFFT call.
 
\end_layout

\begin_layout Standard
As the code is written in C, don't forget to zero-initialize used structs!
\end_layout

\begin_layout Standard
During the initializeVkFFT(VkFFTApplication* app, VkFFTConfiguration inputLaunch
Configuration) call VkFFT performs kernel generation and compilation from
 scratch (kernel reuse may be added later).
 The overall process of initialization looks like this:
\end_layout

\begin_layout Enumerate
Get device parameters, perform default initialization of internal copy of
 configuration struct inside the VkFFTApplication, then fill in user-defined
 parameters from inputLaunchConfiguration.
\begin_inset space ~
\end_inset

VkFFTApplication is passed as a pointer, so initializeVkFFT modifies the
 user-provided application.
\end_layout

\begin_layout Enumerate
By default, there are two internal FFT plans created - inverse and forward.
 Multidimensional FFT is done as a combination of 1D FFTs in each axis direction.
 For each axis, the VkFFTPlanAxis function is called.
\end_layout

\begin_layout Enumerate
VkFFTPlanAxis configures parameters for each axis.
 It may perform additional memory allocations (see: memory allocated by
 VkFFT).
\end_layout

\begin_layout Enumerate
shaderGenVkFFT generates corresponding to the axis code in a char buffer
 (each axis may require more than one kernel: see Four-step FFT, Bluestein's
 algorithm for FFT).
 
\end_layout

\begin_layout Enumerate
Code is then compiled with the run-time compiler of the specified backend.
\end_layout

\begin_layout Standard
Once the plan is no longer need, a call to the deleteVkFFT function frees
 all the allocated resources.
 There are no processes launched that continue to work outside of the VkFFT
 related function calls.
\end_layout

\begin_layout Subsection
Fourier Transform types and their definitions
\end_layout

\begin_layout Standard
VkFFT supports commonly used Complex to complex (C2C), real to complex (R2C),
 complex to real (C2R) transformations and real to real (R2R) Discrete Cosine
 Transformations of types II, III and IV.
 VkFFT uses the same definitions as FFTW, except for the multidimensional
 FFT axis ordering: in FFTW dimensions are ordered with the decrease in
 consecutive elements stride, while VkFFT does the opposite - the first
 axis is the non-strided axis (the one that has elements located consecutively
 in memory with no gaps, usually named as the X-axis).
 So, in FFTW dimensions are specified as ZYX and in VkFFT as XYZ.
 This felt more logical to me - no matter if there are 1, 2 or 3 dimensions,
 the user can always find the axis with the same stride at the same position.
 This choice doesn't require any modification in the user's data management
 - just provide the FFT dimensions in the reverse order to VkFFT.
 
\end_layout

\begin_layout Standard
In addition to up to the 3 dimensions of FFT, VkFFT supports two forms of
 batching: the number of coordinates and the number of systems.
 The choice of two distinct batching ways is made to support matrix-vector
 convolutions, where the kernel is presented as a matrix.
 Overall, the layout of VkFFT can be described as WHDCN - width, height,
 depth, coordinate and number of systems (in order of increasing strides,
 starting with 1 for width).
 Coordinate and number of systems can be 1, if the user has 1 as one of
 the FFT dimensions, the user can omit it from setup altogether as FFT of
 size 1 produces the same number as the input.
 Often, the coordinate part of the layout is not used, so the main batching
 is done by specifying N.
\end_layout

\begin_layout Standard
VkFFT assumes that complex numbers are stored consecutively in memory: RIRIRI...
 where R denotes the real part of the complex number and I denotes the imaginary
 part.
 There is no difference between using a float2/double2/half2 container or
 access memory as float/double/half as long as the byte order remains the
 same.
\end_layout

\begin_layout Standard
This section and the next one will cover the basics of VkFFT data layouts
 and memory management.
 
\end_layout

\begin_layout Subsubsection
C2C transforms
\end_layout

\begin_layout Standard
The base FFT algorithm - C2C in VkFFT has the same definition as FFTW.
 Forward FFT has the exponent sign 
\begin_inset Formula $-1$
\end_inset

, while the inverse has the exponent sign 
\begin_inset Formula $1$
\end_inset

.
 By default, the inverse transform is unnormalized.
 
\begin_inset Formula $N_{x}N_{y}N_{z}$
\end_inset

 complex numbers map to 
\begin_inset Formula $N_{x}N_{y}N_{z}$
\end_inset

 complex numbers and no additional padding is required.
 The resulting data order will be the same as in FFTW/cuFFT, unless special
 parameters are provided in configuration (see: advanced memory management)
\end_layout

\begin_layout Subsubsection
R2C/C2R transforms
\end_layout

\begin_layout Standard
R2C/C2R transforms can be explained as C2C transforms with imaginary part
 set to zero.
 They exploit Hermitian symmetry of the result: 
\begin_inset Formula $X_{k}=X_{N-k}^{*}$
\end_inset

 on the non-strided axis (the one that has elements located consecutively
 in memory with no gaps).
 This results in s reduction of required memory to store the complex result
 - we may only store 
\begin_inset Formula $floor(\frac{N_{x}}{2})+1$
\end_inset

 complex numbers instead of 
\begin_inset Formula $N_{x}$
\end_inset

.
 However, this results in memory requirements mismatch between input and
 output in R2C: 
\begin_inset Formula $floor(\frac{N_{x}}{2})+1$
\end_inset

 complex elements will require 
\begin_inset Formula $N_{x}+2$
\end_inset

 real numbers worth of memory for even 
\begin_inset Formula $N_{x}$
\end_inset

 and 
\begin_inset Formula $N_{x}+1$
\end_inset

 real numbers worth of memory for odd 
\begin_inset Formula $N_{x}$
\end_inset

.
 For C2R the situation is reversed.
 There are two approaches to this problem: pad each sequence of the non-strided
 axis with zeros to the required length or use out-of-place mode.
 More information on how to do this will be given in the next section.
\end_layout

\begin_layout Subsubsection
R2R (DCT) transforms
\end_layout

\begin_layout Standard
R2R transforms in VkFFT are implemented in the form of Discrete cosine transform
s of types I, II, III and IV.
 Their definitions and transforms results match FFTW:
\end_layout

\begin_layout Enumerate
DCT-I: 
\begin_inset Formula $X_{k}=x_{0}+(-1)^{k}x_{N-1}+2\stackrel[n=1]{N-2}{\sum}x_{n}cos(\frac{\pi}{N-1}nk)$
\end_inset

, inverse of DCT-I (itself)
\end_layout

\begin_layout Enumerate
DCT-II: 
\begin_inset Formula $X_{k}=2\stackrel[n=1]{N-1}{\sum}x_{n}cos(\frac{\pi}{N}(n+\frac{1}{2})k)$
\end_inset

, inverse of DCT-III
\end_layout

\begin_layout Enumerate
DCT-III: 
\begin_inset Formula $X_{k}=x_{0}+2\stackrel[n=1]{N-1}{\sum}x_{n}cos(\frac{\pi}{N}n(k+\frac{1}{2}))$
\end_inset

, inverse of DCT-II
\end_layout

\begin_layout Enumerate
DCT-IV: 
\begin_inset Formula $X_{k}=2\stackrel[n=0]{N-1}{\sum}x_{n}cos(\frac{\pi}{N}(n+\frac{1}{2})(k+\frac{1}{2}))$
\end_inset

, inverse of DCT-IV (itself)
\end_layout

\begin_layout Standard
R2R transforms are performed by redefinition of them to the C2C transforms
 (internal C2C sequence length can be different from the input R2R sequence
 length).
 R2R transform performs a one-to-one mapping between real numbers, so they
 don't require stride management, unlike R2C/C2R.
\end_layout

\begin_layout Subsection
Memory management, data layouts for different transforms
\end_layout

\begin_layout Subsubsection
VkFFT buffers
\end_layout

\begin_layout Standard
VkFFT allows for explicit control over the data flow, which makes both in-place
 and out-of-place transforms possible.
 Buffers are passed to VkFFT as VkBuffer pointer in Vulkan, as double void
 pointers in CUDA/HIP/Level Zero and as cl_mem pointer in OpenCL.
 This is done to maintain a uniform data pattern because some of the buffers
 can be allocated automatically.
 
\end_layout

\begin_layout Standard
The main buffer is called buffer and it always has to be provided, either
 during the plan creation or when the plan is executed.
 All calculations are performed in this buffer and it is always overwritten.
 To do calculations out-of-place, VkFFT provides an option to specify inputBuffe
r/outputBuffer buffer.
 The logic behind their usage is fairly simple - the user specifies inputBuffer
 if the input data has to be read from a buffer, different from the main
 buffer.
 As the data is read only once and nothing is written back to the inputBuffer,
 this allows doing truly out-of-place transformations.
 The same logic applies to outputBuffer with the difference that it is responsib
le for the absolute last write of the VkFFT.
 It is possible to use all three buffers to create complex data management
 paths.
\end_layout

\begin_layout Standard
It must be noted, that sometimes FFT can not be done inside one buffer (see:
 Four-Step FFT algorithm, Bluestein's algorithm).
 To compute FFT in these cases, there exists tempBuffer buffer and data
 is transferred between the main buffer and tempBuffer during the FFT execution.
 The ordering of transfers between the main buffer and tempBuffer is done
 in such a way, so the initial data read and final data write are obeying
 the configuration from the previous paragraph.
 Users can allocate tempBuffer themselves of some memory that does not have
 any useful information at the time of FFT execution (the tempBuffer size
 can depend on the configuration, so this is a rather advanced operation
 - read more in the advanced memory management section) or allow VkFFT to
 manage tempBuffer allocation itself (tempBuffer will be freed at the deleteVkFF
T call).
\end_layout

\begin_layout Standard
To compute convolutions and cross-correlations, a kernel buffer has to be
 specified.
 It must have the same layout as the result of the FFT transform.
\end_layout

\begin_layout Subsubsection
VkFFT buffers strides.
 A special case of R2C/C2R transforms
\end_layout

\begin_layout Standard
To have better control of memory, the user can specify the strides between
 consecutive elements of different axis for H (height), D (depth) and C
 (coordinate) parts of the WHDCN layout (W (width) stride is fixed to be
 1, N (number of systems) stride will be consecutive of C in memory if C
 is used, otherwise N will propagate the previous non-uniform stride multiplied
 by the corresponding axis length).
 Strides are specified not in bytes, but in the element type used - similar
 to the way how the user would access the corresponding element in the array.
 If all elements are consecutive in C2C, stride for H will be equal to the
 FFT length of W axis, stride for D will be multiplication of first two
 FFT axis lengths, stride for C will be multiplication of first three FFT
 axis lengths, etc.
 These are the default values of C2C and R2R strides if they are not explicitly
 specified.
\end_layout

\begin_layout Standard
One of the main use-cases of strides comes to solve the R2C/C2R Hermitian
 symmetry H stride mismatch - for real space, it is equal to 
\begin_inset Formula $N_{x}$
\end_inset

 real elements and for the frequency space it is equal to 
\begin_inset Formula $floor(\frac{N_{x}}{2})+1$
\end_inset

 complex numbers.
 So, with strides it is possible to use a buffer, padded to 
\begin_inset Formula $2\cdot(floor(\frac{N_{x}}{2})+1)$
\end_inset

 real elements in H stride (all elements between 
\begin_inset Formula $N_{x}$
\end_inset

 and 
\begin_inset Formula $2\cdot(floor(\frac{N_{x}}{2})+1)$
\end_inset

 will not be read so it does not matter what data is there before the write
 stage).
 All other strides are done as a multiplication between the previous stride
 and the number of elements in the previous axis.
 These are the default values of R2C/C2R strides if they are not explicitly
 specified.
\end_layout

\begin_layout Standard
It is possible to specify separate sets of strides for all user-defined
 buffers: bufferStride for the main buffer, inputStride for input buffer,
 outputStride for output buffer (kernel stride is assumed to be the same
 as bufferStride, tempBuffer strides are configured automatically).
 
\end_layout

\begin_layout Standard
For an out-of-place R2C FFT, there is no need to pad buffer with real numbers,
 but user must specify H stride there (as it differs to default one) - 
\begin_inset Formula $N_{x}$
\end_inset

 real elements for real space and 
\begin_inset Formula $floor(\frac{N_{x}}{2})+1$
\end_inset

 complex numbers for the frequency space.
\end_layout

\begin_layout Standard
An out-of-place C2R FFT is a more tricky transform.
 In the multidimensional case, the main buffer will be written to and read
 from multiple times.
 The intermediate stores have a complex layout, which requires more space
 than the output real layout, so in order not to modify the input data,
 there exist two options.
 First, pad the real data layout has to 
\begin_inset Formula $2\cdot(floor(\frac{N_{x}}{2})+1)$
\end_inset

 real elements in H stride (complex buffer will be used as inputBuffer,
 real buffer as buffer).
 Second, use the third buffer, so both input and output buffers have their
 original layouts (complex buffer will be used as inputBuffer, the main
 buffer for calculations is buffer and output real buffer as outputBuffer).
 If you use inverseReturnToInputBuffer option, where R2C is configured to
 read from input buffer and C2R is configured to write to the input buffer;
 C2R will modify the buffer it reads from in some cases (see issue 
\begin_inset CommandInset href
LatexCommand href
name "#58"
target "https://github.com/DTolm/VkFFT/issues/58#issuecomment-1007205682"
literal "false"

\end_inset

)
\end_layout

\begin_layout Subsection
VkFFT algorithms
\end_layout

\begin_layout Standard
VkFFT implements a wide range of algorithms to compute different types of
 FFTs but all of them can be reduced to a mixed-radix Cooley-Tukey FFT algorithm
 in the Stockham autosort form.
 The main idea behind it is to decompose the sequence as a set of primes,
 for each of which FFT can be written down exactly.
 As of now, VkFFT has radix implementations for primes up to 13, so all
 C2C sequences decomposable into a multiplication of such primes will be
 done purely with the Stockham algorithm.
 Below additional algorithms and their use-cases are described.
\end_layout

\begin_layout Subsubsection
Bluestein's algorithm
\end_layout

\begin_layout Standard
A complex algorithm that is used in cases where the sequence is not decomposable
 with implemented radix butterflies (currently - primes up to 13).
 It is derived by replacing 
\begin_inset Formula $nk=\left(n^{2}+k^{2}-(n-k)^{2}\right)/2$
\end_inset

 in 
\begin_inset CommandInset ref
LatexCommand ref
reference "eq:dft"
plural "false"
caps "false"
noprefix "false"

\end_inset

:
\end_layout

\begin_layout Standard
\begin_inset Formula 
\begin{align}
X_{k} & =\left(e^{-\pi i\frac{k^{2}}{N}}\right)\stackrel[n=0]{N-1}{\sum}\left(x_{n}e^{-\pi i\frac{n^{2}}{N}}\right)\left(e^{\pi i\frac{\left(k-n\right)^{2}}{N}}\right)=b_{k}^{\ast}\stackrel[n=0]{N-1}{\sum}a_{n}b_{k-n}\label{eq:dft-1}\\
a_{n} & =x_{n}b_{n}^{\ast}\\
b_{n} & =e^{\pi i\frac{n^{2}}{N}}
\end{align}

\end_inset


\end_layout

\begin_layout Standard
Here FFT is represented as a convolution between two sequences: 
\begin_inset Formula $a_{n}$
\end_inset

 and 
\begin_inset Formula $b_{n}$
\end_inset

, which can be performed by the means of convolution theorem:
\end_layout

\begin_layout Standard
\begin_inset Formula 
\begin{equation}
F\{a\ast b\}=F\{a\}\cdot F\{b\}
\end{equation}

\end_inset


\end_layout

\begin_layout Standard
By padding 
\begin_inset Formula $a_{n}$
\end_inset

 and 
\begin_inset Formula $b_{n}$
\end_inset

 to a sequence length decomposable with implemented radix butterflies with
 a size of at least 
\begin_inset Formula $2N-1$
\end_inset

 (because the length of 
\begin_inset Formula $b_{n}$
\end_inset

 is 
\begin_inset Formula $2N-1$
\end_inset

), we can perform FFT of any length.
 FFT of 
\begin_inset Formula $b_{n}$
\end_inset

 can be precomputed, so overall this algorithm requires at least 4x the
 computations and more memory transfers.
 This algorithm can be combined with all other algorithms implemented in
 VkFFT.
 If an FFT can not be done in a single upload, a tempBuffer has to be allocated
 (because the logical FFT buffer size is bigger than the original system).
\end_layout

\begin_layout Subsubsection
The Four-Step FFT algorithm
\end_layout

\begin_layout Standard
GPUs and CPUs have a hierarchical memory model - the closer memory to the
 unit that performs the computations, the faster its speed and the lower
 the size.
 So it is advantageous to split FFTs, not to the lowest primes, but to some
 bigger multiplication of those primes, then upload this subsequence to
 the closest cache level to the cores and do the final prime split there.
 The absolute lowest level is the register file, however, it does not allow
 for thread communications outside the warp.
 For this purpose, modern GPUs employ shared memory - a fast memory with
 a bank structure that is visible to all threads in a thread block.
 The usual sizes of it change on a scale from 16KB to 192KB and it is often
 beneficial to use it fully.
 However, if the full sequence can not fit inside the shared memory, FFT
 has to be done in multiple uploads - with the Four Step FFT algorithm.
 The main idea behind it is to represent a big 1D sequence as a 2D (or 3D
 for the three-upload scheme) FFT - we first do FFT along the columns, then
 the rows, then transpose the result and multiply by a special set of phase
 vectors.
 Similar decomposition idea as the main Cooley-Tukey algorithm.
 However, performing transpositions in-place is a complicated task - especially
 for a non-trivial ratio between dimensions.
 It will also require an additional read/write stage, as it can not be merged
 with the last write of the FFT algorithm.
 The easiest and the most performant solution is to use a tempBuffer (it
 is the main reason for having this functionality, actually) and store intermedi
ate FFT results out-of-place.
 This way the last transposition step can be merged with the write step,
 as we can overwrite the output buffer without losing data.
\end_layout

\begin_layout Standard
To estimate if your sequence size is single upload or not, divide the amount
 of available shared memory (48KB - Nvidia GPUs with Vulkan/OpenCL API,
 64KB - AMD GPUs, 100KB - Nvidia GPUs in CUDA API) by the complex size used
 for calculations (8 byte - single precision, 16 byte - double precision).
 For 64KB of shared memory, we get 8192 as max single upload single-precision
 non-strided FFT, 4096 for double precision.
 For strided axes (H and D parts of the layout) these numbers have to be
 divided by 4 and 2 respectively to achieve coalescing, resulting in 2048
 length for single upload in both precisions.
 For more information on coalescing see: coalescing API reference.
\end_layout

\begin_layout Standard
In the case of the Four-Step FFT algorithm, tempBuffer size has to be at
 least the same as the default main buffer size.
 It does not matter how many uploads are in the Four Step FFT algorithm
 - only a single tempBuffer is required.
 In this document, all systems that can fit in the shared memory entirely
 and be done without the Four Step FFT algorithm (and multiple uploads)
 are called single upload systems.
\end_layout

\begin_layout Standard
If the last transposition is not required (the output data is allowed to
 be in not unshuffled form) it can be disabled during the configuration
 phase.
 This way tempBuffer will not be needed and all computations will be done
 in-place (unless Bluestein's algorithm is used).
 An example use-case of this is convolutions - if the kernel is computed
 with the same operation ordering, point-wise multiplication in the frequency
 domain is not dependent on the correct data ordering and the inverse FFT
 will restore the original layout.
\end_layout

\begin_layout Subsubsection
R2C/C2R FFTs
\end_layout

\begin_layout Standard
A typical approach to a single upload R2C/C2R system is to just set the
 imaginary part to zero inside the shared memory and do a simple C2C transform.
 This doesn't affect the amount of memory transferred from VRAM and is not
 a bad approach as FFT is a memory-bound algorithm, however, this can be
 improved in multidimensional (in HDCN part of the layout) case by the compositi
on of a single C2C sequence from two real sequences and some write for R2C/read
 for C2R post-processing.
 Both of these algorithms are implemented in VkFFT.
 Note, that R2C/C2R only affects the non-strided axis (W).
 All strided axes are still done as C2C.
\end_layout

\begin_layout Subsubsection
R2C/C2R multi-upload FFT algorithm
\end_layout

\begin_layout Standard
For even sequences there exists an easy mapping between R2C/C2R FFTs and
 the C2C of half the size.
 In this case, all even indices (starting from 0) are read as the real values
 of a complex number and all odd indices are read as the imaginary values.
 This C2C sequence can be done with the help of the Four-Step FFT algorithm.
 When FFT is done, separate post-processing for R2C/pre-processing for C2R
 is applied.
\end_layout

\begin_layout Subsubsection
R2R Discrete Cosine Transforms
\end_layout

\begin_layout Standard
There exist many different mappings between DCT and FFT.
 As of now, VkFFT has the following algorithms implemented (all single-upload
 for now):
\end_layout

\begin_layout Itemize
DCT-I - mapping between R2R and C2C of the 
\begin_inset Formula $2N-2$
\end_inset

 length.
 For non-strided axis can use an optimization similar to the R2C/C2R multidimens
ional case (setting the imaginary part to the next FFT sequence).
\end_layout

\begin_layout Itemize
DCT-II/DCT-III - mapping between R2R and C2C of the same length.
 For non-strided axis can use an optimization similar to the R2C/C2R multidimens
ional case (setting the imaginary part to the next FFT sequence).
\end_layout

\begin_layout Itemize
DCT-IV - for even sizes, mapping between R2R and C2C sequence of half-length.
 For odd sizes mapping to the FFT of the same length (for non-strided axis
 can use an optimization similar to the R2C/C2R multidimensional case (setting
 the imaginary part to the next FFT sequence)).
\end_layout

\begin_layout Subsubsection
Register overutilization
\end_layout

\begin_layout Standard
Not an FFT algorithm by itself, but an optimization to do bigger sequences
 in a single upload instead of switching to the Four Step FFT algorithm.
 The main idea behind it is to use a register file (which is often bigger
 than the amount of shared memory) to store the sequence and use shared
 memory only as a communication buffer.
 This is useful in Vulkan and OpenCL APIs on Nvidia GPU, as they are only
 allowed to allocate 48KB of shared memory with a register file having the
 size of 256KB.
\end_layout

\begin_layout Subsubsection
Zero padding
\end_layout

\begin_layout Standard
Not an FFT algorithm by itself, but a memory management optimization.
 If the user's system has parts that are known to be zero - for example,
 when an open system is modeled, to avoid a circular part of the FFT system
 has to be padded with zeros up to 2x in each direction.
 VkFFT can omit sequences full of zeros and don't perform the corresponding
 memory transfers and computations, as the output result will be zero.
 This way it is possible to get up to two times speed increase in the 2D
 case and up to 3x increase in the 3D case.
 
\end_layout

\begin_layout Subsubsection
Convolution and cross-correlation support
\end_layout

\begin_layout Standard
With the help of the Convolution theorem, which states that the Fourier
 transform of a convolution is the pointwise product of signals Fourier
 transforms, it is possible to perform convolution with 
\begin_inset Formula $NlogN$
\end_inset

 complexity, compared to 
\begin_inset Formula $N^{2}$
\end_inset

 complexity of the simple multiplication approach.
 This is extremely useful for kernels spanning more than 50 elements in
 size.
 VkFFT can merge the last step FFT, kernel multiplication in the Fourier
 domain and the first step of inverse FFT to provide substantial memory
 transfer savings.
 Moreover, FFTs of big sequences can be performed without data reordering,
 which results in a better locality.
 
\end_layout

\begin_layout Subsection
VkFFT accuracy
\end_layout

\begin_layout Standard
To measure how VkFFT (single/double/half precision) results compare to cuFFT/roc
FFT (single/double/half precision) and FFTW (double precision), multiple
 sets of systems covering full supported C2C/R2C+C2R/R2R FFT range are filled
 with random complex data on the scale of [-1,1] and one transform was performed
 on each system.
 Samples 11(single), 12(double), 13(half), 14(non-power of 2 C2C, single),
 15(R2C+C2R, single), 16(DCT-I/II/III/IV, single), 17(DCT-I/II/III/IV, double),
 18(non-power of 2 C2C, double) are available in VkFFT Benchmark Suite to
 perform VkFFT verification on any of the target platforms.
 Overall, the Cooley-Tukey algorithm (Stockham autosort) exhibits logarithmic
 relative error scaling, similar to those of other GPU FFT libraries.
 Typically, the more computationally expensive algorithm is - the worse
 its precision is.
 So, Bluestein's algorithm has lower accuracy than Stockham autosort algorithm.
\end_layout

\begin_layout Standard
Single precision in VkFFT supports two modes of calculation - by using the
 on-chip Special Function Units that can compute sines and cosines on the
 go or by using the precomputed on CPU look-up tables.
 For Nvidia and AMD GPUs, SFU provide great precision, while Intel iGPUs
 and mobile GPUs must use LUT to perform FFTs correctly.
\end_layout

\begin_layout Standard
Double precision in VkFFT also supports two modes of calculation - by using
 polynomial sincos approximation and computing them on-chip or by using
 precomputed LUT as well.
 The second option is the better one, as polynomial sincos approximation
 is too compute-heavy for modern GPUs.
 It is selected by default on all devices.
\end_layout

\begin_layout Standard
Half precision is currently only supported in the Vulkan backend and is
 often experiencing precision problems with the first number of the resulting
 FFT sequence, which is the sum of all input numbers.
 Half precision is implemented only as a memory trick - all on-chip computations
 are done in single precision, but this doesn't help with the first number
 problem.
 Half precision can use SFU or LUT as well.
\end_layout

\begin_layout Standard
VkFFT also supports mixed-precision operations, where memory storing is
 done at lower precision, compared to the on-chip calculations.
 For example, it is possible to read data in single precision, do calculations
 in double and store data back in single precision.
 
\end_layout

\begin_layout Subsection
VkFFT additional memory allocations
\end_layout

\begin_layout Standard
In this section, all GPU memory allocations that are done by VkFFT are described.
 There are up to three situations when VkFFT allocates memory.
 All of the VkFFT allocated memory is freed at the deleteVkFFT call.
\end_layout

\begin_layout Subsubsection
LUT allocations
\end_layout

\begin_layout Standard
This memory is used to store precomputed twiddle factors and phase vectors
 used during the computation.
 This buffer can have:
\end_layout

\begin_layout Itemize
twiddle factors for each radix stage of Stockham FFT calculation
\end_layout

\begin_layout Itemize
phase vectors used in the Four Step FFT algorithm between stages
\end_layout

\begin_layout Itemize
phase vectors used in DCT-II/III/IV to perform a mapping between R2R and
 C2C
\end_layout

\begin_layout Itemize
phase vectors used in post-processing for R2C/pre-processing for C2R for
 even length sequences as C2C of half size
\end_layout

\begin_layout Standard
VkFFT manages LUT allocations by itself and they are performed during the
 initializeVkFFT call.
 LUT are allocated per axis, though some of them can be reused if the axes
 have the same LUT.
 Inverse and forward FFT plans share the same LUT (conjugation is performed
 on-chip).
\end_layout

\begin_layout Subsubsection
The Four-Step FFT algorithm - tempBuffer allocation
\end_layout

\begin_layout Standard
To perform the merging of the transposition with the last upload of an axis,
 VkFFT requires additional memory to mimic an out-of-place execution.
 This memory is located in tempBuffer and has to be of at least the same
 size as the main buffer.
 It is possible for the users to allocate it themselves, though if this
 is not done, VkFFT can do the allocation automatically (the size of the
 tempBuffer will be the same as the main buffer, unless the logical dimensions
 of FFT are bigger than user-defined - then, it will allocate the system
 with the minimal size, that can cover maximal logical system size used
 in any of the axes - see next subsection).
\end_layout

\begin_layout Subsubsection
Bluestein's buffers allocation
\end_layout

\begin_layout Standard
To do Bluestein's FFT algorithm, precomputed sequences 
\begin_inset Formula $b_{n}=e^{\pi i\frac{n^{2}}{N}}$
\end_inset

, 
\begin_inset Formula $FFT(b_{n})$
\end_inset

 and 
\begin_inset Formula $iFFT(b_{n})$
\end_inset

 are required.
 For each axis, they can be different and are computed separately (unless
 VkFFT can determine that they match, then the buffers are allocated only
 once).
 Notably, as Bluestein's algorithm pads the sequence length to at least
 
\begin_inset Formula $2N-1$
\end_inset

, if it can not be done in a single upload and the Four Step algorithm has
 to be used, the intermediate storage required will be bigger than the main
 buffer size.
 In this case, tempBuffer must always be allocated.
 As the padded sequence can be different for each of the dimensions, the
 required size of the tempBuffer will also vary.
 VkFFT determines the biggest size needed among axes and allocated tempBuffer
 of this size.
\end_layout

\begin_layout Standard
\begin_inset Newpage newpage
\end_inset


\end_layout

\begin_layout Section
VkFFT API Reference
\end_layout

\begin_layout Standard
This section covers error codes, API functions that can be used by the user
 and configuration parameters.
\end_layout

\begin_layout Subsection
Return value VkFFTResult
\end_layout

\begin_layout Standard
All VkFFT Library return values except for VKFFT_SUCCESS are used in case
 of a failure and provide information on what has gone wrong.
 VkFFTResult is unified among different backends, though some of its values
 may not be used in specific backends.
 Possible return values of VkFFTResult are defined as following:
\end_layout

\begin_layout Standard
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
begin{mdframed}[backgroundcolor=bg]
\end_layout

\begin_layout Plain Layout


\backslash
begin{minted}[tabsize=4,obeytabs,breaklines]{C}
\end_layout

\begin_layout Plain Layout

typedef enum VkFFTResult { 	
\end_layout

\begin_layout Plain Layout

VKFFT_SUCCESS = 0,	// The VkFFT operation was successful
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_MALLOC_FAILED = 1,	// Some malloc call inside VkFFT has failed.
 Report this to the GitHub repo
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_INSUFFICIENT_CODE_BUFFER = 2,	// Generated kernel is bigger
 than default kernel array.
 Increase it with maxCodeLength parameter of configuration.
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_INSUFFICIENT_TEMP_BUFFER = 3,	// Temporary string used in kernel
 generation is bigger than default temporary string array.
 Increase it with maxTempLength parameter of configuration.
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_PLAN_NOT_INITIALIZED = 4,	// Code attempts to use uninitialized
 plan (it is zero inside VkFFTApplication)		
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_NULL_TEMP_PASSED = 5,	// Internal kernel generation error
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_INVALID_PHYSICAL_DEVICE = 1001,	// No physical device is provided
 (Vulkan API)
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_INVALID_DEVICE = 1002,	// No device is provided (All APIs)
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_INVALID_QUEUE = 1003,	// No queue is provided (Vulkan API)
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_INVALID_COMMAND_POOL = 1004,	// No command pool is provided
 (Vulkan API)
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_INVALID_FENCE = 1005,	// No fence is provided (Vulkan API)
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_ONLY_FORWARD_FFT_INITIALIZED = 1006,	// VkFFT tries to access
 inverse FFT plan, when appliction is created with makeForwardPlanOnly flag
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_ONLY_INVERSE_FFT_INITIALIZED = 1007,	// VkFFT tries to access
 forward FFT plan, when appliction is created with makeInversePlanOnly flag
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_INVALID_CONTEXT = 1008,	// No context is provided (OpenCL API)
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_INVALID_PLATFORM = 1009,	// No platform is provided (OpenCL
 API)
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_EMPTY_FILE = 1011,
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_EMPTY_FFTdim = 2001,	// Number of dimensions is not provided
 in the configuration
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_EMPTY_size = 2002,	// Array of dimensions is not provided in
 the configuration
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_EMPTY_bufferSize = 2003,	// Buffer size has to be provided during
 the application creation
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_EMPTY_buffer = 2004,	// Buffer has te be specified either at
 the application creation stage or during launch through VkFFTLaunchParams
 struct
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_EMPTY_tempBufferSize = 2005,	// Same error as VKFFT_ERROR_EMPTY_buff
erSize if userTempBuffer is enabled
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_EMPTY_tempBuffer = 2006,	// Same error as VKFFT_ERROR_EMPTY_buffer
 if userTempBuffer is enabled
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_EMPTY_inputBufferSize = 2007,	// Same error as VKFFT_ERROR_EMPTY_buf
ferSize if isInputFormatted is enabled
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_EMPTY_inputBuffer = 2008,	// Same error as VKFFT_ERROR_EMPTY_buffer
 if isInputFormatted is enabled
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_EMPTY_outputBufferSize = 2009,	// Same error as VKFFT_ERROR_EMPTY_bu
fferSize if isOutputFormatted is enabled
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_EMPTY_outputBuffer = 2010,	// Same error as VKFFT_ERROR_EMPTY_buffer
 if isOutputFormatted is enabled
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_EMPTY_kernelSize = 2011,	// Same error as VKFFT_ERROR_EMPTY_bufferSi
ze if performConvolution is enabled
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_EMPTY_kernel = 2012,	// Same error as VKFFT_ERROR_EMPTY_buffer
 if performConvolution is enabled
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_EMPRY_useCustomBluesteinPaddingPattern_arrays = 2014,	// pointers
 to primeSizes or paddedSizes arrays are zero when useCustomBluesteinPaddingPatt
ern is enabled	
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_UNSUPPORTED_RADIX = 3001,	// VkFFT has encountered unsupported
 radix (more than 13) during decomposition and Bluestein's FFT fallback
 did not work
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH = 3002,	// VkFFT can not do this sequence
 length currently - it requires mor than three-upload Four step FFT
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_R2C = 3003,	// VkFFT can not do this
 sequence length currently - odd multi-upload R2C/C2R FFTs
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_DCT = 3004,	// VkFFT can not do this
 sequence length currently - multi-upload R2R transforms, odd DCT-IV transforms
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_UNSUPPORTED_FFT_OMIT = 3005,	// VkFFT can not omit sequences
 in convolution calculations and R2C/C2R case
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_FAILED_TO_ALLOCATE = 4001,	// VkFFT failed to allocate GPU memory
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_FAILED_TO_MAP_MEMORY = 4002,	// 4002-4052 are handlers for errors
 of used backend APIs.
 They may indicate a driver failure.
 If they are thrown - report to the GitHub repo
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_FAILED_TO_ALLOCATE_COMMAND_BUFFERS = 4003,
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_FAILED_TO_BEGIN_COMMAND_BUFFER = 4004,
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_FAILED_TO_END_COMMAND_BUFFER = 4005,
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_FAILED_TO_SUBMIT_QUEUE = 4006,
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_FAILED_TO_WAIT_FOR_FENCES = 4007,
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_FAILED_TO_RESET_FENCES = 4008,
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_FAILED_TO_CREATE_DESCRIPTOR_POOL = 4009,
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_FAILED_TO_CREATE_DESCRIPTOR_SET_LAYOUT = 4010,
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_FAILED_TO_ALLOCATE_DESCRIPTOR_SETS = 4011,
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_FAILED_TO_CREATE_PIPELINE_LAYOUT = 4012,
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_FAILED_SHADER_PREPROCESS = 4013,
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_FAILED_SHADER_PARSE = 4014,
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_FAILED_SHADER_LINK = 4015,
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_FAILED_SPIRV_GENERATE = 4016,
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_FAILED_TO_CREATE_SHADER_MODULE = 4017,
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_FAILED_TO_CREATE_INSTANCE = 4018,
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_FAILED_TO_SETUP_DEBUG_MESSENGER = 4019,
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_FAILED_TO_FIND_PHYSICAL_DEVICE = 4020,
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_FAILED_TO_CREATE_DEVICE = 4021,
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_FAILED_TO_CREATE_FENCE = 4022,
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_POOL = 4023,
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_FAILED_TO_CREATE_BUFFER = 4024,
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_FAILED_TO_ALLOCATE_MEMORY = 4025,
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_FAILED_TO_BIND_BUFFER_MEMORY = 4026,
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_FAILED_TO_FIND_MEMORY = 4027,
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_FAILED_TO_SYNCHRONIZE = 4028,
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_FAILED_TO_COPY = 4029,
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_FAILED_TO_CREATE_PROGRAM = 4030,
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_FAILED_TO_COMPILE_PROGRAM = 4031, 
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_FAILED_TO_GET_CODE_SIZE = 4032,
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_FAILED_TO_GET_CODE = 4033,
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_FAILED_TO_DESTROY_PROGRAM = 4034,
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_FAILED_TO_LOAD_MODULE = 4035,
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_FAILED_TO_GET_FUNCTION = 4036,
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_FAILED_TO_SET_DYNAMIC_SHARED_MEMORY = 4037,
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_FAILED_TO_MODULE_GET_GLOBAL = 4038,
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_FAILED_TO_LAUNCH_KERNEL = 4039,
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_FAILED_TO_EVENT_RECORD = 4040,
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_FAILED_TO_ADD_NAME_EXPRESSION = 4041,
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_FAILED_TO_INITIALIZE = 4042,
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_FAILED_TO_SET_DEVICE_ID = 4043,
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_FAILED_TO_GET_DEVICE = 4044,
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_FAILED_TO_CREATE_CONTEXT = 4045,
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_FAILED_TO_CREATE_PIPELINE = 4046,
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_FAILED_TO_SET_KERNEL_ARG = 4047,
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_QUEUE = 4048,
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_FAILED_TO_RELEASE_COMMAND_QUEUE = 4049,
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_FAILED_TO_ENUMERATE_DEVICES = 4050,
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE = 4051,
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_FAILED_TO_CREATE_EVENT = 4052,
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST = 4053,
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_FAILED_TO_DESTROY_COMMAND_LIST = 4054,
\end_layout

\begin_layout Plain Layout

VKFFT_ERROR_FAILED_TO_SUBMIT_BARRIER = 4055
\end_layout

\begin_layout Plain Layout

} VkFFTResult;
\end_layout

\begin_layout Plain Layout


\backslash
end{minted}
\end_layout

\begin_layout Plain Layout


\backslash
end{mdframed}
\end_layout

\end_inset


\end_layout

\begin_layout Subsection
VkFFT application management functions
\end_layout

\begin_layout Standard
VkFFT has a unified plan management model - all different transform types/
 dimensionalities/ precision use the same calls with configuration done
 through VkFFTConfiguration struct.
 This section shows how to initialize/use/free VkFFT with this unified model,
 while the next one will go into how to configure VkFFTConfiguration correctly.
 All of the functions operate on VkFFTApplication and VkFFTConfiguration
 assuming they have been zero-initialized before usage, so do not forget
 to do this when initializing:
\end_layout

\begin_layout Standard
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
begin{mdframed}[backgroundcolor=bg]
\end_layout

\begin_layout Plain Layout


\backslash
begin{minted}[tabsize=4,obeytabs,breaklines]{C}
\end_layout

\begin_layout Plain Layout

VkFFTConfiguration configuration = {};
\end_layout

\begin_layout Plain Layout

VkFFTApplication app = {};
\end_layout

\begin_layout Plain Layout


\backslash
end{minted}
\end_layout

\begin_layout Plain Layout


\backslash
end{mdframed}
\end_layout

\end_inset


\end_layout

\begin_layout Subsubsection
Function initializeVkFFT()
\end_layout

\begin_layout Standard
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
begin{mdframed}[backgroundcolor=bg]
\end_layout

\begin_layout Plain Layout


\backslash
begin{minted}[tabsize=4,obeytabs,breaklines]{C}
\end_layout

\begin_layout Plain Layout

VkFFTResult initializeVkFFT(VkFFTApplication* app, VkFFTConfiguration inputLaunc
hConfiguration)
\end_layout

\begin_layout Plain Layout


\backslash
end{minted}
\end_layout

\begin_layout Plain Layout


\backslash
end{mdframed}
\end_layout

\end_inset


\end_layout

\begin_layout Standard
Creates an FFT application (collection of forward and inverse plans).
 As forward and inverse FFTs may have different memory layouts, can have
 different normalizations - they are done as separate internal plans inside
 VkFFTApplication.
 This call assumes the application to be zero-initialized, so can be only
 done once on a particular application, until it is deleted.
\end_layout

\begin_layout Standard
If the initializeVkFFT call fails, it frees all allocated by VkFFT CPU/GPU
 resources and sets the application to zero.
 VkFFTResult is returned with an error code corresponding to what went wrong.
\end_layout

\begin_layout Standard
In case of success, VkFFTApplication will contain initialized plans with
 compiled kernels ready for execution with VKFFT_SUCCESS returned.
\end_layout

\begin_layout Subsubsection
Function VkFFTAppend()
\end_layout

\begin_layout Standard
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
begin{mdframed}[backgroundcolor=bg]
\end_layout

\begin_layout Plain Layout


\backslash
begin{minted}[tabsize=4,obeytabs,breaklines]{C}
\end_layout

\begin_layout Plain Layout

VkFFTResult VkFFTAppend(VkFFTApplication* app, int inverse, VkFFTLaunchParams*
 launchParams)
\end_layout

\begin_layout Plain Layout


\backslash
end{minted}
\end_layout

\begin_layout Plain Layout


\backslash
end{mdframed}
\end_layout

\end_inset


\end_layout

\begin_layout Standard
Performs FFT in the int inverse direction (-1 for forward FFT, 1 for inverse
 FFT).
 FFT plans are selected from the VkFFTApplication collection automatically.
 VkFFTApplication must be initialized with initializeVkFFT call before.
 VkFFTLaunchParams struct allows for pre-launch configuration of some parameters
, namely:
\end_layout

\begin_layout Itemize
buffer - similar to how FFTW/cuFFT expects input/output data pointers in
 *execC2C (and other) function calls, VkFFT allows specifying memory used
 for computations at launch.
 It must have the same size/layout/strides as defined during the application
 creation.
\end_layout

\begin_layout Itemize
inputBuffer/outputBuffer/tempBuffer/kernel - other buffers can also be specified
 at launch.
 In addition to them having the same size/layout/strides as defined during
 the application creation, the application must be created with flags enabling
 the corresponding buffer usage: isInputFormatted/isOutputFormatted/userTempBuff
er/performConvolution respectively.
\end_layout

\begin_layout Itemize
bufferOffset/tempBufferOffset/inputBufferOffset/outputBufferOffset/kernelOffset
 - specify if VkFFT has to offset the first element position inside the
 corresponding buffer.
 In bytes.
 Default 0.
 specifyOffsetsAtLaunch parameter must be enabled during the initializeVkFFT
 call before.
 
\end_layout

\begin_layout Standard
Depending on the API, the execution model may vary and require additional
 information at launch:
\end_layout

\begin_layout Itemize
Vulkan API: VkFFT appends a sequence of vkCmdDispatch calls to the user-defined
 VkCommandBuffer (with respective push constants/descriptor sets/pipelines/memor
y barriers bindings).
 VkCommandBuffer must be provided as a pointer in VkFFTLaunchParams.
 VkCommandBuffer must be in the writing stage, started with vkBeginCommandBuffer
 call.
 After VkFFTAppend has finished, provided VkCommandBuffer will contain a
 sequence of operations performing FFT.
 The first call of the sequence has no input memory barrier, the last call
 has one, ensuring FFT has finished execution.
\end_layout

\begin_layout Itemize
CUDA/HIP API: if the user wants to use streams, they have to be provided
 during the application configuration stage.
 VkFFTAppend performs a series of cuLaunchKernel, which are sequential if
 appended to one stream and synchronized if appended to multiple streams.
\end_layout

\begin_layout Itemize
OpenCL API: similar to Vulkan, VkFFT appends a sequence of clEnqueueNDRangeKerne
l calls to user-defined cl_command_queue.
 Currently, they are all assumed to be sequential.
 cl_command_queue must be provided as a pointer in VkFFTLaunchParams.
 
\end_layout

\begin_layout Itemize
Level Zero API: similar to Vulkan, VkFFT appends a sequence of zeCommandListAppe
ndLaunchKernel calls to user-defined command list ze_command_list_handle_t.
 They have execution barriers between.
 ze_command_list_handle_t must be provided as a pointer in VkFFTLaunchParams.
 
\end_layout

\begin_layout Standard
If VkFFT fails during the VkFFTAppend call, it will not free the application
 and allocated there resources - use a separate call for that.
\end_layout

\begin_layout Subsubsection
Function deleteVkFFT()
\end_layout

\begin_layout Standard
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
begin{mdframed}[backgroundcolor=bg]
\end_layout

\begin_layout Plain Layout


\backslash
begin{minted}[tabsize=4,obeytabs,breaklines]{C}
\end_layout

\begin_layout Plain Layout

void deleteVkFFT(VkFFTApplication* app)
\end_layout

\begin_layout Plain Layout


\backslash
end{minted}
\end_layout

\begin_layout Plain Layout


\backslash
end{mdframed}
\end_layout

\end_inset


\end_layout

\begin_layout Standard
Performs deallocation of resources used in the provided application.
 Returns application to the zero-initialized state.
\end_layout

\begin_layout Subsubsection
Function VkFFTGetVersion()
\end_layout

\begin_layout Standard
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
begin{mdframed}[backgroundcolor=bg]
\end_layout

\begin_layout Plain Layout


\backslash
begin{minted}[tabsize=4,obeytabs,breaklines]{C}
\end_layout

\begin_layout Plain Layout

int VkFFTGetVersion()
\end_layout

\begin_layout Plain Layout


\backslash
end{minted}
\end_layout

\begin_layout Plain Layout


\backslash
end{mdframed}
\end_layout

\end_inset


\end_layout

\begin_layout Standard
Returns the version of the VkFFT library in the X.XX.XX format (without dots).
\end_layout

\begin_layout Subsection
VkFFT configuration
\end_layout

\begin_layout Standard
This section will cover all the parameters that can be specified in the
 VkFFTConfiguration struct.
 It will start with a short description of the struct (intended to be used
 as a cheat sheet), then go for each field in detail.
\end_layout

\begin_layout Standard
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
begin{mdframed}[backgroundcolor=bg]
\end_layout

\begin_layout Plain Layout


\backslash
begin{minted}[tabsize=4,obeytabs,breaklines]{C}
\end_layout

\begin_layout Plain Layout

typedef struct {
\end_layout

\begin_layout Plain Layout

// Required parameters: 	
\end_layout

\begin_layout Plain Layout

uint64_t FFTdim;	// FFT dimensionality (1, 2 or 3)
\end_layout

\begin_layout Plain Layout

uint64_t size[3];	// WHD - system dimensions
\end_layout

\begin_layout Plain Layout

#if(VKFFT_BACKEND==0) //Vulkan API
\end_layout

\begin_layout Plain Layout

VkPhysicalDevice* physicalDevice;	// Pointer to Vulkan physical device,
 obtained from vkEnumeratePhysicalDevices
\end_layout

\begin_layout Plain Layout

VkDevice* device;	// Pointer to Vulkan device, created with vkCreateDevice
\end_layout

\begin_layout Plain Layout

VkQueue* queue;	// Pointer to Vulkan queue, created with vkGetDeviceQueue
\end_layout

\begin_layout Plain Layout

VkCommandPool* commandPool;	// Pointer to Vulkan command pool, created with
 vkCreateCommandPool
\end_layout

\begin_layout Plain Layout

VkFence* fence;	// Pointer to Vulkan fence, created with vkCreateFence
\end_layout

\begin_layout Plain Layout

uint64_t isCompilerInitialized;	// Specify if glslang compiler has been
 intialized before (0 - off, 1 - on).
 Default 0 
\end_layout

\begin_layout Plain Layout

#elif(VKFFT_BACKEND==1) //CUDA API
\end_layout

\begin_layout Plain Layout

CUdevice* device;	// Pointer to CUDA device, obtained from cuDeviceGet 	
\end_layout

\begin_layout Plain Layout

cudaStream_t* stream;	// Pointer to streams (can be more than 1), where
 to execute the kernels.
 Deafult 0
\end_layout

\begin_layout Plain Layout

uint64_t num_streams;	// Try to submit CUDA kernels in multiple streams
 for asynchronous execution.
 Default 1 
\end_layout

\begin_layout Plain Layout

#elif(VKFFT_BACKEND==2) //HIP API
\end_layout

\begin_layout Plain Layout

hipDevice_t* device;	// Pointer to HIP device, obtained from hipDeviceGet
\end_layout

\begin_layout Plain Layout

hipStream_t* stream;	// Pointer to streams (can be more than 1), where to
 execute the kernels.
 Deafult 0
\end_layout

\begin_layout Plain Layout

uint64_t num_streams;	// Try to submit HIP kernels in multiple streams for
 asynchronous execution.
 Default 1 
\end_layout

\begin_layout Plain Layout

#elif(VKFFT_BACKEND==3) //OpenCL API
\end_layout

\begin_layout Plain Layout

cl_platform_id* platform;	// NOT REQUIRED
\end_layout

\begin_layout Plain Layout

cl_device_id* device;	// Pointer to OpenCL device, obtained from clGetDeviceIDs
\end_layout

\begin_layout Plain Layout

cl_context* context;	// Pointer to OpenCL context, obtained from clCreateContext
\end_layout

\begin_layout Plain Layout

#elif(VKFFT_BACKEND==4) //Level Zero API
\end_layout

\begin_layout Plain Layout

ze_device_handle_t* device;	// Pointer to Level Zero device, obtained from
 zeDeviceGet
\end_layout

\begin_layout Plain Layout

ze_context_handle_t* context;	// Pointer to Level Zero context, obtained
 from zeContextCreate
\end_layout

\begin_layout Plain Layout

ze_command_queue_handle_t* commandQueue;	// Pointer to Level Zero command
 queue with compute and copy capabilities, obtained from zeCommandQueueCreate
\end_layout

\begin_layout Plain Layout

uint32_t commandQueueID;	// ID of the commandQueue with compute and copy
 capabilities
\end_layout

\begin_layout Plain Layout

#endif
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

// Data parameters (buffers can be specified at launch):
\end_layout

\begin_layout Plain Layout

uint64_t userTempBuffer;	// Buffer allocated by app automatically if needed
 to reorder Four step algorithm.
 Setting to non zero value enables manual user allocation (0 - off, 1 -
 on)
\end_layout

\begin_layout Plain Layout

uint64_t bufferNum;	// Multiple buffer sequence storage is Vulkan only.
 Default 1
\end_layout

\begin_layout Plain Layout

uint64_t tempBufferNum;	// Multiple buffer sequence storage is Vulkan only.
 Default 1, buffer allocated by app automatically if needed to reorder Four
 step algorithm.
 Setting to non zero value enables manual user allocation 	
\end_layout

\begin_layout Plain Layout

uint64_t inputBufferNum;	// Multiple buffer sequence storage is Vulkan only.
 Default 1, if isInputFormatted is enabled 
\end_layout

\begin_layout Plain Layout

uint64_t outputBufferNum;	// Multiple buffer sequence storage is Vulkan
 only.
 Default 1, if isOutputFormatted is enabled 
\end_layout

\begin_layout Plain Layout

uint64_t kernelNum;	// Multiple buffer sequence storage is Vulkan only.
 Default 1, if performConvolution is enabled
\end_layout

\begin_layout Plain Layout

uint64_t* bufferSize;	// Array of buffers sizes in bytes
\end_layout

\begin_layout Plain Layout

uint64_t* tempBufferSize;	// Array of temp buffers sizes in bytes.
 Default set to bufferSize sum, buffer allocated by app automatically if
 needed to reorder Four step algorithm.
 Setting to non zero value enables manual user allocation 
\end_layout

\begin_layout Plain Layout

uint64_t* inputBufferSize;	// Array of input buffers sizes in bytes, if
 isInputFormatted is enabled
\end_layout

\begin_layout Plain Layout

uint64_t* outputBufferSize;	// Array of output buffers sizes in bytes, if
 isOutputFormatted is enabled
\end_layout

\begin_layout Plain Layout

uint64_t* kernelSize;	// Array of kernel buffers sizes in bytes, if performConvo
lution is enabled
\end_layout

\begin_layout Plain Layout

#if(VKFFT_BACKEND==0) //Vulkan API
\end_layout

\begin_layout Plain Layout

VkBuffer* buffer;	// Pointer to array of buffers (or one buffer) used for
 computations
\end_layout

\begin_layout Plain Layout

VkBuffer* tempBuffer;	// Needed if reorderFourStep is enabled to transpose
 the array.
 Same sum size or bigger as buffer (can be split in multiple).
 Default 0.
 Setting to non zero value enables manual user allocation
\end_layout

\begin_layout Plain Layout

VkBuffer* inputBuffer;	// Pointer to array of input buffers (or one buffer)
 used to read data from if isInputFormatted is enabled
\end_layout

\begin_layout Plain Layout

VkBuffer* outputBuffer;	// Pointer to array of output buffers (or one buffer)
 used to write data to if isOutputFormatted is enabled
\end_layout

\begin_layout Plain Layout

VkBuffer* kernel;	// Pointer to array of kernel buffers (or one buffer)
 used to read kernel data from if performConvolution is enabled
\end_layout

\begin_layout Plain Layout

#elif(VKFFT_BACKEND==1) //CUDA API
\end_layout

\begin_layout Plain Layout

void** buffer;	// Pointer to device buffer used for computations
\end_layout

\begin_layout Plain Layout

void** tempBuffer;	// Needed if reorderFourStep is enabled to transpose
 the array.
 Same size as buffer.
 Default 0.
 Setting to non zero value enables manual user allocation 
\end_layout

\begin_layout Plain Layout

void** inputBuffer;	// Pointer to device buffer used to read data from if
 isInputFormatted is enabled
\end_layout

\begin_layout Plain Layout

void** outputBuffer;	// Pointer to device buffer used to write data to if
 isOutputFormatted is enabled
\end_layout

\begin_layout Plain Layout

void** kernel;	// Pointer to device buffer used to read kernel data from
 if performConvolution is enabled 
\end_layout

\begin_layout Plain Layout

#elif(VKFFT_BACKEND==2) //HIP API
\end_layout

\begin_layout Plain Layout

void** buffer;	// Pointer to device buffer used for computations
\end_layout

\begin_layout Plain Layout

void** tempBuffer;	// Needed if reorderFourStep is enabled to transpose
 the array.
 Same size as buffer.
 Default 0.
 Setting to non zero value enables manual user allocation
\end_layout

\begin_layout Plain Layout

void** inputBuffer;	// Pointer to device buffer used to read data from if
 isInputFormatted is enabled
\end_layout

\begin_layout Plain Layout

void** outputBuffer;	// Pointer to device buffer used to write data to if
 isOutputFormatted is enabled
\end_layout

\begin_layout Plain Layout

void** kernel;	// Pointer to device buffer used to read kernel data from
 if performConvolution is enabled 
\end_layout

\begin_layout Plain Layout

#elif(VKFFT_BACKEND==3) //OpenCL API
\end_layout

\begin_layout Plain Layout

cl_mem* buffer;	// Pointer to device buffer used for computations
\end_layout

\begin_layout Plain Layout

cl_mem* tempBuffer;	// Needed if reorderFourStep is enabled to transpose
 the array.
 Same size as buffer.
 Default 0.
 Setting to non zero value enables manual user allocation
\end_layout

\begin_layout Plain Layout

cl_mem* inputBuffer;	// Pointer to device buffer used to read data from
 if isInputFormatted is enabled
\end_layout

\begin_layout Plain Layout

cl_mem* outputBuffer;	// Pointer to device buffer used to write data to
 if isOutputFormatted is enabled
\end_layout

\begin_layout Plain Layout

cl_mem* kernel;	// Pointer to device buffer used to read kernel data from
 if performConvolution is enabled
\end_layout

\begin_layout Plain Layout

#endif
\end_layout

\begin_layout Plain Layout

uint64_t bufferOffset;	// Specify if VkFFT has to offset the first element
 position inside the buffer.
 In bytes.
 Default 0
\end_layout

\begin_layout Plain Layout

uint64_t tempBufferOffset;	// Specify if VkFFT has to offset the first element
 position inside the temp buffer.
 In bytes.
 Default 0
\end_layout

\begin_layout Plain Layout

uint64_t inputBufferOffset;	// Specify if VkFFT has to offset the first
 element position inside the input buffer.
 In bytes.
 Default 0
\end_layout

\begin_layout Plain Layout

uint64_t outputBufferOffset;	// Specify if VkFFT has to offset the first
 element position inside the output buffer.
 In bytes.
 Default 0
\end_layout

\begin_layout Plain Layout

uint64_t kernelOffset;	// Specify if VkFFT has to offset the first element
 position inside the kernel.
 In bytes.
 Default 0
\end_layout

\begin_layout Plain Layout

uint64_t specifyOffsetsAtLaunch;	// Specify if offsets will be selected
 with launch parameters VkFFTLaunchParams (0 - off, 1 - on).
 Default 0
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

// Optional: (default 0 if not stated otherwise)
\end_layout

\begin_layout Plain Layout

uint64_t coalescedMemory;	// In bytes, for Nvidia and AMD is equal to 32,
 Intel is equal 64, scaled for half precision.
 Going to work regardless, but if specified by user correctly, the performance
 will be higher.
\end_layout

\begin_layout Plain Layout

uint64_t aimThreads;	// Aim at this many threads per block.
 Default 128
\end_layout

\begin_layout Plain Layout

uint64_t numSharedBanks;	// How many banks shared memory has.
 Default 32
\end_layout

\begin_layout Plain Layout

uint64_t inverseReturnToInputBuffer;	// return data to the input buffer
 in inverse transform (0 - off, 1 - on).
 isInputFormatted must be enabled
\end_layout

\begin_layout Plain Layout

uint64_t numberBatches;	// N - used to perform multiple batches of initial
 data.
 Default 1
\end_layout

\begin_layout Plain Layout

uint64_t useUint64;	// Use 64-bit addressing mode in generated kernels
\end_layout

\begin_layout Plain Layout

uint64_t omitDimension[3];	// Disable FFT for this dimension (0 - FFT enabled,
 1 - FFT disabled).
 Default 0.
 Doesn't work for R2C for now.
 Doesn't work with convolutions.
\end_layout

\begin_layout Plain Layout

uint64_t performBandwidthBoost; // Try to reduce coalsesced number by a
 factor of X to get bigger sequence in one upload for strided axes.
 Default: -1 for DCT, 2 for Bluestein's algorithm (or -1 if DCT), 0 otherwise
  
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

uint64_t doublePrecision;	// Perform calculations in double precision (0
 - off, 1 - on).
\end_layout

\begin_layout Plain Layout

uint64_t halfPrecision;	// Perform calculations in half precision (0 - off,
 1 - on)
\end_layout

\begin_layout Plain Layout

uint64_t halfPrecisionMemoryOnly;	// Use half precision only as input/output
 buffer.
 Input/Output have to be allocated as half, buffer/tempBuffer have to be
 allocated as float (out-of-place mode only).
 Specify isInputFormatted and isOutputFormatted to use (0 - off, 1 - on)
\end_layout

\begin_layout Plain Layout

uint64_t doublePrecisionFloatMemory;	// Use FP64 precision for all calculations,
 while all memory storage is done in FP32.
\end_layout

\begin_layout Plain Layout

uint64_t performR2C;	// Perform R2C/C2R decomposition (0 - off, 1 - on)
\end_layout

\begin_layout Plain Layout

uint64_t performDCT;	// Perform DCT transformation (X - DCT type, 1-4)
\end_layout

\begin_layout Plain Layout

uint64_t disableMergeSequencesR2C;	// Disable merging of two real sequences
 to reduce calculations (0 - off, 1 - on) 
\end_layout

\begin_layout Plain Layout

uint64_t normalize;	// Normalize inverse transform (0 - off, 1 - on)
\end_layout

\begin_layout Plain Layout

uint64_t disableReorderFourStep;	// Disables unshuffling of Four step algorithm.
 Requires tempbuffer allocation (0 - off, 1 - on)
\end_layout

\begin_layout Plain Layout

uint64_t useLUT;	// Switches from calculating sincos to using precomputed
 LUT tables (0 - off, 1 - on).
 Configured by initialization routine
\end_layout

\begin_layout Plain Layout

uint64_t makeForwardPlanOnly;	// Generate code only for forward FFT (0 -
 off, 1 - on)
\end_layout

\begin_layout Plain Layout

uint64_t makeInversePlanOnly;	// Generate code only for inverse FFT (0 -
 off, 1 - on)
\end_layout

\begin_layout Plain Layout

uint64_t bufferStride[3];	// Buffer strides - default set to x - x*y - x*y*z
 values
\end_layout

\begin_layout Plain Layout

uint64_t isInputFormatted;	// Specify if input buffer is padded - 0 - padded,
 1 - not padded.
 For example if it is not padded for R2C if out-of-place mode is selected
 (only if numberBatches==1 and numberKernels==1)
\end_layout

\begin_layout Plain Layout

uint64_t isOutputFormatted;	// Specify if output buffer is padded - 0 -
 padded, 1 - not padded.
 For example if it is not padded for R2C if out-of-place mode is selected
 (only if numberBatches==1 and numberKernels==1)
\end_layout

\begin_layout Plain Layout

uint64_t inputBufferStride[3];	// Input buffer strides.
 Used if isInputFormatted is enabled.
 Default set to bufferStride values
\end_layout

\begin_layout Plain Layout

uint64_t outputBufferStride[3];	// Output buffer strides.
 Used if isInputFormatted is enabled.
 Default set to bufferStride values
\end_layout

\begin_layout Plain Layout

uint64_t considerAllAxesStrided;	// Will create plan for non-strided axis
 similar as a strided axis - used with disableReorderFourStep to get the
 same layout for Bluestein kernel (0 - off, 1 - on)
\end_layout

\begin_layout Plain Layout

uint64_t keepShaderCode;	// Will keep shader code and print all executed
 shaders during the plan execution in order (0 - off, 1 - on)
\end_layout

\begin_layout Plain Layout

uint64_t printMemoryLayout;	// Will print order of buffers used in shaders
 (0 - off, 1 - on) 
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

uint64_t saveApplicationToString;	// Will save all compiled binaries to
 VkFFTApplication.saveApplicationString (will be allocated by VkFFT, deallocated
 with deleteVkFFT call).
 VkFFTApplication.applicationStringSize will contain size of binary in bytes.
 (0 - off, 1 - on)
\end_layout

\begin_layout Plain Layout

uint64_t loadApplicationFromString;	// Will load all binaries from loadApplicati
onString instead of recompiling them (must be allocated by user, must contain
 what saveApplicationToString call generated previously in VkFFTApplication.saveA
pplicationString).
 (0 - off, 1 - on).
 Mutually exclusive with saveApplicationToString
\end_layout

\begin_layout Plain Layout

void* loadApplicationString;	// Memory array (uint32_t* for Vulkan/HIP,
 char* for CUDA/OpenCL) through which user can load VkFFT binaries, must
 be provided by user if loadApplicationFromString = 1.
 
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

//optional Bluestein optimizations: (default 0 if not stated otherwise)
\end_layout

\begin_layout Plain Layout

uint64_t fixMaxRadixBluestein;	// controls the padding of sequences in Bluestein
 convolution.
 If specified, padded sequence will be made of up to fixMaxRadixBluestein
 primes.
 Default: 2 for CUDA and Vulkan/OpenCL/HIP up to 1048576 combined dimension
 FFT system, 7 for Vulkan/OpenCL/HIP past after.
 Min = 2, Max = 13.
\end_layout

\begin_layout Plain Layout

uint64_t forceBluesteinSequenceSize;	// force the sequence size to pad to
 in Bluestein's algorithm.
 Must be at least 2*N-1 and decomposable with primes 2-13.
\end_layout

\begin_layout Plain Layout

uint64_t useCustomBluesteinPaddingPattern;	// force the sequence sizes to
 pad to in Bluestein's algorithm, but on a range.
 This number specifies the number of elements in primeSizes and in paddedSizes
 arrays.
 primeSizes - array of non-decomposable as radix scheme sizes - 17, 23,
 31 etc.
 paddedSizes - array of lengths to pad to.
 paddedSizes[i] will be the padding size for all non-decomposable sequences
 from primeSizes[i] to primeSizes[i+1] (will use default scheme after last
 one) - 42, 60, 64 for primeSizes before and 37+ will use default scheme
 (for example).
 Default is vendor and API-based specified in autoCustomBluesteinPaddingPattern.
\end_layout

\begin_layout Plain Layout

uint64_t* primeSizes;	// described in useCustomBluesteinPaddingPattern
\end_layout

\begin_layout Plain Layout

uint64_t* paddedSizes;	// described in useCustomBluesteinPaddingPattern
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

// Optional zero padding control parameters: (default 0 if not stated otherwise)
\end_layout

\begin_layout Plain Layout

uint64_t performZeropadding[3];	// Don't read some data/perform computations
 if some input sequences are zeropadded for each axis (0 - off, 1 - on)
\end_layout

\begin_layout Plain Layout

uint64_t fft_zeropad_left[3];	// Specify start boundary of zero block in
 the system for each axis
\end_layout

\begin_layout Plain Layout

uint64_t fft_zeropad_right[3];	// Specify end boundary of zero block in
 the system for each axis
\end_layout

\begin_layout Plain Layout

uint64_t frequencyZeroPadding;	// Set to 1 if zeropadding of frequency domain,
 default 0 - spatial zeropadding
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

// Optional convolution control parameters: (default 0 if not stated otherwise)
\end_layout

\begin_layout Plain Layout

uint64_t performConvolution;	// Perform convolution in this application
 (0 - off, 1 - on).
 Disables reorderFourStep parameter
\end_layout

\begin_layout Plain Layout

uint64_t coordinateFeatures;	// C - coordinate, or dimension of features
 vector.
 In matrix convolution - size of a vector
\end_layout

\begin_layout Plain Layout

uint64_t matrixConvolution;	// If equal to 2 perform 2x2, if equal to 3
 perform 3x3 matrix-vector convolution.
 Overrides coordinateFeatures
\end_layout

\begin_layout Plain Layout

uint64_t symmetricKernel;	// Specify if kernel in 2x2 or 3x3 matrix convolution
 is symmetric
\end_layout

\begin_layout Plain Layout

uint64_t numberKernels;	// N - only used in convolution step - specify how
 many kernels were initialized before.
 Expands one input to multiple (batched) output
\end_layout

\begin_layout Plain Layout

uint64_t kernelConvolution;	// Specify if this application is used to create
 kernel for convolution, so it has the same properties.
 performConvolution has to be set to 0 for kernel creation
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

// Register overutilization (experimental): (default 0 if not stated otherwise)
\end_layout

\begin_layout Plain Layout

uint64_t registerBoost;	// Specify if register file size is bigger than
 shared memory and can be used to extend it X times (on Nvidia 256KB register
 file can be used instead of 32KB of shared memory, set this constant to
 4 to emulate 128KB of shared memory).
 Defaults: Nvidia - 4 in Vulkan/OpenCL, 1 in CUDA backend; AMD - 2 if shared
 memory >= 64KB, else 4 in Vulkan/OpenCL backend, 1 in HIP backend; Intel
 - 1 if shared memory >= 64KB, else 2 in Vulkan/OpenCL/Level Zero backends;
 Default 1
\end_layout

\begin_layout Plain Layout

uint64_t registerBoostNonPow2;	// Specify if register overutilization should
 be used on non power of 2 sequences (0 - off, 1 - on)
\end_layout

\begin_layout Plain Layout

uint64_t registerBoost4Step;	// Specify if register file overutilization
 should be used in big sequences (>2^14), same definition as registerBoost.
 Default 1
\end_layout

\begin_layout Plain Layout

//not used techniques:
\end_layout

\begin_layout Plain Layout

uint64_t swapTo3Stage4Step;	// Specify at which power of 2 to switch from
 2 upload to 3 upload 4-step FFT, in case if making max sequence size lower
 than coalesced sequence helps to combat TLB misses.
 Default 0 - disabled.
 Must be at least 17
\end_layout

\begin_layout Plain Layout

uint64_t devicePageSize;	// In KB, the size of a page on the GPU.
 Setting to 0 disables local buffer split in pages
\end_layout

\begin_layout Plain Layout

uint64_t localPageSize;	// In KB, the size to split page into if sequence
 spans multiple devicePageSize pages
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

// Automatically filled based on device info (still can be reconfigured
 by user):
\end_layout

\begin_layout Plain Layout

uint64_t maxComputeWorkGroupCount[3];	// maxComputeWorkGroupCount from VkPhysica
lDeviceLimits
\end_layout

\begin_layout Plain Layout

uint64_t maxComputeWorkGroupSize[3];	// maxComputeWorkGroupCount from VkPhysical
DeviceLimits
\end_layout

\begin_layout Plain Layout

uint64_t maxThreadsNum;	// Max number of threads from VkPhysicalDeviceLimits
\end_layout

\begin_layout Plain Layout

uint64_t sharedMemorySizeStatic;	// Available for static allocation shared
 memory size, in bytes
\end_layout

\begin_layout Plain Layout

uint64_t sharedMemorySize;	// Available for allocation shared memory size,
 in bytes
\end_layout

\begin_layout Plain Layout

uint64_t sharedMemorySizePow2;	// Power of 2 which is less or equal to sharedMem
orySize, in bytes
\end_layout

\begin_layout Plain Layout

uint64_t warpSize;	// Number of threads per warp/wavefront.
\end_layout

\begin_layout Plain Layout

uint64_t halfThreads;	// Intel fix
\end_layout

\begin_layout Plain Layout

uint64_t allocateTempBuffer;	// Buffer allocated by app automatically if
 needed to reorder Four step algorithm.
 Parameter to check if it has been allocated
\end_layout

\begin_layout Plain Layout

uint64_t reorderFourStep;	// Unshuffle Four step algorithm.
 Requires tempbuffer allocation (0 - off, 1 - on).
 Default 1.
\end_layout

\begin_layout Plain Layout

int64_t maxCodeLength;	// Specify how big can be buffer used for code generation
 (in char).
 Default 1000000 chars.
 
\end_layout

\begin_layout Plain Layout

int64_t maxTempLength;	// Specify how big can be buffer used for intermediate
 string sprintfs be (in char).
 Default 5000 chars.
 If code segfaults for some reason - try increasing this number.
\end_layout

\begin_layout Plain Layout

uint64_t autoCustomBluesteinPaddingPattern; // default value for useCustomBluest
einPaddingPattern
\end_layout

\begin_layout Plain Layout

uint64_t vendorID; // vendorID 0x10DE - NVIDIA, 0x8086 - Intel, 0x1002 -
 AMD, etc
\end_layout

\begin_layout Plain Layout

#if(VKFFT_BACKEND==0) //Vulkan API
\end_layout

\begin_layout Plain Layout

VkDeviceMemory tempBufferDeviceMemory;	// Filled at app creation
\end_layout

\begin_layout Plain Layout

VkCommandBuffer* commandBuffer;	// Filled at app execution
\end_layout

\begin_layout Plain Layout

VkMemoryBarrier* memory_barrier;	// Filled at app creation
\end_layout

\begin_layout Plain Layout

#elif(VKFFT_BACKEND==1) //CUDA API
\end_layout

\begin_layout Plain Layout

cudaEvent_t* stream_event;	// Filled at app creation
\end_layout

\begin_layout Plain Layout

uint64_t streamCounter;	// Filled at app creation
\end_layout

\begin_layout Plain Layout

uint64_t streamID;	// Filled at app creation
\end_layout

\begin_layout Plain Layout

#elif(VKFFT_BACKEND==2) //HIP API
\end_layout

\begin_layout Plain Layout

hipEvent_t* stream_event;	// Filled at app creation
\end_layout

\begin_layout Plain Layout

uint64_t streamCounter;	// Filled at app creation
\end_layout

\begin_layout Plain Layout

uint64_t streamID;	// Filled at app creation
\end_layout

\begin_layout Plain Layout

#elif(VKFFT_BACKEND==3) //OpenCL API
\end_layout

\begin_layout Plain Layout

cl_command_queue* commandQueue;	// Filled at app creation
\end_layout

\begin_layout Plain Layout

#elif(VKFFT_BACKEND==4)
\end_layout

\begin_layout Plain Layout

ze_command_list_handle_t* commandList;	// Filled at app creation
\end_layout

\begin_layout Plain Layout

#endif
\end_layout

\begin_layout Plain Layout

} VkFFTConfiguration;
\end_layout

\begin_layout Plain Layout


\backslash
end{minted}
\end_layout

\begin_layout Plain Layout


\backslash
end{mdframed}
\end_layout

\end_inset


\end_layout

\begin_layout Subsubsection
Driver API parameters
\end_layout

\begin_layout Standard
In order to work, VkFFT needs some structures that are provided by the driver.
 They are backend API-dependent.
 VkFFT will return corresponding VkFFTResult if one of these structures
 are not provided (value equal to zero) unless it is stated that there is
 a default value assigned.
 VkFFT will not modify provided values directly.
\end_layout

\begin_layout Standard
Vulkan API will need the following information:
\end_layout

\begin_layout Itemize
VkPhysicalDevice* physicalDevice - Pointer to Vulkan physical device, obtained
 from vkEnumeratePhysicalDevices()
\end_layout

\begin_layout Itemize
VkDevice* device - Pointer to Vulkan device, created with vkCreateDevice()
\end_layout

\begin_layout Itemize
VkQueue* queue - Pointer to Vulkan queue, created with vkGetDeviceQueue()
\end_layout

\begin_layout Itemize
VkCommandPool* commandPool - Pointer to Vulkan command pool, created with
 vkCreateCommandPool()
\end_layout

\begin_layout Itemize
VkFence* fence - Pointer to Vulkan fence, created with vkCreateFence()
\end_layout

\begin_layout Itemize
uint64_t isCompilerInitialized - Specify if glslang compiler has been intialized
 before (0 - off, 1 - on).
 Default 0 - VkFFT will call glslang_initialize_process() at initializeVkFFT()
 and glslang_finialize_process() at deleteVkFFT() calls.
\end_layout

\begin_layout Standard
CUDA API will need the following information:
\end_layout

\begin_layout Itemize
CUdevice* device - Pointer to CUDA device, obtained from cuDeviceGet()
\end_layout

\begin_layout Itemize
cudaStream_t* stream - Pointer to streams (can be more than 1), where to
 execute the kernels.
 Default 0.
 Streams must be associated with the provided device.
 There is no real benefit in having more than one, however.
 
\end_layout

\begin_layout Itemize
uint64_t num_streams - Try to submit CUDA kernels in multiple streams for
 asynchronous execution.
 Default 1 
\end_layout

\begin_layout Standard
HIP API will need the following information:
\end_layout

\begin_layout Itemize
hipDevice_t* device - Pointer to HIP device, obtained from hipDeviceGet()
\end_layout

\begin_layout Itemize
hipStream_t* stream - Pointer to streams (can be more than 1), where to
 execute the kernels.
 Default 0.
 Streams must be associated with the provided device.
 There is no real benefit in having more than one, however.
 
\end_layout

\begin_layout Itemize
uint64_t num_streams - Try to submit HIP kernels in multiple streams for
 asynchronous execution.
 Default 1 
\end_layout

\begin_layout Standard
OpenCL API will need the following information:
\end_layout

\begin_layout Itemize
cl_device_id* device - Pointer to OpenCL device, obtained from clGetDeviceIDs()
\end_layout

\begin_layout Itemize
cl_context* context - Pointer to OpenCL context, obtained from clCreateContext()
\end_layout

\begin_layout Standard
Level Zero API will need the following information:
\end_layout

\begin_layout Itemize
ze_device_handle_t* device - Pointer to Level Zero device, obtained from
 zeDeviceGet()
\end_layout

\begin_layout Itemize
ze_context_handle_t* context - Pointer to Level Zero context, obtained from
 zeContextGet()
\end_layout

\begin_layout Itemize
ze_command_queue_handle_t* commandQueue - Pointer to Level Zero command
 queuewith compute and copy capabilities, obtained from zeCommandQueueCreate()
\end_layout

\begin_layout Itemize
uint32_t commandQueueID - ID of the commandQueue with compute and copy capabilit
ies
\end_layout

\begin_layout Subsubsection
Memory management parameters
\end_layout

\begin_layout Standard
There are five buffer types user can provide to VkFFT: 
\end_layout

\begin_layout Itemize
the main buffer (buffer)
\end_layout

\begin_layout Itemize
temporary buffer used for calculations requiring out-of-place writes (tempBuffer
)
\end_layout

\begin_layout Itemize
separate input buffer, from which initial read is performed (inputBuffer)
\end_layout

\begin_layout Itemize
separate output buffer, to which final write is performed (outputBuffer)
\end_layout

\begin_layout Itemize
kernel buffer, used for calculation of convolutions and cross-correlations
 (kernel)
\end_layout

\begin_layout Standard
These buffers must be passed by a pointer: in Vulkan API they are provided
 as VkBuffer*, in CUDA, HIP and Level Zero they are provided as void**,
 in OpenCL, they are provided as cl_mem*.
 Even though the underlying structure (VkBuffer, void*, cl_mem) is not a
 memory but just a number that the driver can use to access corresponding
 allocated memory on the GPU, passing them by a pointer allows for the user
 to query multiple GPU allocated buffers for VkFFT to use.
 Currently, it is only supported in Vulkan API - each of five buffer types
 can be made out of multiple separate memory allocations.
 For example, it is possible to combine multiple small unused at the point
 of FFT calculation buffers to form a tempBuffer.
 This option also allows Vulkan API to overcome the limit of 4GB for a single
 memory allocation - due to the fact that Vulkan can only use 32-bit numbers
 for addressing (other APIs support 64-bit addressing).
 
\end_layout

\begin_layout Standard
To use the buffers other than the main buffer, the user has to specify this
 in configuration at the application creation stage (set to zero by default,
 optional parameters): 
\end_layout

\begin_layout Itemize
uint64_t userTempBuffer - enables manual temporary buffer allocation (otherwise
 it is managed by VkFFT)
\end_layout

\begin_layout Itemize
uint64_t isInputFormatted - specifies that initial read is performed from
 a separate buffer (inputBuffer)
\end_layout

\begin_layout Itemize
uint64_t isOutputFormatted - specifies that final write is performed to
 a separate buffer (outputBuffer)
\end_layout

\begin_layout Itemize
uint64_t performConvolution - enables convolution calculations, which requires
 precomputed kernel (kernel)
\end_layout

\begin_layout Standard
Buffer sizes (bufferSize/tempBufferSize/inputBufferSize/outputBufferSize/kernelS
ize) are provided as a uint64_t pointer to an array, where each element
 corresponds to the buffer size of the buffer with the same placement in
 the buffer array.
 Buffer sizes have to be provided in Vulkan API (due to the stricter memory
 management model and multiple buffer support) and are optional in other
 backends (they can be useful to determine when to switch for 64-bit addressing).
\end_layout

\begin_layout Standard
Buffer number (bufferNum/tempBufferNum/inputBufferNum/outputBufferNum/kernelNum)
 corresponds to how many elements are in the buffer and buffer size array.
 By default it is set to 1 and is not required to be provided by the user.
 Non-Vulkan backends currently don't support values other than default.
 Optional parameter.
\end_layout

\begin_layout Standard
Buffer offset (bufferOffset/ tempBufferOffset/ inputBufferOffset/ outputBufferOf
fset/ kernelOffset) specifies offset from the start of the buffer sequence.
 It must be specified in bytes and must be divisible by the number type
 size used in the corresponding array (otherwise, the offset will be truncated).
 It is provided as a single uint64_t value.
 Can be provided at launch time, if specifyOffsetsAtLaunch parameter is
 enabled during initialization call.
 Optional parameters.
\end_layout

\begin_layout Standard
User can provide custom dimension strides for buffer/inputBuffer/outputBuffer
 buffers - uint64_t[3] array.
 Strides are specified in elements used in the array (not bytes).
 The first element corresponds to the stride between elements in the H direction
, the second corresponds to the D direction and the third to C (or N, if
 the number of elements in C is 1).
 The first axis is assumed to be non-strided.
 Must be at least of the same size as default strides, otherwise the behavior
 is undefined.
 Optional parameters.
\end_layout

\begin_layout Standard
uint64_t inverseReturnToInputBuffer - an option that allows setting the
 final output buffer of the inverse transform to the same buffer, initial
 read of forward transform is performed from (inputBuffer, if isInputFormatted
 enabled).
 Optional parameter.
\end_layout

\begin_layout Subsubsection
General FFT parameters 
\end_layout

\begin_layout Standard
This section describes part of the configuration structure responsible for
 FFT specification.
 
\end_layout

\begin_layout Standard
uint64_t FFTdim - dimensionality of the transform (1, 2 or 3).
 Required parameter.
\end_layout

\begin_layout Standard
uint64_t size[3] - WHD dimensions of the transform.
 Required parameter.
\end_layout

\begin_layout Standard
uint64_t numberBatches - N parameter of the transform.
 By default, it is set to 1.
 Optional parameter.
\end_layout

\begin_layout Standard
uint64_t performR2C - perform R2C/C2R decomposition.
 performDCT must be set to 0.
 Default 0, set to 1 to enable.
 Optional parameter.
\end_layout

\begin_layout Standard
uint64_t performDCT - perform DCT transformation.
 performR2C must be set to 0.
 Default 0, set to X for DCT-X (currently supported X: 1, 2, 3 and 4).
 Optional parameter.
\end_layout

\begin_layout Standard
uint64_t normalize - enabling this parameter will make the inverse transform
 divide the result by the FFT length.
 Default 0, set to 1 to enable.
 Optional parameter.
\end_layout

\begin_layout Subsubsection
Precision parameters (and some things that can affect it):
\end_layout

\begin_layout Standard
uint64_t doublePrecision - perform calculations in double precision.
 Default 0, set to 1 to enable.
 In Vulkan/OpenCL/Level Zero your device must support double-precision functiona
lity.
 Optional parameter.
\end_layout

\begin_layout Standard
uint64_t doublePrecisionFloatMemory - perform calculations in double precision,
 but all intermediate and final storage in float.
 Input/Output/main buffers must have single-precision layout.
 doublePrecision must be set to 0.
 This option increases precision, but not that much to be recommended for
 actual use.
 Default 0, set to 1 to enable.
 In Vulkan/OpenCL/Level Zero your device must support double-precision functiona
lity.
 Experimental feature.
 Optional parameter.
\end_layout

\begin_layout Standard
uint64_t halfPrecision - half-precision in VkFFT is implemented only as
 memory optimization.
 All calculations are done in single precision (similar way as doublePrecisionFl
oatMemory works for double and single precision).
 Default 0, set to 1 to enable.
 Works only in Vulkan API now, experimental feature (half precision seems
 to have bad precision for the first FFT element).
 Optional parameter.
\end_layout

\begin_layout Standard
uint64_t halfPrecisionMemoryOnly - another way of performing half-precision
 in VkFFT, it will use half-precision only for initial and final memory
 storage in input/output buffer.
 Input/Output have to be allocated as half, buffer/tempBuffer have to be
 allocated as float (out-of-place mode only).
 Specify isInputFormatted and isOutputFormatted to use.
 So, for example, intermediate storage between axes FFTs in the multidimensional
 case will be done in single precision, as opposed to half-precision in
 the base halfPrecision case.
 halfPrecision must be set to 1.
 Default 0, set to 1 to enable.
 Works only in Vulkan API now, experimental feature.
 Optional parameter.
\end_layout

\begin_layout Standard
uint64_t useLUT - switches from calculating sines and cosines (via special
 function units in single precision or as a polynomial approximation in
 double precision) to using precomputed Look-Up Tables.
 Default 0 in single precision, 1 in double precision, set to 1 to enable.
 Set to 1 by default for Intel GPUs.
 If you have issues with single-precision accuracy on your GPU, try enabling
 this parameter (mobile GPUs may be affected).
 Optional parameter.
\end_layout

\begin_layout Subsubsection
Advanced parameters (code will work fine without using them)
\end_layout

\begin_layout Standard
uint64_t omitDimension[3] - parameter, that disables the FFT calculation
 for a particular axis (WHD).
 Note, that omitted dimensions still need to be included in FFTdim and size.
 This parameter simply works as a switch during execution - by not executing
 the particular dimension code.
 It doesn't work with the non-strided axis (W) of R2C/C2R mode.
 It doesn't work with convolution calculations.
 Default 0, set to 1 to enable.
 Optional parameter.
\end_layout

\begin_layout Standard
uint64_t useUint64 - forces VkFFT to use 64-bit addressing in generated
 kernels.
 It is automatically enabled if the estimated buffer size is more than 4GB.
 Doesn't work with the Vulkan backend.
 By default, it is set to 0.
 Optional parameter.
\end_layout

\begin_layout Standard
uint64_t coalescedMemory - number of bytes to coalesce per one transaction.
 For Nvidia and AMD is equal to 32, Intel is equal to 64.
 Going to work regardless, but if specified by the user correctly, the performan
ce will be higher.
 Default 64 for other GPUs.
 For half-precision should be multiplied by two.
 Should be a power of two.
 Optional parameter.
\end_layout

\begin_layout Standard
uint64_t numSharedBanks - configure the number of shared banks on the target
 GPU.
 Default 32.
 Minor performance boost as it solves shared memory conflicts for the power
 of two systems.
 Optional parameter.
\end_layout

\begin_layout Standard
uint64_t aimThreads - try to aim all kernels at this amount of threads.
 Gains/losses are not predictable, just a parameter to play with (it is
 not guaranteed that the target kernel will use that many threads).
 Default 128.
 Optional parameter.
\end_layout

\begin_layout Standard
uint64_t useUint64 - forces 64-bit addressing in generated kernels.
 Should be enabled automatically for systems spanning more than 4GB, but
 it is better to have an option to force it as a failsafe.
 Doesn't work in Vulkan API (use multiple buffer binding).
 Default 0, set to 1 to enable.
 Optional parameter.
\end_layout

\begin_layout Standard
uint64_t performBandwidthBoost - try to reduce coalsesced number by a factor
 of X to get bigger sequence in one upload for strided axes.
 Default: -1(inf) for DCT, 2 for Bluestein's algorithm (or -1 if DCT), 0
 otherwise 
\end_layout

\begin_layout Standard
uint64_t disableMergeSequencesR2C - disable the optimization that performs
 merging of two real sequences to reduce calculations (in R2C/C2R and R2R).
 If enabled, calculations will be performed by simply setting the imaginary
 component to zero.
 Default 0, set to 1 to enable.
 Optional parameter.
\end_layout

\begin_layout Standard
uint64_t disableReorderFourStep - disables unshuffling of the Four Step
 FFT algorithm (last transposition of data).
 With this option enabled, tempBuffer will not be needed (unless it is required
 by Bluestein's multi-upload FFT algorithm).
 Default 0, set to 1 to enable.
 Automatically enabled for convolution calculations and Bluestein's algorithm.
 Optional parameter.
\end_layout

\begin_layout Standard
uint64_t makeForwardPlanOnly - generate code only for forward FFT.
 Default 0, set to 1 to enable.
 Mutually exclusive with makeInversePlanOnly.
 Optional parameter.
\end_layout

\begin_layout Standard
uint64_t makeInversePlanOnly - generate code only for inverse FFT.
 Default 0, set to 1 to enable.
 Mutually exclusive with makeForwardPlan.
 Optional parameter.
\end_layout

\begin_layout Standard
uint64_t considerAllAxesStrided - will create a plan for a non-strided axis
 similar to a strided axis (used with disableReorderFourStep to get the
 same layout for Bluestein kernel).
 Default 0, set to 1 to enable.
 Optional parameter.
\end_layout

\begin_layout Standard
uint64_t keepShaderCode - debugging option, will keep shader code and print
 all executed shaders during the plan execution in order.
 Default 0, set to 1 to enable.
 Optional parameter.
\end_layout

\begin_layout Standard
uint64_t printMemoryLayout - debugging option, will print order of buffers
 used in kernels.
 Default 0, set to 1 to enable.
 Optional parameter.
 
\end_layout

\begin_layout Standard
uint64_t saveApplicationToString - will save all compiled binaries to VkFFTAppli
cation.saveApplicationString (will be allocated by VkFFT, deallocated with
 deleteVkFFT call).
 VkFFTApplication.applicationStringSize will contain size of binary in bytes.
 Default 0, set to 1 to enable.
 Optional parameter.
\end_layout

\begin_layout Standard
uint64_t loadApplicationFromString - will load all binaries from loadApplication
String instead of recompiling them (loadApplicationString must be allocated
 by user, must contain what saveApplicationToString call generated previously
 in VkFFTApplication.saveApplicationString).
 Default 0, set to 1 to enable.
 Optional parameter.
 Mutually exclusive with saveApplicationToString 
\end_layout

\begin_layout Standard
void* loadApplicationString - memory array (uint32_t* for Vulkan, HIP and
 Level Zero, char* for CUDA/OpenCL) through which user can load VkFFT binaries,
 must be provided by user if loadApplicationFromString = 1.
 
\end_layout

\begin_layout Subsubsection
Bluestein control parameters
\end_layout

\begin_layout Standard
If the sequence can not be decomposed as a multiplication of primes up to
 13, FFT is performed as a convolution.
 The sequence to pad to with the best performance is usually device-dependent.
 VkFFT uses parameters manually tuned for all sequences between 2 and 4096
 for both double and single precision on Nvidia A100 (Nvidia profile) and
 AMD MI250 (default profile).
 To control this process, VkFFT allows for the following parameters specificatio
n:
\end_layout

\begin_layout Standard
uint64_t fixMaxRadixBluestein - controls the padding of sequences in Bluestein
 convolution.
 If specified, padded sequence will be made of up to fixMaxRadixBluestein
 primes.
 Default: 2 for CUDA and Vulkan/OpenCL/HIP up to 1048576 combined dimension
 FFT system, 7 for Vulkan/OpenCL/HIP past after.
 Min = 2, Max = 13.
\end_layout

\begin_layout Standard
uint64_t forceBluesteinSequenceSize - force the sequence size to pad to
 in Bluestein's algorithm.
 Must be at least 2*N-1 and decomposable with primes 2-13.
\end_layout

\begin_layout Standard
uint64_t useCustomBluesteinPaddingPattern - force the sequence sizes to
 pad to in Bluestein's algorithm, but on a range.
 This number specifies the number of elements in primeSizes and in paddedSizes
 arrays.
 primeSizes - array of non-decomposable as radix scheme sizes - 17, 23,
 31 etc.
 paddedSizes - array of lengths to pad to.
 paddedSizes[i] will be the padding size for all non-decomposable sequences
 from primeSizes[i] to primeSizes[i+1] (will use default scheme after last
 one) - 42, 60, 64 for primeSizes before and 37+ will use default scheme
 (for example).
 Default is vendor and API-based specified in autoCustomBluesteinPaddingPattern.
\end_layout

\begin_layout Standard
uint64_t* primeSizes - described in useCustomBluesteinPaddingPattern
\end_layout

\begin_layout Standard
uint64_t* paddedSizes - described in useCustomBluesteinPaddingPattern
\end_layout

\begin_layout Subsubsection
Zero padding parameters
\end_layout

\begin_layout Standard
uint64_t performZeropadding[3] - do not read/write some data/perform computation
s if some part of the sequence is known to have zeros.
 Set separately for each axis (WHD).
 If enabled, all 1D sequences in this direction will be considered padded
 (independent of other zero-padded axes).
 Default 0, set to 1 to enable.
 Optional parameter.
 
\end_layout

\begin_layout Standard
uint64_t fft_zeropad_left[3] - specify start boundary of zero block in the
 system for each axis.
 Default 0, set to the value between 0 and size[X]-1.
 Optional parameter.
\end_layout

\begin_layout Standard
uint64_t fft_zeropad_right[3] - specify end boundary of zero block in the
 system for each axis.
 Default 0, set to the value between fft_zeropad_left[X] and size[X]-1.
 Optional parameter.
\end_layout

\begin_layout Standard
uint64_t frequencyZeroPadding - enables zero padding of the frequency domain,
 so the first read of inverse FFT will consider the parts of the system
 from fft_zeropad_left to fft_zeropad_right as zero.
 Default 0 - spatial zero padding, set to 1 to enable.
 Optional parameter.
 
\end_layout

\begin_layout Subsubsection
Convolution parameters
\end_layout

\begin_layout Standard
uint64_t performConvolution - main parameter that enables convolutions in
 the application.
 If enabled, you must specify kernel buffer, number of kernel buffers and
 kernel sizes (in Vulkan API).
 Disables reordering of the Four Step FFT algorithm.
 Default 0, set to 1 to enable.
 Optional parameter.
 
\end_layout

\begin_layout Standard
uint64_t conjugateConvolution - default 0, set to 1 to enable enables conjugatio
n of the sequence FFT is currently done on, 2 to enable conjugation of the
 convolution kernel.
 Optional parameter.
 
\end_layout

\begin_layout Standard
uint64_t crossPowerSpectrumNormalization - normalize the FFT * kernel multiplica
tion in frequency domain.
 Default 0, set to 1 to enable.
 Optional parameter.
 
\end_layout

\begin_layout Standard
uint64_t coordinateFeatures - max coordinate (C), or dimension of the features
 vector.
 In matrix convolution - the size of the vector.
 The main purpose is to support Matrix-Vector convolutions.
 Use numberBatches parameter in tasks, not requiring two separate coordinate-lik
e enumerations of data.
 Default 1.
 Optional parameter.
\end_layout

\begin_layout Standard
uint64_t matrixConvolution - set to 2 to perform 2x2, set to 3 to perform
 3x3 matrix-vector convolution.
 Matrix-vector convolution is a form of point-wise multiplication in the
 Fourier space, used by the convolution theorem, where multiplication takes
 the form of Matrix-vector multiplication.
 Overrides coordinateFeatures during execution.
 Default 0.
 Optional parameter.
 
\end_layout

\begin_layout Standard
uint64_t symmetricKernel - specify if kernel in 2x2 or 3x3 matrix convolution
 is symmetric.
 You need to store data as xx, xy, yy (upper-triangular) if enabled and
 as xx, xy, yx, yy (along rows then along columns, from left to right) if
 disabled.
 Default 0, set to 1 to enable.
 Optional parameter.
 
\end_layout

\begin_layout Standard
uint64_t numberKernels - specify how many kernels were initialized before
 performing one input/multiple output convolutions.
 Overwrites numberBatches (N).
 Only used in convolution step and the following inverse transforms.
 Default 1.
 Optional parameter.
 
\end_layout

\begin_layout Standard
uint64_t kernelConvolution - specify if this application is used to create
 kernel for convolution, so it has the same properties/memory layout.
 performConvolution has to be set to 0 for the kernel creation.
 Default 0, set to 1 to enable.
 Optional parameter, but it is a required parameter for kernel generation.
\end_layout

\begin_layout Subsubsection
Register overutilization
\end_layout

\begin_layout Standard
Only works in C2C mode, without convolution support.
 Enabled in Vulkan, OpenCL and Level Zero APIs only (it works in other APIs,
 but worse).
 Experimental feature.
\end_layout

\begin_layout Standard
uint64_t registerBoost - specify if the register file size is bigger than
 shared memory and can be used to extend it X times (on Nvidia 256KB register
 file can be used instead of 32KB of shared memory, set this constant to
 4 to emulate 128KB of shared memory).
 Default 1 - no overutilization.
 In Vulkan, OpenCL and Level Zero it is set to 4 on Nvidia GPUs, to 2 if
 the driver shows 64KB or more of shared memory on AMD, to 2 if the driver
 shows less than 64KB of shared memory on AMD, to 1 if the driver shows
 64KB or more of shared memory on Intel, to 2 if the driver shows less than
 64KB of shared memory on Intel.
 Optional parameter.
\end_layout

\begin_layout Standard
uint64_t registerBoostNonPow2 - specify if register overutilization should
 be used on non-power of 2 sequences.
 Default 0, set to 1 to enable.
 Optional parameter.
\end_layout

\begin_layout Standard
uint64_t registerBoost4Step - specify if register file overutilization should
 be used in big sequences (>2^14), same definition as registerBoost.
 Default 1.
 Optional parameter.
\end_layout

\begin_layout Subsubsection
Extra advanced parameters (filled automatically)
\end_layout

\begin_layout Standard
uint64_t maxComputeWorkGroupCount[3] - how many workgroups can be launched
 at one dispatch.
 Automatically derived from the driver, can be artificially lowered.
 Then VkFFT will perform a logical split and extension of the number of
 workgroups to cover the required range.
\end_layout

\begin_layout Standard
uint64_t maxComputeWorkGroupSize[3] - max dimensions of the workgroup.
 Automatically derived from the driver.
 Can be modified if there are some issues with the driver (as there were
 with ROCm 4.0, when it returned 1024 for maxComputeWorkGroupSize and actually
 supported only up to 256 threads).
\end_layout

\begin_layout Standard
uint64_t maxThreadsNum - max number of threads per block.
 Similar to maxComputeWorkGroupSize, but aggregated.
 Automatically derived from the driver.
\end_layout

\begin_layout Standard
uint64_t sharedMemorySizeStatic - available for static allocation shared
 memory size, in bytes.
 Automatically derived from the driver.
 Can be controlled by the user, if desired.
\end_layout

\begin_layout Standard
uint64_t sharedMemorySize - available for allocation shared memory size,
 in bytes.
 VkFFT uses dynamic shared memory in CUDA/HIP as it allows for bigger allocation
s.
 Automatically derived from the driver.
 Can be controlled by the user, if desired.
\end_layout

\begin_layout Standard
uint64_t sharedMemorySizePow2 - the power of 2 which is less or equal to
 sharedMemorySize, in bytes.
 Automatically computed.
\end_layout

\begin_layout Standard
uint64_t warpSize - number of threads per warp/wavefront.
 Automatically derived from the driver, but can be modified (can increase
 performance, though unpredictable as defaults have good values).
 Must be a power of two.
\end_layout

\begin_layout Standard
uint64_t halfThreads - Intel GPU fix, tries to reduce the amount of dispatched
 threads in half to solve performance degradation in the Four Step FFT algorithm.
 Default 0 for other GPUs, try enabling it if performance degrades in the
 Four Step FFT algorithm for your GPU as well.
 
\end_layout

\begin_layout Standard
int64_t maxCodeLength - specify how big can the buffer used for code generation
 be (in char).
 Default 1000000 chars.
 
\end_layout

\begin_layout Standard
int64_t maxTempLength - specify how big can the buffer used for intermediate
 string sprintf's be (in char).
 Default 5000 chars.
 If code segfaults for some reason - try increasing this number.
\end_layout

\begin_layout Standard
uint64_t autoCustomBluesteinPaddingPattern; // default value for useCustomBluest
einPaddingPattern
\end_layout

\begin_layout Standard
uint64_t vendorID; // vendorID 0x10DE - NVIDIA, 0x8086 - Intel, 0x1002 -
 AMD, etc.
\end_layout

\begin_layout Standard
\begin_inset Newpage newpage
\end_inset


\end_layout

\begin_layout Section
VkFFT Benchmark/Precision Suite and utils_VkFFT helper routines
\end_layout

\begin_layout Standard
The only licensed (MIT) part of the VkFFT repository is the VkFFT header
 file - core library.
 Other files are either external helper libraries (half, glslang, with their
 respective licenses) or unlicensed code that is intended for simple copy-pastin
g (benchmark_scripts, utils_VkFFT.h).
 It is the easiest way to understand how to use VkFFT by taking the provided
 scripts and tinker them to the particular task.
 The current version of the benchmark and precision verification suite has
 the following codes available:
\end_layout

\begin_layout Itemize
user_benchmark_VkFFT - generalization of the main configuration parameters
 that can be used to launch simplest in-place transforms for the most important
 supported functionality
\end_layout

\begin_layout Itemize
Sample 0 - FFT + iFFT C2C benchmark 1D batched in single precision
\end_layout

\begin_layout Itemize
Sample 1 - FFT + iFFT C2C benchmark 1D batched in double precision
\end_layout

\begin_layout Itemize
Sample 2 - FFT + iFFT C2C benchmark 1D batched in half precision
\end_layout

\begin_layout Itemize
Sample 3 - FFT + iFFT C2C multidimensional benchmark in single precision
\end_layout

\begin_layout Itemize
Sample 4 - FFT + iFFT C2C multidimensional benchmark in single precision,
 native zeropadding
\end_layout

\begin_layout Itemize
Sample 5 - FFT + iFFT C2C benchmark 1D batched in single precision, no reshuffli
ng
\end_layout

\begin_layout Itemize
Sample 6 - FFT + iFFT R2C / C2R benchmark, in-place.
\end_layout

\begin_layout Itemize
Sample 7 - FFT + iFFT C2C Bluestein benchmark in single precision
\end_layout

\begin_layout Itemize
Sample 8 - FFT + iFFT C2C Bluestein benchmark in double precision
\end_layout

\begin_layout Itemize
Sample 10 - multiple buffers (4 by default) split version of benchmark 0
\end_layout

\begin_layout Itemize
Sample 11 - VkFFT / xFFT / FFTW C2C precision test in single precision (xFFT
 can be cuFFT or rocFFT)
\end_layout

\begin_layout Itemize
Sample 12 - VkFFT / xFFT / FFTW C2C precision test in double precision (xFFT
 can be cuFFT or rocFFT)
\end_layout

\begin_layout Itemize
Sample 13 - VkFFT / cuFFT / FFTW C2C precision test in half precision
\end_layout

\begin_layout Itemize
Sample 14 - VkFFT / FFTW C2C radix 3 / 5 / 7 / 11 / 13 / Bluestein precision
 test in single precision
\end_layout

\begin_layout Itemize
Sample 15 - VkFFT / xFFT / FFTW R2C+C2R precision test in single precision,
 out-of-place.
 (xFFT can be cuFFT or rocFFT)
\end_layout

\begin_layout Itemize
Sample 16 - VkFFT / FFTW R2R DCT-I, II, III and IV precision test in single
 precision
\end_layout

\begin_layout Itemize
Sample 17 - VkFFT / FFTW R2R DCT-I, II, III and IV precision test in double
 precision
\end_layout

\begin_layout Itemize
Sample 18 - VkFFT / FFTW C2C radix 3 / 5 / 7 / 11 / 13 / Bluestein precision
 test in double precision
\end_layout

\begin_layout Itemize
Sample 50 - convolution example with identity kernel
\end_layout

\begin_layout Itemize
Sample 51 - zero padding convolution example with identity kernel
\end_layout

\begin_layout Itemize
Sample 52 - batched convolution example with identity kernel
\end_layout

\begin_layout Itemize
Sample 100 - VkFFT FFT + iFFT R2R DCT multidimensional benchmark in single
 precision
\end_layout

\begin_layout Itemize
Sample 101 - VkFFT FFT + iFFT R2R DCT multidimensional benchmark in double
 precision
\end_layout

\begin_layout Itemize
Sample 1000 - FFT + iFFT C2C benchmark 1D batched in single precision: all
 supported systems from 2 to 4096
\end_layout

\begin_layout Itemize
Sample 1001 - FFT + iFFT C2C benchmark 1D batched in single precision: all
 supported systems from 2 to 4096
\end_layout

\begin_layout Itemize
Sample 1003 - FFT + iFFT C2C benchmark 1D batched in single precision: all
 supported systems from 2 to 4096
\end_layout

\begin_layout Subsection
utils_VkFFT helper routines
\end_layout

\begin_layout Standard
Launching even the simplest Vulkan application can be a non-trivial task.
 To help with this, utils_VkFFT contains the routines that can help to create
 the simplest Vulkan application, allocate memory, record command buffers
 and launch them.
 Code has some comments explaining what is going on at each step.
 It also has some useful struct defines (like vkGPU) that keep the most
 important handles used in Vulkan Compute.
 This section may be expanded in the future to the proper step-by-step guide
 on Vulkan Compute simple application creation.
 I also encourage to check https://github.com/DTolm/VulkanComputeSamples-Transpos
ition repository for another example of a compute algorithm (matrix transpositio
n) implemented with Vulkan API.
\end_layout

\begin_layout Standard
utils_VkFFT also has a routine that prints the list of available devices.
\end_layout

\begin_layout Standard
vkGPU struct has the following definition:
\end_layout

\begin_layout Standard
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
begin{mdframed}[backgroundcolor=bg]
\end_layout

\begin_layout Plain Layout


\backslash
begin{minted}[tabsize=4,obeytabs,breaklines]{C}
\end_layout

\begin_layout Plain Layout

typedef struct {
\end_layout

\begin_layout Plain Layout

#if(VKFFT_BACKEND==0) //Vulkan API
\end_layout

\begin_layout Plain Layout

VkInstance instance; //a connection between the application and the Vulkan
 library
\end_layout

\begin_layout Plain Layout

VkPhysicalDevice physicalDevice; //a handle for the graphics card used in
 the application
\end_layout

\begin_layout Plain Layout

VkPhysicalDeviceProperties physicalDeviceProperties; //bastic device properties
\end_layout

\begin_layout Plain Layout

VkPhysicalDeviceMemoryProperties physicalDeviceMemoryProperties; //basic
 memory properties of the device
\end_layout

\begin_layout Plain Layout

VkDevice device; //a logical device, interacting with physical device
\end_layout

\begin_layout Plain Layout

VkDebugUtilsMessengerEXT debugMessenger; //extension for debugging
\end_layout

\begin_layout Plain Layout

uint64_t queueFamilyIndex; //if multiple queues are available, specify the
 used one
\end_layout

\begin_layout Plain Layout

VkQueue queue; //a place, where all operations are submitted
\end_layout

\begin_layout Plain Layout

VkCommandPool commandPool; //an opaque objects that command buffer memory
 is allocated from
\end_layout

\begin_layout Plain Layout

VkFence fence; //a vkGPU->fence used to synchronize dispatches
\end_layout

\begin_layout Plain Layout

std::vector<const char*> enabledDeviceExtensions;
\end_layout

\begin_layout Plain Layout

uint64_t enableValidationLayers;
\end_layout

\begin_layout Plain Layout

#elif(VKFFT_BACKEND==1) //CUDA API
\end_layout

\begin_layout Plain Layout

CUdevice device;
\end_layout

\begin_layout Plain Layout

CUcontext context;
\end_layout

\begin_layout Plain Layout

#elif(VKFFT_BACKEND==2) //HIP API
\end_layout

\begin_layout Plain Layout

hipDevice_t device;
\end_layout

\begin_layout Plain Layout

hipCtx_t context;
\end_layout

\begin_layout Plain Layout

#elif(VKFFT_BACKEND==3) //OpenCL API
\end_layout

\begin_layout Plain Layout

cl_platform_id platform;
\end_layout

\begin_layout Plain Layout

cl_device_id device;
\end_layout

\begin_layout Plain Layout

cl_context context;
\end_layout

\begin_layout Plain Layout

cl_command_queue commandQueue;
\end_layout

\begin_layout Plain Layout

#elif(VKFFT_BACKEND==4) //Level Zero API
\end_layout

\begin_layout Plain Layout

ze_driver_handle_t driver;
\end_layout

\begin_layout Plain Layout

ze_device_handle_t device;
\end_layout

\begin_layout Plain Layout

ze_context_handle_t context;
\end_layout

\begin_layout Plain Layout

ze_command_queue_handle_t commandQueue;
\end_layout

\begin_layout Plain Layout

uint32_t commandQueueID;
\end_layout

\begin_layout Plain Layout

#endif
\end_layout

\begin_layout Plain Layout

uint64_t device_id; //an id of a device, reported by devices_list call
\end_layout

\begin_layout Plain Layout

} VkGPU;
\end_layout

\begin_layout Plain Layout


\backslash
end{minted}
\end_layout

\begin_layout Plain Layout


\backslash
end{mdframed}
\end_layout

\end_inset


\begin_inset Newpage newpage
\end_inset


\end_layout

\begin_layout Section
VkFFT Code Examples
\end_layout

\begin_layout Standard
This section will provide some simple pseudocode for VkFFT usage, which
 will once again outline important steps required to launch FFT with VkFFT.
 More information (and fully working code) can be found in this folder of
 the VkFFT repository:
\end_layout

\begin_layout Standard
/benchmark_samples/vkFFT_scripts/src/
\end_layout

\begin_layout Subsection
Driver initializations
\end_layout

\begin_layout Standard
Before launching VkFFT, do not forget to do all necessary driver initializations.
 The following code specifies them for all the supported backends, though
 the final implementation may be different depending on the particular user's
 configuration.
\end_layout

\begin_layout Standard
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
begin{mdframed}[backgroundcolor=bg]
\end_layout

\begin_layout Plain Layout


\backslash
begin{minted}[tabsize=4,obeytabs,breaklines]{C}
\end_layout

\begin_layout Plain Layout

#if(VKFFT_BACKEND==0) //Vulkan API
\end_layout

\begin_layout Plain Layout

VkResult res = VK_SUCCESS;
\end_layout

\begin_layout Plain Layout

//create instance - a connection between the application and the Vulkan
 library
\end_layout

\begin_layout Plain Layout

res = createInstance(vkGPU, sample_id);
\end_layout

\begin_layout Plain Layout

if (res != 0) {
\end_layout

\begin_layout Plain Layout

	//printf("Instance creation failed, error code: %" PRIu64 "
\backslash
n", res);
\end_layout

\begin_layout Plain Layout

	return VKFFT_ERROR_FAILED_TO_CREATE_INSTANCE;
\end_layout

\begin_layout Plain Layout

} 	
\end_layout

\begin_layout Plain Layout

//set up the debugging messenger
\end_layout

\begin_layout Plain Layout

res = setupDebugMessenger(vkGPU);
\end_layout

\begin_layout Plain Layout

if (res != 0) {
\end_layout

\begin_layout Plain Layout

//printf("Debug messenger creation failed, error code: %" PRIu64 "
\backslash
n", res);
\end_layout

\begin_layout Plain Layout

	return VKFFT_ERROR_FAILED_TO_SETUP_DEBUG_MESSENGER;
\end_layout

\begin_layout Plain Layout

}
\end_layout

\begin_layout Plain Layout

//check if there are GPUs that support Vulkan and select one
\end_layout

\begin_layout Plain Layout

res = findPhysicalDevice(vkGPU);
\end_layout

\begin_layout Plain Layout

if (res != 0) {
\end_layout

\begin_layout Plain Layout

//printf("Physical device not found, error code: %" PRIu64 "
\backslash
n", res);
\end_layout

\begin_layout Plain Layout

	return VKFFT_ERROR_FAILED_TO_FIND_PHYSICAL_DEVICE;
\end_layout

\begin_layout Plain Layout

}
\end_layout

\begin_layout Plain Layout

//create logical device representation
\end_layout

\begin_layout Plain Layout

res = createDevice(vkGPU, sample_id);
\end_layout

\begin_layout Plain Layout

if (res != 0) {
\end_layout

\begin_layout Plain Layout

//printf("Device creation failed, error code: %" PRIu64 "
\backslash
n", res);
\end_layout

\begin_layout Plain Layout

	return VKFFT_ERROR_FAILED_TO_CREATE_DEVICE;
\end_layout

\begin_layout Plain Layout

}
\end_layout

\begin_layout Plain Layout

//create fence for synchronization
\end_layout

\begin_layout Plain Layout

res = createFence(vkGPU);
\end_layout

\begin_layout Plain Layout

if (res != 0) {
\end_layout

\begin_layout Plain Layout

//printf("Fence creation failed, error code: %" PRIu64 "
\backslash
n", res);
\end_layout

\begin_layout Plain Layout

	return VKFFT_ERROR_FAILED_TO_CREATE_FENCE;
\end_layout

\begin_layout Plain Layout

}
\end_layout

\begin_layout Plain Layout

//create a place, command buffer memory is allocated from
\end_layout

\begin_layout Plain Layout

res = createCommandPool(vkGPU);
\end_layout

\begin_layout Plain Layout

if (res != 0) {
\end_layout

\begin_layout Plain Layout

	//printf("Fence creation failed, error code: %" PRIu64 "
\backslash
n", res);
\end_layout

\begin_layout Plain Layout

	return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_POOL;
\end_layout

\begin_layout Plain Layout

}
\end_layout

\begin_layout Plain Layout

vkGetPhysicalDeviceProperties(vkGPU->physicalDevice, &vkGPU->physicalDevicePrope
rties);
\end_layout

\begin_layout Plain Layout

vkGetPhysicalDeviceMemoryProperties(vkGPU->physicalDevice, &vkGPU->physicalDevic
eMemoryProperties);
\end_layout

\begin_layout Plain Layout

glslang_initialize_process();
\end_layout

\begin_layout Plain Layout

//compiler can be initialized before VkFFT
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

#elif(VKFFT_BACKEND==1) //CUDA API
\end_layout

\begin_layout Plain Layout

CUresult res = CUDA_SUCCESS;
\end_layout

\begin_layout Plain Layout

cudaError_t res2 = cudaSuccess;
\end_layout

\begin_layout Plain Layout

res = cuInit(0);
\end_layout

\begin_layout Plain Layout

if (res != CUDA_SUCCESS) return VKFFT_ERROR_FAILED_TO_INITIALIZE;
\end_layout

\begin_layout Plain Layout

res2 = cudaSetDevice((int)vkGPU->device_id);
\end_layout

\begin_layout Plain Layout

if (res2 != cudaSuccess) return VKFFT_ERROR_FAILED_TO_SET_DEVICE_ID;
\end_layout

\begin_layout Plain Layout

res = cuDeviceGet(&vkGPU->device, (int)vkGPU->device_id);
\end_layout

\begin_layout Plain Layout

if (res != CUDA_SUCCESS) return VKFFT_ERROR_FAILED_TO_GET_DEVICE;
\end_layout

\begin_layout Plain Layout

res = cuCtxCreate(&vkGPU->context, 0, (int)vkGPU->device);
\end_layout

\begin_layout Plain Layout

if (res != CUDA_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_CONTEXT;
\end_layout

\begin_layout Plain Layout

#elif(VKFFT_BACKEND==2) //HIP API
\end_layout

\begin_layout Plain Layout

hipError_t res = hipSuccess;
\end_layout

\begin_layout Plain Layout

res = hipInit(0);
\end_layout

\begin_layout Plain Layout

if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_INITIALIZE;
\end_layout

\begin_layout Plain Layout

res = hipSetDevice((int)vkGPU->device_id);
\end_layout

\begin_layout Plain Layout

if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_SET_DEVICE_ID;
\end_layout

\begin_layout Plain Layout

res = hipDeviceGet(&vkGPU->device, (int)vkGPU->device_id);
\end_layout

\begin_layout Plain Layout

if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_GET_DEVICE; 
\end_layout

\begin_layout Plain Layout

res = hipCtxCreate(&vkGPU->context, 0, (int)vkGPU->device);
\end_layout

\begin_layout Plain Layout

if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_CREATE_CONTEXT;
\end_layout

\begin_layout Plain Layout

#elif(VKFFT_BACKEND==3) //OpenCL API
\end_layout

\begin_layout Plain Layout

cl_int res = CL_SUCCESS;
\end_layout

\begin_layout Plain Layout

cl_uint numPlatforms;
\end_layout

\begin_layout Plain Layout

res = clGetPlatformIDs(0, 0, &numPlatforms);
\end_layout

\begin_layout Plain Layout

if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_INITIALIZE;
\end_layout

\begin_layout Plain Layout

cl_platform_id* platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id)
 * numPlatforms);
\end_layout

\begin_layout Plain Layout

if (!platforms) return VKFFT_ERROR_MALLOC_FAILED;
\end_layout

\begin_layout Plain Layout

res = clGetPlatformIDs(numPlatforms, platforms, 0);
\end_layout

\begin_layout Plain Layout

if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_INITIALIZE;
\end_layout

\begin_layout Plain Layout

uint64_t k = 0;
\end_layout

\begin_layout Plain Layout

for (uint64_t j = 0; j < numPlatforms; j++) {
\end_layout

\begin_layout Plain Layout

	cl_uint numDevices;
\end_layout

\begin_layout Plain Layout

	res = clGetDeviceIDs(platforms[j], CL_DEVICE_TYPE_ALL, 0, 0, &numDevices);
\end_layout

\begin_layout Plain Layout

	cl_device_id* deviceList = (cl_device_id*)malloc(sizeof(cl_device_id) *
 numDevices);
\end_layout

\begin_layout Plain Layout

	if (!deviceList) return VKFFT_ERROR_MALLOC_FAILED;
\end_layout

\begin_layout Plain Layout

	res = clGetDeviceIDs(platforms[j], CL_DEVICE_TYPE_ALL, numDevices, deviceList,
 0);
\end_layout

\begin_layout Plain Layout

	if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_GET_DEVICE;
\end_layout

\begin_layout Plain Layout

	for (uint64_t i = 0; i < numDevices; i++) {
\end_layout

\begin_layout Plain Layout

		if (k == vkGPU->device_id) {
\end_layout

\begin_layout Plain Layout

			vkGPU->platform = platforms[j];
\end_layout

\begin_layout Plain Layout

			vkGPU->device = deviceList[i];
\end_layout

\begin_layout Plain Layout

			vkGPU->context = clCreateContext(NULL, 1, &vkGPU->device, NULL, NULL,
 &res);
\end_layout

\begin_layout Plain Layout

			if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_CONTEXT;
\end_layout

\begin_layout Plain Layout

			cl_command_queue commandQueue = clCreateCommandQueue(vkGPU->context,
 vkGPU->device, 0, &res);
\end_layout

\begin_layout Plain Layout

			if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_QUEUE;
\end_layout

\begin_layout Plain Layout

			vkGPU->commandQueue = commandQueue;
\end_layout

\begin_layout Plain Layout

			k++;
\end_layout

\begin_layout Plain Layout

		}
\end_layout

\begin_layout Plain Layout

		else {
\end_layout

\begin_layout Plain Layout

			k++;
\end_layout

\begin_layout Plain Layout

		}
\end_layout

\begin_layout Plain Layout

	}
\end_layout

\begin_layout Plain Layout

	free(deviceList);
\end_layout

\begin_layout Plain Layout

}
\end_layout

\begin_layout Plain Layout

free(platforms);
\end_layout

\begin_layout Plain Layout

#elif(VKFFT_BACKEND==4)
\end_layout

\begin_layout Plain Layout

	ze_result_t res = ZE_RESULT_SUCCESS;
\end_layout

\begin_layout Plain Layout

	res = zeInit(0);
\end_layout

\begin_layout Plain Layout

	if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_INITIALIZE;
\end_layout

\begin_layout Plain Layout

	uint32_t numDrivers = 0;
\end_layout

\begin_layout Plain Layout

	res = zeDriverGet(&numDrivers, 0);
\end_layout

\begin_layout Plain Layout

	if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_INITIALIZE;
\end_layout

\begin_layout Plain Layout

	ze_driver_handle_t* drivers = (ze_driver_handle_t*)malloc(numDrivers *
 sizeof(ze_driver_handle_t));
\end_layout

\begin_layout Plain Layout

	if (!drivers) return VKFFT_ERROR_MALLOC_FAILED;
\end_layout

\begin_layout Plain Layout

	res = zeDriverGet(&numDrivers, drivers);
\end_layout

\begin_layout Plain Layout

	if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_INITIALIZE;
\end_layout

\begin_layout Plain Layout

	uint64_t k = 0;
\end_layout

\begin_layout Plain Layout

	for (uint64_t j = 0; j < numDrivers; j++) {
\end_layout

\begin_layout Plain Layout

		uint32_t numDevices = 0;
\end_layout

\begin_layout Plain Layout

		res = zeDeviceGet(drivers[j], &numDevices, nullptr);
\end_layout

\begin_layout Plain Layout

		if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_GET_DEVICE;
\end_layout

\begin_layout Plain Layout

		ze_device_handle_t* deviceList = (ze_device_handle_t*)malloc(numDevices
 * sizeof(ze_device_handle_t));
\end_layout

\begin_layout Plain Layout

		if (!deviceList) return VKFFT_ERROR_MALLOC_FAILED;
\end_layout

\begin_layout Plain Layout

		res = zeDeviceGet(drivers[j], &numDevices, deviceList);
\end_layout

\begin_layout Plain Layout

		if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_GET_DEVICE;
\end_layout

\begin_layout Plain Layout

		for (uint64_t i = 0; i < numDevices; i++) {
\end_layout

\begin_layout Plain Layout

			if (k == vkGPU->device_id) {
\end_layout

\begin_layout Plain Layout

				vkGPU->driver = drivers[j];
\end_layout

\begin_layout Plain Layout

				vkGPU->device = deviceList[i];
\end_layout

\begin_layout Plain Layout

				ze_context_desc_t contextDescription = {};
\end_layout

\begin_layout Plain Layout

				contextDescription.stype = ZE_STRUCTURE_TYPE_CONTEXT_DESC;
\end_layout

\begin_layout Plain Layout

				res = zeContextCreate(vkGPU->driver, &contextDescription, &vkGPU->context);
\end_layout

\begin_layout Plain Layout

				if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_CONTEXT;
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

				uint32_t queueGroupCount = 0;
\end_layout

\begin_layout Plain Layout

				res = zeDeviceGetCommandQueueGroupProperties(vkGPU->device, &queueGroupCount
, 0);
\end_layout

\begin_layout Plain Layout

				if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_QU
EUE;
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

				ze_command_queue_group_properties_t* cmdqueueGroupProperties = (ze_command_q
ueue_group_properties_t*) malloc(queueGroupCount * sizeof(ze_command_queue_group
_properties_t));
\end_layout

\begin_layout Plain Layout

				if (!cmdqueueGroupProperties) return VKFFT_ERROR_MALLOC_FAILED;
\end_layout

\begin_layout Plain Layout

				res = zeDeviceGetCommandQueueGroupProperties(vkGPU->device, &queueGroupCount
, cmdqueueGroupProperties);
\end_layout

\begin_layout Plain Layout

				if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_QU
EUE;
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

				uint32_t commandQueueID = -1;
\end_layout

\begin_layout Plain Layout

				for (uint32_t i = 0; i < queueGroupCount; ++i) {
\end_layout

\begin_layout Plain Layout

					if ((cmdqueueGroupProperties[i].flags && ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLA
G_COMPUTE) && (cmdqueueGroupProperties[i].flags && ZE_COMMAND_QUEUE_GROUP_PROPERT
Y_FLAG_COPY)) {
\end_layout

\begin_layout Plain Layout

						commandQueueID = i;
\end_layout

\begin_layout Plain Layout

						break;
\end_layout

\begin_layout Plain Layout

					}
\end_layout

\begin_layout Plain Layout

				}
\end_layout

\begin_layout Plain Layout

				if (commandQueueID == -1) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_QUEUE;
\end_layout

\begin_layout Plain Layout

				vkGPU->commandQueueID = commandQueueID;
\end_layout

\begin_layout Plain Layout

				ze_command_queue_desc_t commandQueueDescription = {};
\end_layout

\begin_layout Plain Layout

				commandQueueDescription.stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC;
\end_layout

\begin_layout Plain Layout

				commandQueueDescription.ordinal = commandQueueID;
\end_layout

\begin_layout Plain Layout

				commandQueueDescription.priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL;
\end_layout

\begin_layout Plain Layout

				commandQueueDescription.mode = ZE_COMMAND_QUEUE_MODE_DEFAULT;
\end_layout

\begin_layout Plain Layout

				res = zeCommandQueueCreate(vkGPU->context, vkGPU->device, &commandQueueDescr
iption, &vkGPU->commandQueue);
\end_layout

\begin_layout Plain Layout

				if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_QU
EUE;
\end_layout

\begin_layout Plain Layout

				free(cmdqueueGroupProperties);
\end_layout

\begin_layout Plain Layout

				k++;
\end_layout

\begin_layout Plain Layout

			}
\end_layout

\begin_layout Plain Layout

			else {
\end_layout

\begin_layout Plain Layout

				k++;
\end_layout

\begin_layout Plain Layout

			}
\end_layout

\begin_layout Plain Layout

		}
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

		free(deviceList);
\end_layout

\begin_layout Plain Layout

	}
\end_layout

\begin_layout Plain Layout

	free(drivers);
\end_layout

\begin_layout Plain Layout

#endif
\end_layout

\begin_layout Plain Layout


\backslash
end{minted}
\end_layout

\begin_layout Plain Layout


\backslash
end{mdframed}
\end_layout

\end_inset


\end_layout

\begin_layout Subsection
Simple FFT application example: 1D (one dimensional) C2C (complex to complex)
 FP32 (single precision) FFT
\end_layout

\begin_layout Standard
This example performs the simplest case of FFT.
 It shows all the necessary fields that the user must fill during the configurat
ion and the submission process.
 Other samples will build on this one, as driver parameters initialization
 and code execution commands are the same for all configurations (except
 for the launch parameters that can be configured after application creation).
\end_layout

\begin_layout Standard
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
begin{mdframed}[backgroundcolor=bg]
\end_layout

\begin_layout Plain Layout


\backslash
begin{minted}[tabsize=4,obeytabs,breaklines]{C}
\end_layout

\begin_layout Plain Layout

//zero-initialize configuration + FFT application
\end_layout

\begin_layout Plain Layout

VkFFTConfiguration configuration = {};
\end_layout

\begin_layout Plain Layout

VkFFTApplication app = {};
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

configuration.FFTdim = 1; //FFT dimension, 1D, 2D or 3D
\end_layout

\begin_layout Plain Layout

configuration.size[0] = Nx; //FFT size
\end_layout

\begin_layout Plain Layout

uint64_t bufferSize = (uint64_t)sizeof(float) * 2 * configuration.size[0];
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

//Device management + code submission
\end_layout

\begin_layout Plain Layout

configuration.device = &vkGPU->device; 
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

#if(VKFFT_BACKEND==0) //Vulkan API		
\end_layout

\begin_layout Plain Layout

configuration.queue = &vkGPU->queue;
\end_layout

\begin_layout Plain Layout

configuration.fence = &vkGPU->fence; 			
\end_layout

\begin_layout Plain Layout

configuration.commandPool = &vkGPU->commandPool; 			
\end_layout

\begin_layout Plain Layout

configuration.physicalDevice = &vkGPU->physicalDevice; 			
\end_layout

\begin_layout Plain Layout

configuration.isCompilerInitialized = isCompilerInitialized; //glslang compiler
 can be initialized before VkFFT plan creation.
 if not, VkFFT will create and destroy one after initialization 
\end_layout

\begin_layout Plain Layout

#elif(VKFFT_BACKEND==3) //OpenCL API		
\end_layout

\begin_layout Plain Layout

configuration.context = &vkGPU->context; 
\end_layout

\begin_layout Plain Layout

#elif(VKFFT_BACKEND==4)
\end_layout

\begin_layout Plain Layout

configuration.context = &vkGPU->context;
\end_layout

\begin_layout Plain Layout

configuration.commandQueue = &vkGPU->commandQueue;
\end_layout

\begin_layout Plain Layout

configuration.commandQueueID = vkGPU->commandQueueID;
\end_layout

\begin_layout Plain Layout

#endif
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

allocateBuffer(buffer, bufferSize); //Pseudocode for buffer allocation,
 differs between APIs
\end_layout

\begin_layout Plain Layout

transferDataFromCPU(buffer, cpu_buffer); //Pseudocode for data transfer
 from CPU to GPU, differs between APIs
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

#if(VKFFT_BACKEND==0) //Vulkan API needs bufferSize at initialization	
\end_layout

\begin_layout Plain Layout

configuration.bufferSize = &bufferSize; 
\end_layout

\begin_layout Plain Layout

#endif
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

VkFFTResult resFFT = initializeVkFFT(&app, configuration);
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

VkFFTLaunchParams launchParams = {};
\end_layout

\begin_layout Plain Layout

launchParams.buffer = &buffer;
\end_layout

\begin_layout Plain Layout

#if(VKFFT_BACKEND==0) //Vulkan API 
\end_layout

\begin_layout Plain Layout

launchParams.commandBuffer = &commandBuffer;
\end_layout

\begin_layout Plain Layout

#elif(VKFFT_BACKEND==3) //OpenCL API
\end_layout

\begin_layout Plain Layout

launchParams.commandQueue = &commandQueue;
\end_layout

\begin_layout Plain Layout

#elif(VKFFT_BACKEND==4) //Level Zero API
\end_layout

\begin_layout Plain Layout

launchParams->commandList = &commandList;
\end_layout

\begin_layout Plain Layout

#endif
\end_layout

\begin_layout Plain Layout

resFFT = VkFFTAppend(app, -1, &launchParams);
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

//add synchronization relevant to your API - vkWaitForFences/cudaDeviceSynchroni
ze/hipDeviceSynchronize/clFinish
\end_layout

\begin_layout Plain Layout

transferDataToCPU(cpu_buffer, buffer); //Pseudocode for data transfer from
 GPU to CPU, differs between APIs
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

freeBuffer(buffer, bufferSize); //Pseudocode for buffer deallocation, differs
 between APIs
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

deleteVkFFT(&app);
\end_layout

\begin_layout Plain Layout


\backslash
end{minted}
\end_layout

\begin_layout Plain Layout


\backslash
end{mdframed} 
\end_layout

\end_inset


\end_layout

\begin_layout Subsection
Advanced FFT application example: ND, C2C/R2C/R2R, different precisions,
 batched FFT
\end_layout

\begin_layout Standard
This example shows how to configure the main parameters of interest in the
 VkFFT library: multidimensional case, different types of transforms, different
 precision, perform batched transforms.
\end_layout

\begin_layout Standard
In the code below X, Y and Z are the dimensions of FFT, B - number of batches,
 R2C - real to complex mode 0 or 1 (on/off), DCT - 0, 1, 2, 3 or 4 (off/DCT
 type), P - precision (0 - single, 1 - double, 2 - half).
\end_layout

\begin_layout Standard
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
begin{mdframed}[backgroundcolor=bg]
\end_layout

\begin_layout Plain Layout


\backslash
begin{minted}[tabsize=4,obeytabs,breaklines]{C}
\end_layout

\begin_layout Plain Layout

//zero-initialize configuration + FFT application
\end_layout

\begin_layout Plain Layout

VkFFTConfiguration configuration = {};
\end_layout

\begin_layout Plain Layout

VkFFTApplication app = {};
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

configuration.FFTdim = 1; //FFT dimension, 1D, 2D or 3D
\end_layout

\begin_layout Plain Layout

configuration.size[0] = X;
\end_layout

\begin_layout Plain Layout

configuration.size[1] = Y;
\end_layout

\begin_layout Plain Layout

configuration.size[2] = Z;
\end_layout

\begin_layout Plain Layout

if (Y > 1) configuration.FFTdim++;
\end_layout

\begin_layout Plain Layout

if (Z > 1) configuration.FFTdim++;
\end_layout

\begin_layout Plain Layout

configuration.numberBatches = B;
\end_layout

\begin_layout Plain Layout

configuration.performR2C = R2C;
\end_layout

\begin_layout Plain Layout

configuration.performDCT = DCT;
\end_layout

\begin_layout Plain Layout

if (P == 1) configuration.doublePrecision = 1; 
\end_layout

\begin_layout Plain Layout

if (P == 2) configuration.halfPrecision = 1;
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

uint64_t bufferSize = 0;
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

if (R2C) {
\end_layout

\begin_layout Plain Layout

	bufferSize = (uint64_t)(storageComplexSize / 2) * (configuration.size[0]
 + 2) * configuration.size[1] * configuration.size[2] * configuration.numberBatches
;
\end_layout

\begin_layout Plain Layout

}
\end_layout

\begin_layout Plain Layout

else {
\end_layout

\begin_layout Plain Layout

	if (DCT) {
\end_layout

\begin_layout Plain Layout

		bufferSize = (uint64_t)(storageComplexSize / 2) * configuration.size[0]
 * configuration.size[1] * configuration.size[2] * configuration.numberBatches;
\end_layout

\begin_layout Plain Layout

	}
\end_layout

\begin_layout Plain Layout

	else {
\end_layout

\begin_layout Plain Layout

		bufferSize = (uint64_t)storageComplexSize * configuration.size[0] * configurati
on.size[1] * configuration.size[2] * configuration.numberBatches;
\end_layout

\begin_layout Plain Layout

	}
\end_layout

\begin_layout Plain Layout

} // storageComplexSize - 4/8/16 for FP16/FP32/FP64 respectively.
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

//Device management + code submission - code is identical to the previous
 example
\end_layout

\begin_layout Plain Layout


\backslash
end{minted}
\end_layout

\begin_layout Plain Layout


\backslash
end{mdframed}
\end_layout

\end_inset


\end_layout

\begin_layout Subsection
Advanced FFT application example: out-of-place R2C FFT with custom strides
\end_layout

\begin_layout Standard
In this example, VkFFT is configured to calculate a 3D out-of-place R2C
 FFT of a system with custom strides.
 VkFFT reads data from the inputBuffer and produces the result in the buffer.
 
\end_layout

\begin_layout Standard
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
begin{mdframed}[backgroundcolor=bg]
\end_layout

\begin_layout Plain Layout


\backslash
begin{minted}[tabsize=4,obeytabs,breaklines]{C}
\end_layout

\begin_layout Plain Layout

//zero-initialize configuration + FFT application
\end_layout

\begin_layout Plain Layout

VkFFTConfiguration configuration = {};
\end_layout

\begin_layout Plain Layout

VkFFTApplication app = {};
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

configuration.FFTdim = 3; //FFT dimension, 1D, 2D or 3D
\end_layout

\begin_layout Plain Layout

configuration.size[0] = Nx;
\end_layout

\begin_layout Plain Layout

configuration.size[1] = Ny;
\end_layout

\begin_layout Plain Layout

configuration.size[2] = Nz;
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

configuration.performR2C = 1;
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

//out-of-place - we need to specify that input buffer is separate from the
 main buffer
\end_layout

\begin_layout Plain Layout

configuration.isInputFormatted = 1;
\end_layout

\begin_layout Plain Layout

configuration.inputBufferStride[0] = configuration.size[0];
\end_layout

\begin_layout Plain Layout

configuration.inputBufferStride[1] = configuration.inputBufferStride[0] *
 configuration.size[1];
\end_layout

\begin_layout Plain Layout

configuration.inputBufferStride[2] = configuration.inputBufferStride[1] *
 configuration.size[2];
\end_layout

\begin_layout Plain Layout

			
\end_layout

\begin_layout Plain Layout

configuration.bufferStride[0] = (uint64_t) (configuration.size[0] / 2) + 1;
\end_layout

\begin_layout Plain Layout

configuration.bufferStride[1] = configuration.bufferStride[0] * configuration.size[
1];
\end_layout

\begin_layout Plain Layout

configuration.bufferStride[2] = configuration.bufferStride[1]* configuration.size[2
];
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

uint64_t inputBufferSize = (uint64_t)sizeof(float) * configuration.size[0]
 * configuration.size[1] * configuration.size[2];
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

uint64_t bufferSize = (uint64_t)sizeof(float) * 2 * (configuration.size[0]/2+1)
 * configuration.size[1] * configuration.size[2];
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

//Device management + code submission - code is identical to the first example,
 except that you need to allocate two buffers (and provide them in the launch
 configuration).
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout


\backslash
end{minted}
\end_layout

\begin_layout Plain Layout


\backslash
end{mdframed}
\end_layout

\end_inset


\end_layout

\begin_layout Subsection
Advanced FFT application example: 3D zero-padded FFT
\end_layout

\begin_layout Standard
In this example, VkFFT is configured to calculate a 3D FFT of a system.
 The meaningful data is located in the first octant of the buffer, the rest
 is padded with zeros.
 This configuration removes the circular part of the convolution and allows
 modelling of open systems.
\end_layout

\begin_layout Standard
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
begin{mdframed}[backgroundcolor=bg]
\end_layout

\begin_layout Plain Layout


\backslash
begin{minted}[tabsize=4,obeytabs,breaklines]{C}
\end_layout

\begin_layout Plain Layout

//zero-initialize configuration + FFT application
\end_layout

\begin_layout Plain Layout

VkFFTConfiguration configuration = {};
\end_layout

\begin_layout Plain Layout

VkFFTApplication app = {};
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

configuration.FFTdim = 3; //FFT dimension, 1D, 2D or 3D
\end_layout

\begin_layout Plain Layout

configuration.size[0] = Nx;
\end_layout

\begin_layout Plain Layout

configuration.size[1] = Ny;
\end_layout

\begin_layout Plain Layout

configuration.size[2] = Nz;
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

configuration.performZeropadding[0] = 1; //Perform padding with zeros on
 GPU.
 Still need to properly align input data (no need to fill padding area with
 meaningful data) but this will increase performance due to the lower amount
 of the memory reads/writes and omitting sequences only consisting of zeros.
\end_layout

\begin_layout Plain Layout

configuration.performZeropadding[1] = 1;
\end_layout

\begin_layout Plain Layout

configuration.performZeropadding[2] = 1;
\end_layout

\begin_layout Plain Layout

configuration.fft_zeropad_left[0] = (uint64_t)ceil(configuration.size[0] /
 2.0);
\end_layout

\begin_layout Plain Layout

configuration.fft_zeropad_right[0] = configuration.size[0];
\end_layout

\begin_layout Plain Layout

configuration.fft_zeropad_left[1] = (uint64_t)ceil(configuration.size[1] /
 2.0);
\end_layout

\begin_layout Plain Layout

configuration.fft_zeropad_right[1] = configuration.size[1];
\end_layout

\begin_layout Plain Layout

configuration.fft_zeropad_left[2] = (uint64_t)ceil(configuration.size[2] /
 2.0);
\end_layout

\begin_layout Plain Layout

configuration.fft_zeropad_right[2] = configuration.size[2];
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

uint64_t bufferSize = (uint64_t)storageComplexSize * configuration.size[0]
 * configuration.size[1] * configuration.size[2];
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

//Device management + code submission - code is identical to the first example
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout


\backslash
end{minted}
\end_layout

\begin_layout Plain Layout


\backslash
end{mdframed}
\end_layout

\end_inset


\end_layout

\begin_layout Subsection
Convolution application example: 3x3 matrix-vector convolution in 1D
\end_layout

\begin_layout Standard
In this example, VkFFT is configured to calculate a kernel, represented
 by a 3x3 matrix and a system, represented by a 3D vector.
 Their convolution is a matrix-vector multiplication in the frequency domain.
\end_layout

\begin_layout Standard
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
begin{mdframed}[backgroundcolor=bg]
\end_layout

\begin_layout Plain Layout


\backslash
begin{minted}[tabsize=4,obeytabs,breaklines]{C}
\end_layout

\begin_layout Plain Layout

//zero-initialize configuration + FFT application, we need two - one for
 kernel calculation
\end_layout

\begin_layout Plain Layout

VkFFTConfiguration kernel_configuration = {};
\end_layout

\begin_layout Plain Layout

VkFFTConfiguration convolution_configuration = {};
\end_layout

\begin_layout Plain Layout

VkFFTApplication app_kernel = {};
\end_layout

\begin_layout Plain Layout

VkFFTApplication app_convolution = {};
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

kernel_configuration.FFTdim = 1; //FFT dimension, 1D, 2D or 3D
\end_layout

\begin_layout Plain Layout

kernel_configuration.size[0] = Nx; //FFT size
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

uint64_t bufferSize = (uint64_t)sizeof(float) * 2 * kernel_configuration.size[0];
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

//configure kernel 
\end_layout

\begin_layout Plain Layout

kernel_configuration.kernelConvolution = 1; //specify if this plan is used
 to create kernel for convolution
\end_layout

\begin_layout Plain Layout

kernel_configuration.coordinateFeatures = 9; //Specify dimensionality of
 the input feature vector (default 1).
 Each component is stored not as a vector, but as a separate system and
 padded on it's own according to other options (i.e.
 for x*y system of 3-vector, first x*y elements correspond to the first
 dimension, then goes x*y for the second, etc).
\end_layout

\begin_layout Plain Layout

//coordinateFeatures number is an important constant for convolution.
 If we perform 1x1 convolution, it is equal to number of features, but matrixCon
volution should be equal to 1.
 For matrix convolution, it must be equal to matrixConvolution parameter.
 If we perform 2x2 convolution, it is equal to 3 for symmetric kernel (stored
 as xx, xy, yy) and 4 for nonsymmetric (stored as xx, xy, yx, yy).
 Similarly, 6 (stored as xx, xy, xz, yy, yz, zz) and 9 (stored as xx, xy,
 xz, yx, yy, yz, zx, zy, zz) for 3x3 convolutions.
 
\end_layout

\begin_layout Plain Layout

kernel_configuration.normalize = 1;
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

//Initialize app_kernel and perform a single forward FFT like in examples
 before.
 You pass kernel as a buffer for the preparation stage.
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

convolution_configuration = kernel_configuration;
\end_layout

\begin_layout Plain Layout

convolution_configuration.kernelConvolution = 0;
\end_layout

\begin_layout Plain Layout

convolution_configuration.performConvolution = 1;
\end_layout

\begin_layout Plain Layout

convolution_configuration.symmetricKernel = 0;//Specify if convolution kernel
 is symmetric.
 In this case we only pass upper triangle part of it in the form of: (xx,
 xy, yy) for 2d and (xx, xy, xz, yy, yz, zz) for 3d.
\end_layout

\begin_layout Plain Layout

convolution_configuration.matrixConvolution = 3;//we do matrix convolution,
 so kernel is 9 numbers (3x3), but vector dimension is 3
\end_layout

\begin_layout Plain Layout

convolution_configuration.coordinateFeatures = 3;//equal to matrixConvolution
 size
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

//Initialize app_convolution and perform a single forward FFT like in examples
 before.
 You pass kernel as kernel and system to be convolved with it as buffer
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout


\backslash
end{minted}
\end_layout

\begin_layout Plain Layout


\backslash
end{mdframed}
\end_layout

\end_inset


\end_layout

\begin_layout Subsection
Convolution application example: R2C cross-correlation between two sets
 of N images
\end_layout

\begin_layout Standard
In this example, VkFFT is configured to calculate a kernel, represented
 by three 2D vectors (RGB values of a pixel) and a system, also represented
 by three 2D vectors.
 There are N kernels and N systems.
 Their cross-correlation is a conjugate convolution in the frequency domain.
 Images are usually stored as real, not complex numbers, so code uses R2C
 optimization as well.
\end_layout

\begin_layout Standard
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
begin{mdframed}[backgroundcolor=bg]
\end_layout

\begin_layout Plain Layout


\backslash
begin{minted}[tabsize=4,obeytabs,breaklines]{C}
\end_layout

\begin_layout Plain Layout

//zero-initialize configuration + FFT application, we need two - one for
 kernel calculation
\end_layout

\begin_layout Plain Layout

VkFFTConfiguration kernel_configuration = {};
\end_layout

\begin_layout Plain Layout

VkFFTConfiguration convolution_configuration = {};
\end_layout

\begin_layout Plain Layout

VkFFTApplication app_kernel = {};
\end_layout

\begin_layout Plain Layout

VkFFTApplication app_convolution = {};
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

kernel_configuration.FFTdim = 2; //FFT dimension, 1D, 2D or 3D
\end_layout

\begin_layout Plain Layout

kernel_configuration.size[0] = Nx;
\end_layout

\begin_layout Plain Layout

kernel_configuration.size[1] = Ny; 
\end_layout

\begin_layout Plain Layout

kernel_configuration.coordinateFeatures = 3;
\end_layout

\begin_layout Plain Layout

kernel_configuration.numberBatches = N;
\end_layout

\begin_layout Plain Layout

kernel_configuration.performR2C = 1;
\end_layout

\begin_layout Plain Layout

kernel_configuration.normalize = 1;
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

uint64_t bufferSize = (uint64_t)sizeof(float) * 2 * (kernel_configuration.size[0]
/2+1) * kernel_configuration.size[1] * kernel_configuration.coordinateFeatures
 * kernel_configuration.numberBatches; 
\end_layout

\begin_layout Plain Layout

	
\end_layout

\begin_layout Plain Layout

kernel_configuration.kernelConvolution = 1; //specify if this plan is used
 to create kernel for convolution
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

//Initialize app_kernel and perform a single forward FFT like in examples
 before.
 Pad in-place R2C system like this:
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

for (uint64_t n = 0; n < kernel_configuration.numberBatches; n++) {
\end_layout

\begin_layout Plain Layout

	for (uint64_t c = 0; c < kernel_configuration.coordinateFeatures; c++) {
\end_layout

\begin_layout Plain Layout

		for (uint64_t j = 0; j < kernel_configuration.size[1]; j++) {
\end_layout

\begin_layout Plain Layout

			for (uint64_t i = 0; i < kernel_configuration.size[0]; i++) {
\end_layout

\begin_layout Plain Layout

				kernel_padded_GPU[i + j * 2 * (kernel_configuration.size[0]/2 + 1) +
 c * 2 * (kernel_configuration.size[0]/2 + 1) * kernel_configuration.size[1]
 + n * 2 * (kernel_configuration.size[0]/2 + 1) * kernel_configuration.size[1]
 * kernel_configuration.coordinateFeatures] = kernel_input[i + j * kernel_configu
ration.size[0] + c * kernel_configuration.size[0] * kernel_configuration.size[1]
 + n * kernel_configuration.size[0] * kernel_configuration.size[1] * kernel_config
uration.coordinateFeatures];
\end_layout

\begin_layout Plain Layout

			}
\end_layout

\begin_layout Plain Layout

		}
\end_layout

\begin_layout Plain Layout

	}
\end_layout

\begin_layout Plain Layout

}
\end_layout

\begin_layout Plain Layout

convolution_configuration = kernel_configuration;
\end_layout

\begin_layout Plain Layout

convolution_configuration.kernelConvolution = 0;
\end_layout

\begin_layout Plain Layout

convolution_configuration.performConvolution = 1;
\end_layout

\begin_layout Plain Layout

convolution_configuration.conjugateConvolution = 1;
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

//Initialize app_convolution and perform a single forward FFT like in examples
 before.
 Pad the system in the same way as the kernel
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout


\backslash
end{minted}
\end_layout

\begin_layout Plain Layout


\backslash
end{mdframed}
\end_layout

\end_inset


\end_layout

\begin_layout Subsection
Simple FFT application binary reuse application
\end_layout

\begin_layout Standard
This example shows how to save/load binaries generated by VkFFT.
 This can reduce time taken by initializeVkFFT call by removing RTC components
 from it.
 Be sure that rest of the configuration stays the same to reuse the binary.
\end_layout

\begin_layout Standard
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
begin{mdframed}[backgroundcolor=bg]
\end_layout

\begin_layout Plain Layout


\backslash
begin{minted}[tabsize=4,obeytabs,breaklines]{C}
\end_layout

\begin_layout Plain Layout

VkFFTConfiguration configuration = {};
\end_layout

\begin_layout Plain Layout

VkFFTApplication app = {};
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

//configuration is initialized like in other examples
\end_layout

\begin_layout Plain Layout

configuration.saveApplicationToString = 1;
\end_layout

\begin_layout Plain Layout

//configuration.loadApplicationFromString = 1; //choose one to save/load
 binary file
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

if (configuration.loadApplicationFromString) {
\end_layout

\begin_layout Plain Layout

	FILE* kernelCache;
\end_layout

\begin_layout Plain Layout

	uint64_t str_len;
\end_layout

\begin_layout Plain Layout

#if((VKFFT_BACKEND==0) || (VKFFT_BACKEND==2) || (VKFFT_BACKEND==4))
\end_layout

\begin_layout Plain Layout

	kernelCache = fopen("VkFFT_binary", "rb"); //Vulkan and HIP backends load
 data as a uint32_t sequence
\end_layout

\begin_layout Plain Layout

#else
\end_layout

\begin_layout Plain Layout

	kernelCache = fopen("VkFFT_binary", "r"); 
\end_layout

\begin_layout Plain Layout

#endif
\end_layout

\begin_layout Plain Layout

	fseek(kernelCache, 0, SEEK_END);
\end_layout

\begin_layout Plain Layout

	str_len = ftell(kernelCache);
\end_layout

\begin_layout Plain Layout

	fseek(kernelCache, 0, SEEK_SET);
\end_layout

\begin_layout Plain Layout

	configuration.loadApplicationString = malloc(str_len);
\end_layout

\begin_layout Plain Layout

	fread(configuration.loadApplicationString, str_len, 1, kernelCache);
\end_layout

\begin_layout Plain Layout

	fclose(kernelCache);
\end_layout

\begin_layout Plain Layout

}
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

resFFT = initializeVkFFT(&app, configuration);
\end_layout

\begin_layout Plain Layout

if (resFFT != VKFFT_SUCCESS) return resFFT;
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

if (configuration.loadApplicationFromString)
\end_layout

\begin_layout Plain Layout

	free(configuration.loadApplicationString);
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

if (configuration.saveApplicationToString) {
\end_layout

\begin_layout Plain Layout

	FILE* kernelCache;
\end_layout

\begin_layout Plain Layout

#if((VKFFT_BACKEND==0) || (VKFFT_BACKEND==2) || (VKFFT_BACKEND==4))
\end_layout

\begin_layout Plain Layout

	kernelCache = fopen("VkFFT_binary", "wb"); //Vulkan and HIP backends save
 data as a uint32_t sequence
\end_layout

\begin_layout Plain Layout

#else
\end_layout

\begin_layout Plain Layout

	kernelCache = fopen("VkFFT_binary", "w"); 
\end_layout

\begin_layout Plain Layout

#endif
\end_layout

\begin_layout Plain Layout

	fwrite(app.saveApplicationString, app.applicationStringSize, 1, kernelCache);
\end_layout

\begin_layout Plain Layout

	fclose(kernelCache);
\end_layout

\begin_layout Plain Layout

}
\end_layout

\begin_layout Plain Layout

\end_layout

\begin_layout Plain Layout

//application is launched like in other examples
\end_layout

\begin_layout Plain Layout


\backslash
end{minted}
\end_layout

\begin_layout Plain Layout


\backslash
end{mdframed} 
\end_layout

\end_inset


\end_layout

\end_body
\end_document