Compare commits

...

20 Commits

Author SHA1 Message Date
mazx 89398a1c57 update fused kernels. 2022-11-15 15:13:50 +08:00
mazx 6e3f0bbf9a update reduce kernel 2022-11-02 20:02:22 +08:00
mazx 2aadcb6e9c add: onnx ok. 2022-11-02 17:49:37 +08:00
mazx 3046dd5901 add: graph for bert. 2022-11-02 17:49:37 +08:00
mazx 5ed540be6e add: optimization pass for metaGraph. 2022-11-02 17:49:37 +08:00
mazx ec58c85505 add: add metaGaph for codegen. 2022-11-02 17:49:37 +08:00
mazx 18b79903ee add: codegen for all metaOps. 2022-11-02 17:49:37 +08:00
mazx 2c8bd3729b add: generate transpose, unary, and binary. 2022-11-02 17:49:37 +08:00
mazx 254d23b3c0 Add: transpose operator 2022-11-02 17:49:37 +08:00
mazx 05d39439db add: init power fusion. 2022-11-02 17:49:37 +08:00
mazx 48e986d377 add: fix bert to transpose. 2022-11-02 17:49:37 +08:00
mazx 09365c81f4 add: graph build for pf. 2022-11-02 17:49:37 +08:00
Pairshoe 94faefb0ef Add: pytest for import_onnx 2022-11-02 17:49:37 +08:00
Pairshoe 970c77d0f4 Update: Rename GraphFactory -> GraphBuilder && Remove unnecessary outputs 2022-11-02 17:49:37 +08:00
Pairshoe 7cf2d8f78f Add: python interfaced for importing onnx 2022-11-02 17:49:37 +08:00
Pairshoe ff90c4c7d5 Add: test for class GraphFactoryObj 2022-11-02 17:49:37 +08:00
Pairshoe 9e45e51279 Add: class GraphFactory and pybind11 interfaces 2022-11-02 17:49:37 +08:00
Liyan Zheng 1b1fc2585b Add: save optime result 2022-11-02 17:38:08 +08:00
Liyan Zheng eb993f7829 Add: evaluate onnx script 2022-11-02 16:51:33 +08:00
Liyan Zheng 63e5df4227 Add: fused conv 2022-11-02 16:39:12 +08:00
85 changed files with 6129 additions and 53 deletions

View File

@ -12,10 +12,11 @@ option(BUILD_TEST "Build tests" ON)
cmake_dependent_option(BUILD_TEST_CORE "Build tests for core components" ON BUILD_TEST OFF)
cmake_dependent_option(BUILD_TEST_PET "Build tests for PET" OFF BUILD_TEST OFF)
cmake_dependent_option(BUILD_TEST_EINNET "Build tests for EINNET" OFF BUILD_TEST OFF)
cmake_dependent_option(BUILD_TEST_PFUSION "Build tests for EINNET" ON BUILD_TEST OFF)
set(DEFAULT_BUILD_TYPE "RelWithDebInfo")
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD 20)
set(CMAKE_CXX_EXTENSIONS OFF) # -std=gnu++11 when on, -std=c++11 when off
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -Wall -Werror -Wno-error=deprecated-declarations")
@ -74,7 +75,7 @@ if(BUILD_TEST)
endif()
# Source files
file(GLOB_RECURSE SRC src/ffi/*.cc src/core/*.cc src/kernels/cpu/*.cc src/nnet/*.cc src/operators/*.cc src/utils/*.cc)
file(GLOB_RECURSE SRC src/ffi/*.cc src/core/*.cc src/kernels/cpu/*.cc src/nnet/*.cc src/pfusion/*.cc src/operators/*.cc src/utils/*.cc)
if(USE_CUDA)
file(GLOB_RECURSE SRC_CUDA src/cuda/*.cc src/cuda/*.cu src/kernels/cuda/*.cc src/kernels/cuda/*.cu)
@ -114,7 +115,7 @@ if(USE_CUDA)
${CMAKE_CXX_COMPILER}
CACHE STRING "Set cuda host compiler path")
# CMP0104 requires CUDA_ARCHITECTURES
set_target_properties(InfiniTensor PROPERTIES CUDA_ARCHITECTURES "70;80")
set_target_properties(InfiniTensor PROPERTIES CUDA_ARCHITECTURES "70")
enable_language(CUDA)
find_package(CUDAToolkit) # For nvrtc and cuda driver
target_link_libraries(InfiniTensor cudnn CUDA::curand CUDA::cublas CUDA::nvrtc CUDA::cudart CUDA::cuda_driver)
@ -196,4 +197,7 @@ if(BUILD_TEST)
if(BUILD_TEST_EINNET)
build_test(test/nnet/test_*.cc)
endif()
if(BUILD_TEST_PFUSION)
build_test(test/pfusion/test_*.cc)
endif()
endif()

89
eval_pfusion/cuda_utils.h Normal file
View File

@ -0,0 +1,89 @@
#pragma once
#include <iostream>
#include <cublas.h>
#include <curand.h>
#define cudaSafeCall(err) __cudaSafeCall(err, __FILE__, __LINE__)
#define cudaCheckError() __cudaCheckError(__FILE__, __LINE__)
#define cublasSafeCall(err) __cublasSafeCall(err, __FILE__, __LINE__)
#define curandSafeCall(err) __curandSafeCall(err, __FILE__, __LINE__)
inline void __cudaSafeCall(cudaError err, const char *file, const int line) {
if (err != cudaSuccess) {
std::cout << "[ERROR] " << file << "::" << line
<< ": cudaSafeCall() failed. " << cudaGetErrorString(err)
<< std::endl;
exit(-1);
}
return;
}
inline void __cudaCheckError(const char *file, const int line) {
auto err = cudaGetLastError();
if (err != cudaSuccess) {
std::cout << "[ERROR] " << file << "::" << line
<< ": cudaCheckError() failed. " << cudaGetErrorString(err)
<< std::endl;
exit(-1);
}
#ifdef DEBUG
// This checking will affect performance.
err = cudaDeviceSynchronize();
if (err != cudaSuccess) {
std::cout << "[ERROR] " << file << "::" << line
<< ": cudaCheckError() with sync failed. "
<< cudaGetErrorString(err) << std::endl;
exit(-1);
}
#endif
return;
}
inline const char *cublasGetErrorString(cublasStatus_t err) {
switch (err) {
case CUBLAS_STATUS_SUCCESS:
return "CUBLAS_STATUS_SUCCESS";
case CUBLAS_STATUS_NOT_INITIALIZED:
return "CUBLAS_STATUS_NOT_INITIALIZED";
case CUBLAS_STATUS_ALLOC_FAILED:
return "CUBLAS_STATUS_ALLOC_FAILED";
case CUBLAS_STATUS_INVALID_VALUE:
return "CUBLAS_STATUS_INVALID_VALUE";
case CUBLAS_STATUS_ARCH_MISMATCH:
return "CUBLAS_STATUS_ARCH_MISMATCH";
case CUBLAS_STATUS_MAPPING_ERROR:
return "CUBLAS_STATUS_MAPPING_ERROR";
case CUBLAS_STATUS_EXECUTION_FAILED:
return "CUBLAS_STATUS_EXECUTION_FAILED";
case CUBLAS_STATUS_INTERNAL_ERROR:
return "CUBLAS_STATUS_INTERNAL_ERROR";
case CUBLAS_STATUS_NOT_SUPPORTED:
return "CUBLAS_STATUS_NOT_SUPPORTED";
case CUBLAS_STATUS_LICENSE_ERROR:
return "CUBLAS_STATUS_LICENSE_ERROR";
}
return "<unknown>";
}
inline void __cublasSafeCall(cublasStatus_t err, const char *file,
const int line) {
if (err != CUBLAS_STATUS_SUCCESS) {
std::cout << "[ERROR]" << file << "::" << line
<< ": cublasSafeCall() failed. " << cublasGetErrorString(err)
<< std::endl;
exit(-1);
}
}
inline void __curandSafeCall(curandStatus_t err, const char *file,
const int line) {
if (err != CURAND_STATUS_SUCCESS) {
std::cout << "[ERROR]" << file << "::" << line
<< ": curandSafeCall() failed. " << err << std::endl;
exit(-1);
}
}

View File

@ -0,0 +1,21 @@
import os
def eval(filename, kernel, shape):
with open("../eval_pfusion/eval_kernel.tmp", "r") as f:
code = f.read()
code = code.replace("%%invoke_func%%", kernel)
code = code.replace("%%shape%%", shape)
with open("../generated_code/tmp.cu", "w") as f:
f.write(code)
# os.system("make -j && ./test_bias")
os.system(
"nvcc ../generated_code/tmp.cu ../generated_code/" + filename + " -I ../eval_pfusion -o ./tmp")
os.system("./tmp")
if __name__ == "__main__":
eval("bias_0.cu", "invoke_func_0", "{28 * 28, 24}")
eval("bias_1.cu", "invoke_func_1", "{28 * 28, 58}")
eval("bias_2.cu", "invoke_func_2", "{14 * 14, 116}")
eval("bias_3.cu", "invoke_func_3", "{7 * 7, 232}")

View File

@ -0,0 +1,41 @@
#include "cuda.h"
#include "cuda_utils.h"
#include <vector>
void %%invoke_func%%(float *tensor_ptr_0, float *tensor_ptr_1,
float *tensor_ptr_2);
int main() {
std::vector<int> shape = %%shape%%;
float *t0, *t1, *t2;
size_t size = 1;
for (auto x : shape) {
size *= x;
}
cudaSafeCall(cudaMalloc((void **)&t0, size * sizeof(float)));
cudaSafeCall(cudaMalloc((void **)&t1, size * sizeof(float)));
cudaSafeCall(cudaMalloc((void **)&t2, size * sizeof(float)));
float duration = 0;
cudaEvent_t st, ed;
cudaEventCreate(&st);
cudaEventCreate(&ed);
int cnt = 128;
for (int t = 0; t < cnt; t++) {
%%invoke_func%%(t0, t1, t2);
}
cudaEventRecord(st, 0);
for (int t = 0; t < cnt; t++) {
%%invoke_func%%(t0, t1, t2);
}
cudaEventRecord(ed, 0);
cudaEventSynchronize(st);
cudaEventSynchronize(ed);
cudaEventElapsedTime(&duration, st, ed);
std::cout << "[INFO] time: " << duration / cnt << std::endl;
// double perf = double(size) * 8.0f * cnt / (duration * 1e-3) / 1024.0f / 1024.0f / 1024.0f;
// std::cout << "[INFO] Perf: " << perf << "GB/s" << std::endl;
std::cout << "[Exit] successful." << std::endl;
}

View File

@ -0,0 +1,41 @@
#include "cuda.h"
#include "cuda_utils.h"
#include <vector>
void invoke_func_2(float *tensor_ptr_2, float *tensor_ptr_3,
float *tensor_ptr_4);
int main() {
std::vector<int> shape = {1, 64, 512, 512};
float *t0, *t1, *t2;
size_t size = 1;
for (auto x : shape) {
size *= x;
}
cudaSafeCall(cudaMalloc((void **)&t0, size * sizeof(float)));
cudaSafeCall(cudaMalloc((void **)&t1, size * sizeof(float)));
cudaSafeCall(cudaMalloc((void **)&t2, size * sizeof(float)));
float duration = 0;
cudaEvent_t st, ed;
cudaEventCreate(&st);
cudaEventCreate(&ed);
int cnt = 128;
for (int t = 0; t < cnt; t++) {
invoke_func_2(t0, t1, t2);
}
cudaEventRecord(st, 0);
for (int t = 0; t < cnt; t++) {
invoke_func_2(t0, t1, t2);
}
cudaEventRecord(ed, 0);
cudaEventSynchronize(st);
cudaEventSynchronize(ed);
cudaEventElapsedTime(&duration, st, ed);
std::cout << "[INFO] time: " << duration / cnt << std::endl;
double perf = double(size) * 8.0f * cnt / (duration * 1e-3) / 1024.0f / 1024.0f / 1024.0f;
std::cout << "[INFO] Perf: " << perf << "GB/s" << std::endl;
std::cout << "[Exit] successful." << std::endl;
}

View File

@ -0,0 +1,41 @@
#include "cuda.h"
#include "cuda_utils.h"
#include <vector>
void invoke_func_5(float *tensor_ptr_2, float *tensor_ptr_3,
float *tensor_ptr_4);
int main() {
std::vector<int> shape = {1, 1, 512, 512};
float *t0, *t1, *t2;
size_t size = 1;
for (auto x : shape) {
size *= x;
}
cudaSafeCall(cudaMalloc((void **)&t0, size * sizeof(float)));
cudaSafeCall(cudaMalloc((void **)&t1, size * sizeof(float)));
cudaSafeCall(cudaMalloc((void **)&t2, size * sizeof(float)));
float duration = 0;
cudaEvent_t st, ed;
cudaEventCreate(&st);
cudaEventCreate(&ed);
int cnt = 128;
for (int t = 0; t < cnt; t++) {
invoke_func_5(t0, t1, t2);
}
cudaEventRecord(st, 0);
for (int t = 0; t < cnt; t++) {
invoke_func_5(t0, t1, t2);
}
cudaEventRecord(ed, 0);
cudaEventSynchronize(st);
cudaEventSynchronize(ed);
cudaEventElapsedTime(&duration, st, ed);
std::cout << "[INFO] time: " << duration / cnt << std::endl;
double perf = double(size) * 8.0f * cnt / (duration * 1e-3) / 1024.0f / 1024.0f / 1024.0f;
std::cout << "[INFO] Perf: " << perf << "GB/s" << std::endl;
std::cout << "[Exit] successful." << std::endl;
}

8
eval_pfusion/eval_sardrn.sh Executable file
View File

@ -0,0 +1,8 @@
# rm ./eval_sar_drn_0 ./eval_sar_drn_1 ./eval_sar_drn_2
make -j && ./test_sar_drn
nvcc ../eval_pfusion/eval_sar_drn_0.cu ../generated_code/sar_drn_0.cu -I ../eval_pfusion -o eval_sar_drn_0
nvcc ../eval_pfusion/eval_sar_drn_1.cu ../generated_code/sar_drn_1.cu -I ../eval_pfusion -o eval_sar_drn_1
# nvcc ../eval_pfusion/eval_sar_drn_2.cu ../generated_code/sar_drn_1.cu -I ../eval_pfusion -o eval_sar_drn_2
./eval_sar_drn_0
./eval_sar_drn_1
# ./eval_sar_drn_2

View File

@ -0,0 +1,84 @@
#include "cuda.h"
#include "cuda_utils.h"
#include <vector>
void invoke_func_0(float *src, float *dst);
int main() {
std::vector<int> shape = {31, 32, 32, 33};
std::vector<int> perm = {2, 0, 3, 1};
float *src, *dst;
size_t size = 1;
for (auto x : shape) {
size *= x;
}
std::vector<int> stride_src(4), stride_dst(4);
stride_dst[0] = 1;
for (int i = 1; i < 4; i++) {
stride_dst[i] = stride_dst[i-1] * shape[i-1];
}
size_t this_stride = 1;
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 4; j++) {
if (perm[j] == i) {
stride_src[i] = this_stride;
this_stride *= shape[j];
}
}
}
cudaSafeCall(cudaMalloc((void **)&src, size * sizeof(float)));
cudaSafeCall(cudaMalloc((void **)&dst, size * sizeof(float)));
float *src_host, *dst_host;
src_host = (float *)malloc(size * sizeof(float));
dst_host = (float *)malloc(size * sizeof(float));
for (size_t i = 0; i < size; i++) {
src_host[i] = i;
}
cudaSafeCall(cudaMemcpy(src, src_host, size * sizeof(float), cudaMemcpyHostToDevice));
invoke_func_0(src, dst);
cudaSafeCall(cudaMemcpy(dst_host, dst, size * sizeof(float), cudaMemcpyDeviceToHost));
bool flag = 0;
for (size_t i = 0; i < size; i++) {
size_t base = i;
size_t offset_src = 0;
for (int j = 0; j < 4; j++) {
offset_src += base % shape[j] * stride_src[perm[j]];
base /= shape[j];
}
if (dst_host[i] != src_host[offset_src]) {
flag = 1;
std::cout << "[ERROR] at " << i << "," << offset_src << ":" << dst_host[i] << "," << src_host[offset_src] << std::endl;
break;
}
}
if (!flag) {
std::cout << "[INFO] transpose correct." << std::endl;
} else {
std::cout << "[ERROR] transpose incorrect." << std::endl;
}
float duration = 0;
cudaEvent_t st, ed;
cudaEventCreate(&st);
cudaEventCreate(&ed);
int cnt = 128;
for (int t = 0; t < cnt; t++) {
invoke_func_0(src, dst);
}
cudaEventRecord(st, 0);
for (int t = 0; t < cnt; t++) {
invoke_func_0(src, dst);
}
cudaEventRecord(ed, 0);
cudaEventSynchronize(st);
cudaEventSynchronize(ed);
cudaEventElapsedTime(&duration, st, ed);
std::cout << "[INFO] time: " << duration << std::endl;
double perf = double(size) * 8.0f * cnt / (duration * 1e-3) / 1024.0f / 1024.0f / 1024.0f;
std::cout << "[INFO] Perf: " << perf << "GB/s" << std::endl;
std::cout << "[Exit] successful." << std::endl;
}

View File

@ -0,0 +1,21 @@
import os
def eval(filename, kernel, shape, perm):
with open("../eval_pfusion/eval_transpose.tmp", "r") as f:
code = f.read()
code = code.replace("%%invoke_func%%", kernel)
code = code.replace("%%shape%%", shape)
code = code.replace("%%perm%%", perm)
with open("../generated_code/tmp.cu", "w") as f:
f.write(code)
# os.system("make -j && ./test_bias")
os.system(
"nvcc ../generated_code/tmp.cu ../generated_code/" + filename + " -I ../eval_pfusion -o ./tmp")
os.system("./tmp")
if __name__ == "__main__":
eval("transpose_0.cu", "invoke_func_0", "{28 * 28, 58, 2}", "{0, 2, 1}")
eval("transpose_1.cu", "invoke_func_1", "{14 * 14, 116, 2}", "{0, 2, 1}")
eval("transpose_2.cu", "invoke_func_2", "{7 * 7, 232, 2}", "{0, 2, 1}")

4
eval_pfusion/eval_transpose.sh Executable file
View File

@ -0,0 +1,4 @@
rm ./eval_transpose
make -j && ./test_transpose
nvcc ../eval_pfusion/eval_transpose.cu ../generated_code/transpose.cu -I ../eval_pfusion -o eval_transpose
./eval_transpose

View File

@ -0,0 +1,84 @@
#include "cuda.h"
#include "cuda_utils.h"
#include <vector>
void %%invoke_func%%(float *src, float *dst);
int main() {
std::vector<int> shape = %%shape%%;
std::vector<int> perm = %%perm%%;
float *src, *dst;
size_t size = 1;
for (auto x : shape) {
size *= x;
}
std::vector<int> stride_src(shape.size()), stride_dst(shape.size());
stride_dst[0] = 1;
for (int i = 1; i < shape.size(); i++) {
stride_dst[i] = stride_dst[i-1] * shape[i-1];
}
size_t this_stride = 1;
for (int i = 0; i < shape.size(); i++) {
for (int j = 0; j < shape.size(); j++) {
if (perm[j] == i) {
stride_src[i] = this_stride;
this_stride *= shape[j];
}
}
}
cudaSafeCall(cudaMalloc((void **)&src, size * sizeof(float)));
cudaSafeCall(cudaMalloc((void **)&dst, size * sizeof(float)));
float *src_host, *dst_host;
src_host = (float *)malloc(size * sizeof(float));
dst_host = (float *)malloc(size * sizeof(float));
for (size_t i = 0; i < size; i++) {
src_host[i] = i;
}
cudaSafeCall(cudaMemcpy(src, src_host, size * sizeof(float), cudaMemcpyHostToDevice));
%%invoke_func%%(src, dst);
cudaSafeCall(cudaMemcpy(dst_host, dst, size * sizeof(float), cudaMemcpyDeviceToHost));
bool flag = 0;
for (size_t i = 0; i < size; i++) {
size_t base = i;
size_t offset_src = 0;
for (int j = 0; j < shape.size(); j++) {
offset_src += base % shape[j] * stride_src[perm[j]];
base /= shape[j];
}
if (dst_host[i] != src_host[offset_src]) {
flag = 1;
std::cout << "[ERROR] at " << i << "," << offset_src << ":" << dst_host[i] << "," << src_host[offset_src] << std::endl;
break;
}
}
if (!flag) {
std::cout << "[INFO] transpose correct." << std::endl;
} else {
std::cout << "[ERROR] transpose incorrect." << std::endl;
}
float duration = 0;
cudaEvent_t st, ed;
cudaEventCreate(&st);
cudaEventCreate(&ed);
int cnt = 128;
for (int t = 0; t < cnt; t++) {
%%invoke_func%%(src, dst);
}
cudaEventRecord(st, 0);
for (int t = 0; t < cnt; t++) {
%%invoke_func%%(src, dst);
}
cudaEventRecord(ed, 0);
cudaEventSynchronize(st);
cudaEventSynchronize(ed);
cudaEventElapsedTime(&duration, st, ed);
std::cout << "[INFO] time: " << duration / cnt << std::endl;
double perf = double(size) * 8.0f * cnt / (duration * 1e-3) / 1024.0f / 1024.0f / 1024.0f;
std::cout << "[INFO] Perf: " << perf << "GB/s" << std::endl;
std::cout << "[Exit] successful." << std::endl;
}

148
generated_code/bert_0.cu Normal file
View File

@ -0,0 +1,148 @@
#include "cuda_utils.h"
// Kernel
__global__ void kernel_func_0(float *tensor_ptr_3, float *tensor_ptr_2,
float *tensor_ptr_4) {
int lane_id = threadIdx.x % 32;
int warp_id = threadIdx.x / 32;
int parallel_idx = blockIdx.x * 2 + warp_id;
float buf[24];
for (int loop_idx = parallel_idx; loop_idx < 15627264; loop_idx += 216) {
int offset_src = 0;
int offset_src_buf = loop_idx;
offset_src += offset_src_buf % 15627264 * 32;
offset_src_buf /= 15627264;
}
}
// Kernel
__global__ void kernel_func_6(float *tensor_ptr_4, float *tensor_ptr_5,
float *tensor_ptr_6, float *tensor_ptr_7,
float *tensor_ptr_8) {
int lane_id = threadIdx.x % 32;
int warp_id = threadIdx.x / 32;
int parallel_idx = blockIdx.x * 8 + warp_id;
float buf[48];
for (int loop_idx = parallel_idx; loop_idx < 256; loop_idx += 864) {
int offset_src = 0;
int offset_src_buf = loop_idx;
offset_src += offset_src_buf % 256 * 256;
offset_src_buf /= 256;
#pragma unroll
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
buf[inst_idx] =
tensor_ptr_4[0 + offset_src + inst_idx * 32 + lane_id];
}
#pragma unroll
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
buf[inst_idx + 8] =
tensor_ptr_5[0 + offset_src + inst_idx * 32 + lane_id];
}
#pragma unroll
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
buf[inst_idx + 16] = buf[inst_idx] + buf[inst_idx + 8];
}
#pragma unroll
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
tensor_ptr_6[0 + offset_src + inst_idx * 32 + lane_id] =
buf[inst_idx + 16];
}
#pragma unroll
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
buf[inst_idx] =
tensor_ptr_6[0 + offset_src + inst_idx * 32 + lane_id];
}
#pragma unroll
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
buf[inst_idx + 8] =
tensor_ptr_7[0 + offset_src + inst_idx * 32 + lane_id];
}
#pragma unroll
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
buf[inst_idx + 16] = buf[inst_idx] + buf[inst_idx + 8];
}
#pragma unroll
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
tensor_ptr_8[0 + offset_src + inst_idx * 32 + lane_id] =
buf[inst_idx + 16];
}
}
}
// Kernel
__global__ void kernel_func_7(float *tensor_ptr_8, float *tensor_ptr_9,
float *tensor_ptr_10) {
int lane_id = threadIdx.x % 32;
int warp_id = threadIdx.x / 32;
int parallel_idx = blockIdx.x * 2 + warp_id;
float buf[48];
for (int loop_idx = parallel_idx; loop_idx < 128; loop_idx += 216) {
int offset_src = 0;
int offset_src_buf = loop_idx;
offset_src += offset_src_buf % 128 * 512;
offset_src_buf /= 128;
}
}
// Kernel
__global__ void kernel_func_5(float *tensor_ptr_10, float *tensor_ptr_11,
float *tensor_ptr_12) {
int lane_id = threadIdx.x % 32;
int warp_id = threadIdx.x / 32;
int parallel_idx = blockIdx.x * 8 + warp_id;
float buf[24];
for (int loop_idx = parallel_idx; loop_idx < 256; loop_idx += 864) {
int offset_src = 0;
int offset_src_buf = loop_idx;
offset_src += offset_src_buf % 256 * 256;
offset_src_buf /= 256;
#pragma unroll
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
buf[inst_idx] =
tensor_ptr_10[0 + offset_src + inst_idx * 32 + lane_id];
}
#pragma unroll
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
buf[inst_idx + 8] =
tensor_ptr_11[0 + offset_src + inst_idx * 32 + lane_id];
}
#pragma unroll
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
buf[inst_idx + 16] = buf[inst_idx] - buf[inst_idx + 8];
}
#pragma unroll
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
tensor_ptr_12[0 + offset_src + inst_idx * 32 + lane_id] =
buf[inst_idx + 16];
}
}
}
void invoke_func_0(float *tensor_ptr_3, float *tensor_ptr_2,
float *tensor_ptr_4) {
dim3 gridDim(108, 1);
dim3 blockDim(64, 1);
kernel_func_0<<<gridDim, blockDim>>>(tensor_ptr_3, tensor_ptr_2,
tensor_ptr_4);
cudaCheckError();
}
void invoke_func_6(float *tensor_ptr_4, float *tensor_ptr_5,
float *tensor_ptr_6, float *tensor_ptr_7,
float *tensor_ptr_8) {
dim3 gridDim(108, 1);
dim3 blockDim(256, 1);
kernel_func_6<<<gridDim, blockDim>>>(
tensor_ptr_4, tensor_ptr_5, tensor_ptr_6, tensor_ptr_7, tensor_ptr_8);
cudaCheckError();
}
void invoke_func_7(float *tensor_ptr_8, float *tensor_ptr_9,
float *tensor_ptr_10) {
dim3 gridDim(108, 1);
dim3 blockDim(64, 1);
kernel_func_7<<<gridDim, blockDim>>>(tensor_ptr_8, tensor_ptr_9,
tensor_ptr_10);
cudaCheckError();
}
void invoke_func_5(float *tensor_ptr_10, float *tensor_ptr_11,
float *tensor_ptr_12) {
dim3 gridDim(108, 1);
dim3 blockDim(256, 1);
kernel_func_5<<<gridDim, blockDim>>>(tensor_ptr_10, tensor_ptr_11,
tensor_ptr_12);
cudaCheckError();
}

42
generated_code/bias_0.cu Normal file
View File

@ -0,0 +1,42 @@
#include "cuda_utils.h"
// Kernel
__global__ void kernel_func_0(float *input, float *bias, float *output) {
int lane_id = threadIdx.x % 32;
int warp_id = threadIdx.x / 32;
int parallel_idx = blockIdx.x * 4 + warp_id;
float buf[4];
for (int loop_idx = parallel_idx; loop_idx < 144; loop_idx += 320) {
int offset_input = 0;
int offset_input_buf = loop_idx;
offset_input += offset_input_buf % 7 * 128;
offset_input_buf /= 7;
offset_input += offset_input_buf % 24 * 784;
offset_input_buf /= 24;
int offset_bias = 0;
int offset_bias_buf = loop_idx;
offset_bias += offset_bias_buf % 24 * 24;
offset_bias_buf /= 24;
#pragma unroll
for (int inst_idx = 0; inst_idx < 4; inst_idx++) {
buf[inst_idx] = input[0 + offset_input + inst_idx * 32 + lane_id];
}
#pragma unroll
for (int inst_idx = 0; inst_idx < 1; inst_idx++) {
buf[4] = bias[0 + offset_bias];
}
#pragma unroll
for (int inst_idx = 0; inst_idx < 4; inst_idx++) {
buf[inst_idx] = buf[inst_idx] + buf[4];
}
#pragma unroll
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
bias[0 + offset_input + inst_idx * 32 + lane_id] = buf[inst_idx];
}
}
}
void invoke_func_0(float *input, float *bias, float *output) {
dim3 gridDim(80, 1);
dim3 blockDim(128, 1);
kernel_func_0<<<gridDim, blockDim>>>(input, bias, output);
cudaCheckError();
}

42
generated_code/bias_1.cu Normal file
View File

@ -0,0 +1,42 @@
#include "cuda_utils.h"
// Kernel
__global__ void kernel_func_1(float *input, float *bias, float *output) {
int lane_id = threadIdx.x % 32;
int warp_id = threadIdx.x / 32;
int parallel_idx = blockIdx.x * 4 + warp_id;
float buf[4];
for (int loop_idx = parallel_idx; loop_idx < 348; loop_idx += 320) {
int offset_input = 0;
int offset_input_buf = loop_idx;
offset_input += offset_input_buf % 7 * 128;
offset_input_buf /= 7;
offset_input += offset_input_buf % 58 * 784;
offset_input_buf /= 58;
int offset_bias = 0;
int offset_bias_buf = loop_idx;
offset_bias += offset_bias_buf % 58 * 58;
offset_bias_buf /= 58;
#pragma unroll
for (int inst_idx = 0; inst_idx < 4; inst_idx++) {
buf[inst_idx] = input[0 + offset_input + inst_idx * 32 + lane_id];
}
#pragma unroll
for (int inst_idx = 0; inst_idx < 1; inst_idx++) {
buf[4] = bias[0 + offset_bias];
}
#pragma unroll
for (int inst_idx = 0; inst_idx < 4; inst_idx++) {
buf[inst_idx] = buf[inst_idx] + buf[4];
}
#pragma unroll
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
bias[0 + offset_input + inst_idx * 32 + lane_id] = buf[inst_idx];
}
}
}
void invoke_func_1(float *input, float *bias, float *output) {
dim3 gridDim(80, 1);
dim3 blockDim(128, 1);
kernel_func_1<<<gridDim, blockDim>>>(input, bias, output);
cudaCheckError();
}

42
generated_code/bias_2.cu Normal file
View File

@ -0,0 +1,42 @@
#include "cuda_utils.h"
// Kernel
__global__ void kernel_func_2(float *input, float *bias, float *output) {
int lane_id = threadIdx.x % 32;
int warp_id = threadIdx.x / 32;
int parallel_idx = blockIdx.x * 4 + warp_id;
float buf[4];
for (int loop_idx = parallel_idx; loop_idx < 116; loop_idx += 320) {
int offset_input = 0;
int offset_input_buf = loop_idx;
offset_input += offset_input_buf % 2 * 128;
offset_input_buf /= 2;
offset_input += offset_input_buf % 116 * 196;
offset_input_buf /= 116;
int offset_bias = 0;
int offset_bias_buf = loop_idx;
offset_bias += offset_bias_buf % 116 * 116;
offset_bias_buf /= 116;
#pragma unroll
for (int inst_idx = 0; inst_idx < 4; inst_idx++) {
buf[inst_idx] = input[0 + offset_input + inst_idx * 32 + lane_id];
}
#pragma unroll
for (int inst_idx = 0; inst_idx < 1; inst_idx++) {
buf[4] = bias[0 + offset_bias];
}
#pragma unroll
for (int inst_idx = 0; inst_idx < 4; inst_idx++) {
buf[inst_idx] = buf[inst_idx] + buf[4];
}
#pragma unroll
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
bias[0 + offset_input + inst_idx * 32 + lane_id] = buf[inst_idx];
}
}
}
void invoke_func_2(float *input, float *bias, float *output) {
dim3 gridDim(80, 1);
dim3 blockDim(128, 1);
kernel_func_2<<<gridDim, blockDim>>>(input, bias, output);
cudaCheckError();
}

42
generated_code/bias_3.cu Normal file
View File

@ -0,0 +1,42 @@
#include "cuda_utils.h"
// Kernel
__global__ void kernel_func_3(float *input, float *bias, float *output) {
int lane_id = threadIdx.x % 32;
int warp_id = threadIdx.x / 32;
int parallel_idx = blockIdx.x * 4 + warp_id;
float buf[4];
for (int loop_idx = parallel_idx; loop_idx < 0; loop_idx += 320) {
int offset_input = 0;
int offset_input_buf = loop_idx;
offset_input += offset_input_buf % 1 * 128;
offset_input_buf /= 1;
offset_input += offset_input_buf % 232 * 49;
offset_input_buf /= 232;
int offset_bias = 0;
int offset_bias_buf = loop_idx;
offset_bias += offset_bias_buf % 232 * 232;
offset_bias_buf /= 232;
#pragma unroll
for (int inst_idx = 0; inst_idx < 4; inst_idx++) {
buf[inst_idx] = input[0 + offset_input + inst_idx * 32 + lane_id];
}
#pragma unroll
for (int inst_idx = 0; inst_idx < 1; inst_idx++) {
buf[4] = bias[0 + offset_bias];
}
#pragma unroll
for (int inst_idx = 0; inst_idx < 4; inst_idx++) {
buf[inst_idx] = buf[inst_idx] + buf[4];
}
#pragma unroll
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
bias[0 + offset_input + inst_idx * 32 + lane_id] = buf[inst_idx];
}
}
}
void invoke_func_3(float *input, float *bias, float *output) {
dim3 gridDim(80, 1);
dim3 blockDim(128, 1);
kernel_func_3<<<gridDim, blockDim>>>(input, bias, output);
cudaCheckError();
}

View File

@ -0,0 +1,20 @@
#include "cuda_utils.h"
// Kernel
__global__ void kernel_func_0(float *tensor_ptr_2, float *tensor_ptr_3) {
int lane_id = threadIdx.x % 32;
int warp_id = threadIdx.x / 32;
int parallel_idx = blockIdx.x * 2 + warp_id;
float buf[24];
for (int loop_idx = parallel_idx; loop_idx < 128; loop_idx += 216) {
int offset_src = 0;
int offset_src_buf = loop_idx;
offset_src += offset_src_buf % 128 * 512;
offset_src_buf /= 128;
}
}
void invoke_func_0(float *tensor_ptr_2, float *tensor_ptr_3) {
dim3 gridDim(108, 1);
dim3 blockDim(64, 1);
kernel_func_0<<<gridDim, blockDim>>>(tensor_ptr_2, tensor_ptr_3);
cudaCheckError();
}

View File

@ -0,0 +1,46 @@
#include "cuda_utils.h"
// Kernel
__global__ void kernel_func_2(float *tensor_ptr_2, float *tensor_ptr_4,
float *tensor_ptr_5) {
int lane_id = threadIdx.x % 32;
int warp_id = threadIdx.x / 32;
int parallel_idx = blockIdx.x * 8 + warp_id;
float buf[32];
for (int loop_idx = parallel_idx; loop_idx < 65536; loop_idx += 864) {
int offset_src = 0;
int offset_src_buf = loop_idx;
offset_src += offset_src_buf % 65536 * 256;
offset_src_buf /= 65536;
#pragma unroll
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
buf[inst_idx] =
tensor_ptr_2[0 + offset_src + inst_idx * 32 + lane_id];
}
#pragma unroll
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
buf[inst_idx] = (buf[inst_idx] > 0) ? buf[inst_idx] : 0;
}
#pragma unroll
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
buf[inst_idx + 8] =
tensor_ptr_4[0 + offset_src + inst_idx * 32 + lane_id];
}
#pragma unroll
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
buf[inst_idx + 16] = buf[inst_idx] + buf[inst_idx + 8];
}
#pragma unroll
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
tensor_ptr_5[0 + offset_src + inst_idx * 32 + lane_id] =
buf[inst_idx + 16];
}
}
}
void invoke_func_2(float *tensor_ptr_2, float *tensor_ptr_4,
float *tensor_ptr_5) {
dim3 gridDim(108, 1);
dim3 blockDim(256, 1);
kernel_func_2<<<gridDim, blockDim>>>(tensor_ptr_2, tensor_ptr_4,
tensor_ptr_5);
cudaCheckError();
}

View File

@ -0,0 +1,46 @@
#include "cuda_utils.h"
// Kernel
__global__ void kernel_func_5(float *tensor_ptr_9, float *tensor_ptr_11,
float *tensor_ptr_12) {
int lane_id = threadIdx.x % 32;
int warp_id = threadIdx.x / 32;
int parallel_idx = blockIdx.x * 8 + warp_id;
float buf[32];
for (int loop_idx = parallel_idx; loop_idx < 1024; loop_idx += 864) {
int offset_src = 0;
int offset_src_buf = loop_idx;
offset_src += offset_src_buf % 1024 * 256;
offset_src_buf /= 1024;
#pragma unroll
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
buf[inst_idx] =
tensor_ptr_9[0 + offset_src + inst_idx * 32 + lane_id];
}
#pragma unroll
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
buf[inst_idx] = (buf[inst_idx] > 0) ? buf[inst_idx] : 0;
}
#pragma unroll
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
buf[inst_idx + 8] =
tensor_ptr_11[0 + offset_src + inst_idx * 32 + lane_id];
}
#pragma unroll
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
buf[inst_idx + 16] = buf[inst_idx] - buf[inst_idx + 8];
}
#pragma unroll
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
tensor_ptr_12[0 + offset_src + inst_idx * 32 + lane_id] =
buf[inst_idx + 16];
}
}
}
void invoke_func_5(float *tensor_ptr_9, float *tensor_ptr_11,
float *tensor_ptr_12) {
dim3 gridDim(108, 1);
dim3 blockDim(256, 1);
kernel_func_5<<<gridDim, blockDim>>>(tensor_ptr_9, tensor_ptr_11,
tensor_ptr_12);
cudaCheckError();
}

24
generated_code/test.cu Normal file
View File

@ -0,0 +1,24 @@
#include "cuda_utils.h"
// Kernel
__global__ void kernel_func(float *src, float *dst) {
int lane_id = threadIdx.x % 32;
int warp_id = threadIdx.x / 32;
int parallel_idx = blockIdx.x * 0 + warp_id;
float buf[0];
__shared__ float smem[0];
for (int loop_idx = parallel_idx; loop_idx < 1024; loop_idx += 0) {
if (lane_id < 0) {
#pragma unroll
for (int inst_idx = 0; inst_idx < 32; inst_idx++) {
smem[warp_id * 32 * 32 * 2 + inst_idx * 32 + lane_id] =
inst_idx;
}
}
}
}
void invoke_func(float *src, float *dst) {
dim3 gridDim(0, 1);
dim3 blockDim(0, 1);
kernel_func<<<gridDim, blockDim>>>(src, dst);
cudaCheckError();
}

84
generated_code/tmp.cu Normal file
View File

@ -0,0 +1,84 @@
#include "cuda.h"
#include "cuda_utils.h"
#include <vector>
void invoke_func_2(float *src, float *dst);
int main() {
std::vector<int> shape = {7 * 7, 232, 2};
std::vector<int> perm = {0, 2, 1};
float *src, *dst;
size_t size = 1;
for (auto x : shape) {
size *= x;
}
std::vector<int> stride_src(shape.size()), stride_dst(shape.size());
stride_dst[0] = 1;
for (int i = 1; i < shape.size(); i++) {
stride_dst[i] = stride_dst[i-1] * shape[i-1];
}
size_t this_stride = 1;
for (int i = 0; i < shape.size(); i++) {
for (int j = 0; j < shape.size(); j++) {
if (perm[j] == i) {
stride_src[i] = this_stride;
this_stride *= shape[j];
}
}
}
cudaSafeCall(cudaMalloc((void **)&src, size * sizeof(float)));
cudaSafeCall(cudaMalloc((void **)&dst, size * sizeof(float)));
float *src_host, *dst_host;
src_host = (float *)malloc(size * sizeof(float));
dst_host = (float *)malloc(size * sizeof(float));
for (size_t i = 0; i < size; i++) {
src_host[i] = i;
}
cudaSafeCall(cudaMemcpy(src, src_host, size * sizeof(float), cudaMemcpyHostToDevice));
invoke_func_2(src, dst);
cudaSafeCall(cudaMemcpy(dst_host, dst, size * sizeof(float), cudaMemcpyDeviceToHost));
bool flag = 0;
for (size_t i = 0; i < size; i++) {
size_t base = i;
size_t offset_src = 0;
for (int j = 0; j < shape.size(); j++) {
offset_src += base % shape[j] * stride_src[perm[j]];
base /= shape[j];
}
if (dst_host[i] != src_host[offset_src]) {
flag = 1;
std::cout << "[ERROR] at " << i << "," << offset_src << ":" << dst_host[i] << "," << src_host[offset_src] << std::endl;
break;
}
}
if (!flag) {
std::cout << "[INFO] transpose correct." << std::endl;
} else {
std::cout << "[ERROR] transpose incorrect." << std::endl;
}
float duration = 0;
cudaEvent_t st, ed;
cudaEventCreate(&st);
cudaEventCreate(&ed);
int cnt = 128;
for (int t = 0; t < cnt; t++) {
invoke_func_2(src, dst);
}
cudaEventRecord(st, 0);
for (int t = 0; t < cnt; t++) {
invoke_func_2(src, dst);
}
cudaEventRecord(ed, 0);
cudaEventSynchronize(st);
cudaEventSynchronize(ed);
cudaEventElapsedTime(&duration, st, ed);
std::cout << "[INFO] time: " << duration / cnt << std::endl;
double perf = double(size) * 8.0f * cnt / (duration * 1e-3) / 1024.0f / 1024.0f / 1024.0f;
std::cout << "[INFO] Perf: " << perf << "GB/s" << std::endl;
std::cout << "[Exit] successful." << std::endl;
}

View File

@ -0,0 +1,52 @@
#include "cuda_utils.h"
// Kernel
__global__ void kernel_func_0(float *tensor_ptr_2, float *tensor_ptr_3) {
int lane_id = threadIdx.x % 32;
int warp_id = threadIdx.x / 32;
int parallel_idx = blockIdx.x * 8 + warp_id;
float buf[32];
__shared__ float smem[8448];
for (int loop_idx = parallel_idx; loop_idx < 1056; loop_idx += 864) {
int offset_src = 0;
int offset_src_buf = loop_idx;
offset_src += offset_src_buf % 32 * 32736;
offset_src_buf /= 32;
offset_src += offset_src_buf % 33 * 33;
offset_src_buf /= 33;
int offset_dst = 0;
int offset_dst_buf = loop_idx;
offset_dst += offset_dst_buf % 32 * 1024;
offset_dst_buf /= 32;
offset_dst += offset_dst_buf % 33 * 33792;
offset_dst_buf /= 33;
#pragma unroll
for (int inst_idx = 0; inst_idx < 31; inst_idx++) {
buf[inst_idx] =
tensor_ptr_2[0 + offset_src + 0 + inst_idx * 1056 + lane_id];
}
#pragma unroll
for (int inst_idx = 0; inst_idx < 31; inst_idx++) {
smem[group_id * 32 * 33 + inst_idx * 33 + lane_id] = buf[inst_idx];
}
if (lane_id < 31) {
#pragma unroll
for (int inst_idx = 0; inst_idx < 32; inst_idx++) {
buf[inst_idx] =
smem[group_id * 32 * 33 + lane_id * 33 + inst_idx];
}
}
if (lane_id < 31) {
#pragma unroll
for (int inst_idx = 0; inst_idx < 32; inst_idx++) {
tensor_ptr_3[0 + offset_dst + 0 + inst_idx * 31 + lane_id] =
buf[inst_idx];
}
}
}
}
void invoke_func_0(float *tensor_ptr_2, float *tensor_ptr_3) {
dim3 gridDim(108, 1);
dim3 blockDim(256, 1);
kernel_func_0<<<gridDim, blockDim>>>(tensor_ptr_2, tensor_ptr_3);
cudaCheckError();
}

View File

@ -0,0 +1,56 @@
#include "cuda_utils.h"
// Kernel
__global__ void kernel_func_0(float *input, float *output) {
int lane_id = threadIdx.x % 32;
int warp_id = threadIdx.x / 32;
int parallel_idx = blockIdx.x * 4 + warp_id;
float buf[4];
for (int loop_idx = parallel_idx; loop_idx < 812; loop_idx += 320) {
int offset_input = 0;
int offset_input_buf = loop_idx;
offset_input += offset_input_buf % 7 * 128;
offset_input_buf /= 7;
offset_input += offset_input_buf % 58 * 1568;
offset_input_buf /= 58;
offset_input += offset_input_buf % 2 * 784;
offset_input_buf /= 2;
int offset_output = 0;
int offset_output_buf = loop_idx;
offset_output += offset_output_buf % 7 * 128;
offset_output_buf /= 7;
offset_output += offset_output_buf % 58 * 784;
offset_output_buf /= 58;
offset_output += offset_output_buf % 2 * 45472;
offset_output_buf /= 2;
if (loop_idx % 7 == 6) {
if (lane_id < 16) {
buf[0] = input[0 + offset_input + 0 * 32 + lane_id];
}
} else {
#pragma unroll
for (int inst_idx = 0; inst_idx < 4; inst_idx++) {
buf[inst_idx] =
input[0 + offset_input + inst_idx * 32 + lane_id];
}
}
// test
if (loop_idx % 7 == 6) {
if (lane_id < 16) {
output[0 + offset_output + 0 * 32 + lane_id] = buf[0];
}
} else {
#pragma unroll
for (int inst_idx = 0; inst_idx < 4; inst_idx++) {
output[0 + offset_output + inst_idx * 32 + lane_id] =
buf[inst_idx];
}
}
// test
}
}
void invoke_func_0(float *input, float *output) {
dim3 gridDim(80, 1);
dim3 blockDim(128, 1);
kernel_func_0<<<gridDim, blockDim>>>(input, output);
cudaCheckError();
}

View File

@ -0,0 +1,66 @@
#include "cuda_utils.h"
// Kernel
__global__ void kernel_func_1(float *input, float *output) {
int lane_id = threadIdx.x % 32;
int warp_id = threadIdx.x / 32;
int parallel_idx = blockIdx.x * 4 + warp_id;
float buf[4];
for (int loop_idx = parallel_idx; loop_idx < 464; loop_idx += 320) {
int offset_input = 0;
int offset_input_buf = loop_idx;
offset_input += offset_input_buf % 2 * 128;
offset_input_buf /= 2;
offset_input += offset_input_buf % 116 * 392;
offset_input_buf /= 116;
offset_input += offset_input_buf % 2 * 196;
offset_input_buf /= 2;
int offset_output = 0;
int offset_output_buf = loop_idx;
offset_output += offset_output_buf % 2 * 128;
offset_output_buf /= 2;
offset_output += offset_output_buf % 116 * 196;
offset_output_buf /= 116;
offset_output += offset_output_buf % 2 * 22736;
offset_output_buf /= 2;
if (loop_idx % 2 == 1) {
#pragma unroll
for (int inst_idx = 0; inst_idx < 2; inst_idx++) {
buf[inst_idx] =
input[0 + offset_input + inst_idx * 32 + lane_id];
}
if (lane_id < 4) {
buf[2] = input[0 + offset_input + 2 * 32 + lane_id];
}
} else {
#pragma unroll
for (int inst_idx = 0; inst_idx < 4; inst_idx++) {
buf[inst_idx] =
input[0 + offset_input + inst_idx * 32 + lane_id];
}
}
// test
if (loop_idx % 2 == 1) {
#pragma unroll
for (int inst_idx = 0; inst_idx < 2; inst_idx++) {
output[0 + offset_output + inst_idx * 32 + lane_id] =
buf[inst_idx];
}
if (lane_id < 4) {
output[0 + offset_output + 2 * 32 + lane_id] = buf[2];
}
} else {
#pragma unroll
for (int inst_idx = 0; inst_idx < 4; inst_idx++) {
output[0 + offset_output + inst_idx * 32 + lane_id] =
buf[inst_idx];
}
}
// test
}
}
void invoke_func_1(float *input, float *output) {
dim3 gridDim(80, 1);
dim3 blockDim(128, 1);
kernel_func_1<<<gridDim, blockDim>>>(input, output);
cudaCheckError();
}

View File

@ -0,0 +1,62 @@
#include "cuda_utils.h"
// Kernel
__global__ void kernel_func_2(float *input, float *output) {
int lane_id = threadIdx.x % 32;
int warp_id = threadIdx.x / 32;
int parallel_idx = blockIdx.x * 4 + warp_id;
float buf[4];
for (int loop_idx = parallel_idx; loop_idx < 464; loop_idx += 320) {
int offset_input = 0;
int offset_input_buf = loop_idx;
offset_input += offset_input_buf % 232 * 98;
offset_input_buf /= 232;
offset_input += offset_input_buf % 2 * 49;
offset_input_buf /= 2;
int offset_output = 0;
int offset_output_buf = loop_idx;
offset_output += offset_output_buf % 232 * 49;
offset_output_buf /= 232;
offset_output += offset_output_buf % 2 * 11368;
offset_output_buf /= 2;
if (loop_idx % 1 == 0) {
#pragma unroll
for (int inst_idx = 0; inst_idx < 1; inst_idx++) {
buf[inst_idx] =
input[0 + offset_input + inst_idx * 32 + lane_id];
}
if (lane_id < 17) {
buf[1] = input[0 + offset_input + 1 * 32 + lane_id];
}
} else {
#pragma unroll
for (int inst_idx = 0; inst_idx < 2; inst_idx++) {
buf[inst_idx] =
input[0 + offset_input + inst_idx * 32 + lane_id];
}
}
// test
if (loop_idx % 1 == 0) {
#pragma unroll
for (int inst_idx = 0; inst_idx < 1; inst_idx++) {
output[0 + offset_output + inst_idx * 32 + lane_id] =
buf[inst_idx];
}
if (lane_id < 17) {
output[0 + offset_output + 1 * 32 + lane_id] = buf[1];
}
} else {
#pragma unroll
for (int inst_idx = 0; inst_idx < 2; inst_idx++) {
output[0 + offset_output + inst_idx * 32 + lane_id] =
buf[inst_idx];
}
}
// test
}
}
void invoke_func_2(float *input, float *output) {
dim3 gridDim(80, 1);
dim3 blockDim(128, 1);
kernel_func_2<<<gridDim, blockDim>>>(input, output);
cudaCheckError();
}

View File

@ -0,0 +1,169 @@
#pragma once
#include "core/common.h"
#include "core/graph.h"
#include "core/operator.h"
#include "core/tensor.h"
#include "operators/G2BMM.h"
#include "operators/GBMM.h"
#include "operators/concat.h"
#include "operators/conv.h"
#include "operators/element_wise.h"
#include "operators/extend.h"
#include "operators/gather.h"
#include "operators/matmul.h"
#include "operators/membound.h"
#include "operators/pad.h"
#include "operators/pooling.h"
#include "operators/reduce_mean.h"
#include "operators/reshape.h"
#include "operators/slice.h"
#include "operators/split.h"
#include "operators/transpose.h"
#include "operators/unary.h"
namespace infini {
class GraphBuilderObj {
private:
Graph g;
public:
GraphBuilderObj(Runtime runtime) : g(make_ref<GraphObj>(runtime)) {}
// tensors
Tensor tensor(Shape dim, const std::string &dtype);
// operators
// conv op
Operator conv(Tensor input, Tensor weight, Tensor output, int ph, int pw,
int sh = 1, int sw = 1, int dh = 1, int dw = 1,
Tensor bias = nullptr);
Operator conv(Tensor input, Tensor weight, int ph, int pw, int sh = 1,
int sw = 1, int dh = 1, int dw = 1, Tensor bias = nullptr);
Operator conv(Tensor input, Tensor weight, Tensor output,
ConvBaseObj::PaddingMode pm, int sh = 1, int sw = 1,
int dh = 1, int dw = 1, Tensor bias = nullptr);
Operator conv(Tensor input, Tensor weight, ConvBaseObj::PaddingMode pm,
int sh = 1, int sw = 1, int dh = 1, int dw = 1,
Tensor bias = nullptr);
// matmul op
Operator matmul(Tensor A, Tensor B, Tensor C, bool transA = false,
bool transB = false);
Operator matmul(Tensor A, Tensor B, bool transA = false,
bool transB = false);
// conv trans op
Operator convTrans(Tensor input, Tensor weight, Tensor output, int ph,
int pw, int sh = 1, int sw = 1, int dh = 1, int dw = 1,
int oph = 0, int opw = 0, int group = 1,
Tensor bias = nullptr, ActType act = ActType::None);
Operator convTrans(Tensor input, Tensor weight, int ph, int pw, int sh = 1,
int sw = 1, int dh = 1, int dw = 1, int oph = 0,
int opw = 0, int group = 1, Tensor bias = nullptr,
ActType act = ActType::None);
Operator convTrans(Tensor input, Tensor weight, Tensor output,
ConvBaseObj::PaddingMode pm, int sh = 1, int sw = 1,
int dh = 1, int dw = 1, int oph = 0, int opw = 0,
int group = 1, Tensor bias = nullptr,
ActType act = ActType::None);
Operator convTrans(Tensor input, Tensor weight, ConvBaseObj::PaddingMode pm,
int sh = 1, int sw = 1, int dh = 1, int dw = 1,
int oph = 0, int opw = 0, int group = 1,
Tensor bias = nullptr, ActType act = ActType::None);
// g2bmm op
Operator g2bmm(Tensor A, Tensor B, Tensor C, const int width,
const int dilation, Tensor bias = nullptr,
ActType act = ActType::None);
Operator g2bmm(Tensor A, Tensor B, const int width, const int dilation,
Tensor bias = nullptr, ActType act = ActType::None);
// gbmm-like op
Operator gbmml(Tensor A, Tensor B, Tensor C, const int dilation,
Tensor bias = nullptr, ActType act = ActType::None);
Operator gbmml(Tensor A, Tensor B, const int dilation,
Tensor bias = nullptr, ActType act = ActType::None);
// pad op
Operator pad(Tensor input, Tensor output, const vector<int> &pads,
const optional<const vector<int>> &axis);
Operator pad(Tensor input, const vector<int> &pads,
const optional<const vector<int>> &axis);
// slice op
Operator slice(Tensor input, Tensor output, const vector<int> &starts,
const vector<int> &ends,
const optional<const vector<int>> &axis,
const optional<const vector<int>> &steps);
Operator slice(Tensor input, const vector<int> &starts,
const vector<int> &ends,
const optional<const vector<int>> &axis,
const optional<const vector<int>> &steps);
// concat op
Operator concat(TensorVec inputs, Tensor output, int dim);
Operator concat(TensorVec inputs, int dim);
// split op
Operator split(Tensor input, std::optional<TensorVec> outputs, int dim,
int num);
Operator split(Tensor input, int dim, int num);
Operator split(Tensor input, std::optional<TensorVec> outputs, int dim,
const vector<int> &ratio);
Operator split(Tensor input, int dim, const vector<int> &ratio);
// transpose op
Operator transpose(Tensor input, Tensor output, const Shape &perm);
// TODO
// extend op
Operator extend(Tensor input, Tensor output, int dim, int num);
Operator extend(Tensor input, int dim, int num);
// max pool op
Operator maxpool(Tensor input, Tensor output, int kh, int kw, int dh,
int dw, int ph, int pw, int sh, int sw);
Operator maxpool(Tensor input, int kh, int kw, int dh, int dw, int ph,
int pw, int sh, int sw);
// average pool op
Operator avgpool(Tensor input, Tensor output, int kh, int kw, int dh,
int dw, int ph, int pw, int sh, int sw);
Operator avgpool(Tensor input, int kh, int kw, int dh, int dw, int ph,
int pw, int sh, int sw);
// element wise op
Operator add(Tensor input0, Tensor input1, Tensor output);
Operator add(Tensor input0, Tensor input1);
Operator sub(Tensor input0, Tensor input1, Tensor output);
Operator sub(Tensor input0, Tensor input1);
Operator mul(Tensor input0, Tensor input1, Tensor output);
Operator mul(Tensor input0, Tensor input1);
Operator div(Tensor input0, Tensor input1, Tensor output);
Operator div(Tensor input0, Tensor input1);
Operator pow(Tensor input0, Tensor input1, Tensor output);
Operator pow(Tensor input0, Tensor input1);
// gather op
Operator gather(Tensor input, Tensor index, Tensor output, int axis);
Operator gather(Tensor input, Tensor index, int axis);
// reduce mean op
// TODO
// reshape op
Operator reshape(Tensor input, Tensor output, const Shape &dims);
Operator reshape(Tensor input, const Shape &dims);
Operator flatten(Tensor input, Tensor output);
Operator flatten(Tensor input);
Operator identity(Tensor input, Tensor output);
Operator identity(Tensor input);
// unary op
// TODO: batch norm
Operator softmax(Tensor input, Tensor output);
Operator softmax(Tensor input);
// TODO: activation
Operator relu(Tensor input, Tensor output);
Operator relu(Tensor input);
Operator sigmoid(Tensor input, Tensor output);
Operator sigmoid(Tensor input);
Operator tanh(Tensor input, Tensor output);
Operator tanh(Tensor input);
Operator abs(Tensor input, Tensor output);
Operator abs(Tensor input);
Operator reduceMean(Tensor input, Tensor Output, int axis);
Operator erf(Tensor input, Tensor output);
// resize op
// TODO
// membound op
Operator memBound(const TensorVec &inputs, const TensorVec &outputs,
const std::vector<nnet::Tensor> &nnetInputs,
nnet::Expr expr, double exec_time, std::string hint = {});
};
} // namespace infini

View File

@ -38,6 +38,7 @@ enum class OpType {
Tanh,
Abs,
Resize,
Erf,
//
MemBound = 300,
};

View File

@ -10,6 +10,7 @@ class TensorBaseObj;
class TensorObj;
class OperatorObj;
class GraphObj;
class GraphBuilderObj;
class RuntimeObj;
class BlobObj;
@ -17,6 +18,7 @@ using TensorBase = Ref<TensorBaseObj>;
using Tensor = Ref<TensorObj>;
using Operator = Ref<OperatorObj>;
using Graph = Ref<GraphObj>;
using GraphBuilder = Ref<GraphBuilderObj>;
using Runtime = Ref<RuntimeObj>;
using Blob = Ref<BlobObj>;
enum class OpType;

View File

@ -1,5 +1,5 @@
#pragma once
#include "core/common.h"
// #include "core/common.h"
#include <cublas_v2.h>
#include <cuda.h>
#include <cuda_profiler_api.h>

View File

@ -1,10 +1,16 @@
#pragma once
#include "core/operator.h"
namespace infini {
namespace opTimer {
double getPerfConvCudnn(int n, int c, int h, int w, int f, int r, int s,
int padh, int padw, int strideh, int stridew,
int dilationh, int dilationw, int group,
const char *name);
int dilationh, int dilationw, int group);
double getPerfConvBiasActCudnn(int n, int c, int h, int w, int f, int r, int s,
int padh, int padw, int strideh, int stridew,
int dilationh, int dilationw, int group,
bool bias, string act);
double getPerfConvTransposed2dCudnn(int n, int c, int h, int w, int f, int r,
int s, int padh, int padw, int strideh,

View File

@ -25,19 +25,22 @@ class ConvBaseObj : public OperatorObj {
int h, w; // input shape (same for conv2d and convTranposed2d)
int f; // output/input channel for conv2d/convTransposed2d
int r, s; // weight shape
ActType act;
public:
// Constructors for explicitly setting padding size
ConvBaseObj(OpType opType, TensorVec inputs, Tensor &output, int ph, int pw,
int sh, int sw, int dh, int dw, const Tensor &inputInConvFWD,
const Tensor &weightInConvFWD);
const Tensor &weightInConvFWD, const ActType act);
ConvBaseObj(OpType opType, TensorVec inputs, Tensor &output,
PaddingMode mode, int sh, int sw, int dh, int dw,
const Tensor &inputInConvFWD, const Tensor &weightInConvFWD);
const Tensor &inputInConvFWD, const Tensor &weightInConvFWD,
const ActType act);
std::string toString() const override;
int numInputs() const override { return 2; }
int numInputs() const override { return inputs.size(); }
int numOutputs() const override { return 1; }
bool hasBias() const { return inputs.size() == 3; }
Tensor getBias() const { return inputs[2]; }
PaddingMode getPaddingMode() const { return padding; }
@ -53,6 +56,7 @@ class ConvBaseObj : public OperatorObj {
auto getPadStrideDilation() const { return tuple(ph, pw, sh, sw, dh, dw); }
int getChannelPerGroup() const { return inputs[1]->getDims()[1]; }
virtual int getNumGroups() const = 0;
ActType getAct() const { return act; }
private:
vector<int> getWorkloadVector() const override;
@ -65,8 +69,6 @@ class ConvBaseObj : public OperatorObj {
};
class ConvObj : public ConvBaseObj {
private:
ActType act;
public:
ConvObj(GraphObj *graph, Tensor input, Tensor weight, Tensor output, int ph,
@ -79,7 +81,6 @@ class ConvObj : public ConvBaseObj {
ActType act = ActType::None);
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
ActType getAct() const { return act; }
int getNumGroups() const override { return c / getChannelPerGroup(); }
private:
@ -90,7 +91,6 @@ class ConvTransposed2dObj : public ConvBaseObj {
private:
int oph, opw;
int group;
ActType act;
public:
ConvTransposed2dObj(GraphObj *graph, Tensor input, Tensor weight,
@ -106,7 +106,6 @@ class ConvTransposed2dObj : public ConvBaseObj {
Tensor bias = nullptr, ActType act = ActType::None);
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
ActType getAct() const { return act; }
int getNumGroups() const override { return group; }
private:

View File

@ -0,0 +1,25 @@
#pragma once
#include "core/operator.h"
namespace infini {
class TransposeObj : public OperatorObj {
Shape perm;
public:
TransposeObj(GraphObj *graph, Tensor input, Tensor output,
const Shape &dims);
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
std::string toString() const override;
int numInputs() const override { return 1; }
int numOutputs() const override { return 1; }
Shape getPerm() const { return perm; }
private:
vector<int> getWorkloadVector() const override;
vector<int> getOpAttrVector() const override;
};
} // namespace infini

View File

@ -28,4 +28,5 @@ DEFINE_UNARY_OBJ(Sigmoid, OpType::Sigmoid)
DEFINE_UNARY_OBJ(Tanh, OpType::Tanh)
DEFINE_UNARY_OBJ(Softmax, OpType::Softmax)
DEFINE_UNARY_OBJ(Abs, OpType::Abs)
DEFINE_UNARY_OBJ(Erf, OpType::Erf)
}; // namespace infini

71
include/pfusion/common.h Normal file
View File

@ -0,0 +1,71 @@
#pragma once
#include "core/common.h"
#include <iostream>
#include <memory>
#include <string>
#include <vector>
namespace memb {
enum OpType {
NONE = 0,
EMPTY = 1,
READ,
WRITE,
RELU,
ADD,
SUB,
REDUCEMEAN,
REDUCESUM,
GATHER,
TRANSPOSE,
BROADCAST,
};
enum MemType {
DRAM = 1,
SRAM,
REG,
};
inline std::string getName(OpType opType) {
switch (opType) {
case (OpType::EMPTY):
return "EMPTY";
case (OpType::READ):
return "READ";
case (OpType::WRITE):
return "WRITE";
case (OpType::RELU):
return "RELU";
case (OpType::ADD):
return "ADD";
case (OpType::SUB):
return "SUB";
default:
IT_ASSERT(false);
}
return "";
}
inline std::string getName(MemType memType) {
switch (memType) {
case (MemType::DRAM):
return "DRAM";
case (MemType::SRAM):
return "SRAM";
case (MemType::REG):
return "REG";
default:
IT_ASSERT(false);
}
return "";
}
inline size_t hashAppend(size_t a, size_t b) {
return (a * 10000019 + b * 10000079) % 2147483647;
}
} // namespace memb

View File

@ -0,0 +1,88 @@
#pragma once
#include <cublas.h>
#include <curand.h>
#include <iostream>
#define cudaSafeCall(err) __cudaSafeCall(err, __FILE__, __LINE__)
#define cudaCheckError() __cudaCheckError(__FILE__, __LINE__)
#define cublasSafeCall(err) __cublasSafeCall(err, __FILE__, __LINE__)
#define curandSafeCall(err) __curandSafeCall(err, __FILE__, __LINE__)
inline void __cudaSafeCall(cudaError err, const char *file, const int line) {
if (err != cudaSuccess) {
std::cout << "[ERROR] " << file << "::" << line
<< ": cudaSafeCall() failed. " << cudaGetErrorString(err)
<< std::endl;
exit(-1);
}
return;
}
inline void __cudaCheckError(const char *file, const int line) {
auto err = cudaGetLastError();
if (err != cudaSuccess) {
std::cout << "[ERROR] " << file << "::" << line
<< ": cudaCheckError() failed. " << cudaGetErrorString(err)
<< std::endl;
exit(-1);
}
#ifdef DEBUG
// This checking will affect performance.
err = cudaDeviceSynchronize();
if (err != cudaSuccess) {
std::cout << "[ERROR] " << file << "::" << line
<< ": cudaCheckError() with sync failed. "
<< cudaGetErrorString(err) << std::endl;
exit(-1);
}
#endif
return;
}
inline const char *cublasGetErrorString(cublasStatus_t err) {
switch (err) {
case CUBLAS_STATUS_SUCCESS:
return "CUBLAS_STATUS_SUCCESS";
case CUBLAS_STATUS_NOT_INITIALIZED:
return "CUBLAS_STATUS_NOT_INITIALIZED";
case CUBLAS_STATUS_ALLOC_FAILED:
return "CUBLAS_STATUS_ALLOC_FAILED";
case CUBLAS_STATUS_INVALID_VALUE:
return "CUBLAS_STATUS_INVALID_VALUE";
case CUBLAS_STATUS_ARCH_MISMATCH:
return "CUBLAS_STATUS_ARCH_MISMATCH";
case CUBLAS_STATUS_MAPPING_ERROR:
return "CUBLAS_STATUS_MAPPING_ERROR";
case CUBLAS_STATUS_EXECUTION_FAILED:
return "CUBLAS_STATUS_EXECUTION_FAILED";
case CUBLAS_STATUS_INTERNAL_ERROR:
return "CUBLAS_STATUS_INTERNAL_ERROR";
case CUBLAS_STATUS_NOT_SUPPORTED:
return "CUBLAS_STATUS_NOT_SUPPORTED";
case CUBLAS_STATUS_LICENSE_ERROR:
return "CUBLAS_STATUS_LICENSE_ERROR";
}
return "<unknown>";
}
inline void __cublasSafeCall(cublasStatus_t err, const char *file,
const int line) {
if (err != CUBLAS_STATUS_SUCCESS) {
std::cout << "[ERROR]" << file << "::" << line
<< ": cublasSafeCall() failed. " << cublasGetErrorString(err)
<< std::endl;
exit(-1);
}
}
inline void __curandSafeCall(curandStatus_t err, const char *file,
const int line) {
if (err != CURAND_STATUS_SUCCESS) {
std::cout << "[ERROR]" << file << "::" << line
<< ": curandSafeCall() failed. " << err << std::endl;
exit(-1);
}
}

View File

@ -0,0 +1,32 @@
#pragma once
#include "pfusion/common.h"
#include "pfusion/meta_op.h"
namespace memb {
std::vector<std::shared_ptr<MetaOp>>
instantiateUnary(const OpType opType,
std::vector<std::shared_ptr<Pointer>> ptrs,
const std::vector<size_t> &shape);
std::vector<std::shared_ptr<MetaOp>>
instantiateBinary(const OpType opType,
std::vector<std::shared_ptr<Pointer>> ptrs,
const std::vector<size_t> &shape);
std::vector<std::shared_ptr<MetaOp>> instantiateTranspose(
const OpType opType, std::vector<std::shared_ptr<Pointer>> ptrs,
const std::vector<size_t> &shape, const std::vector<size_t> &perm);
std::vector<std::shared_ptr<MetaOp>>
instantiateGather(const OpType opType,
const std::vector<std::shared_ptr<Pointer>> &ptrs,
const std::vector<size_t> &inputShape,
const std::vector<size_t> &indexShape,
const std::vector<size_t> &outputShape, const size_t axis);
std::vector<std::shared_ptr<MetaOp>>
instantiateReduce(const OpType opType,
const std::vector<std::shared_ptr<Pointer>> &ptrs,
const std::vector<size_t> &inputShape, const size_t axis);
std::vector<std::shared_ptr<MetaOp>> instantiateBroadcast(
const OpType opType, const std::vector<std::shared_ptr<Pointer>> &ptrs,
const std::vector<size_t> &inputShape, const size_t axis, const size_t num);
} // namespace memb

View File

@ -0,0 +1,29 @@
#pragma once
#include "core/runtime.h"
namespace infini {
class MemoryCodegen {
private:
std::string generateGraph(Graph graph);
std::string generateBias(const std::vector<size_t> &shape);
std::string generateTranspose(const std::vector<size_t> &shape,
const std::vector<size_t> &perm);
public:
MemoryCodegen() {}
~MemoryCodegen() {}
void exportGraph(Graph graph, std::string filename);
void exportBert_LN(const std::string &filename);
void exportBert_SM(const std::string &filename);
void exportBert_GELU(const std::string &filename);
void exportViT_LN(const std::string &filename);
void exportViT_SM(const std::string &filename);
void exportViT_GELU(const std::string &filename);
void exportBias(const std::string &filename,
const std::vector<size_t> &shape);
void exportTranspose(const std::string &filename,
const std::vector<size_t> &shape,
const std::vector<size_t> &perm);
};
} // namespace infini

View File

@ -0,0 +1,33 @@
#pragma once
#include "pfusion/meta_op.h"
namespace memb {
class MetaGraph {
private:
std::vector<std::shared_ptr<MetaOp>> metaOps;
std::vector<std::pair<size_t, size_t>> edges;
std::unordered_map<size_t, size_t> metaOpMap;
public:
MetaGraph() {}
~MetaGraph() {}
inline void addOp(std::shared_ptr<MetaOp> op) {
IT_ASSERT(metaOpMap.find(op->id) == metaOpMap.end());
metaOpMap[op->id] = metaOps.size();
metaOps.emplace_back(op);
}
inline void addEdge(std::shared_ptr<MetaOp> op1,
std::shared_ptr<MetaOp> op2) {
IT_ASSERT(metaOpMap.find(op1->id) != metaOpMap.end());
IT_ASSERT(metaOpMap.find(op2->id) != metaOpMap.end());
edges.emplace_back(metaOpMap[op1->id], metaOpMap[op2->id]);
}
void print();
void optimize();
std::string genHeader();
std::string genKernelFuncs();
std::string genInvokeFuncs();
};
} // namespace memb

111
include/pfusion/meta_op.h Normal file
View File

@ -0,0 +1,111 @@
#pragma once
#include "pfusion/common.h"
#include "pfusion/micro_op.h"
#include "pfusion/pointer.h"
namespace memb {
class TensorMapping {
private:
std::vector<size_t> shape, stride, map;
std::string name;
public:
TensorMapping(const std::string &_name, const std::vector<size_t> &_shape,
const std::vector<size_t> &_stride,
const std::vector<size_t> &_map) {
name = "offset_" + _name;
IT_ASSERT(_shape.size() > 0 && _shape.size() < 10);
for (auto x : _shape) {
shape.emplace_back(x);
}
IT_ASSERT(_stride.size() > 0 && _stride.size() < 10);
for (auto x : _stride) {
stride.emplace_back(x);
}
IT_ASSERT(_map.size() > 0 && _map.size() < 10);
for (auto x : _map) {
map.emplace_back(x);
}
}
~TensorMapping() {}
static inline std::shared_ptr<TensorMapping>
buildWithMap(const std::string &name, const std::vector<size_t> &shape,
const std::vector<size_t> &map) {
std::vector<size_t> stride(shape.size());
stride[0] = 1;
for (size_t i = 1; i < stride.size(); i++) {
stride[i] = shape[i] * stride[i - 1];
}
return std::make_shared<TensorMapping>(name, shape, stride, map);
}
static inline std::shared_ptr<TensorMapping>
build(const std::string &name, const std::vector<size_t> &shape,
const std::vector<size_t> &stride, const std::vector<size_t> &map) {
return std::make_shared<TensorMapping>(name, shape, stride, map);
}
inline std::string offset() { return name; }
inline size_t getHash() {
std::hash<size_t> hasher;
std::hash<std::string> stringHasher;
size_t ret = stringHasher(name);
ret = hashAppend(ret, hasher(shape.size()));
for (auto x : shape) {
ret = hashAppend(ret, hasher(x));
}
ret = hashAppend(ret, hasher(map.size()));
for (auto x : map) {
ret = hashAppend(ret, hasher(x));
}
return ret;
}
std::string genOffset();
};
class MetaOp {
public:
int id;
int main_loop_st, main_loop_ed, numBlocks, numGroups, numReg, numSmem,
numLanes;
std::vector<std::shared_ptr<MicroOp>> microOps;
std::vector<std::shared_ptr<Pointer>> ptrs;
std::vector<std::shared_ptr<TensorMapping>> mappings;
MetaOp() {
static int metaOpId = 0;
id = metaOpId++;
}
~MetaOp() {}
inline void setLoopSt(int _main_loop_st) { main_loop_st = _main_loop_st; }
inline void setLoopEd(int _main_loop_ed) { main_loop_ed = _main_loop_ed; }
inline int getLoopSt() { return main_loop_st; }
inline int getLoopEd() { return main_loop_ed; }
void optimize();
std::string genKernelFunc();
std::string genInvokeFunc();
static std::shared_ptr<MetaOp> merge(std::shared_ptr<MetaOp> metaOp0,
std::shared_ptr<MetaOp> metaOp1);
inline void print() {
std::cout << "MetaOp: " << id << std::endl;
for (auto microOp : microOps) {
microOp->print();
}
}
bool checkValid() {
// TODO: check valid
return true;
};
static std::shared_ptr<MetaOp>
buildBiasOp(const std::vector<size_t> &shape);
static std::shared_ptr<MetaOp>
buildTransposeOp(const std::vector<size_t> &shape,
const std::vector<size_t> &perm);
};
} // namespace memb

View File

@ -0,0 +1,29 @@
#pragma once
#include "pfusion/micro_op.h"
namespace memb {
class BinaryOp : public MicroOp {
private:
size_t num, width;
public:
BinaryOp(OpType _opType, std::shared_ptr<Pointer> _pSrc0,
std::shared_ptr<Pointer> _pSrc1, std::shared_ptr<Pointer> _pDst,
size_t _num, size_t _width)
: num(_num), width(_width) {
opType = _opType;
ptrs = {_pSrc0, _pSrc1, _pDst};
}
~BinaryOp() {}
std::shared_ptr<Pointer> getSrc0() { return ptrs[0]; }
std::shared_ptr<Pointer> getSrc1() { return ptrs[1]; }
std::shared_ptr<Pointer> getDst() { return ptrs[2]; }
// bool checkValid() override;
std::string generate() override;
inline void print() override {
std::cout << id << " " << getName(opType) << std::endl;
}
};
} // namespace memb

View File

@ -0,0 +1,17 @@
#pragma once
#include "pfusion/micro_op.h"
namespace memb {
class EmptyOp : public MicroOp {
public:
EmptyOp() { opType = EMPTY; }
~EmptyOp() {}
// bool checkValid() override;
std::string generate() override { return ""; };
inline void print() override {
std::cout << id << " " << getName(opType) << std::endl;
}
};
} // namespace memb

View File

@ -0,0 +1,55 @@
#pragma once
#include "pfusion/micro_op.h"
namespace memb {
class MemoryOp : public MicroOp {
private:
size_t num, width;
public:
MemoryOp(const OpType _opType, const std::shared_ptr<Pointer> _src,
const std::shared_ptr<Pointer> _dst, const size_t _num,
const size_t _width, const std::vector<size_t> &_cond)
: num(_num), width(_width) {
opType = _opType;
ptrs = {_src, _dst};
cond = _cond;
}
// bool checkValid() override;
~MemoryOp() {}
static inline std::shared_ptr<MicroOp>
build(const OpType opType, const std::shared_ptr<Pointer> src,
const std::shared_ptr<Pointer> dst, const size_t num,
const size_t width) {
return std::make_shared<MemoryOp>(opType, src, dst, num, width,
std::vector<size_t>({}));
}
static inline std::shared_ptr<MicroOp>
build(const OpType opType, const std::shared_ptr<Pointer> src,
const std::shared_ptr<Pointer> dst, const size_t num,
const size_t width, const std::vector<size_t> &cond) {
return std::make_shared<MemoryOp>(opType, src, dst, num, width, cond);
}
std::shared_ptr<Pointer> getSrc() { return ptrs[0]; }
std::shared_ptr<Pointer> getDst() { return ptrs[1]; }
std::string generate() override;
std::string generateWithCond();
inline void print() override {
if (opType == READ) {
std::cout << id << " " << getName(opType) << " "
<< getName(getSrc()->getType()) << " "
<< getSrc()->getHash() << std::endl;
} else if (opType == WRITE) {
std::cout << id << " " << getName(opType) << " "
<< getName(getDst()->getType()) << " "
<< getDst()->getHash() << std::endl;
} else {
IT_ASSERT(false);
}
}
};
} // namespace memb

View File

@ -0,0 +1,29 @@
#pragma once
#include "pfusion/micro_op.h"
namespace memb {
class ReduceOp : public MicroOp {
private:
size_t num, width;
public:
ReduceOp(OpType _opType, std::shared_ptr<Pointer> _pSrc,
std::shared_ptr<Pointer> _pDst, std::shared_ptr<Pointer> _pBuf,
size_t _num, size_t _width)
: num(_num), width(_width) {
opType = _opType;
ptrs = {_pSrc, _pDst, _pBuf};
}
~ReduceOp() {}
std::shared_ptr<Pointer> getSrc() { return ptrs[0]; }
std::shared_ptr<Pointer> getDst() { return ptrs[1]; }
std::shared_ptr<Pointer> getBuf() { return ptrs[2]; }
// bool checkValid() override;
std::string generate() override;
inline void print() override {
std::cout << id << " " << getName(opType) << std::endl;
}
};
} // namespace memb

View File

@ -0,0 +1,29 @@
#pragma once
#include "pfusion/micro_op.h"
namespace memb {
class UnaryOp : public MicroOp {
private:
const int num, width;
public:
UnaryOp(OpType _opType, std::shared_ptr<Pointer> _src,
std::shared_ptr<Pointer> _dst, int _num, int _width)
: num(_num), width(_width) {
opType = _opType;
ptrs = {_src, _dst};
}
~UnaryOp() {}
std::shared_ptr<Pointer> getSrc() { return ptrs[0]; }
std::shared_ptr<Pointer> getDst() { return ptrs[1]; }
// bool checkValid() override;
std::string generate() override;
inline void print() override {
std::cout << id << " " << getName(opType) << std::endl;
}
};
} // namespace memb

View File

@ -0,0 +1,43 @@
#pragma once
#include "pfusion/common.h"
#include "pfusion/pointer.h"
namespace memb {
class MicroOp {
protected:
size_t id;
OpType opType;
std::vector<std::shared_ptr<Pointer>> ptrs;
std::vector<size_t> cond;
public:
MicroOp() : opType(NONE), cond(0) {
static int microOpId = 0;
id = microOpId++;
}
virtual ~MicroOp() {}
inline OpType getType() { return opType; }
inline bool isMemoryOp() { return opType == READ || opType == WRITE; }
inline std::vector<std::shared_ptr<Pointer>> getPtrs() { return ptrs; }
// virtual bool checkValid() = 0;
virtual std::string generate() = 0;
virtual void print() = 0;
static std::shared_ptr<MicroOp> merge(std::shared_ptr<MicroOp> op0,
std::shared_ptr<MicroOp> op1);
};
class MicroGraph {
public:
MicroGraph() {}
~MicroGraph() {}
private:
std::vector<std::shared_ptr<MicroOp>> microOps;
std::vector<std::pair<int, int>> edges;
};
} // namespace memb

71
include/pfusion/pointer.h Normal file
View File

@ -0,0 +1,71 @@
#pragma once
#include "pfusion/common.h"
namespace memb {
class Pointer {
private:
const MemType memType;
const std::string name, offset;
public:
Pointer(MemType _memType, std::string _name, std::string _offset)
: memType(_memType), name(_name), offset(_offset) {}
~Pointer() {}
static inline std::shared_ptr<Pointer> buildPtr(MemType memType,
std::string name) {
return std::make_shared<Pointer>(memType, name, "0");
}
static inline std::shared_ptr<Pointer>
buildPtr(MemType memType, std::string name, std::string offset) {
return std::make_shared<Pointer>(memType, name, offset);
}
static inline std::shared_ptr<Pointer>
buildPtr(std::shared_ptr<Pointer> ptr) {
return std::make_shared<Pointer>(ptr->getType(), ptr->getName(),
ptr->getOffset());
}
static inline std::shared_ptr<Pointer>
buildPtr(std::shared_ptr<Pointer> ptr, std::string offset) {
return std::make_shared<Pointer>(ptr->getType(), ptr->getName(),
ptr->getOffset() + " + " + offset);
}
static inline std::shared_ptr<Pointer> buildPtrByTensorGuid(size_t guid) {
return std::make_shared<Pointer>(
MemType::DRAM, "tensor_ptr_" + std::to_string(guid), "0");
}
inline const MemType getType() { return memType; }
inline const std::string getName() { return name; }
inline const std::string getOffset() { return offset; }
inline const std::string generate() { return name + "[" + offset + "]"; }
inline const std::string generateWithInstIdx(std::string idx) {
std::string code = generate();
size_t pos = 0, lengthA = 8, lengthB = idx.size();
while ((pos = code.find("inst_idx", pos)) != std::string::npos) {
code.replace(pos, lengthA, idx);
pos += lengthB;
}
std::cout << "[INFO] " << idx << " " << lengthB << " " << code
<< std::endl;
return code;
}
inline bool equal(std::shared_ptr<Pointer> ptr) {
if (name == ptr->getName() && offset == ptr->getOffset()) {
IT_ASSERT(memType == ptr->getType());
return true;
}
return false;
}
inline const size_t getHash() {
std::hash<MemType> memTypeHash;
std::hash<std::string> stringHash;
size_t ret = memTypeHash(memType);
ret = hashAppend(ret, stringHash(name));
ret = hashAppend(ret, stringHash(offset));
return ret;
}
};
} // namespace memb

View File

@ -0,0 +1,39 @@
#pragma once
#include "pfusion/meta_graph.h"
#include "pfusion/meta_op.h"
namespace memb {
class SearchGraph {
private:
class Node {
public:
int id;
std::vector<std::shared_ptr<MetaOp>> metaOps;
std::vector<int> pred;
std::vector<int> succ;
};
// each node is a vector of metaOps.
std::vector<Node> nodes;
std::vector<std::pair<int, int>> edges;
public:
SearchGraph() {}
~SearchGraph() {}
inline void addNode(std::vector<std::shared_ptr<MetaOp>> metaOps) {
Node node;
node.id = nodes.size();
for (auto metaOp : metaOps) {
node.metaOps.emplace_back(metaOp);
}
nodes.emplace_back(node);
}
inline void addEdge(int i, int j) {
edges.emplace_back(i, j);
nodes[i].succ.emplace_back(j);
nodes[j].pred.emplace_back(i);
}
std::shared_ptr<MetaGraph> exportFirstMetaGraph();
};
} // namespace memb

View File

@ -0,0 +1 @@
from .import_onnx import import_onnx

View File

@ -0,0 +1,483 @@
import functools
import numpy as np
import onnx
import onnx.checker
import onnx.numpy_helper
import onnx.shape_inference
from rules import conv_transposed2d_rules, conv_rules, print_result
def _add_value_info_for_constants(model: onnx.ModelProto):
"""
Currently onnx.shape_inference doesn't use the shape of initializers, so add
that info explicitly as ValueInfoProtos.
Mutates the model.
Args:
model: The ModelProto to update.
"""
# All (top-level) constants will have ValueInfos before IRv4 as they are all inputs
if model.ir_version < 4:
return
def add_const_value_infos_to_graph(graph: onnx.GraphProto):
inputs = {i.name for i in graph.input}
existing_info = {vi.name: vi for vi in graph.value_info}
for init in graph.initializer:
# Check it really is a constant, not an input
if init.name in inputs:
continue
# The details we want to add
elem_type = init.data_type
shape = init.dims
# Get existing or create new value info for this constant
vi = existing_info.get(init.name)
if vi is None:
vi = graph.value_info.add()
vi.name = init.name
# Even though it would be weird, we will not overwrite info even if it doesn't match
tt = vi.type.tensor_type
if tt.elem_type == onnx.TensorProto.UNDEFINED:
tt.elem_type = elem_type
if not tt.HasField("shape"):
# Ensure we set an empty list if the const is scalar (zero dims)
tt.shape.dim.extend([])
for dim in shape:
tt.shape.dim.add().dim_value = dim
# Handle subgraphs
for node in graph.node:
for attr in node.attribute:
# Ref attrs refer to other attrs, so we don't need to do anything
if attr.ref_attr_name != "":
continue
if attr.type == onnx.AttributeProto.GRAPH:
add_const_value_infos_to_graph(attr.g)
if attr.type == onnx.AttributeProto.GRAPHS:
for g in attr.graphs:
add_const_value_infos_to_graph(g)
return add_const_value_infos_to_graph(model.graph)
def _parse_attribute(attributes, defaults=dict()):
atts = defaults
for att in attributes:
if att.type == onnx.AttributeProto.INT:
atts[att.name] = att.i
elif att.type == onnx.AttributeProto.INTS:
atts[att.name] = att.ints
elif att.type == onnx.AttributeProto.FLOAT:
atts[att.name] = att.f
elif att.type == onnx.AttributeProto.STRING:
atts[att.name] = att.s
elif att.type == onnx.AttributeProto.TENSOR:
atts[att.name] = att.t
else:
assert False, "Unsupported Attribute Type: {}".format(att.type)
return atts
def _onnx_datatype_tostring(dtype):
if dtype == 0:
return 'UNDEFINED'
elif dtype == 1:
return 'FLOAT'
elif dtype == 2:
return 'UINT8'
elif dtype == 3:
return 'INT8'
elif dtype == 4:
return 'UINT16'
elif dtype == 5:
return 'INT16'
elif dtype == 6:
return 'INT32'
elif dtype == 7:
return 'INT64'
elif dtype == 8:
return 'STRING'
elif dtype == 9:
return 'BOOL'
elif dtype == 10:
return 'FLOAT16'
elif dtype == 11:
return 'DOUBLE'
elif dtype == 12:
return 'UINT32'
elif dtype == 13:
return 'UINT64'
elif dtype == 14:
return 'COMPLEX64'
elif dtype == 15:
return 'COMPLEX128'
elif dtype == 16:
return 'BFLOAT16'
else:
assert False, 'Unknown onnx datatype'
def import_onnx(model_path: str, bs: int):
ts, ds, ops, consts = dict(), dict(), dict(), dict() # (key, value) = (name, class)
model = onnx.load(model_path)
# Tensor_input
for input in model.graph.input:
if input.name not in ts:
dims = [d.dim_value for d in input.type.tensor_type.shape.dim]
# ts[input.name] = g.tensor(dims, _onnx_datatype_tostring(input.type.tensor_type.elem_type))
ds[input.name] = dims
# Tensor_weight
for weight in model.graph.initializer:
if weight.name not in ts:
# ts[weight.name] = g.tensor(weight.dims, _onnx_datatype_tostring(weight.data_type))
ds[weight.name] = weight.dims
# Tensor_inference
_add_value_info_for_constants(model)
infered_model = onnx.shape_inference.infer_shapes(model)
for v in infered_model.graph.value_info:
if v.name not in ts:
dims = [d.dim_value for d in v.type.tensor_type.shape.dim]
# ts[v.name] = g.tensor(dims, _onnx_datatype_tostring(v.type.tensor_type.elem_type))
ds[v.name] = dims
# Tensor_output
for output in model.graph.output:
if output.name not in ts:
dims = [d.dim_value for d in output.type.tensor_type.shape.dim]
# ts[output.name] = g.tensor(dims, _onnx_datatype_tostring(output.type.tensor_type.elem_type))
ds[output.name] = dims
# Op
for node in model.graph.node:
# if node.op_type == 'Add':
# assert len(node.output) == 1
# g.add([ts[item] for item in node.input], ts[node.output[0]])
# elif node.op_type == 'Cast':
# assert len(node.input) == 1
# assert len(node.output) == 1
# # Ignore for now (TODO)
# g.identity(ts[node.input[0]], ts[node.output[0]])
if node.op_type == 'Conv':
attrs = _parse_attribute(node.attribute, {
"auto_pad": "NOTSET",
"dilations": [1, 1],
"pads": [0, 0, 0, 0],
"strides": [1, 1]})
assert len(node.input) == 2 or len(node.input) == 3
assert len(node.output) == 1
assert attrs["auto_pad"] == "NOTSET"
assert len(attrs["pads"]) == 4
assert len(attrs["strides"]) == 2
assert len(attrs["dilations"]) == 2
assert attrs["pads"][0] == attrs["pads"][2]
assert attrs["pads"][1] == attrs["pads"][3]
assert ds[node.input[0]][1] % ds[node.input[1]][1] == 0
n, c, h, w, f, r, s, ph, pw, sh, sw, dh, dw = ds[node.input[0]][0], ds[node.input[0]][1], ds[node.input[0]][2], ds[node.input[0]][3], ds[node.input[1]][0], ds[node.input[1]][2], ds[node.input[1]
][3], attrs["pads"][0], attrs["pads"][1], attrs["strides"][0], attrs["strides"][1], attrs["dilations"][0], attrs["dilations"][1]
group = ds[node.input[0]][1] // ds[node.input[1]][1]
# t = getPerfConv(n, c, h, w, f, r, s, ph, pw,
# sh, sw, dh, dw, group, "")
# print(node.name, n, c, h, w, f, r, s, ph, pw, sh, sw, dh, dw, group, f'{t:.3f}')
n = n*bs
for rule in conv_rules:
rule(node.name, n, c, h, w, f, r, s,
ph, pw, sh, sw, dh, dw, group)
elif node.op_type == 'ConvTranspose':
attrs = _parse_attribute(node.attribute, {
"auto_pad": "NOTSET",
"dilations": [1, 1],
"pads": [0, 0, 0, 0],
"strides": [1, 1],
"group": 1})
assert len(node.input) == 2 or len(node.input) == 3
assert len(node.output) == 1
assert attrs["auto_pad"] == "NOTSET"
assert len(attrs["pads"]) == 4
assert len(attrs["strides"]) == 2
assert len(attrs["dilations"]) == 2
assert attrs["pads"][0] == attrs["pads"][2]
assert attrs["pads"][1] == attrs["pads"][3]
n, f, h, w = ds[node.input[0]]
_, c, r, s = ds[node.input[1]]
ph, pw, sh, sw, dh, dw = attrs["pads"][0], attrs["pads"][1], attrs["strides"][
0], attrs["strides"][1], attrs["dilations"][0], attrs["dilations"][1]
oph, opw = 0, 0
if "output_padding" in attrs:
oph, opw = attrs["output_padding"][0], attrs["output_padding"][1]
assert attrs["output_padding"][0] == attrs["output_padding"][1]
group = attrs["group"]
n = n*bs
for rule in conv_transposed2d_rules:
rule(node.name, n, c, h, w, f, r, s, ph,
pw, sh, sw, dh, dw, oph, opw, group)
elif node.op_type == 'MatMul':
print(f'{node.name} skipped')
continue
assert len(node.input) == 2
assert len(node.output) == 1
dimA = list(ds[node.input[0]])
dimB = list(ds[node.input[1]])
dimO = list(ds[node.output[0]])
# if len(dimA) == 2 and len(dimB) == 2:
# tmpI0 = g.tensor([1] + list(ds[node.input[0]]), "FLOAT")
# tmpI1 = g.tensor([1] + list(ds[node.input[1]]), "FLOAT")
# tmpO = g.tensor([1] + list(ds[node.output[0]]), "FLOAT")
# g.transpose(ts[node.input[0]], tmpI0, 0, Perm([PermItem(-1), PermItem(0), PermItem(1)]), 1)
# g.transpose(ts[node.input[1]], tmpI1, 0, Perm([PermItem(-1), PermItem(0), PermItem(1)]), 1)
# g.matmul(tmpI0, tmpI1, tmpO, False, False, None)
# g.transpose(tmpO, ts[node.output[0]], -1, Perm([PermItem([0, 1]), PermItem(2)]), 0)
# else:
# assert len(dimO) >= 3
# batch = functools.reduce(lambda x, y: x * y, dimO[:-2])
# if len(dimA) == 3:
# tmpI0 = ts[node.input[0]]
# else:
# tmpI0 = g.tensor([batch, dimA[-2], dimA[-1]], "FLOAT")
# g.reshape(ts[node.input[0]], tmpI0)
# if len(dimB) == 3:
# tmpI1 = ts[node.input[1]]
# else:
# tmpI1 = g.tensor([batch, dimB[-2], dimB[-1]], "FLOAT")
# g.reshape(ts[node.input[1]], tmpI1)
# if len(dimO) == 3:
# tmpO = ts[node.output[0]]
# g.matmul(tmpI0, tmpI1, tmpO, False, False, None)
# else:
# tmpO = g.tensor([batch, dimO[-2], dimO[-1]], "FLOAT")
# g.matmul(tmpI0, tmpI1, tmpO, False, False, None)
# g.reshape(tmpO, ts[node.output[0]])
# elif node.op_type == 'Concat':
# assert len(node.output) == 1
# attrs = _parse_attribute(node.attribute, {})
# g.concat([ts[item] for item in node.input], ts[node.output[0]], attrs["axis"])
# elif node.op_type == 'Constant':
# attrs = _parse_attribute(node.attribute, {})
# assert len(node.output) == 1
# c = onnx.numpy_helper.to_array(attrs["value"])
# if c.ndim == 0:
# c = c[()]
# consts[node.output[0]] = c
# elif node.op_type == 'Flatten':
# attrs = _parse_attribute(node.attribute, {"axis": 1})
# assert len(node.input) == 1
# assert len(node.output) == 1
# g.flatten(ts[node.input[0]], ts[node.output[0]], attrs["axis"])
# elif node.op_type == 'Gather':
# attrs = _parse_attribute(node.attribute, {"axis": 0})
# assert len(node.input) == 2
# assert len(node.output) == 1
# g.gather(ts[node.input[0]], ts[node.input[1]], ts[node.output[0]], attrs["axis"])
# elif node.op_type == 'Gemm':
# attrs = _parse_attribute(node.attribute, {
# "alpha": 1.0,
# "beta": 1.0,
# "transA": 0,
# "transB": 0})
# assert len(node.input) == 2 or len(node.input) == 3
# assert len(node.output) == 1
# assert attrs["alpha"] == 1.0
# assert attrs["beta"] == 1.0 or len(node.input) == 2
# tmpI0 = g.tensor([1] + list(ds[node.input[0]]), "FLOAT")
# tmpI1 = g.tensor([1] + list(ds[node.input[1]]), "FLOAT")
# tmpO = g.tensor([1] + list(ds[node.output[0]]), "FLOAT")
# g.transpose(ts[node.input[0]], tmpI0, 0, Perm([PermItem(-1), PermItem(0), PermItem(1)]), 1)
# g.transpose(ts[node.input[1]], tmpI1, 0, Perm([PermItem(-1), PermItem(0), PermItem(1)]), 1)
# g.matmul(tmpI0, tmpI1, tmpO,
# attrs["transA"], attrs["transB"],
# None if len(node.input) == 2 else ts[node.input[2]])
# g.transpose(tmpO, ts[node.output[0]], -1, Perm([PermItem([0, 1]), PermItem(2)]), 0)
# elif node.op_type == 'Mul':
# assert len(node.output) == 1
# g.mul([ts[x] for x in node.input], ts[node.output[0]])
# elif node.op_type == 'GlobalAveragePool':
# assert len(node.input) == 1
# assert len(node.output) == 1
# dims = ds[node.input[0]]
# if len(dims) > 0:
# g.avgpool(ts[node.input[0]], ts[node.output[0]], dims[2], dims[3], 0, 0, 1, 1)
# else:
# g.avgpool(ts[node.input[0]], ts[node.output[0]])
# elif node.op_type == 'MaxPool':
# attrs = _parse_attribute(node.attribute, {
# "auto_pad": "NOTSET",
# "dilations": [1, 1],
# "pads": [0, 0, 0, 0],
# "strides": [1, 1]})
# assert len(node.input) == 1
# assert len(node.output) == 1
# assert len(attrs["kernel_shape"]) == 2
# assert len(attrs["pads"]) == 4
# assert len(attrs["strides"]) == 2
# assert len(attrs["dilations"]) == 2
# assert attrs["pads"][0] == attrs["pads"][2]
# assert attrs["pads"][1] == attrs["pads"][3]
# g.maxpool(ts[node.input[0]], ts[node.output[0]],
# attrs["kernel_shape"][0], attrs["kernel_shape"][1],
# attrs["dilations"][0], attrs["dilations"][1],
# attrs["pads"][0], attrs["pads"][1],
# attrs["strides"][0], attrs["strides"][1])
# elif node.op_type == 'AveragePool':
# attrs = _parse_attribute(node.attribute, {
# "auto_pad": "NOTSET",
# "count_include_pad": 0,
# "pads": [0, 0, 0, 0],
# "strides": [1, 1]})
# # No dilation in ONNX
# assert len(node.input) == 1
# assert len(node.output) == 1
# assert attrs["count_include_pad"] == 0 # To be consistent with operator.cc
# assert len(attrs["kernel_shape"]) == 2
# assert len(attrs["pads"]) == 4
# assert len(attrs["strides"]) == 2
# assert attrs["pads"][0] == attrs["pads"][2]
# assert attrs["pads"][1] == attrs["pads"][3]
# g.avgpool(ts[node.input[0]], ts[node.output[0]],
# attrs["kernel_shape"][0], attrs["kernel_shape"][1],
# attrs["pads"][0], attrs["pads"][1],
# attrs["strides"][0], attrs["strides"][1])
# elif node.op_type == 'Pad':
# attrs = _parse_attribute(node.attribute, {'mode': b'constant'})
# assert attrs["mode"].decode("ascii") == "constant"
# assert len(attrs["pads"]) % 2 == 0
# assert attrs["value"] == 0
# nDim = len(attrs["pads"]) // 2
# begin = attrs["pads"][:nDim]
# end = attrs["pads"][nDim:]
# g.pad(ts[node.input[0]], ts[node.output[0]], begin, end)
# elif node.op_type == 'ReduceMean':
# attrs = _parse_attribute(node.attribute, {'keepdims': 1})
# assert len(node.input) == 1
# assert len(node.output) == 1
# assert len(attrs["axes"]) == 1
# axis = attrs["axes"][0]
# if axis < 0:
# axis = len(ds[node.input[0]]) - axis
# g.reduceMean(ts[node.input[0]], ts[node.output[0]], axis)
# elif node.op_type == 'Softmax':
# attrs = _parse_attribute(node.attribute)
# assert len(node.input) == 1
# assert len(node.output) == 1
# axis = attrs["axis"]
# if axis < 0:
# axis = len(ds[node.input[0]]) - axis
# g.softmax(ts[node.input[0]], ts[node.output[0]], axis)
# elif node.op_type == 'Reshape':
# assert len(node.input) == 2
# assert len(node.output) == 1
# g.reshape(ts[node.input[0]], ts[node.output[0]])
# elif node.op_type == 'Relu':
# assert len(node.input) == 1
# assert len(node.output) == 1
# g.relu(ts[node.input[0]], ts[node.output[0]])
# elif node.op_type == 'Tanh':
# assert len(node.input) == 1
# assert len(node.output) == 1
# g.tanh(ts[node.input[0]], ts[node.output[0]])
# elif node.op_type == 'Sigmoid':
# assert len(node.input) == 1
# assert len(node.output) == 1
# g.sigmoid(ts[node.input[0]], ts[node.output[0]])
# elif node.op_type == 'Shape':
# # Ignore for now, and no need to output anything (TODO)
# pass
# elif node.op_type == 'Sub':
# assert len(node.input) == 2
# assert len(node.output) == 1
# g.sub(ts[node.input[0]], ts[node.input[1]], ts[node.output[0]])
# elif node.op_type == 'Transpose':
# attrs = _parse_attribute(node.attribute, {})
# assert len(node.input) == 1
# assert len(node.output) == 1
# assert "perm" in attrs
# g.transpose(ts[node.input[0]], ts[node.output[0]], -1,
# Perm([PermItem(x) for x in attrs["perm"]]), 0)
# elif node.op_type == 'Unsqueeze':
# assert len(node.input) == 2
# assert len(node.output) == 1
# g.reshape(ts[node.input[0]], ts[node.output[0]])
# elif node.op_type == "BatchNormalization":
# attrs = _parse_attribute(node.attribute, {})
# assert len(node.input) == 5
# assert len(node.output) == 1
# epsilon = attrs['epsilon'] if 'epsilon' in attrs else 1e-5
# momentum = attrs['momentum'] if 'momentum' in attrs else 0.9
# g.batchnorm(ts[node.input[0]], ts[node.input[1]],
# ts[node.input[2]], ts[node.input[3]],
# ts[node.input[4]], ts[node.output[0]],
# epsilon, momentum)
# elif node.op_type == "Split":
# attrs = _parse_attribute(node.attribute, {})
# assert len(node.input) == 1
# assert len(node.output) > 1
# axis = attrs['axis']
# split = attrs['split']
# g.split(ts[node.input[0]], [ts[t] for t in node.output], axis, split)
# elif node.op_type == "Slice":
# attrs = _parse_attribute(node.attribute, {})
# assert len(node.input) == 4
# assert len(node.output) == 1
# g.slice(ts[node.input[0]], ts[node.output[0]],
# ts[node.input[1]], ts[node.input[2]])
# elif node.op_type == "Resize":
# attrs = _parse_attribute(node.attribute, {})
# assert len(node.input) == 4
# assert len(node.output) == 1
# roi = ts[node.input[1]] if node.input[1] != '' else g.tensor(
# [], 'FLOAT')
# g.resize(ts[node.input[0]], roi, ts[node.output[0]])
# else:
# assert False, "Unsupported op: " + node.op_type
if __name__ == "__main__":
import sys
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("model", help="ONNX model file")
parser.add_argument("bs", help="batch size", type=int, default=1)
# parser.add_argument("--output", help="Output file")
args = parser.parse_args()
import_onnx(args.model, args.bs)
print_result(args.model)

View File

@ -0,0 +1,588 @@
from pyinfinitensor import *
import functools
import numpy as np
import onnx
import onnx.checker
import onnx.numpy_helper
import onnx.shape_inference
def _add_value_info_for_constants(model: onnx.ModelProto):
"""
Currently onnx.shape_inference doesn't use the shape of initializers, so add
that info explicitly as ValueInfoProtos.
Mutates the model.
Args:
model: The ModelProto to update.
"""
# All (top-level) constants will have ValueInfos before IRv4 as they are all inputs
if model.ir_version < 4:
return
def add_const_value_infos_to_graph(graph: onnx.GraphProto):
inputs = {i.name for i in graph.input}
existing_info = {vi.name: vi for vi in graph.value_info}
for init in graph.initializer:
# Check it really is a constant, not an input
if init.name in inputs:
continue
# The details we want to add
elem_type = init.data_type
shape = init.dims
# Get existing or create new value info for this constant
vi = existing_info.get(init.name)
if vi is None:
vi = graph.value_info.add()
vi.name = init.name
# Even though it would be weird, we will not overwrite info even if it doesn't match
tt = vi.type.tensor_type
if tt.elem_type == onnx.TensorProto.UNDEFINED:
tt.elem_type = elem_type
if not tt.HasField("shape"):
# Ensure we set an empty list if the const is scalar (zero dims)
tt.shape.dim.extend([])
for dim in shape:
tt.shape.dim.add().dim_value = dim
# Handle subgraphs
for node in graph.node:
for attr in node.attribute:
# Ref attrs refer to other attrs, so we don't need to do anything
if attr.ref_attr_name != "":
continue
if attr.type == onnx.AttributeProto.GRAPH:
add_const_value_infos_to_graph(attr.g)
if attr.type == onnx.AttributeProto.GRAPHS:
for g in attr.graphs:
add_const_value_infos_to_graph(g)
return add_const_value_infos_to_graph(model.graph)
def _parse_attribute(attributes, defaults=dict()):
atts = defaults
for att in attributes:
if att.type == onnx.AttributeProto.INT:
atts[att.name] = att.i
elif att.type == onnx.AttributeProto.INTS:
atts[att.name] = att.ints
elif att.type == onnx.AttributeProto.FLOAT:
atts[att.name] = att.f
elif att.type == onnx.AttributeProto.STRING:
atts[att.name] = att.s
elif att.type == onnx.AttributeProto.TENSOR:
atts[att.name] = att.t
else:
assert False, "Unsupported Attribute Type: {}".format(att.type)
return atts
def _onnx_datatype_tostring(dtype):
if dtype == 0:
return 'UNDEFINED'
elif dtype == 1:
return 'FLOAT'
elif dtype == 2:
return 'UINT8'
elif dtype == 3:
return 'INT8'
elif dtype == 4:
return 'UINT16'
elif dtype == 5:
return 'INT16'
elif dtype == 6:
return 'INT32'
elif dtype == 7:
return 'INT64'
elif dtype == 8:
return 'STRING'
elif dtype == 9:
return 'BOOL'
elif dtype == 10:
return 'FLOAT16'
elif dtype == 11:
return 'DOUBLE'
elif dtype == 12:
return 'UINT32'
elif dtype == 13:
return 'UINT64'
elif dtype == 14:
return 'COMPLEX64'
elif dtype == 15:
return 'COMPLEX128'
elif dtype == 16:
return 'BFLOAT16'
else:
assert False, 'Unknown onnx datatype'
def import_onnx(gf: GraphBuilder, net: str):
ts, ds, ops, consts = dict(), dict(), dict(), dict() # (key, value) = (name, class)
model = onnx.load(net)
# Tensor_input
for input in model.graph.input:
if input.name not in ts:
dims = [d.dim_value for d in input.type.tensor_type.shape.dim]
ts[input.name] = gf.tensor(dims, _onnx_datatype_tostring(
input.type.tensor_type.elem_type))
ds[input.name] = dims
# Tensor_weight
for weight in model.graph.initializer:
if weight.name not in ts:
ts[weight.name] = gf.tensor(
weight.dims, _onnx_datatype_tostring(weight.data_type))
ds[weight.name] = weight.dims
# Tensor_inference
_add_value_info_for_constants(model)
infered_model = onnx.shape_inference.infer_shapes(model)
for v in infered_model.graph.value_info:
if v.name not in ts:
dims = [d.dim_value for d in v.type.tensor_type.shape.dim]
ts[v.name] = gf.tensor(dims, _onnx_datatype_tostring(
v.type.tensor_type.elem_type))
ds[v.name] = dims
# Tensor_output
for output in model.graph.output:
if output.name not in ts:
dims = [d.dim_value for d in output.type.tensor_type.shape.dim]
ts[output.name] = gf.tensor(dims, _onnx_datatype_tostring(
output.type.tensor_type.elem_type))
ds[output.name] = dims
# Op
for node in model.graph.node:
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#Conv
if node.op_type == 'Conv':
attrs = _parse_attribute(node.attribute, {
"auto_pad": "NOTSET",
"dilations": [1, 1],
"pads": [0, 0, 0, 0],
"strides": [1, 1]})
# assert len(node.input) == 2 # bias is not implemented yet
assert len(node.output) == 1
assert attrs["auto_pad"] == "NOTSET"
assert len(attrs["pads"]) == 4
assert len(attrs["strides"]) == 2
assert len(attrs["dilations"]) == 2
assert attrs["pads"][0] == attrs["pads"][2]
assert attrs["pads"][1] == attrs["pads"][3]
gf.conv(ts[node.input[0]], ts[node.input[1]], ts[node.output[0]],
attrs["pads"][0], attrs["pads"][1],
attrs["strides"][0], attrs["strides"][1],
attrs["dilations"][0], attrs["dilations"][1],
None)
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#MatMul
elif node.op_type == 'MatMul':
assert len(node.input) == 2
assert len(node.output) == 1
dimA = list(ds[node.input[0]])
dimB = list(ds[node.input[1]])
dimO = list(ds[node.output[0]])
if len(dimA) == 2 and len(dimB) == 2:
tmpI0 = gf.tensor([1] + list(ds[node.input[0]]), "FLOAT")
tmpI1 = gf.tensor([1] + list(ds[node.input[1]]), "FLOAT")
tmpO = gf.tensor([1] + list(ds[node.output[0]]), "FLOAT")
gf.matmul(tmpI0, tmpI1, tmpO, False, False, None)
else:
assert len(dimO) >= 3
batch = functools.reduce(lambda x, y: x * y, dimO[:-2])
if len(dimA) == 3:
tmpI0 = ts[node.input[0]]
else:
tmpI0 = gf.tensor([batch, dimA[-2], dimA[-1]], "FLOAT")
gf.reshape(ts[node.input[0]], tmpI0, [
batch, dimA[-2], dimA[-1]])
if len(dimB) == 3:
tmpI1 = ts[node.input[1]]
else:
tmpI1 = gf.tensor([batch, dimB[-2], dimB[-1]], "FLOAT")
gf.reshape(ts[node.input[1]], tmpI1, [
batch, dimB[-2], dimB[-1]])
if len(dimO) == 3:
tmpO = ts[node.output[0]]
gf.matmul(tmpI0, tmpI1, tmpO, False, False)
else:
tmpO = gf.tensor([batch, dimO[-2], dimO[-1]], "FLOAT")
gf.matmul(tmpI0, tmpI1, tmpO, False, False)
gf.reshape(tmpO, ts[node.output[0]], ds[node.output[0]])
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#ConvTranspose
elif node.op_type == 'ConvTranspose':
attrs = _parse_attribute(node.attribute, {
"auto_pad": "NOTSET",
"dilations": [1, 1],
"pads": [0, 0, 0, 0],
"strides": [1, 1],
"group": 1})
assert len(node.input) == 2 or len(node.input) == 3
assert len(node.output) == 1
assert attrs["auto_pad"] == "NOTSET"
assert len(attrs["pads"]) == 4
assert len(attrs["strides"]) == 2
assert len(attrs["dilations"]) == 2
assert attrs["pads"][0] == attrs["pads"][2]
assert attrs["pads"][1] == attrs["pads"][3]
oph, opw = 0, 0
if "output_padding" in attrs:
oph, opw = attrs["output_padding"][0], attrs["output_padding"][1]
assert attrs["output_padding"][0] == attrs["output_padding"][1]
gf.convTrans(ts[node.input[0]], ts[node.input[1]], ts[node.output[0]],
attrs["pads"][0], attrs["pads"][1],
attrs["strides"][0], attrs["strides"][1],
attrs["dilations"][0], attrs["dilations"][1],
oph, opw, group,
None if len(node.input) == 2 else ts[node.input[2]])
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#Pad
elif node.op_type == 'Pad':
attrs = _parse_attribute(node.attribute, {'mode': b'constant'})
assert attrs["mode"].decode("ascii") == "constant"
assert len(attrs["pads"]) % 2 == 0
if "constant_value" in attrs:
assert attrs["constant_value"] == 0
gf.pad(ts[node.input[0]], ts[node.output[0]], attrs["pads"],
attrs["axes"] if axes in attrs else None)
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#Slice
elif node.op_type == "Slice":
assert 3 <= len(node.input) <= 5
assert len(node.output) == 1
gf.slice(ts[node.input[0]], ts[node.output[0]],
ts[node.input[1]], ts[node.input[2]],
ts[node.input[3]] if len(node.input) == 4 else None,
ts[node.input[4]] if len(node.input) == 5 else None)
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#Concat
elif node.op_type == 'Concat':
attrs = _parse_attribute(node.attribute, {})
assert len(node.output) == 1
gf.concat([ts[item] for item in node.input],
ts[node.output[0]], attrs["axis"])
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#Split
elif node.op_type == "Split":
attrs = _parse_attribute(node.attribute, {'axis': 0})
assert len(node.input) == 1
assert len(node.output) > 1
dim = attrs['axis']
num = attrs['num_outputs']
gf.split(ts[node.input[0]], [ts[t] for t in node.output], dim, num)
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#MaxPool
elif node.op_type == 'MaxPool':
attrs = _parse_attribute(node.attribute, {
"auto_pad": "NOTSET",
"dilations": [1, 1],
"pads": [0, 0, 0, 0],
"strides": [1, 1]})
assert len(node.input) == 1
assert len(node.output) == 1
assert len(attrs["kernel_shape"]) == 2
assert len(attrs["pads"]) == 4
assert len(attrs["strides"]) == 2
assert len(attrs["dilations"]) == 2
assert attrs["pads"][0] == attrs["pads"][2]
assert attrs["pads"][1] == attrs["pads"][3]
gf.maxpool(ts[node.input[0]], ts[node.output[0]],
attrs["kernel_shape"][0], attrs["kernel_shape"][1],
attrs["dilations"][0], attrs["dilations"][1],
attrs["pads"][0], attrs["pads"][1],
attrs["strides"][0], attrs["strides"][1])
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#AveragePool
# No dilation in ONNX
elif node.op_type == 'AveragePool':
attrs = _parse_attribute(node.attribute, {
"auto_pad": "NOTSET",
"count_include_pad": 0,
"pads": [0, 0, 0, 0],
"strides": [1, 1]})
assert len(node.input) == 1
assert len(node.output) == 1
# To be consistent with operator.cc
assert attrs["count_include_pad"] == 0
assert len(attrs["kernel_shape"]) == 2
assert len(attrs["pads"]) == 4
assert len(attrs["strides"]) == 2
assert attrs["pads"][0] == attrs["pads"][2]
assert attrs["pads"][1] == attrs["pads"][3]
dh, dw = 1, 1
gf.avgpool(ts[node.input[0]], ts[node.output[0]],
attrs["kernel_shape"][0], attrs["kernel_shape"][1],
dw, dh,
attrs["pads"][0], attrs["pads"][1],
attrs["strides"][0], attrs["strides"][1])
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#Add
elif node.op_type == 'Add':
print(ds[node.input[0]], ds[node.input[1]], ds[node.output[0]])
assert len(node.input) == 2
assert len(node.output) == 1
if ds[node.input[0]] == ds[node.input[1]]:
gf.add(ts[node.input[0]], ts[node.input[1]],
ts[node.output[0]])
elif ds[node.input[0]] == ds[node.output[0]]:
tmp = gf.tensor(ds[node.output[0]], "FLOAT")
gf.add(ts[node.input[0]], tmp, ts[node.output[0]])
elif ds[node.input[1]] == ds[node.output[0]]:
tmp = gf.tensor(ds[node.output[0]], "FLOAT")
gf.add(tmp, ts[node.input[1]], ts[node.output[0]])
else:
assert False
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#Sub
elif node.op_type == 'Sub':
assert len(node.input) == 2
assert len(node.output) == 1
if ds[node.input[0]] == ds[node.input[1]]:
gf.sub(ts[node.input[0]], ts[node.input[1]],
ts[node.output[0]])
elif ds[node.input[0]] == ds[node.output[0]]:
tmp = gf.tensor(ds[node.output[0]], "FLOAT")
gf.sub(ts[node.input[0]], tmp, ts[node.output[0]])
elif ds[node.input[1]] == ds[node.output[0]]:
tmp = gf.tensor(ds[node.output[0]], "FLOAT")
gf.sub(tmp, ts[node.input[1]], ts[node.output[0]])
else:
assert False
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#Mul
elif node.op_type == 'Mul':
assert len(node.input) == 2
assert len(node.output) == 1
if ds[node.input[0]] == ds[node.input[1]]:
gf.mul(ts[node.input[0]], ts[node.input[1]],
ts[node.output[0]])
elif ds[node.input[0]] == ds[node.output[0]]:
tmp = gf.tensor(ds[node.output[0]], "FLOAT")
gf.mul(ts[node.input[0]], tmp, ts[node.output[0]])
elif ds[node.input[1]] == ds[node.output[0]]:
tmp = gf.tensor(ds[node.output[0]], "FLOAT")
gf.mul(tmp, ts[node.input[1]], ts[node.output[0]])
else:
assert False
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#Div
elif node.op_type == 'Div':
assert len(node.input) == 2
assert len(node.output) == 1
if ds[node.input[0]] == ds[node.input[1]]:
gf.div(ts[node.input[0]], ts[node.input[1]],
ts[node.output[0]])
elif ds[node.input[0]] == ds[node.output[0]]:
tmp = gf.tensor(ds[node.output[0]], "FLOAT")
gf.div(ts[node.input[0]], tmp, ts[node.output[0]])
elif ds[node.input[1]] == ds[node.output[0]]:
tmp = gf.tensor(ds[node.output[0]], "FLOAT")
gf.div(tmp, ts[node.input[1]], ts[node.output[0]])
else:
assert False
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#Pow
elif node.op_type == 'Pow':
assert len(node.input) == 2
assert len(node.output) == 1
if ds[node.input[0]] == ds[node.input[1]]:
gf.pow(ts[node.input[0]], ts[node.input[1]],
ts[node.output[0]])
elif ds[node.input[0]] == ds[node.output[0]]:
tmp = gf.tensor(ds[node.output[0]], "FLOAT")
gf.pow(ts[node.input[0]], tmp, ts[node.output[0]])
elif ds[node.input[1]] == ds[node.output[0]]:
tmp = gf.tensor(ds[node.output[0]], "FLOAT")
gf.pow(tmp, ts[node.input[1]], ts[node.output[0]])
else:
assert False
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#Gather
elif node.op_type == 'Gather':
attrs = _parse_attribute(node.attribute, {"axis": 0})
assert len(node.input) == 2
assert len(node.output) == 1
gf.gather(ts[node.input[0]], ts[node.input[1]],
ts[node.output[0]], attrs["axis"])
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#Reshape
elif node.op_type == 'Reshape':
attrs = _parse_attribute(node.attribute, {"allowzero": 0})
assert len(node.input) == 2
assert len(node.output) == 1
gf.reshape(ts[node.input[0]],
ts[node.output[0]], ds[node.output[0]])
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#Flatten
# Output is 2D in ONNX
elif node.op_type == 'Flatten':
assert len(node.input) == 1
assert len(node.output) == 1
gf.flatten(ts[node.input[0]], ts[node.output[0]])
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#Identity
elif node.op_type == 'Identity':
assert len(node.input) == 1
assert len(node.output) == 1
gf.identity(ts[node.input[0]], ts[node.output[0]])
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#Softmax
elif node.op_type == 'Softmax':
attrs = _parse_attribute(node.attribute, {"axis": -1})
assert len(node.input) == 1
assert len(node.output) == 1
gf.softmax(ts[node.input[0]], ts[node.output[0]])
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#Relu
elif node.op_type == 'Relu':
assert len(node.input) == 1
assert len(node.output) == 1
gf.relu(ts[node.input[0]], ts[node.output[0]])
# TODO
elif node.op_type == 'Sqrt':
assert len(node.input) == 1
assert len(node.output) == 1
gf.relu(ts[node.input[0]], ts[node.output[0]])
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#Sigmoid
elif node.op_type == 'Sigmoid':
assert len(node.input) == 1
assert len(node.output) == 1
gf.sigmoid(ts[node.input[0]], ts[node.output[0]])
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#Tanh
elif node.op_type == 'Tanh':
assert len(node.input) == 1
assert len(node.output) == 1
gf.tanh(ts[node.input[0]], ts[node.output[0]])
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#Abs
elif node.op_type == 'Abs':
assert len(node.input) == 1
assert len(node.output) == 1
gf.abs(ts[node.input[0]], ts[node.output[0]])
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#Cast
# Ignore for now (TODO)
elif node.op_type == 'Cast':
assert len(node.input) == 1
assert len(node.output) == 1
gf.identity(ts[node.input[0]], ts[node.output[0]])
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#Constant
elif node.op_type == 'Constant':
attrs = _parse_attribute(node.attribute, {})
assert len(node.output) == 1
c = onnx.numpy_helper.to_array(attrs["value"])
if c.ndim == 0:
c = c[()]
consts[node.output[0]] = c
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#Gemm
elif node.op_type == 'Gemm':
attrs = _parse_attribute(node.attribute, {
"alpha": 1.0,
"beta": 1.0,
"transA": 0,
"transB": 0})
assert len(node.input) == 2 or len(node.input) == 3
assert len(node.output) == 1
assert attrs["alpha"] == 1.0
assert attrs["beta"] == 1.0 or len(node.input) == 2
i0 = gf.tensor([1] + list(ds[node.input[0]]), "FLOAT")
i1 = gf.tensor([1] + list(ds[node.input[1]]), "FLOAT")
o0 = gf.tensor([1] + list(ds[node.output[0]]), "FLOAT")
gf.reshape(ts[node.input[0]], i0, [1] + list(ds[node.input[0]]))
gf.reshape(ts[node.input[1]], i1, [1] + list(ds[node.input[1]]))
gf.matmul(i0, i1, o0, attrs["transA"], attrs["transB"])
o1 = gf.tensor(ds[node.output[0]], "FLOAT")
a0 = gf.tensor(ds[node.output[0]], "FLOAT")
gf.reshape(o0, o1, ds[node.output[0]])
gf.add(o1, a0, ts[node.output[0]])
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#GlobalAveragePool
# elif node.op_type == 'GlobalAveragePool':
# assert len(node.input) == 1
# assert len(node.output) == 1
# dims = ds[node.input[0]]
# if len(dims) > 0:
# g.avgpool(ts[node.input[0]], ts[node.output[0]], dims[2], dims[3], 0, 0, 1, 1)
# else:
# g.avgpool(ts[node.input[0]], ts[node.output[0]])
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#ReduceMean
elif node.op_type == 'ReduceMean':
attrs = _parse_attribute(node.attribute, {'keepdims': 1})
assert len(node.input) == 1
assert len(node.output) == 1
assert len(attrs["axes"]) == 1
axis = attrs["axes"][0]
print(axis, len(ds[node.input[0]]))
if axis < 0:
axis = len(ds[node.input[0]]) + axis
gf.reduceMean(ts[node.input[0]], ts[node.output[0]], axis)
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#Shape
# Ignore for now, and no need to output anything (TODO)
# elif node.op_type == 'Shape':
# pass
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#Transpose
elif node.op_type == 'Transpose':
attrs = _parse_attribute(node.attribute, {})
assert len(node.input) == 1
assert len(node.output) == 1
assert "perm" in attrs
gf.transpose(ts[node.input[0]], ts[node.output[0]], attrs["perm"])
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#Unsqueeze
elif node.op_type == 'Unsqueeze':
assert len(node.input) == 2
assert len(node.output) == 1
gf.reshape(ts[node.input[0]],
ts[node.output[0]], ts[node.output[0]])
# TODO
elif node.op_type == 'Erf':
assert len(node.input) == 1
assert len(node.output) == 1
gf.erf(ts[node.input[0]], ts[node.output[0]])
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#BatchNormalization
# elif node.op_type == "BatchNormalization":
# attrs = _parse_attribute(node.attribute, {})
# assert len(node.input) == 5
# assert len(node.output) == 1
# epsilon = attrs['epsilon'] if 'epsilon' in attrs else 1e-5
# momentum = attrs['momentum'] if 'momentum' in attrs else 0.9
# g.batchnorm(ts[node.input[0]], ts[node.input[1]],
# ts[node.input[2]], ts[node.input[3]],
# ts[node.input[4]], ts[node.output[0]],
# epsilon, momentum)
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#Resize
# elif node.op_type == "Resize":
# attrs = _parse_attribute(node.attribute, {})
# assert len(node.input) == 4
# assert len(node.output) == 1
# roi = ts[node.input[1]] if node.input[1] != '' else g.tensor(
# [], 'FLOAT')
# g.resize(ts[node.input[0]], roi, ts[node.output[0]])
else:
assert False, "Unsupported op: " + node.op_type

View File

@ -1,10 +1,11 @@
import argparse
from tokenize import Double
import pyinfinitensor # import getPerfConv, getPerfMatmul
def getPerfConv(n, c, h, w, f, r, s, padh, padw, strideh, stridew, dilationh, dilationw, group, name=""):
def getPerfConv(n, c, h, w, f, r, s, padh, padw, strideh, stridew, dilationh, dilationw, group):
return pyinfinitensor.getPerfConvCudnn(n, c, h, w, f, r, s, padh, padw,
strideh, stridew, dilationh, dilationw, group, name)
strideh, stridew, dilationh, dilationw, group)
def getPerfConvTransposed2dCudnn(n, c, h, w, f, r, s, padh, padw, strideh, stridew, dilationh, dilationw, oph, opw, group):
@ -13,3 +14,43 @@ def getPerfConvTransposed2dCudnn(n, c, h, w, f, r, s, padh, padw, strideh, strid
def getPerfMatmul(b, m, n, k, name=""):
return pyinfinitensor.getPerfMatmulCublas(b, m, n, k, name)
def getPerfConvBiasActCudnn(n, c, h, w, f, r, s, padh, padw, strideh, stridew, dilationh, dilationw, group, bias: bool, act="None"):
return pyinfinitensor.getPerfConvBiasActCudnn(n, c, h, w, f, r, s, padh, padw,
strideh, stridew, dilationh, dilationw, group, bias, act)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Process some integers.')
parser.add_argument('op', metavar='operator', type=str)
parser.add_argument('shape', nargs='+')
parser.add_argument('--pad', type=int, default=0)
parser.add_argument('--stride', type=int, default=1)
parser.add_argument('--dilation', type=int, default=1)
parser.add_argument('--group', type=int, default=1)
parser.add_argument('--bias', type=bool, default=False)
parser.add_argument('--act', type=str, default="None")
args = parser.parse_args()
print(args)
if args.op == 'gemm':
t = getPerfMatmul(int(args.shape[0]), int(
args.shape[1]), int(args.shape[2]), int(args.shape[3]))
print(
f'time {t:.3f} ms, {2*int(args.shape[0])*int(args.shape[1])*int(args.shape[2])*int(args.shape[3])/t/1e9:.3f} TFLOPS')
elif args.op == 'conv':
assert len(args.shape) == 7
n, c, h, w, f, r, s = [int(v) for v in args.shape]
padh = padw = int(args.pad)
strideh = stridew = int(args.stride)
dilationh = dilationw = int(args.dilation)
group = int(args.group)
bias = int(args.bias)
act = args.act
assert group==1, "Unsupported"
t = pyinfinitensor.getPerfConvBiasActCudnn(
n, c, h, w, f, r, s, padh, padw, strideh, stridew, dilationh, dilationw, group, bias, act)
print(
f'time {t:.3f} ms, {n*c*h*w*f*r*s/strideh/stridew*2/10**9:.3f} TFlops')
else:
assert False, "Not supported"

View File

@ -0,0 +1,84 @@
import pandas as pd
import numpy as np
from operator_timer import *
from datetime import datetime
pd.options.display.float_format = '{:,.3f}'.format
df= pd.DataFrame(columns=['n', 'c', 'h', 'w', 'f', 'r', 's', 'ph', 'pw', 'sh', 'sw', 'dh', 'dw', 'oph', 'opw', 'group'])
def conv_original(name, n, c, h, w, f, r, s, ph, pw,
sh, sw, dh, dw, group):
df.loc[name, ['n', 'c', 'h', 'w', 'f', 'r', 's', 'ph', 'pw', 'sh', 'sw', 'dh', 'dw', 'group']] = n, c, h, w, f, r, s, ph, pw, sh, sw, dh, dw, group
df.loc[name, 't_original'] = getPerfConv(n, c, h, w, f, r, s, ph, pw,
sh, sw, dh, dw, group)
df.loc[name, 't_bias'] = getPerfConvBiasActCudnn(n, c, h, w, f, r, s, ph, pw,
sh, sw, dh, dw, group, bias=True)
df.loc[name, 't_bias_relu'] = getPerfConvBiasActCudnn(n, c, h, w, f, r, s, ph, pw,
sh, sw, dh, dw, group, bias=True, act="Relu")
def conv_rule_5x5_to_3x3(name, n, c, h, w, f, r, s, ph, pw,
sh, sw, dh, dw, group):
col = 't_5x5_to_3x3'
if r == 5 and s == 5:
df.loc[name, col] = getPerfConv(n, c, h, w, f*4, 3, 3, ph, pw,
sh, sw, dh, dw, group)
else:
df.loc[name, col] = np.inf
def conv_rule_9x9_to_3x3(name, n, c, h, w, f, r, s, ph, pw,
sh, sw, dh, dw, group):
col = 't_9x9_to_3x3'
if r == 9 and s == 9:
df.loc[name, col] = getPerfConv(n, c, h, w, f*9, r//3, s//3, ph, pw,
sh, sw, dh, dw, group)
else:
df.loc[name, col] = np.inf
bandwidth=200*10**6 # (200GB/ms)
def conv_rule_conv2gemm(name, n, c, h, w, f, r, s, ph, pw,
sh, sw, dh, dw, group):
col = 't_conv2gemm'
if [sh, sw, dh, dw, group] == [1] * 5:
# b = group
# m = batch_size * input_height * input_width
# n = output_channel * kernel_height * kernel_width
# k = input_channel // group
t_reduce= group*n*h*w*f*r*s*4/bandwidth if r>1 or s>1 else 0
df.loc[name, '_'+col+'_mem'] = t_reduce
df.loc[name, col] = getPerfMatmul(group, n*h*w, f*r*s, c//group) + t_reduce
else:
df.loc[name, col] = np.inf
# conv_rules=[conv_original, conv_rule_9x9_to_3x3, conv_rule_5x5_to_3x3, conv_rule_conv2gemm]
conv_rules=[conv_original]
def conv_tranpsposed2d_original(name, n, c, h, w, f, r, s, ph, pw, sh, sw, dh, dw, oph, opw, group):
df.loc[name, ['n', 'c', 'h', 'w', 'f', 'r', 's', 'ph', 'pw', 'sh', 'sw', 'dh', 'dw', 'oph', 'opw', 'group']] = n, c, h, w, f, r, s, ph, pw, sh, sw, dh, dw, oph, opw, group
df.loc[name, 't_original'] = getPerfConvTransposed2dCudnn(n, c, h, w, f, r, s, ph, pw, sh, sw, dh, dw, oph, opw, group)
def conv_tranpsposed2d_togemm(name, n, c, h, w, f, r, s, ph, pw, sh, sw, dh, dw, oph, opw, group):
col = 't_conv2gemm'
if [dh, dw, group] == [1] * 3:
# ConvTransose2gemm
# b = 1
# m = batch_size * input_height*input_width
# n = output_channel*kernel_height*kernel_width
# k = input_channel
t_reduce= n*h*w*c*r*s*4/bandwidth if r>1 or s>1 else 0
df.loc[name, '_'+col+'_mem'] = t_reduce
print('t_conv2gemm', group, n*h*w, c*r*s, f)
df.loc[name, col] = getPerfMatmul(group, n*h*w, c*r*s, f) + t_reduce
else:
df.loc[name, col] = np.inf
conv_transposed2d_rules=[conv_tranpsposed2d_original, conv_tranpsposed2d_togemm]
def print_result(model_fn):
pd.set_option('display.max_rows', 500)
df['t_min'] = df.filter(regex=("^t_.*")).min(axis=1)
print(df)
print(f'Origin: {df["t_original"].sum():.3f} ms')
print(f'Min: {df["t_min"].sum():.3f} ms')
print(f'Speedup: {df["t_original"].sum()/df["t_min"].sum():.3f} x')
df.to_pickle(f'optime_{model_fn.split("/")[-1]}_{datetime.now().strftime("%m_%d_%H_%M_%S")}.pkl')

5
python/test/run_test.py Normal file
View File

@ -0,0 +1,5 @@
import pytest
if __name__ == "__main__":
retcode = pytest.main()

View File

@ -0,0 +1,16 @@
from pyinfinitensor import *
from infinitensor import import_onnx
class Test_ImportOnnx:
def test_Netname(self):
runtime = CpuRuntimeObj.getInstance()
graphBuilder = GraphBuilderObj(runtime)
import_onnx(graphBuilder, '/home/mazx/git/pf-models/bert.bs1.onnx')
class Test_SARDRN:
def test_Netname(self):
runtime = CpuRuntimeObj.getInstance()
graphBuilder = GraphBuilderObj(runtime)
import_onnx(graphBuilder, '/home/mazx/git/pf-models/sardrn.bs1.onnx')

545
src/core/graph_builder.cc Normal file
View File

@ -0,0 +1,545 @@
#include "core/graph_builder.h"
namespace infini {
Tensor GraphBuilderObj::tensor(Shape dim, const std::string &dtype) {
if (dtype == "FLOAT") {
return g->addTensor(dim, DataType::Float32);
}
if (dtype == "INT32") {
return g->addTensor(dim, DataType::UInt32);
}
if (dtype == "INT64") {
return g->addTensor(dim, DataType::UInt32);
}
IT_TODO_HALT_MSG("Unsupported data type");
}
Operator GraphBuilderObj::conv(Tensor input, Tensor weight, Tensor output,
int ph, int pw, int sh, int sw, int dh, int dw,
Tensor bias) {
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
Tensor w0 = g->addTensor(weight->getDims(), weight->getDType());
Tensor o0 = g->addTensor(output->getDims(), output->getDType());
auto op =
g->addOpWithOutputs<ConvObj>(i0, w0, o0, ph, ph, sh, sw, dh, dw, bias);
return op;
}
Operator GraphBuilderObj::conv(Tensor input, Tensor weight, int ph, int pw,
int sh, int sw, int dh, int dw, Tensor bias) {
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
Tensor w0 = g->addTensor(weight->getDims(), weight->getDType());
auto op = g->addOp<ConvObj>(i0, w0, nullptr, ph, ph, sh, sw, dh, dw, bias);
return op;
}
Operator GraphBuilderObj::conv(Tensor input, Tensor weight, Tensor output,
ConvBaseObj::PaddingMode pm, int sh, int sw,
int dh, int dw, Tensor bias) {
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
Tensor w0 = g->addTensor(weight->getDims(), weight->getDType());
Tensor o0 = g->addTensor(output->getDims(), output->getDType());
auto op =
g->addOpWithOutputs<ConvObj>(i0, w0, o0, pm, sh, sw, dh, dw, bias);
return op;
}
Operator GraphBuilderObj::conv(Tensor input, Tensor weight,
ConvBaseObj::PaddingMode pm, int sh, int sw,
int dh, int dw, Tensor bias) {
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
Tensor w0 = g->addTensor(weight->getDims(), weight->getDType());
auto op = g->addOp<ConvObj>(i0, w0, nullptr, pm, sh, sw, dh, dw, bias);
return op;
}
Operator GraphBuilderObj::matmul(Tensor A, Tensor B, Tensor C, bool transA,
bool transB) {
Tensor i0 = g->addTensor(A->getDims(), A->getDType());
Tensor i1 = g->addTensor(B->getDims(), B->getDType());
Tensor o0 = g->addTensor(C->getDims(), C->getDType());
auto op = g->addOpWithOutputs<MatmulObj>(i0, i1, o0, transA, transB);
return op;
}
Operator GraphBuilderObj::matmul(Tensor A, Tensor B, bool transA, bool transB) {
Tensor i0 = g->addTensor(A->getDims(), A->getDType());
Tensor i1 = g->addTensor(B->getDims(), B->getDType());
auto op = g->addOp<MatmulObj>(i0, i1, nullptr, transA, transB);
return op;
}
Operator GraphBuilderObj::convTrans(Tensor input, Tensor weight, Tensor output,
int ph, int pw, int sh, int sw, int dh,
int dw, int oph, int opw, int group,
Tensor bias, ActType act) {
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
Tensor w0 = g->addTensor(weight->getDims(), weight->getDType());
Tensor o0 = g->addTensor(output->getDims(), output->getDType());
auto op = g->addOpWithOutputs<ConvTransposed2dObj>(
i0, w0, o0, ph, pw, sh, sw, dh, dw, oph, opw, group, bias, act);
return op;
}
Operator GraphBuilderObj::convTrans(Tensor input, Tensor weight, int ph, int pw,
int sh, int sw, int dh, int dw, int oph,
int opw, int group, Tensor bias,
ActType act) {
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
Tensor w0 = g->addTensor(weight->getDims(), weight->getDType());
auto op = g->addOp<ConvTransposed2dObj>(i0, w0, nullptr, ph, pw, sh, sw, dh,
dw, oph, opw, group, bias, act);
return op;
}
Operator GraphBuilderObj::convTrans(Tensor input, Tensor weight, Tensor output,
ConvBaseObj::PaddingMode pm, int sh, int sw,
int dh, int dw, int oph, int opw, int group,
Tensor bias, ActType act) {
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
Tensor w0 = g->addTensor(weight->getDims(), weight->getDType());
Tensor o0 = g->addTensor(output->getDims(), output->getDType());
auto op = g->addOpWithOutputs<ConvTransposed2dObj>(
i0, w0, o0, pm, sh, sw, dh, dw, oph, opw, group, bias, act);
return op;
}
Operator GraphBuilderObj::convTrans(Tensor input, Tensor weight,
ConvBaseObj::PaddingMode pm, int sh, int sw,
int dh, int dw, int oph, int opw, int group,
Tensor bias, ActType act) {
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
Tensor w0 = g->addTensor(weight->getDims(), weight->getDType());
auto op = g->addOp<ConvTransposed2dObj>(i0, w0, nullptr, pm, sh, sw, dh, dw,
oph, opw, group, bias, act);
return op;
}
Operator GraphBuilderObj::g2bmm(Tensor A, Tensor B, Tensor C, const int width,
const int dilation, Tensor bias, ActType act) {
Tensor i0 = g->addTensor(A->getDims(), A->getDType());
Tensor i1 = g->addTensor(B->getDims(), B->getDType());
Tensor o0 = g->addTensor(C->getDims(), C->getDType());
auto op =
g->addOpWithOutputs<G2BMMObj>(i0, i1, o0, width, dilation, bias, act);
return op;
}
Operator GraphBuilderObj::g2bmm(Tensor A, Tensor B, const int width,
const int dilation, Tensor bias, ActType act) {
Tensor i0 = g->addTensor(A->getDims(), A->getDType());
Tensor i1 = g->addTensor(B->getDims(), B->getDType());
auto op = g->addOp<G2BMMObj>(i0, i1, nullptr, width, dilation, bias, act);
return op;
}
Operator GraphBuilderObj::gbmml(Tensor A, Tensor B, Tensor C,
const int dilation, Tensor bias, ActType act) {
Tensor i0 = g->addTensor(A->getDims(), A->getDType());
Tensor i1 = g->addTensor(B->getDims(), B->getDType());
Tensor o0 = g->addTensor(C->getDims(), C->getDType());
auto op = g->addOpWithOutputs<GBMMObj>(i0, i1, o0, dilation, bias, act);
return op;
}
Operator GraphBuilderObj::gbmml(Tensor A, Tensor B, const int dilation,
Tensor bias, ActType act) {
Tensor i0 = g->addTensor(A->getDims(), A->getDType());
Tensor i1 = g->addTensor(B->getDims(), B->getDType());
auto op = g->addOp<GBMMObj>(i0, i1, nullptr, dilation, bias, act);
return op;
}
Operator GraphBuilderObj::pad(Tensor input, Tensor output,
const vector<int> &pads,
const optional<const vector<int>> &axis) {
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
Tensor o0 = g->addTensor(output->getDims(), output->getDType());
auto op = g->addOpWithOutputs<PadObj>(i0, o0, pads, axis);
return op;
}
Operator GraphBuilderObj::pad(Tensor input, const vector<int> &pads,
const optional<const vector<int>> &axis) {
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
auto op = g->addOp<PadObj>(i0, nullptr, pads, axis);
return op;
}
Operator GraphBuilderObj::slice(Tensor input, Tensor output,
const vector<int> &starts,
const vector<int> &ends,
const optional<const vector<int>> &axis,
const optional<const vector<int>> &steps) {
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
Tensor o0 = g->addTensor(output->getDims(), output->getDType());
auto op = g->addOpWithOutputs<SliceObj>(i0, o0, starts, ends, axis, steps);
return op;
}
Operator GraphBuilderObj::slice(Tensor input, const vector<int> &starts,
const vector<int> &ends,
const optional<const vector<int>> &axis,
const optional<const vector<int>> &steps) {
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
auto op = g->addOp<SliceObj>(i0, nullptr, starts, ends, axis, steps);
return op;
}
Operator GraphBuilderObj::concat(TensorVec inputs, Tensor output, int dim) {
TensorVec is;
for (auto input : inputs) {
Tensor i = g->addTensor(input->getDims(), input->getDType());
is.push_back(i);
}
Tensor o = g->addTensor(output->getDims(), output->getDType());
auto op = g->addOpWithOutputs<ConcatObj>(is, o, dim);
return op;
}
Operator GraphBuilderObj::concat(TensorVec inputs, int dim) {
TensorVec is;
for (auto input : inputs) {
Tensor i = g->addTensor(input->getDims(), input->getDType());
is.push_back(i);
}
auto op = g->addOp<ConcatObj>(is, nullptr, dim);
return op;
}
Operator GraphBuilderObj::split(Tensor input, std::optional<TensorVec> outputs,
int dim, int num) {
Tensor i = g->addTensor(input->getDims(), input->getDType());
if (outputs.has_value()) {
TensorVec os;
for (auto output : outputs.value()) {
Tensor o = g->addTensor(output->getDims(), output->getDType());
os.push_back(o);
}
auto op = g->addOpWithOutputs<SplitObj>(i, os, dim, num);
return op;
} else {
auto op = g->addOp<SplitObj>(i, std::nullopt, dim, num);
return op;
}
}
Operator GraphBuilderObj::split(Tensor input, int dim, int num) {
Tensor i = g->addTensor(input->getDims(), input->getDType());
auto op = g->addOp<SplitObj>(i, std::nullopt, dim, num);
return op;
}
Operator GraphBuilderObj::split(Tensor input, std::optional<TensorVec> outputs,
int dim, const vector<int> &ratio) {
Tensor i = g->addTensor(input->getDims(), input->getDType());
if (outputs.has_value()) {
TensorVec os;
for (auto output : outputs.value()) {
Tensor o = g->addTensor(output->getDims(), output->getDType());
os.push_back(o);
}
auto op = g->addOpWithOutputs<SplitObj>(i, os, dim, ratio);
return op;
} else {
auto op = g->addOp<SplitObj>(i, std::nullopt, dim, ratio);
return op;
}
}
Operator GraphBuilderObj::split(Tensor input, int dim,
const vector<int> &ratio) {
Tensor i = g->addTensor(input->getDims(), input->getDType());
auto op = g->addOp<SplitObj>(i, std::nullopt, dim, ratio);
return op;
}
Operator GraphBuilderObj::transpose(Tensor input, Tensor output,
const Shape &perm) {
Tensor i = g->addTensor(input->getDims(), input->getDType());
Tensor o = g->addTensor(output->getDims(), output->getDType());
auto op = g->addOpWithOutputs<TransposeObj>(i, o, perm);
return op;
}
Operator GraphBuilderObj::extend(Tensor input, Tensor output, int dim,
int num) {
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
Tensor o0 = g->addTensor(output->getDims(), output->getDType());
auto op = g->addOpWithOutputs<ExtendObj>(i0, o0, dim, num);
return op;
}
Operator GraphBuilderObj::extend(Tensor input, int dim, int num) {
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
auto op = g->addOp<ExtendObj>(i0, nullptr, dim, num);
return op;
}
Operator GraphBuilderObj::maxpool(Tensor input, Tensor output, int kh, int kw,
int dh, int dw, int ph, int pw, int sh,
int sw) {
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
Tensor o0 = g->addTensor(output->getDims(), output->getDType());
auto op =
g->addOpWithOutputs<MaxPoolObj>(i0, o0, kh, kw, dh, dw, ph, pw, sh, sw);
return op;
}
Operator GraphBuilderObj::maxpool(Tensor input, int kh, int kw, int dh, int dw,
int ph, int pw, int sh, int sw) {
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
auto op = g->addOp<MaxPoolObj>(i0, nullptr, kh, kw, dh, dw, ph, pw, sh, sw);
return op;
}
Operator GraphBuilderObj::avgpool(Tensor input, Tensor output, int kh, int kw,
int dh, int dw, int ph, int pw, int sh,
int sw) {
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
Tensor o0 = g->addTensor(input->getDims(), input->getDType());
auto op =
g->addOpWithOutputs<AvgPoolObj>(i0, o0, kh, kw, dh, dw, ph, pw, sh, sw);
return op;
}
Operator GraphBuilderObj::avgpool(Tensor input, int kh, int kw, int dh, int dw,
int ph, int pw, int sh, int sw) {
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
auto op = g->addOp<AvgPoolObj>(i0, nullptr, kh, kw, dh, dw, ph, pw, sh, sw);
return op;
}
Operator GraphBuilderObj::add(Tensor input0, Tensor input1, Tensor output) {
Tensor i0 = g->addTensor(input0->getDims(), input0->getDType());
Tensor i1 = g->addTensor(input1->getDims(), input1->getDType());
Tensor o0 = g->addTensor(output->getDims(), output->getDType());
auto op = g->addOpWithOutputs<AddObj>(i0, i1, o0);
return op;
}
Operator GraphBuilderObj::add(Tensor input0, Tensor input1) {
Tensor i0 = g->addTensor(input0->getDims(), input0->getDType());
Tensor i1 = g->addTensor(input1->getDims(), input1->getDType());
auto op = g->addOp<AddObj>(i0, i1, nullptr);
return op;
}
Operator GraphBuilderObj::sub(Tensor input0, Tensor input1, Tensor output) {
std::cout << "Sub1" << std::endl;
Tensor i0 = g->addTensor(input0->getDims(), input0->getDType());
Tensor i1 = g->addTensor(input1->getDims(), input1->getDType());
Tensor o0 = g->addTensor(output->getDims(), output->getDType());
auto op = g->addOpWithOutputs<SubObj>(i0, i1, o0);
return op;
}
Operator GraphBuilderObj::sub(Tensor input0, Tensor input1) {
std::cout << "Sub2" << std::endl;
Tensor i0 = g->addTensor(input0->getDims(), input0->getDType());
Tensor i1 = g->addTensor(input1->getDims(), input1->getDType());
auto op = g->addOp<SubObj>(i0, i1, nullptr);
return op;
}
Operator GraphBuilderObj::mul(Tensor input0, Tensor input1, Tensor output) {
Tensor i0 = g->addTensor(input0->getDims(), input0->getDType());
Tensor i1 = g->addTensor(input1->getDims(), input1->getDType());
Tensor o0 = g->addTensor(output->getDims(), output->getDType());
auto op = g->addOpWithOutputs<MulObj>(i0, i1, o0);
return op;
}
Operator GraphBuilderObj::mul(Tensor input0, Tensor input1) {
Tensor i0 = g->addTensor(input0->getDims(), input0->getDType());
Tensor i1 = g->addTensor(input1->getDims(), input1->getDType());
auto op = g->addOp<MulObj>(i0, i1, nullptr);
return op;
}
Operator GraphBuilderObj::div(Tensor input0, Tensor input1, Tensor output) {
Tensor i0 = g->addTensor(input0->getDims(), input0->getDType());
Tensor i1 = g->addTensor(input1->getDims(), input1->getDType());
Tensor o0 = g->addTensor(output->getDims(), output->getDType());
auto op = g->addOpWithOutputs<DivObj>(i0, i1, o0);
return op;
}
Operator GraphBuilderObj::div(Tensor input0, Tensor input1) {
Tensor i0 = g->addTensor(input0->getDims(), input0->getDType());
Tensor i1 = g->addTensor(input1->getDims(), input1->getDType());
auto op = g->addOp<DivObj>(i0, i1, nullptr);
return op;
}
Operator GraphBuilderObj::pow(Tensor input0, Tensor input1, Tensor output) {
Tensor i0 = g->addTensor(input0->getDims(), input0->getDType());
Tensor i1 = g->addTensor(input1->getDims(), input1->getDType());
Tensor o0 = g->addTensor(output->getDims(), output->getDType());
auto op = g->addOpWithOutputs<PowObj>(i0, i1, o0);
return op;
}
Operator GraphBuilderObj::pow(Tensor input0, Tensor input1) {
Tensor i0 = g->addTensor(input0->getDims(), input0->getDType());
Tensor i1 = g->addTensor(input1->getDims(), input1->getDType());
auto op = g->addOp<PowObj>(i0, i1, nullptr);
return op;
}
Operator GraphBuilderObj::gather(Tensor input, Tensor index, Tensor output,
int axis) {
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
Tensor o0 = g->addTensor(output->getDims(), output->getDType());
auto op = g->addOpWithOutputs<GatherObj>(i0, index, o0, axis);
return op;
}
Operator GraphBuilderObj::gather(Tensor input, Tensor index, int axis) {
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
auto op = g->addOp<GatherObj>(i0, index, nullptr, axis);
return op;
}
Operator GraphBuilderObj::reshape(Tensor input, Tensor output,
const Shape &dims) {
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
Tensor o0 = g->addTensor(output->getDims(), output->getDType());
auto op = g->addOpWithOutputs<ReshapeObj>(i0, o0, dims);
return op;
}
Operator GraphBuilderObj::reshape(Tensor input, const Shape &dims) {
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
auto op = g->addOp<ReshapeObj>(i0, nullptr, dims);
return op;
}
Operator GraphBuilderObj::flatten(Tensor input, Tensor output) {
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
Tensor o0 = g->addTensor(output->getDims(), output->getDType());
auto op = g->addOpWithOutputs<FlattenObj>(i0, o0);
return op;
}
Operator GraphBuilderObj::flatten(Tensor input) {
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
auto op = g->addOp<FlattenObj>(i0, nullptr);
return op;
}
Operator GraphBuilderObj::identity(Tensor input, Tensor output) {
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
Tensor o0 = g->addTensor(output->getDims(), output->getDType());
auto op = g->addOpWithOutputs<IdentityObj>(i0, o0);
return op;
}
Operator GraphBuilderObj::identity(Tensor input) {
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
auto op = g->addOp<IdentityObj>(i0, nullptr);
return op;
}
Operator GraphBuilderObj::softmax(Tensor input, Tensor output) {
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
Tensor o0 = g->addTensor(output->getDims(), output->getDType());
auto op = g->addOpWithOutputs<SoftmaxObj>(i0, o0);
return op;
}
Operator GraphBuilderObj::softmax(Tensor input) {
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
auto op = g->addOp<SoftmaxObj>(i0, nullptr);
return op;
}
Operator GraphBuilderObj::relu(Tensor input, Tensor output) {
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
Tensor o0 = g->addTensor(output->getDims(), output->getDType());
auto op = g->addOpWithOutputs<ReluObj>(i0, o0);
return op;
}
Operator GraphBuilderObj::relu(Tensor input) {
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
auto op = g->addOp<ReluObj>(i0, nullptr);
return op;
}
Operator GraphBuilderObj::sigmoid(Tensor input, Tensor output) {
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
Tensor o0 = g->addTensor(output->getDims(), output->getDType());
auto op = g->addOpWithOutputs<SigmoidObj>(i0, o0);
return op;
}
Operator GraphBuilderObj::sigmoid(Tensor input) {
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
auto op = g->addOp<SigmoidObj>(i0, nullptr);
return op;
}
Operator GraphBuilderObj::tanh(Tensor input, Tensor output) {
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
Tensor o0 = g->addTensor(output->getDims(), output->getDType());
auto op = g->addOpWithOutputs<TanhObj>(i0, o0);
return op;
}
Operator GraphBuilderObj::tanh(Tensor input) {
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
auto op = g->addOp<TanhObj>(i0, nullptr);
return op;
}
Operator GraphBuilderObj::abs(Tensor input, Tensor output) {
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
Tensor o0 = g->addTensor(output->getDims(), output->getDType());
auto op = g->addOpWithOutputs<AbsObj>(i0, o0);
return op;
}
Operator GraphBuilderObj::abs(Tensor input) {
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
auto op = g->addOp<AbsObj>(i0, nullptr);
return op;
}
Operator GraphBuilderObj::reduceMean(Tensor input, Tensor output, int axis) {
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
Tensor o0 = g->addTensor(output->getDims(), output->getDType());
auto op =
g->addOpWithOutputs<ReduceMeanObj>(i0, o0, std::vector<int>({axis}));
return op;
}
Operator GraphBuilderObj::erf(Tensor input, Tensor output) {
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
Tensor o0 = g->addTensor(output->getDims(), output->getDType());
auto op = g->addOpWithOutputs<ErfObj>(i0, o0);
return op;
}
Operator GraphBuilderObj::memBound(const TensorVec &inputs,
const TensorVec &outputs,
const std::vector<nnet::Tensor> &nnetInputs,
nnet::Expr expr, double exec_time,
std::string hint) {
TensorVec is;
for (auto input : inputs) {
auto i = g->addTensor(input->getDims(), input->getDType());
is.push_back(i);
}
TensorVec os;
for (auto output : outputs) {
auto o = g->addTensor(output->getDims(), output->getDType());
os.push_back(o);
}
auto op = g->addOpWithOutputs<MemBoundObj>(is, os, nnetInputs, expr,
exec_time, hint);
return op;
}
} // namespace infini

View File

@ -1,3 +1,4 @@
#include "cuda/operator_timer.h"
#include "core/graph.h"
#include "core/kernel.h"
#include "core/runtime.h"
@ -12,8 +13,23 @@ namespace opTimer {
double getPerfConvCudnn(int n, int c, int h, int w, int f, int r, int s,
int padh, int padw, int strideh, int stridew,
int dilationh, int dilationw, int group,
const char *name) {
int dilationh, int dilationw, int group) {
return getPerfConvBiasActCudnn(n, c, h, w, f, r, s, padh, padw, strideh,
stridew, dilationh, dilationw, group, false,
"None");
}
double getPerfConvBiasActCudnn(int n, int c, int h, int w, int f, int r, int s,
int padh, int padw, int strideh, int stridew,
int dilationh, int dilationw, int group,
bool bias, string actName) {
ActType act = ActType::None;
if (actName == "None")
act = ActType::None;
else if (actName == "Relu")
act = ActType::Relu;
else
IT_ASSERT(false, "Unsupported activation");
// const auto &[n, c, h, w, f, r, s, padh, padw, strideh, stridew,
// dilationh, dilationw, group] =
// tuple{1, 512, 14, 14, 512, 3, 3, 2, 2, 1, 1, 2, 2, 1};
@ -25,17 +41,27 @@ double getPerfConvCudnn(int n, int c, int h, int w, int f, int r, int s,
IT_ASSERT(c % group == 0);
Tensor i0Cpu = gCpu->addTensor({n, c, h, w}, DataType::Float32);
Tensor w0Cpu = gCpu->addTensor({f, c / group, r, s}, DataType::Float32);
Tensor b0Cpu = gCpu->addTensor({f}, DataType::Float32);
// Malloc data for all tensors in a graph. Do we need implicit allocation?
gCpu->dataMalloc();
i0Cpu->setData(IncrementalGenerator());
w0Cpu->setData(IncrementalGenerator());
b0Cpu->setData(IncrementalGenerator());
// Copy input tensors from CPU to CUDA
Tensor i0Cuda = gCuda->cloneTensor(i0Cpu);
Tensor w0Cuda = gCuda->cloneTensor(w0Cpu);
Tensor b0Cuda = gCuda->cloneTensor(b0Cpu);
// Build CUDA graph
auto conv = gCuda->addOp<ConvObj>(i0Cuda, w0Cuda, nullptr, padh, padw,
strideh, stridew, dilationh, dilationw);
if (!bias) {
auto conv =
gCuda->addOp<ConvObj>(i0Cuda, w0Cuda, nullptr, padh, padw, strideh,
stridew, dilationh, dilationw);
} else {
auto conv =
gCuda->addOp<ConvObj>(i0Cuda, w0Cuda, nullptr, padh, padw, strideh,
stridew, dilationh, dilationw, b0Cuda, act);
}
// allocate CUDA memory
gCuda->dataMalloc();
// Execute on CUDA

View File

@ -2,6 +2,7 @@
#ifdef USE_CUDA
#include "cuda/operator_timer.h"
#endif
#include "core/graph_builder.h"
namespace py = pybind11;
namespace infini {
@ -13,11 +14,193 @@ void register_operator_timer(py::module &m) {
#ifdef USE_CUDA
using namespace opTimer;
m.def("getPerfConvCudnn", &getPerfConvCudnn);
m.def("getPerfConvBiasActCudnn", &getPerfConvBiasActCudnn);
m.def("getPerfConvTransposed2dCudnn", &getPerfConvTransposed2dCudnn);
m.def("getPerfMatmulCublas", &getPerfMatmulCublas);
m.def("getPerfMatmulCublas", &getPerfMatmulCublas);
#endif
}
void init_graph_builder(py::module &m) {
py::class_<RuntimeObj, std::shared_ptr<RuntimeObj>>(m, "RuntimeObj");
py::class_<CpuRuntimeObj, std::shared_ptr<CpuRuntimeObj>, RuntimeObj>(
m, "CpuRuntimeObj")
.def(py::init<>())
.def("getInstance", py::overload_cast<>(&CpuRuntimeObj::getInstance),
policy::reference_internal);
py::class_<Shape>(m, "Shape");
py::class_<TensorObj, std::shared_ptr<TensorObj>>(m, "TensorObj");
py::class_<Tensor>(m, "Tensor");
py::class_<TensorVec>(m, "TensorVec");
py::class_<OperatorObj, std::shared_ptr<OperatorObj>>(m, "OperatorObj");
py::class_<Operator>(m, "Operator");
py::class_<ActType>(m, "ActType");
py::class_<ConvObj, std::shared_ptr<ConvObj>, OperatorObj>(m, "ConvObj");
py::class_<MatmulObj, std::shared_ptr<MatmulObj>, OperatorObj>(m,
"MatmulObj");
py::class_<ConvTransposed2dObj, std::shared_ptr<ConvTransposed2dObj>,
OperatorObj>(m, "ConvTransposed2dObj");
py::class_<G2BMMObj, std::shared_ptr<G2BMMObj>, OperatorObj>(m, "G2BMMObj");
py::class_<GBMMObj, std::shared_ptr<GBMMObj>, OperatorObj>(m, "GBMMObj");
py::class_<PadObj, std::shared_ptr<PadObj>, OperatorObj>(m, "PadObj");
py::class_<SliceObj, std::shared_ptr<SliceObj>, OperatorObj>(m, "SliceObj");
py::class_<ConcatObj, std::shared_ptr<ConcatObj>, OperatorObj>(m,
"ConcatObj");
py::class_<SplitObj, std::shared_ptr<SplitObj>, OperatorObj>(m, "SplitObj");
py::class_<TransposeObj, std::shared_ptr<TransposeObj>, OperatorObj>(
m, "TransposeObj");
py::class_<ExtendObj, std::shared_ptr<ExtendObj>, OperatorObj>(m,
"ExtendObj");
py::class_<MaxPoolObj, std::shared_ptr<MaxPoolObj>, OperatorObj>(
m, "MaxPoolObj");
py::class_<AvgPoolObj, std::shared_ptr<AvgPoolObj>, OperatorObj>(
m, "AvgPoolObj");
py::class_<AddObj, std::shared_ptr<AddObj>, OperatorObj>(m, "AddObj");
py::class_<SubObj, std::shared_ptr<SubObj>, OperatorObj>(m, "SubObj");
py::class_<MulObj, std::shared_ptr<MulObj>, OperatorObj>(m, "MulObj");
py::class_<DivObj, std::shared_ptr<DivObj>, OperatorObj>(m, "DivObj");
py::class_<PowObj, std::shared_ptr<PowObj>, OperatorObj>(m, "PowObj");
py::class_<GatherObj, std::shared_ptr<GatherObj>, OperatorObj>(m,
"GatherObj");
py::class_<ReshapeObj, std::shared_ptr<ReshapeObj>, OperatorObj>(
m, "ReshapeObj");
py::class_<FlattenObj, std::shared_ptr<FlattenObj>, OperatorObj>(
m, "FlattenObj");
py::class_<IdentityObj, std::shared_ptr<IdentityObj>, OperatorObj>(
m, "IdentityObj");
py::class_<SoftmaxObj, std::shared_ptr<SoftmaxObj>, OperatorObj>(
m, "SoftmaxObj");
py::class_<ReluObj, std::shared_ptr<ReluObj>, OperatorObj>(m, "ReluObj");
py::class_<SigmoidObj, std::shared_ptr<SigmoidObj>, OperatorObj>(
m, "SigmoidObj");
py::class_<TanhObj, std::shared_ptr<TanhObj>, OperatorObj>(m, "TanhObj");
py::class_<AbsObj, std::shared_ptr<AbsObj>, OperatorObj>(m, "AbsObj");
py::class_<ReduceMeanObj, std::shared_ptr<ReduceMeanObj>, OperatorObj>(
m, "ReduceMeanObj");
py::class_<MemBoundObj, std::shared_ptr<MemBoundObj>, OperatorObj>(
m, "MemBoundObj");
py::class_<GraphBuilder>(m, "GraphBuilder");
py::class_<GraphBuilderObj>(m, "GraphBuilderObj")
.def(py::init<Runtime>())
.def("tensor",
py::overload_cast<Shape, const std::string &>(
&GraphBuilderObj::tensor),
policy::reference_internal)
.def("conv",
py::overload_cast<Tensor, Tensor, Tensor, int, int, int, int, int,
int, Tensor>(&GraphBuilderObj::conv),
policy::reference_internal)
.def("matmul",
py::overload_cast<Tensor, Tensor, Tensor, bool, bool>(
&GraphBuilderObj::matmul),
policy::reference_internal)
.def("convTrans",
py::overload_cast<Tensor, Tensor, Tensor, int, int, int, int, int,
int, int, int, int, Tensor, ActType>(
&GraphBuilderObj::convTrans),
policy::reference_internal)
.def("g2bmm",
py::overload_cast<Tensor, Tensor, Tensor, const int, const int,
Tensor, ActType>(&GraphBuilderObj::g2bmm),
policy::reference_internal)
.def("gbmml",
py::overload_cast<Tensor, Tensor, Tensor, const int, Tensor,
ActType>(&GraphBuilderObj::gbmml),
policy::reference_internal)
.def("pad",
py::overload_cast<Tensor, Tensor, const vector<int> &,
const optional<const vector<int>> &>(
&GraphBuilderObj::pad),
policy::reference_internal)
.def("slice",
py::overload_cast<Tensor, Tensor, const vector<int> &,
const vector<int> &,
const optional<const vector<int>> &,
const optional<const vector<int>> &>(
&GraphBuilderObj::slice),
policy::reference_internal)
.def(
"concat",
py::overload_cast<TensorVec, Tensor, int>(&GraphBuilderObj::concat),
policy::reference_internal)
.def("split",
py::overload_cast<Tensor, std::optional<TensorVec>, int, int>(
&GraphBuilderObj::split),
policy::reference_internal)
.def("transpose",
py::overload_cast<Tensor, Tensor, const vector<int> &>(
&GraphBuilderObj::transpose),
policy::reference_internal)
.def("extend",
py::overload_cast<Tensor, Tensor, int, int>(
&GraphBuilderObj::extend),
policy::reference_internal)
.def("maxpool",
py::overload_cast<Tensor, Tensor, int, int, int, int, int, int,
int, int>(&GraphBuilderObj::maxpool),
policy::reference_internal)
.def("avgpool",
py::overload_cast<Tensor, Tensor, int, int, int, int, int, int,
int, int>(&GraphBuilderObj::avgpool),
policy::reference_internal)
.def("add",
py::overload_cast<Tensor, Tensor, Tensor>(&GraphBuilderObj::add),
policy::reference_internal)
.def("sub",
py::overload_cast<Tensor, Tensor, Tensor>(&GraphBuilderObj::sub),
policy::reference_internal)
.def("mul",
py::overload_cast<Tensor, Tensor, Tensor>(&GraphBuilderObj::mul),
policy::reference_internal)
.def("div",
py::overload_cast<Tensor, Tensor, Tensor>(&GraphBuilderObj::div),
policy::reference_internal)
.def("pow",
py::overload_cast<Tensor, Tensor, Tensor>(&GraphBuilderObj::pow),
policy::reference_internal)
.def("gather",
py::overload_cast<Tensor, Tensor, Tensor, int>(
&GraphBuilderObj::gather),
policy::reference_internal)
.def("reshape",
py::overload_cast<Tensor, Tensor, const Shape &>(
&GraphBuilderObj::reshape),
policy::reference_internal)
.def("flatten",
py::overload_cast<Tensor, Tensor>(&GraphBuilderObj::flatten),
policy::reference_internal)
.def("identity",
py::overload_cast<Tensor, Tensor>(&GraphBuilderObj::identity),
policy::reference_internal)
.def("softmax",
py::overload_cast<Tensor, Tensor>(&GraphBuilderObj::softmax),
policy::reference_internal)
.def("relu", py::overload_cast<Tensor, Tensor>(&GraphBuilderObj::relu),
policy::reference_internal)
.def("sigmoid",
py::overload_cast<Tensor, Tensor>(&GraphBuilderObj::sigmoid),
policy::reference_internal)
.def("tanh", py::overload_cast<Tensor, Tensor>(&GraphBuilderObj::tanh),
policy::reference_internal)
.def("abs", py::overload_cast<Tensor, Tensor>(&GraphBuilderObj::abs),
policy::reference_internal)
.def("reduceMean",
py::overload_cast<Tensor, Tensor, int>(
&GraphBuilderObj::reduceMean),
policy::reference_internal)
.def("erf", py::overload_cast<Tensor, Tensor>(&GraphBuilderObj::erf),
policy::reference_internal)
.def("memBound",
py::overload_cast<const TensorVec &, const TensorVec &,
const std::vector<nnet::Tensor> &, nnet::Expr,
double, std::string>(&GraphBuilderObj::memBound),
policy::reference_internal);
}
} // namespace infini
PYBIND11_MODULE(pyinfinitensor, m) { infini::register_operator_timer(m); }
PYBIND11_MODULE(pyinfinitensor, m) {
infini::register_operator_timer(m);
infini::init_graph_builder(m);
}

View File

@ -8,6 +8,8 @@
namespace infini {
struct ConvCuDnnPerfRecordObj : public PerfRecordObj {
int kernel =
0; // 0 cudnnConvolutionForward, 1 cudnnConvolutionBiasActivationForward
int algo = 0; // cudnnConvolutionFwdAlgo_t
int mode = 1;
size_t workspaceSize = 100000;
@ -56,8 +58,6 @@ class convCudnn : public Kernel {
const ConvCuDnnPerfRecord &record) const {
void *const inData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const knData = (op->getInputs(1)->getRawDataPtr<void *>());
if (op->getInputs().size() > 2) // Bias is not supported yet
IT_TODO_HALT();
// void *const biasData = (op->getInputs(2)->getRawDataPtr<void *>());
void *const outData = (op->getOutput()->getRawDataPtr<void *>());
@ -209,6 +209,36 @@ class convCudnn : public Kernel {
return true;
}
bool cuDNNfused(const Ref<ConvObj> &op, const ConvCuDnnPerfRecord &record,
const CudaRuntimeObj *context) const {
cudnnStatus_t stat;
const auto &[inData, knData, outData, inDesc, knDesc, biasDesc,
convDesc, actDesc, outDesc] =
createCuDNNDescriptor(op, record);
size_t wsSize = record->workspaceSize;
CudaPtr wsData = context->getWorkspace(wsSize);
float alpha = 1.f, beta = 0.f;
// w/ bias & act
stat = cudnnConvolutionBiasActivationForward(
context->cudnnHandle(), &alpha, inDesc, inData, knDesc, knData,
convDesc, ALGOS[record->algo], wsData, wsSize, &beta, outDesc,
outData, biasDesc, nullptr, actDesc, outDesc, outData);
if (stat != CUDNN_STATUS_SUCCESS)
return false;
// Destories in CUDA does not require sync. But cuDNN does not state
// whether sync is required before destories.
checkCudnnError(cudnnDestroyTensorDescriptor(outDesc));
checkCudnnError(cudnnDestroyActivationDescriptor(actDesc));
checkCudnnError(cudnnDestroyConvolutionDescriptor(convDesc));
checkCudnnError(cudnnDestroyTensorDescriptor(biasDesc));
checkCudnnError(cudnnDestroyFilterDescriptor(knDesc));
checkCudnnError(cudnnDestroyTensorDescriptor(inDesc));
return true;
}
void compute(const Operator &op, const RuntimeObj *context) const override {
auto record = make_ref<ConvCuDnnPerfRecordObj>(); // with paramters in
// default ctor
@ -217,10 +247,88 @@ class convCudnn : public Kernel {
PerfRecord tune(const Operator &_op,
const RuntimeObj *_context) const override {
ConvCuDnnPerfRecordObj ret;
ret.time = std::numeric_limits<double>::max();
auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
auto op = as<ConvObj>(_op);
printf("%s\n", op->toString().c_str());
if (op->hasBias() || op->getAct() != ActType::None)
return tuneFused(op, context);
else
return tuneUnfused(op, context);
}
PerfRecord tuneFused(const Ref<ConvObj> &op,
const CudaRuntimeObj *context) const {
ConvCuDnnPerfRecordObj ret;
ret.time = std::numeric_limits<double>::max();
// Both modes have the same performance. Only run cross-correlation.
for (int mode = 1; mode < 2; mode++) {
// Try every possible algorithm of convolution
for (int algo = 0; algo < N_ALGO; algo++) {
auto recordRef = make_ref<ConvCuDnnPerfRecordObj>();
auto &record = *recordRef;
record.mode = mode;
record.algo = algo;
cudnnStatus_t stat;
const auto &[inData, knData, outData, inDesc, knDesc, biasDesc,
convDesc, actDesc, outDesc] =
createCuDNNDescriptor(op, recordRef);
void *biasData = op->getBias()->getRawDataPtr<void *>();
// get workspace
stat = cudnnGetConvolutionForwardWorkspaceSize(
context->cudnnHandle(), inDesc, knDesc, convDesc, outDesc,
ALGOS[record.algo], &record.workspaceSize);
if (stat != CUDNN_STATUS_SUCCESS)
continue;
if (record.workspaceSize > context->getWorkspaceSize())
continue;
CudaPtr wsData = context->getWorkspace(record.workspaceSize);
float alpha = 1.f, beta = 0.f;
stat = cudnnConvolutionBiasActivationForward(
context->cudnnHandle(), &alpha, inDesc, inData, knDesc,
knData, convDesc, ALGOS[record.algo], wsData,
record.workspaceSize, &beta, outDesc, outData, biasDesc,
biasData, actDesc, outDesc, outData);
if (stat != CUDNN_STATUS_SUCCESS)
continue;
record.time = timeit(
[&]() {
stat = cudnnConvolutionBiasActivationForward(
context->cudnnHandle(), &alpha, inDesc, inData,
knDesc, knData, convDesc, ALGOS[record.algo],
wsData, record.workspaceSize, &beta, outDesc,
outData, biasDesc, biasData, actDesc, outDesc,
outData);
},
[&]() { context->sync(); });
printf("mode %d, algo %d, time %.3lf ms\n", mode, algo,
record.time);
// Update the tune result
if (ret.time > record.time)
ret = record;
checkCudnnError(cudnnDestroyTensorDescriptor(outDesc));
checkCudnnError(cudnnDestroyActivationDescriptor(actDesc));
checkCudnnError(cudnnDestroyConvolutionDescriptor(convDesc));
checkCudnnError(cudnnDestroyTensorDescriptor(biasDesc));
checkCudnnError(cudnnDestroyFilterDescriptor(knDesc));
checkCudnnError(cudnnDestroyTensorDescriptor(inDesc));
}
}
printf("tuneFused: the best algo is %d, the best conv mode is %d\n",
ret.algo, ret.mode);
IT_ASSERT(ret.time < std::numeric_limits<double>::max(), "No valid "
"algorithm "
"found");
return make_ref<ConvCuDnnPerfRecordObj>(ret);
}
PerfRecord tuneUnfused(const Ref<ConvObj> &op,
const CudaRuntimeObj *context) const {
ConvCuDnnPerfRecordObj ret;
ret.time = std::numeric_limits<double>::max();
// Both modes have the same performance. Only run cross-correlation.
for (int mode = 1; mode < 2; mode++) {
// Try every possible algorithm of convolution
@ -260,7 +368,8 @@ class convCudnn : public Kernel {
&beta, outDesc, outData);
},
[&]() { context->sync(); });
// printf("mode:%d algo:%d :%.8lf\n", mode, algo, record.time);
printf("mode %d, algo %d, time %.3lf ms\n", mode, algo,
record.time);
// Update the tune result
if (ret.time > record.time)
@ -273,8 +382,8 @@ class convCudnn : public Kernel {
checkCudnnError(cudnnDestroyTensorDescriptor(inDesc));
}
}
// printf("the best algo is %d, the best conv mode is %d\n", ret.algo,
// ret.mode);
printf("tuneUnfused: the best algo is %d, the best conv mode is %d\n",
ret.algo, ret.mode);
IT_ASSERT(ret.time < std::numeric_limits<double>::max(), "No valid "
"algorithm "
"found");

View File

@ -1,4 +1,3 @@
#include "core/common.h"
#include "core/constants.h"
#include "cuda/cuda_common.h"
#include <math.h>

View File

@ -5,15 +5,15 @@ namespace infini {
ConvBaseObj::ConvBaseObj(OpType opType, TensorVec inputs, Tensor &output,
int ph, int pw, int sh, int sw, int dh, int dw,
const Tensor &inputInConvFWD,
const Tensor &weightInConvFWD)
const Tensor &weightInConvFWD, const ActType act)
: OperatorObj(opType, inputs, {output}), ph(ph), pw(pw), sh(sh), sw(sw),
dh(dh), dw(dw), padding(PaddingMode::Other) {}
dh(dh), dw(dw), padding(PaddingMode::Other), act(act) {}
ConvBaseObj::ConvBaseObj(OpType opType, TensorVec inputs, Tensor &output,
PaddingMode mode, int sh, int sw, int dh, int dw,
const Tensor &inputInConvFWD,
const Tensor &weightInConvFWD)
const Tensor &weightInConvFWD, const ActType act)
: OperatorObj(opType, inputs, {output}), ph(-1), pw(-1), sh(sh), sw(sw),
dh(dh), dw(dw), padding(mode) {
dh(dh), dw(dw), padding(mode), act(act) {
IT_ASSERT(mode != PaddingMode::Other);
}
@ -21,28 +21,60 @@ string ConvBaseObj::toString() const {
std::ostringstream os;
os << OpRegistry::getOpName(getOpType()) << "[" << getGuid() << "]";
os << "(";
if (inputs.size() == 2) {
os << vecToString(inputs[0]->getDims()) << ",";
os << vecToString(inputs[1]->getDims()) << ",";
os << vecToString(inputs[0]->getDims()) << ",";
os << vecToString(inputs[1]->getDims()) << ",";
if (inputs.size() > 2) {
os << vecToString(inputs[2]->getDims()) << ",";
}
os << "p=[" << ph << "," << pw << "],";
os << "s=[" << sh << "," << sw << "],";
os << "d=[" << dh << "," << dw << "],";
os << "act=" << enum_to_underlying(getAct()) << ",";
// os << "act=" << enum_to_underlying(act) << ",";
os << "input=" << inputs[0]->getGuid() << ",";
os << "weight=" << inputs[1]->getGuid() << ",";
os << "bias="
<< ((inputs.size() == 2) ? "nullptr"
: std::to_string(inputs[2]->getGuid()))
<< ",";
os << "output=" << outputs[0]->getGuid() << ")";
return os.str();
}
vector<int> ConvBaseObj::getWorkloadVector() const {
return {
enum_to_underlying(type), n, c, h, w, f, r, s, ph, pw, sh, sw, dh, dw};
return {enum_to_underlying(type),
n,
c,
h,
w,
f,
r,
s,
ph,
pw,
sh,
sw,
dh,
dw,
hasBias(),
enum_to_underlying(getAct())};
}
vector<int> ConvBaseObj::getOpAttrVector() const {
IT_TODO_HALT(); // should padding mode / ph+pw be in attrs?
return {enum_to_underlying(type), c, f, r, s, ph, pw, sh, sw, dh, dw};
return {enum_to_underlying(type),
c,
f,
r,
s,
ph,
pw,
sh,
sw,
dh,
dw,
hasBias(),
enum_to_underlying(getAct())};
}
void ConvObj::setAuxilaryAttributes(PaddingMode mode) {
@ -64,11 +96,10 @@ void ConvObj::setAuxilaryAttributes(PaddingMode mode) {
ConvObj::ConvObj(GraphObj *graph, Tensor input, Tensor weight, Tensor output,
int ph, int pw, int sh, int sw, int dh, int dw, Tensor bias,
ActType act)
: ConvBaseObj(OpType::Conv, {input, weight}, output, ph, pw, sh, sw, dh, dw,
input, weight),
act(act) {
if (bias)
IT_TODO_HALT();
: ConvBaseObj(OpType::Conv,
((bias) ? (TensorVec{input, weight, bias})
: (TensorVec{input, weight})),
output, ph, pw, sh, sw, dh, dw, input, weight, act) {
setAuxilaryAttributes(PaddingMode::Other);
IT_ASSERT(checkValid(graph));
}
@ -76,11 +107,10 @@ ConvObj::ConvObj(GraphObj *graph, Tensor input, Tensor weight, Tensor output,
ConvObj::ConvObj(GraphObj *graph, Tensor input, Tensor weight, Tensor output,
PaddingMode mode, int sh, int sw, int dh, int dw, Tensor bias,
ActType act)
: ConvBaseObj(OpType::Conv, {input, weight}, output, mode, sh, sw, dh, dw,
input, weight),
act(act) {
if (bias)
IT_TODO_HALT();
: ConvBaseObj(OpType::Conv,
((bias) ? (TensorVec{input, weight, bias})
: (TensorVec{input, weight})),
output, mode, sh, sw, dh, dw, input, weight, act) {
setAuxilaryAttributes(mode);
IT_ASSERT(checkValid(graph));
}
@ -98,6 +128,11 @@ optional<vector<Shape>> ConvObj::inferShape(const TensorVec &inputs) const {
// For NCHW+FCRS layout, C of input is divisable by C of weight
if (input->getDims()[1] % weight->getDims()[1] != 0)
return {};
// check bias shape
if (inputs.size() == 3) {
if (inputs[2]->size() != (size_t)f)
return {};
}
// Set padding size
if (padding == PaddingMode::Other) {
oh = (h - (r - sh) * dh + ph * 2) / sh;
@ -122,8 +157,8 @@ ConvTransposed2dObj::ConvTransposed2dObj(GraphObj *graph, Tensor input,
int oph, int opw, int group,
Tensor bias, ActType act)
: ConvBaseObj(OpType::ConvTrans, {input, weight}, output, ph, pw, sh, sw,
dh, dw, output, weight),
oph(oph), opw(opw), group(group), act(act) {
dh, dw, output, weight, act),
oph(oph), opw(opw), group(group) {
if (bias)
IT_TODO_HALT();
setAuxilaryAttributes(PaddingMode::Other);
@ -136,8 +171,8 @@ ConvTransposed2dObj::ConvTransposed2dObj(GraphObj *graph, Tensor input,
int dh, int dw, int oph, int opw,
int group, Tensor bias, ActType act)
: ConvBaseObj(OpType::ConvTrans, {input, weight}, output, mode, sh, sw, dh,
dw, output, weight),
oph(oph), opw(opw), group(group), act(act) {
dw, output, weight, act),
oph(oph), opw(opw), group(group) {
if (bias)
IT_TODO_HALT();
setAuxilaryAttributes(mode);
@ -156,6 +191,8 @@ ConvTransposed2dObj::inferShape(const TensorVec &inputs) const {
auto s = weight->getDims()[3];
if (f != weight->getDims()[0])
return {};
if (inputs.size() != 2)
IT_TODO_HALT();
int on = n, oc = c * group;
int oh = 0, ow = 0;

View File

@ -11,6 +11,7 @@ optional<vector<Shape>>
ElementWiseObj::inferShape(const TensorVec &inputs) const {
// For now,we only process the same dims here, broardcast will be considered
// in the opt layer.
std::cout << std::endl;
const auto A = inputs[0], B = inputs[1];
if (A->getDims().size() != B->getDims().size() ||
A->getDims() != B->getDims())

View File

@ -12,6 +12,7 @@ ReduceMeanObj::ReduceMeanObj(GraphObj *graph, Tensor input, Tensor output,
int idx = (*_axis)[j];
if (idx < 0)
IT_TODO_HALT();
std::cout << idx << " " << input->getDims().size() << std::endl;
IT_ASSERT((size_t)idx < input->getDims().size());
axis.emplace(idx);
}

View File

@ -0,0 +1,52 @@
#include "operators/transpose.h"
namespace infini {
TransposeObj::TransposeObj(GraphObj *graph, Tensor input, Tensor output,
const Shape &perm)
: OperatorObj(OpType::Transpose, {input}, {output}), perm(perm) {
IT_ASSERT(checkValid(graph));
}
optional<vector<Shape>>
TransposeObj::inferShape(const TensorVec &inputs) const {
const Shape &dimsIn = inputs[0]->getDims();
Shape dimsOut;
std::unordered_set<size_t> dimSet;
for (size_t i = 0; i < perm.size(); ++i) {
if (size_t(perm[i]) >= dimsIn.size() ||
dimSet.find(perm[i]) != dimSet.end()) {
std::cout << i << " " << perm[i] << " "
<< int(dimSet.find(perm[i]) != dimSet.end()) << std::endl;
return {};
}
dimsOut.emplace_back(dimsIn[perm[i]]);
dimSet.emplace(perm[i]);
}
std::cout << "transpose Ok" << std::endl;
return {{dimsOut}};
}
std::string TransposeObj::toString() const {
std::ostringstream os;
os << "Transpose[" << getGuid() << "]";
os << "(";
os << vecToString(inputs[0]->getDims()) << ",";
os << "perm=" << vecToString(perm) << ",";
os << "input=" << inputs[0]->getGuid() << ",";
os << "output=" << outputs[0]->getGuid() << ")";
return os.str();
}
vector<int> TransposeObj::getWorkloadVector() const {
vector<int> ret = inputs[0]->getDims();
ret.insert(ret.end(), perm.begin(), perm.end());
ret.emplace(ret.begin(), enum_to_underlying(type));
return ret;
}
vector<int> TransposeObj::getOpAttrVector() const {
vector<int> ret = perm;
ret.emplace(ret.begin(), enum_to_underlying(type));
return ret;
}
} // namespace infini

View File

@ -0,0 +1,6 @@
project(PowerFusion)
aux_source_directory(. SRC_CXX)
aux_source_directory(micro_kernel/. SRC_CXX)
cuda_add_library(pf_base SHARED ${SRC_CXX})
set_target_properties(pf_base PROPERTIES CUDA_ARCHITECTURES "80")

27
src/pfusion/code.cpp Normal file
View File

@ -0,0 +1,27 @@
#include <iostream>
#include "code.h"
#include "memory_operator.h"
namespace memb {
std::string MemoryOperator::generate() {
CodeBuffer buf;
std::string inst;
buf.emit("#pragma unroll");
buf.emit("for (int inst_idx = 0; inst_idx < " + num + "; inst_idx++) {");
if (opType == READ) {
inst = "reg[inst_idx] = " + ptr.base_ptr + "[" + ptr.offset + " + " +
offset + "];";
buf.emit(inst);
} else if (opType == WRITE) {
inst = ptr.base_ptr + "[" + ptr.offset + " + " + offset +
"] = " + "reg[inst_idx];";
buf.emit(inst);
} else {
std::cout << "[ERROR]" << std::endl;
exit(-1);
}
buf.emit("}");
return buf.toString();
}
} // namespace memb

90
src/pfusion/gen_relu.cpp Normal file
View File

@ -0,0 +1,90 @@
#include "codegen.h"
#include "memory_operator.h"
#include "micro_kernel/element.h"
#include "micro_kernel/transpose.h"
namespace memb {
std::string gen_relu(int dimn, int dimx, int dimy, int nblocks, int nthreads) {
std::string code = "\n";
MicroKernelElement relu("relu", "return element > 0 ? element : 0");
code += relu.gen_func();
code += "\n";
code += "// Kernel\n";
code += "__global__ void kernel_tmp(float *src, float *dst) {\n";
code += "int lane_id = threadIdx.x % 32;\n";
code += "int warp_id = threadIdx.x / 32;\n";
code += "int parallel_idx = blockIdx.x * " + std::to_string(nthreads / 32) +
" + warp_id;\n";
code += "float reg[32];\n";
code += "__shared__ float smem[32 * 32 * 2 * " +
std::to_string(nthreads / 32) + "];\n";
int loop = ((dimx - 1) / 32 + 1) * ((dimy - 1) / 32 + 1);
code += "for (int loop_idx = 0; loop_idx < " + std::to_string(loop) +
"; loop_idx++) {\n";
MemoryOperator dram_read;
dram_read.memoryType = MemoryOperator::DRAM;
dram_read.opType = MemoryOperator::READ;
dram_read.ptr = Ptr("src", "parallel_idx * " + std::to_string(dimx * dimy));
dram_read.num =
"MIN(32, " + std::to_string(dimx) + " - loop_idx / 32 * 32)";
dram_read.offset =
"INDEX(loop_idx / 32 + inst_idx, loop_idx % 32 + lane_id, " +
std::to_string(dimy) + ")";
dram_read.reg = "inst_idx";
code += dram_read.generate();
MemoryOperator sram_write;
sram_write.memoryType = MemoryOperator::SRAM;
sram_write.opType = MemoryOperator::WRITE;
sram_write.ptr = Ptr("smem", "warp_id * 32 * 32 * 2");
sram_write.num =
"MIN(32, " + std::to_string(dimx) + " - loop_idx / 32 * 32)";
sram_write.offset = "inst_idx * 32 + lane_id";
sram_write.reg = "inst_idx";
code += sram_write.generate();
code += relu.gen_kernel(Ptr("smem", "warp_id * 32 * 32 * 2"),
Ptr("smem", "warp_id * 32 * 32 * 2 + 32 * 32"), 32,
32, "32");
MemoryOperator sram_read;
sram_read.memoryType = MemoryOperator::SRAM;
sram_read.opType = MemoryOperator::READ;
sram_read.ptr = Ptr("smem", "warp_id * 32 * 32 * 2 + 32 * 32");
sram_read.num =
"MIN(32, " + std::to_string(dimx) + " - loop_idx / 32 * 32)";
sram_read.offset = "inst_idx * 32 + lane_id";
sram_read.reg = "inst_idx";
code += sram_read.generate();
MemoryOperator dram_write;
sram_read.memoryType = MemoryOperator::DRAM;
sram_read.opType = MemoryOperator::WRITE;
sram_read.ptr = Ptr("dst", "parallel_idx * " + std::to_string(dimx) +
" * " + std::to_string(dimy));
sram_read.num =
"MIN(32, " + std::to_string(dimx) + " - loop_idx / 32 * 32)";
sram_read.offset =
"INDEX(loop_idx / 32 + inst_idx, loop_idx % 32 + lane_id, " +
std::to_string(dimy) + ")";
sram_read.reg = "inst_idx";
code += sram_read.generate();
code += "}\n";
code += "}\n\n";
code += "void transpose(float *src, float *dst) {\n";
code += "dim3 gridDim(" + std::to_string(nblocks) + ", 1);";
code += "dim3 blockDim(" + std::to_string(nthreads) + ", 1);";
code += "kernel_tmp<<<gridDim, blockDim>>>(src, dst);\n";
code += "cudaCheckError();\n";
code += "}\n";
return code;
}
} // namespace memb

View File

@ -0,0 +1,93 @@
#include "codegen.h"
#include "memory_operator.h"
#include "micro_kernel/transpose.h"
namespace memb {
std::string header =
std::string("// Header code\n") + std::string("#include \"cuda.h\"\n") +
std::string("#include \"cuda_utils.h\"\n\n") +
std::string("#define ROUND_UP(n, b) (((n) - 1) / (b) + 1)\n") +
std::string("#define INDEX(x, y, n) ((x) * (n) + (y))\n") +
std::string("#define MIN(x, y) ((x) < (y) ? (x) : (y))\n\n");
std::string gen_lowest_basic(int dimn, int dimx, int dimy, int nblocks,
int nthreads) {
std::string code = header;
code += "// Kernel\n";
code += "__global__ void kernel_tmp(float *src, float *dst) {\n";
code += "int lane_id = threadIdx.x % 32;\n";
code += "int warp_id = threadIdx.x / 32;\n";
code += "int parallel_idx = blockIdx.x * " + std::to_string(nthreads / 32) +
" + warp_id;\n";
code += "float reg[32];\n";
code += "__shared__ float smem[32 * 32 * 2 * " +
std::to_string(nthreads / 32) + "];\n";
int loop = ((dimx - 1) / 32 + 1) * ((dimy - 1) / 32 + 1);
code += "for (int loop_idx = 0; loop_idx < " + std::to_string(loop) +
"; loop_idx++) {\n";
MemoryOperator dram_read;
dram_read.memoryType = MemoryOperator::DRAM;
dram_read.opType = MemoryOperator::READ;
dram_read.ptr = Ptr("src", "parallel_idx * " + std::to_string(dimx * dimy));
dram_read.num =
"MIN(32, " + std::to_string(dimx) + " - loop_idx / 32 * 32)";
dram_read.offset =
"INDEX(loop_idx / 32 + inst_idx, loop_idx % 32 + lane_id, " +
std::to_string(dimy) + ")";
dram_read.reg = "inst_idx";
code += dram_read.generate();
MemoryOperator sram_write;
sram_write.memoryType = MemoryOperator::SRAM;
sram_write.opType = MemoryOperator::WRITE;
sram_write.ptr = Ptr("smem", "warp_id * 32 * 32 * 2");
sram_write.num =
"MIN(32, " + std::to_string(dimx) + " - loop_idx / 32 * 32)";
sram_write.offset = "inst_idx * 32 + lane_id";
sram_write.reg = "inst_idx";
code += sram_write.generate();
MicroKernelTranspose transpose;
code += transpose.generate(Ptr("smem", "warp_id * 32 * 32 * 2"),
Ptr("smem", "warp_id * 32 * 32 * 2 + 32 * 32"),
32, "32", 32, "32");
MemoryOperator sram_read;
sram_read.memoryType = MemoryOperator::SRAM;
sram_read.opType = MemoryOperator::READ;
sram_read.ptr = Ptr("smem", "warp_id * 32 * 32 * 2 + 32 * 32");
sram_read.num =
"MIN(32, " + std::to_string(dimx) + " - loop_idx / 32 * 32)";
sram_read.offset = "inst_idx * 32 + lane_id";
sram_read.reg = "inst_idx";
code += sram_read.generate();
MemoryOperator dram_write;
sram_read.memoryType = MemoryOperator::DRAM;
sram_read.opType = MemoryOperator::WRITE;
sram_read.ptr = Ptr("dst", "parallel_idx * " + std::to_string(dimx) +
" * " + std::to_string(dimy));
sram_read.num =
"MIN(32, " + std::to_string(dimx) + " - loop_idx / 32 * 32)";
sram_read.offset =
"INDEX(loop_idx / 32 + inst_idx, loop_idx % 32 + lane_id, " +
std::to_string(dimy) + ")";
sram_read.reg = "inst_idx";
code += sram_read.generate();
code += "}\n";
code += "}\n\n";
code += "void transpose(float *src, float *dst) {\n";
code += "dim3 gridDim(" + std::to_string(nblocks) + ", 1);";
code += "dim3 blockDim(" + std::to_string(nthreads) + ", 1);";
code += "kernel_tmp<<<gridDim, blockDim>>>(src, dst);\n";
code += "cudaCheckError();\n";
code += "}\n";
return code;
}
} // namespace memb

302
src/pfusion/instantiate.cc Normal file
View File

@ -0,0 +1,302 @@
#include "pfusion/instantiate.h"
#include "pfusion/meta_op.h"
#include "pfusion/micro_kernel/binary.h"
#include "pfusion/micro_kernel/memory.h"
#include "pfusion/micro_kernel/unary.h"
#include "pfusion/micro_op.h"
namespace memb {
size_t getSize(const std::vector<size_t> &shape) {
size_t size = 1;
for (auto x : shape) {
size *= x;
}
return size;
}
size_t min(size_t a, size_t b) { return (a < b) ? a : b; }
std::vector<std::shared_ptr<MetaOp>>
instantiateUnary(const OpType opType,
std::vector<std::shared_ptr<Pointer>> ptrs,
const std::vector<size_t> &shape) {
std::vector<std::shared_ptr<MetaOp>> metaOps;
size_t size = getSize(shape);
auto metaOp = std::make_shared<MetaOp>();
metaOp->main_loop_st = 0;
metaOp->main_loop_ed = size / 32 / 8;
metaOp->numBlocks = 108;
metaOp->numGroups = 8;
metaOp->numLanes = 32;
metaOp->numReg = 8;
metaOp->numSmem = 0;
metaOp->mappings.emplace_back(TensorMapping::buildWithMap(
std::string("src"), std::vector<size_t>({32 * 8, size / 32 / 8}),
std::vector<size_t>({1})));
metaOp->ptrs = ptrs;
auto buf = Pointer::buildPtr(REG, "buf", "inst_idx");
metaOp->microOps.emplace_back(MemoryOp::build(
READ,
Pointer::buildPtr(ptrs[0], "offset_src + inst_idx * 32 + lane_id"), buf,
8, 32));
metaOp->microOps.emplace_back(
std::make_shared<UnaryOp>(opType, buf, buf, 8, 32));
metaOp->microOps.emplace_back(MemoryOp::build(
WRITE, buf,
Pointer::buildPtr(ptrs[1], "offset_src + inst_idx * 32 + lane_id"), 8,
32));
metaOps.emplace_back(metaOp);
return metaOps;
}
std::vector<std::shared_ptr<MetaOp>>
instantiateBinary(const OpType opType,
std::vector<std::shared_ptr<Pointer>> ptrs,
const std::vector<size_t> &shape) {
std::vector<std::shared_ptr<MetaOp>> metaOps;
size_t size = getSize(shape);
auto metaOp = std::make_shared<MetaOp>();
metaOp->main_loop_st = 0;
metaOp->main_loop_ed = size / 32 / 8;
metaOp->numBlocks = 108;
metaOp->numGroups = 8;
metaOp->numLanes = 32;
metaOp->numReg = 24;
metaOp->numSmem = 0;
metaOp->mappings.emplace_back(TensorMapping::buildWithMap(
std::string("src"), std::vector<size_t>({32 * 8, size / 32 / 8}),
std::vector<size_t>({1})));
metaOp->ptrs = ptrs;
auto buf0 = Pointer::buildPtr(REG, "buf", "inst_idx");
auto buf1 = Pointer::buildPtr(REG, "buf", "inst_idx + 8");
auto buf2 = Pointer::buildPtr(REG, "buf", "inst_idx + 16");
metaOp->microOps.emplace_back(MemoryOp::build(
READ,
Pointer::buildPtr(ptrs[0], "offset_src + inst_idx * 32 + lane_id"),
buf0, 8, 32));
metaOp->microOps.emplace_back(MemoryOp::build(
READ,
Pointer::buildPtr(ptrs[1], "offset_src + inst_idx * 32 + lane_id"),
buf1, 8, 32));
metaOp->microOps.emplace_back(
std::make_shared<BinaryOp>(opType, buf0, buf1, buf2, 8, 32));
metaOp->microOps.emplace_back(MemoryOp::build(
WRITE, buf2,
Pointer::buildPtr(ptrs[2], "offset_src + inst_idx * 32 + lane_id"), 8,
32));
metaOps.emplace_back(metaOp);
return metaOps;
}
std::vector<std::shared_ptr<MetaOp>> instantiateTranspose(
const OpType opType, std::vector<std::shared_ptr<Pointer>> ptrs,
const std::vector<size_t> &shape, const std::vector<size_t> &perm) {
std::vector<std::shared_ptr<MetaOp>> metaOps;
size_t size = 1;
for (auto x : shape) {
size *= x;
}
auto metaOp = std::make_shared<MetaOp>();
std::vector<size_t> srcShape(shape.size());
std::vector<size_t> srcMap;
size_t parallelSize = 1;
for (size_t i = 0; i < shape.size(); i++) {
srcShape[perm[i]] = shape[i];
if (i != 0 && perm[i] != 0) {
srcMap.emplace_back(perm[i]);
parallelSize *= shape[i];
}
}
metaOp->mappings.emplace_back(
TensorMapping::buildWithMap("src", srcShape, srcMap));
std::vector<size_t> dstMap;
for (size_t i = 0; i < shape.size(); i++) {
if (i != 0 && perm[i] != 0) {
dstMap.emplace_back(i);
}
}
metaOp->mappings.emplace_back(
TensorMapping::buildWithMap("dst", shape, dstMap));
metaOp->main_loop_st = 0;
metaOp->main_loop_ed = parallelSize;
metaOp->numBlocks = 108;
metaOp->numGroups = 8;
metaOp->numLanes = 32;
int numTileA = (shape[perm[0]] - 1) / 32 + 1;
int numTileB = (shape[0] - 1) / 32 + 1;
// std::cout << numTileA << " " << numTileB << std::endl;
metaOp->numReg = 32;
metaOp->numSmem = 32 * 33;
size_t stride_src = 1, stride_dst = 1;
for (size_t i = 0; i < shape.size(); i++) {
if (perm[i] < perm[0]) {
stride_src *= shape[i];
}
}
for (size_t i = 0; perm[i] != 0; i++) {
stride_dst *= shape[i];
}
// TODO: tiling is a metaOp or microOps?
metaOp->ptrs = ptrs;
auto smem =
Pointer::buildPtr(SRAM, "smem",
"group_id * " + std::to_string(metaOp->numLanes) +
" * " + std::to_string(metaOp->numLanes + 1));
auto buf = Pointer::buildPtr(REG, "buf", "inst_idx");
for (int i = 0; i < numTileA; i++) {
for (int j = 0; j < numTileB; j++) {
auto src_ptr = Pointer::buildPtr(
ptrs[0], "offset_src + " +
std::to_string(j * 32 * stride_src + i * 32) +
"+" + "inst_idx * " + std::to_string(stride_src) +
" + lane_id");
metaOp->microOps.emplace_back(
MemoryOp::build(READ, src_ptr, buf, min(32u, shape[0]),
min(32, shape[perm[0]])));
metaOp->microOps.emplace_back(MemoryOp::build(
WRITE, buf, Pointer::buildPtr(smem, "inst_idx * 33 + lane_id"),
min(32, shape[0]), min(32, shape[perm[0]])));
metaOp->microOps.emplace_back(MemoryOp::build(
READ, Pointer::buildPtr(smem, "lane_id * 33 + inst_idx"), buf,
min(32, shape[perm[0]]), min(32, shape[0])));
auto dst_ptr = Pointer::buildPtr(
ptrs[1], "offset_dst + " +
std::to_string(i * 32 * stride_dst + j * 32) +
"+" + "inst_idx * " + std::to_string(stride_dst) +
" + lane_id");
metaOp->microOps.emplace_back(
MemoryOp::build(WRITE, buf, dst_ptr, min(32, shape[perm[0]]),
min(32, shape[0])));
}
}
metaOps.emplace_back(metaOp);
return metaOps;
}
std::vector<std::shared_ptr<MetaOp>>
instantiateGather(const OpType opType,
const std::vector<std::shared_ptr<Pointer>> &ptrs,
const std::vector<size_t> &inputShape,
const std::vector<size_t> &indexShape,
const std::vector<size_t> &outputShape, const size_t axis) {
std::vector<std::shared_ptr<MetaOp>> metaOps;
size_t par_size = 1;
for (size_t i = 0; i < outputShape.size() - 1; i++) {
par_size *= inputShape[i];
}
size_t seq_size = inputShape[outputShape.size() - 1];
auto metaOp = std::make_shared<MetaOp>();
metaOp->main_loop_st = 0;
metaOp->main_loop_ed = par_size;
metaOp->numBlocks = 108;
metaOp->numGroups = 2;
metaOp->numLanes = 32;
metaOp->numReg = 24;
metaOp->numSmem = 0;
metaOp->mappings.emplace_back(TensorMapping::buildWithMap(
std::string("src"), std::vector<size_t>({seq_size, par_size}),
std::vector<size_t>({1})));
metaOp->ptrs = ptrs;
auto buf0 = Pointer::buildPtr(REG, "buf", "inst_idx");
auto buf1 = Pointer::buildPtr(REG, "buf", "inst_idx + 8");
auto buf2 = Pointer::buildPtr(REG, "buf", "inst_idx + 16");
metaOps.emplace_back(metaOp);
return metaOps;
}
std::vector<std::shared_ptr<MetaOp>>
instantiateReduce(const OpType opType,
const std::vector<std::shared_ptr<Pointer>> &ptrs,
const std::vector<size_t> &inputShape, const size_t axis) {
IT_ASSERT(axis == 0);
std::vector<std::shared_ptr<MetaOp>> metaOps;
size_t par_size = 1;
for (size_t i = 0; i < inputShape.size(); i++) {
if (i != axis) {
par_size *= inputShape[i];
}
}
size_t seq_size = inputShape[axis];
auto metaOp = std::make_shared<MetaOp>();
metaOp->main_loop_st = 0;
metaOp->main_loop_ed = par_size;
metaOp->numBlocks = 108;
metaOp->numGroups = 1;
metaOp->numLanes = 128;
metaOp->numReg = inputShape[0] / 128;
metaOp->numSmem = 0;
metaOp->mappings.emplace_back(TensorMapping::buildWithMap(
std::string("src"), std::vector<size_t>({seq_size, par_size}),
std::vector<size_t>({1})));
metaOp->ptrs = ptrs;
auto buf0 = Pointer::buildPtr(REG, "buf", "inst_idx");
auto buf1 = Pointer::buildPtr(REG, "buf", "inst_idx + 8");
auto buf2 = Pointer::buildPtr(REG, "buf", "inst_idx + 16");
metaOps.emplace_back(metaOp);
return metaOps;
}
std::vector<std::shared_ptr<MetaOp>>
instantiateBroadcast(const OpType opType,
const std::vector<std::shared_ptr<Pointer>> &ptrs,
const std::vector<size_t> &inputShape, const size_t axis,
const size_t num) {
std::vector<std::shared_ptr<MetaOp>> metaOps;
size_t par_size = getSize(inputShape);
size_t seq_size = num;
auto metaOp = std::make_shared<MetaOp>();
metaOp->main_loop_st = 0;
metaOp->main_loop_ed = par_size;
metaOp->numBlocks = 108;
metaOp->numGroups = 2;
metaOp->numLanes = 32;
metaOp->numReg = 24;
metaOp->numSmem = 0;
metaOp->mappings.emplace_back(TensorMapping::buildWithMap(
std::string("src"), std::vector<size_t>({seq_size, par_size}),
std::vector<size_t>({1})));
metaOp->ptrs = ptrs;
auto buf0 = Pointer::buildPtr(REG, "buf", "inst_idx");
auto buf1 = Pointer::buildPtr(REG, "buf", "inst_idx + 8");
auto buf2 = Pointer::buildPtr(REG, "buf", "inst_idx + 16");
metaOps.emplace_back(metaOp);
return metaOps;
}
} // namespace memb

View File

@ -0,0 +1,257 @@
#include "core/graph.h"
#include "operators/extend.h"
#include "operators/gather.h"
#include "operators/reduce_mean.h"
#include "operators/transpose.h"
#include "pfusion/instantiate.h"
#include "pfusion/memory_codegen.h"
#include "pfusion/pointer.h"
#include "pfusion/search_graph.h"
#include <cstdlib>
#include <filesystem>
#include <fstream>
void exportCode(const std::string &filename, const std::string &code) {
// check dir
if (std::filesystem::exists("../generated_code")) {
assert(std::filesystem::is_directory("../generated_code"));
} else {
auto ok = std::filesystem::create_directory("../generated_code");
assert(ok);
}
std::string dir = "../generated_code/" + filename;
std::ofstream fout(dir);
assert(fout.is_open());
fout << code;
fout.close();
system(std::string("clang-format -i " + dir).c_str());
}
void infini::MemoryCodegen::exportGraph(Graph graph, std::string filename) {
std::string code = generateGraph(graph);
exportCode(filename, code);
}
void infini::MemoryCodegen::exportBert_LN(const std::string &filename) {
std::string code = "";
exportCode(filename, code);
}
void infini::MemoryCodegen::exportBert_SM(const std::string &filename) {
std::string code = "";
exportCode(filename, code);
}
void infini::MemoryCodegen::exportBert_GELU(const std::string &filename) {
std::string code = "";
exportCode(filename, code);
}
void infini::MemoryCodegen::exportViT_LN(const std::string &filename) {
std::string code = "";
exportCode(filename, code);
}
void infini::MemoryCodegen::exportViT_SM(const std::string &filename) {
std::string code = "";
exportCode(filename, code);
}
void infini::MemoryCodegen::exportViT_GELU(const std::string &filename) {
std::string code = "";
exportCode(filename, code);
}
void infini::MemoryCodegen::exportBias(const std::string &filename,
const std::vector<size_t> &shape) {
std::string code = generateBias(shape);
exportCode(filename, code);
}
void infini::MemoryCodegen::exportTranspose(const std::string &filename,
const std::vector<size_t> &shape,
const std::vector<size_t> &perm) {
std::string code = generateTranspose(shape, perm);
exportCode(filename, code);
}
std::vector<size_t> convertShape(const std::vector<int> &_shape) {
std::vector<size_t> shape;
for (int i = int(_shape.size()); i > 0; i--) {
shape.emplace_back(_shape[i - 1]);
}
return shape;
}
size_t convertIndex(const size_t idx, const size_t size) { return size - idx; }
std::vector<size_t> convertPerm(const std::vector<int> &_perm) {
std::vector<size_t> perm;
for (int i = int(_perm.size()); i > 0; i--) {
perm.emplace_back(_perm.size() - _perm[i - 1] - 1);
}
return perm;
}
void convertTranspose(std::shared_ptr<memb::SearchGraph> searchGraph,
infini::Operator op) {
searchGraph->addNode(memb::instantiateTranspose(
memb::TRANSPOSE,
{memb::Pointer::buildPtrByTensorGuid(op->getInputs()[0]->getGuid()),
memb::Pointer::buildPtrByTensorGuid(op->getOutputs()[0]->getGuid())},
convertShape(op->getOutputs()[0]->getDims()),
convertPerm(infini::as<infini::TransposeObj>(op)->getPerm())));
}
void convertUnary(std::shared_ptr<memb::SearchGraph> searchGraph,
infini::Operator op, memb::OpType opType) {
searchGraph->addNode(memb::instantiateUnary(
opType,
{memb::Pointer::buildPtrByTensorGuid(op->getInputs()[0]->getGuid()),
memb::Pointer::buildPtrByTensorGuid(op->getOutputs()[0]->getGuid())},
convertShape(op->getOutputs()[0]->getDims())));
}
void convertBinary(std::shared_ptr<memb::SearchGraph> searchGraph,
infini::Operator op, memb::OpType opType) {
searchGraph->addNode(memb::instantiateBinary(
opType,
{memb::Pointer::buildPtrByTensorGuid(op->getInputs()[0]->getGuid()),
memb::Pointer::buildPtrByTensorGuid(op->getInputs()[1]->getGuid()),
memb::Pointer::buildPtrByTensorGuid(op->getOutputs()[0]->getGuid())},
convertShape(op->getOutputs()[0]->getDims())));
}
void convertGather(std::shared_ptr<memb::SearchGraph> searchGraph,
infini::Operator op) {
searchGraph->addNode(memb::instantiateGather(
memb::GATHER,
{memb::Pointer::buildPtrByTensorGuid(op->getInputs()[0]->getGuid()),
memb::Pointer::buildPtrByTensorGuid(op->getInputs()[1]->getGuid()),
memb::Pointer::buildPtrByTensorGuid(op->getOutputs()[0]->getGuid())},
convertShape(op->getInputs()[0]->getDims()),
convertShape(op->getInputs()[1]->getDims()),
convertShape(op->getOutputs()[0]->getDims()),
convertIndex(infini::as<infini::GatherObj>(op)->getAxis(),
op->getInputs()[0]->getDims().size())));
}
void convertReduce(std::shared_ptr<memb::SearchGraph> searchGraph,
infini::Operator op, memb::OpType opType) {
auto reduceMeanOp = infini::as<infini::ReduceMeanObj>(op);
int axis = -1,
dimSize = int(reduceMeanOp->getInputs()[0]->getDims().size());
for (int i = 0; i < dimSize; i++) {
if (reduceMeanOp->isReduced(i)) {
if (axis != -1) {
IT_ASSERT(false);
} else {
axis = dimSize - i - 1;
}
}
}
IT_ASSERT(axis != -1);
searchGraph->addNode(memb::instantiateReduce(
opType,
{memb::Pointer::buildPtrByTensorGuid(op->getInputs()[0]->getGuid()),
memb::Pointer::buildPtrByTensorGuid(op->getOutputs()[0]->getGuid())},
convertShape(op->getInputs()[0]->getDims()), axis));
}
void convertBroadcast(std::shared_ptr<memb::SearchGraph> searchGraph,
infini::Operator op) {
auto extendOp = infini::as<infini::ExtendObj>(op);
IT_ASSERT(op->getInputs()[0]->getDims()[extendOp->getDim()] == 1);
searchGraph->addNode(memb::instantiateBroadcast(
memb::BROADCAST,
{memb::Pointer::buildPtrByTensorGuid(op->getInputs()[0]->getGuid()),
memb::Pointer::buildPtrByTensorGuid(op->getOutputs()[0]->getGuid())},
convertShape(op->getInputs()[0]->getDims()), extendOp->getDim(),
extendOp->getNum() + 1));
}
std::shared_ptr<memb::SearchGraph> instantiateGraph(infini::Graph graph) {
auto searchGraph = std::make_shared<memb::SearchGraph>();
std::unordered_map<int, int> opMap;
int id = 0;
for (auto op : graph->getOperators()) {
switch (op->getOpType()) {
case infini::OpType::Transpose:
convertTranspose(searchGraph, op);
break;
case infini::OpType::Relu:
convertUnary(searchGraph, op, memb::RELU);
break;
case infini::OpType::Add:
convertBinary(searchGraph, op, memb::ADD);
break;
case infini::OpType::Sub:
convertBinary(searchGraph, op, memb::SUB);
break;
case infini::OpType::Gather:
convertGather(searchGraph, op);
break;
case infini::OpType::ReduceMean:
convertReduce(searchGraph, op, memb::REDUCEMEAN);
break;
case infini::OpType::Extend:
convertBroadcast(searchGraph, op);
break;
default:
std::cout << int(op->getOpType()) << std::endl;
IT_ASSERT(false);
}
IT_ASSERT(opMap.find(op->getGuid()) == opMap.end());
opMap[op->getGuid()] = id;
id++;
}
for (auto op : graph->getOperators()) {
for (auto nextOp : op->getSuccessors()) {
assert(opMap.find(op->getGuid()) != opMap.end());
assert(opMap.find(nextOp->getGuid()) != opMap.end());
searchGraph->addEdge(opMap[op->getGuid()],
opMap[nextOp->getGuid()]);
}
}
return searchGraph;
}
std::string infini::MemoryCodegen::generateGraph(Graph graph) {
auto searchGraph = instantiateGraph(graph);
auto metaGraph = searchGraph->exportFirstMetaGraph();
std::string code = "";
std::cout << "[INFO] before opt." << std::endl;
metaGraph->print();
metaGraph->optimize();
std::cout << "[INFO] after opt." << std::endl;
metaGraph->print();
code += metaGraph->genHeader();
code += metaGraph->genKernelFuncs();
code += metaGraph->genInvokeFuncs();
return code;
}
std::string
infini::MemoryCodegen::generateBias(const std::vector<size_t> &shape) {
auto metaGraph = std::make_shared<memb::MetaGraph>();
metaGraph->addOp(memb::MetaOp::buildBiasOp(shape));
metaGraph->print();
std::string code = "";
code += metaGraph->genHeader();
code += metaGraph->genKernelFuncs();
code += metaGraph->genInvokeFuncs();
return code;
}
std::string
infini::MemoryCodegen::generateTranspose(const std::vector<size_t> &shape,
const std::vector<size_t> &perm) {
auto metaGraph = std::make_shared<memb::MetaGraph>();
metaGraph->addOp(memb::MetaOp::buildTransposeOp(shape, perm));
metaGraph->print();
std::string code = "";
code += metaGraph->genHeader();
code += metaGraph->genKernelFuncs();
code += metaGraph->genInvokeFuncs();
return code;
}

54
src/pfusion/meta_graph.cc Normal file
View File

@ -0,0 +1,54 @@
#include "pfusion/meta_graph.h"
namespace memb {
void MetaGraph::print() {
for (auto op : metaOps) {
op->print();
}
}
void MetaGraph::optimize() {
std::vector<std::shared_ptr<MetaOp>> ops;
int numOp = metaOps.size();
int cur = 0;
for (int i = 1; i < numOp; i++) {
auto next = MetaOp::merge(metaOps[cur], metaOps[i]);
if (next == nullptr) {
ops.emplace_back(metaOps[cur]);
cur = i;
} else {
cur = metaOps.size();
metaOps.emplace_back(next);
}
}
ops.emplace_back(metaOps[cur]);
metaOps.clear();
for (auto op : ops) {
op->optimize();
metaOps.emplace_back(op);
}
}
std::string MetaGraph::genHeader() {
std::string code = "#include \"cuda_utils.h\"\n";
return code;
}
std::string MetaGraph::genKernelFuncs() {
std::string code = "";
for (auto metaOp : metaOps) {
code += metaOp->genKernelFunc();
}
return code;
}
std::string MetaGraph::genInvokeFuncs() {
std::string code = "";
for (auto metaOp : metaOps) {
code += metaOp->genInvokeFunc();
}
return code;
}
} // namespace memb

303
src/pfusion/meta_op.cc Normal file
View File

@ -0,0 +1,303 @@
#include "pfusion/meta_op.h"
#include "pfusion/micro_kernel/binary.h"
#include "pfusion/micro_kernel/memory.h"
namespace memb {
std::string TensorMapping::genOffset() {
std::string code = "int " + offset() + " = 0;\n";
std::string bufName = name + "_buf";
code += "int " + bufName + " = loop_idx;\n";
for (size_t i = 0; i < map.size(); i++) {
code += name + " += " + bufName + " % " +
std::to_string(shape[map[i]]) + " * " +
std::to_string(stride[map[i]]) + ";\n";
code += bufName + " /= " + std::to_string(shape[map[i]]) + ";\n";
}
return code;
}
void MetaOp::optimize() {
if (microOps.size() == 0)
return;
std::vector<std::shared_ptr<MicroOp>> ops;
int numOp = microOps.size();
int cur = 0;
for (int i = 1; i < numOp; i++) {
auto next = MicroOp::merge(microOps[cur], microOps[i]);
if (next == nullptr) {
ops.emplace_back(microOps[cur]);
cur = i;
} else {
cur = microOps.size();
microOps.emplace_back(next);
}
}
ops.emplace_back(microOps[cur]);
microOps.clear();
std::unordered_set<std::string> ptrSet;
for (auto op : ops) {
for (auto ptr : op->getPtrs()) {
ptrSet.emplace(ptr->getName());
}
if (op->getType() != EMPTY) {
microOps.emplace_back(op);
}
}
std::vector<std::shared_ptr<Pointer>> newPtrs;
for (auto ptr : ptrs) {
if (ptrSet.find(ptr->getName()) != ptrSet.end()) {
newPtrs.emplace_back(ptr);
}
}
ptrs.clear();
for (auto ptr : newPtrs) {
ptrs.emplace_back(ptr);
}
}
std::string MetaOp::genKernelFunc() {
std::string code = "";
code += "// Kernel\n";
code += "__global__ void kernel_func_" + std::to_string(id) + "(";
IT_ASSERT(ptrs.size() > 0);
code += "float *" + ptrs[0]->getName();
for (size_t i = 1; i < ptrs.size(); i++) {
code += ", float *" + ptrs[i]->getName();
}
code += ") {\n";
code += "int lane_id = threadIdx.x % " + std::to_string(numLanes) + ";\n";
code += "int warp_id = threadIdx.x / " + std::to_string(numLanes) + ";\n";
code += "int parallel_idx = blockIdx.x * " + std::to_string(numGroups) +
" + warp_id;\n";
if (numReg != 0) {
code += "float buf[" + std::to_string(numReg) + "];\n";
}
if (numSmem != 0) {
code += "__shared__ float smem[" + std::to_string(numSmem * numGroups) +
"];\n";
}
code += "for (int loop_idx = parallel_idx; loop_idx < " +
std::to_string(main_loop_ed) +
"; loop_idx += " + std::to_string(numBlocks * numGroups) + ") {\n";
// gen offset_src
for (auto mapping : mappings) {
code += mapping->genOffset();
}
for (auto microOp : microOps) {
code += microOp->generate();
}
code += "}\n}\n";
return code;
}
std::string MetaOp::genInvokeFunc() {
std::string code = "";
code += "void invoke_func_" + std::to_string(id) + "(";
IT_ASSERT(ptrs.size() > 0);
code += "float *" + ptrs[0]->getName();
for (size_t i = 1; i < ptrs.size(); i++) {
code += ", float *" + ptrs[i]->getName();
}
code += ") {\n";
int numThreads = numGroups * numLanes;
code += "dim3 gridDim(" + std::to_string(numBlocks) + ", 1);";
code += "dim3 blockDim(" + std::to_string(numThreads) + ", 1);";
code += "kernel_func_" + std::to_string(id) + "<<<gridDim, blockDim>>>(";
IT_ASSERT(ptrs.size() > 0);
code += ptrs[0]->getName();
for (size_t i = 1; i < ptrs.size(); i++) {
code += ", " + ptrs[i]->getName();
}
code += ");\n";
code += "cudaCheckError();\n";
code += "}\n";
return code;
}
std::shared_ptr<MetaOp> MetaOp::merge(std::shared_ptr<MetaOp> metaOp0,
std::shared_ptr<MetaOp> metaOp1) {
IT_ASSERT(metaOp0->checkValid());
IT_ASSERT(metaOp1->checkValid());
// Check unmergeable
if (metaOp0->main_loop_st != metaOp1->main_loop_st ||
metaOp0->main_loop_ed != metaOp1->main_loop_ed ||
metaOp0->numBlocks != metaOp1->numBlocks ||
metaOp0->numGroups != metaOp1->numGroups ||
metaOp0->numLanes != metaOp1->numLanes) {
return nullptr;
}
auto metaOp = std::make_shared<MetaOp>();
metaOp->main_loop_st = metaOp0->main_loop_st;
metaOp->main_loop_ed = metaOp0->main_loop_ed;
metaOp->numBlocks = metaOp0->numBlocks;
metaOp->numGroups = metaOp0->numGroups;
metaOp->numLanes = metaOp0->numLanes;
metaOp->numReg = metaOp0->numReg + metaOp1->numReg;
metaOp->numSmem = metaOp0->numSmem + metaOp1->numSmem;
// Merge ptr
std::unordered_set<size_t> ptrSet;
for (auto ptr : metaOp0->ptrs) {
IT_ASSERT(ptrSet.find(ptr->getHash()) == ptrSet.end());
metaOp->ptrs.emplace_back(ptr);
ptrSet.emplace(ptr->getHash());
}
for (auto ptr : metaOp1->ptrs) {
if (ptrSet.find(ptr->getHash()) == ptrSet.end()) {
metaOp->ptrs.emplace_back(ptr);
ptrSet.emplace(ptr->getHash());
}
}
// Merge mapping
std::unordered_set<size_t> mappingSet;
for (auto mapping : metaOp0->mappings) {
IT_ASSERT(mappingSet.find(mapping->getHash()) == mappingSet.end());
metaOp->mappings.emplace_back(mapping);
mappingSet.emplace(mapping->getHash());
}
for (auto mapping : metaOp1->mappings) {
if (mappingSet.find(mapping->getHash()) == mappingSet.end()) {
metaOp->mappings.emplace_back(mapping);
mappingSet.emplace(mapping->getHash());
}
}
// Merge microOps.
// TODO: make it a graph.
for (auto microOp : metaOp0->microOps) {
metaOp->microOps.emplace_back(microOp);
}
for (auto microOp : metaOp1->microOps) {
metaOp->microOps.emplace_back(microOp);
}
for (auto microOp : metaOp->microOps) {
microOp->print();
}
// TODO: elimiate microOps.
return metaOp;
}
std::shared_ptr<MetaOp> MetaOp::buildBiasOp(const std::vector<size_t> &shape) {
IT_ASSERT(shape.size() == 2);
auto metaOp = std::make_shared<MetaOp>();
metaOp->main_loop_st = 0;
metaOp->main_loop_ed = shape[1] * (shape[0] / 32 / 4);
metaOp->numBlocks = 80;
metaOp->numGroups = 4;
metaOp->numLanes = 32;
metaOp->numReg = 4;
metaOp->numSmem = 0;
metaOp->mappings.emplace_back(TensorMapping::build(
std::string("input"),
std::vector<size_t>({32 * 4, (shape[0] - 1) / (32 * 4) + 1, shape[1]}),
std::vector<size_t>({1, 32 * 4, shape[0]}),
std::vector<size_t>({1, 2})));
metaOp->mappings.emplace_back(TensorMapping::buildWithMap(
std::string("bias"), std::vector<size_t>({shape[0], shape[1]}),
std::vector<size_t>({1})));
metaOp->ptrs = std::vector<std::shared_ptr<Pointer>>();
auto &ptrs = metaOp->ptrs;
ptrs.emplace_back(Pointer::buildPtr(DRAM, "input"));
ptrs.emplace_back(Pointer::buildPtr(DRAM, "bias"));
ptrs.emplace_back(Pointer::buildPtr(DRAM, "output"));
auto buf_input = Pointer::buildPtr(REG, "buf", "inst_idx");
auto buf_bias = Pointer::buildPtr(REG, "buf", "4");
auto buf_output = Pointer::buildPtr(REG, "buf", "inst_idx");
// @cond group_id * 4 * 32 + inst_idx * 32 + lane_id < shape[0]
metaOp->microOps.emplace_back(MemoryOp::build(
READ,
Pointer::buildPtr(ptrs[0], "offset_input + inst_idx * 32 + lane_id"),
buf_input, 4, 32));
metaOp->microOps.emplace_back(MemoryOp::build(
READ, Pointer::buildPtr(ptrs[1], "offset_bias"), buf_bias, 1, 32));
metaOp->microOps.emplace_back(std::make_shared<BinaryOp>(
ADD, buf_input, buf_bias, buf_output, 4, 32));
// @cond group_id * 4 * 32 + inst_idx * 32 + lane_id < shape[0]
metaOp->microOps.emplace_back(MemoryOp::build(
WRITE, buf_output,
Pointer::buildPtr(ptrs[1], "offset_input + inst_idx * 32 + lane_id"), 8,
32));
return metaOp;
}
std::shared_ptr<MetaOp>
MetaOp::buildTransposeOp(const std::vector<size_t> &shape,
const std::vector<size_t> &perm) {
IT_ASSERT(perm[0] == 0 && shape[0] >= 32);
IT_ASSERT(shape.size() == 3);
auto metaOp = std::make_shared<MetaOp>();
size_t numInst, extraDim;
std::vector<size_t> map_shape, map_stride;
if (shape[0] <= 4 * 32) {
numInst = (shape[0] - 1) / 32 + 1;
extraDim = 1;
metaOp->mappings.emplace_back(TensorMapping::build(
std::string("input"),
std::vector<size_t>({shape[0], shape[perm[1]], shape[perm[2]]}),
std::vector<size_t>({1, shape[0], shape[0] * shape[perm[1]]}),
std::vector<size_t>({perm[1], perm[2]})));
metaOp->mappings.emplace_back(TensorMapping::build(
std::string("output"),
std::vector<size_t>({shape[0], shape[1], shape[2]}),
std::vector<size_t>({1, shape[0], shape[0] * shape[1]}),
std::vector<size_t>({1, 2})));
// cond: local_id < shape[0];
} else {
numInst = 4;
extraDim = (shape[0] - 1) / 128 + 1;
metaOp->mappings.emplace_back(TensorMapping::build(
std::string("input"),
std::vector<size_t>(
{128, extraDim, shape[perm[1]], shape[perm[2]]}),
std::vector<size_t>({1, 128, shape[0], shape[0] * shape[perm[1]]}),
std::vector<size_t>({1, perm[1] + 1, perm[2] + 1})));
metaOp->mappings.emplace_back(TensorMapping::build(
std::string("output"),
std::vector<size_t>({128, extraDim, shape[1], shape[2]}),
std::vector<size_t>({1, 128, shape[0], shape[0] * shape[1]}),
std::vector<size_t>({1, 2, 3})));
// cond loop_idx % extraDim * 128 + local_id < shape[0];
}
metaOp->main_loop_st = 0;
metaOp->main_loop_ed = shape[1] * shape[2] * extraDim;
metaOp->numBlocks = 80;
metaOp->numGroups = 4;
metaOp->numLanes = 32;
metaOp->numReg = 4;
metaOp->numSmem = 0;
metaOp->ptrs = std::vector<std::shared_ptr<Pointer>>();
auto &ptrs = metaOp->ptrs;
ptrs.emplace_back(Pointer::buildPtr(DRAM, "input"));
ptrs.emplace_back(Pointer::buildPtr(DRAM, "output"));
auto buf_input = Pointer::buildPtr(REG, "buf", "inst_idx");
auto buf_output = Pointer::buildPtr(REG, "buf", "inst_idx");
// @cond group_id * 4 * 32 + inst_idx * 32 + lane_id < shape[0]
std::vector<size_t> cond = {shape[0], extraDim, 128};
auto inPtr =
Pointer::buildPtr(ptrs[0], "offset_input + inst_idx * 32 + lane_id");
auto opRead = MemoryOp::build(READ, inPtr, buf_input, numInst, 32, cond);
auto outPtr =
Pointer::buildPtr(ptrs[1], "offset_output + inst_idx * 32 + lane_id");
auto opWrite =
MemoryOp::build(WRITE, buf_output, outPtr, numInst, 32, cond);
metaOp->microOps = std::vector<std::shared_ptr<MicroOp>>({opRead, opWrite});
return metaOp;
}
} // namespace memb

View File

@ -0,0 +1,33 @@
#include "pfusion/micro_kernel/binary.h"
namespace memb {
std::string BinaryOp::generate() {
std::string code;
if (width < 32) {
code += "if (lane_id < " + std::to_string(width) + ") {\n";
}
code += "#pragma unroll\n";
code += "for (int inst_idx = 0; inst_idx < " + std::to_string(num) +
"; inst_idx++) {\n";
std::string opFunc = getDst()->generate() + " = " + getSrc0()->generate();
if (opType == ADD) {
opFunc += " + ";
} else if (opType == SUB) {
opFunc += " - ";
} else {
IT_ASSERT(false);
}
opFunc += getSrc1()->generate() + ";\n";
code += opFunc;
code += "}\n";
if (width < 32) {
code += "}\n";
}
return code;
}
} // namespace memb

View File

@ -0,0 +1,99 @@
#include "pfusion/micro_kernel/memory.h"
namespace memb {
std::string MemoryOp::generateWithCond() {
IT_ASSERT(cond.size() == 3);
std::string code = "";
int edge_length = cond[0] % cond[2];
int edge_num = edge_length / width;
int edge_width = edge_length % width;
if (edge_num > 0 || edge_width > 0) {
code += "if (loop_idx % " + std::to_string(cond[1]) +
" == " + std::to_string(cond[1] - 1) + ") {\n";
}
if (edge_num > 0) {
code += "#pragma unroll\n";
code += "for (int inst_idx = 0; inst_idx < " +
std::to_string(edge_num) + "; inst_idx++) {\n";
if ((opType == OpType::READ && getSrc()->getType() != MemType::REG &&
getDst()->getType() == MemType::REG) ||
(opType == OpType::WRITE && getSrc()->getType() == MemType::REG &&
getDst()->getType() != MemType::REG)) {
code += getDst()->generate() + " = " + getSrc()->generate() + ";\n";
} else {
IT_ASSERT(false);
}
code += "}\n";
}
if (edge_width > 0) {
code += "if (lane_id < " + std::to_string(edge_width) + ") {";
if ((opType == OpType::READ && getSrc()->getType() != MemType::REG &&
getDst()->getType() == MemType::REG) ||
(opType == OpType::WRITE && getSrc()->getType() == MemType::REG &&
getDst()->getType() != MemType::REG)) {
code += getDst()->generateWithInstIdx(std::to_string(edge_num)) +
" = " +
getSrc()->generateWithInstIdx(std::to_string(edge_num)) +
";\n";
} else {
IT_ASSERT(false);
}
code += "}\n";
}
if (edge_num > 0 || edge_width > 0) {
code += "} else {\n";
}
code += "#pragma unroll\n";
code += "for (int inst_idx = 0; inst_idx < " + std::to_string(num) +
"; inst_idx++) {\n";
if ((opType == OpType::READ && getSrc()->getType() != MemType::REG &&
getDst()->getType() == MemType::REG) ||
(opType == OpType::WRITE && getSrc()->getType() == MemType::REG &&
getDst()->getType() != MemType::REG)) {
code += getDst()->generate() + " = " + getSrc()->generate() + ";\n";
} else {
IT_ASSERT(false);
}
code += "}\n";
if (edge_num > 0 || edge_width > 0) {
code += "}\n";
}
code += "// test\n";
return code;
}
std::string MemoryOp::generate() {
if (cond.size() != 0) {
return generateWithCond();
}
std::string code;
if (width < 32) {
code += "if (lane_id < " + std::to_string(width) + ") {\n";
}
code += "#pragma unroll\n";
code += "for (int inst_idx = 0; inst_idx < " + std::to_string(num) +
"; inst_idx++) {\n";
if ((opType == OpType::READ && getSrc()->getType() != MemType::REG &&
getDst()->getType() == MemType::REG) ||
(opType == OpType::WRITE && getSrc()->getType() == MemType::REG &&
getDst()->getType() != MemType::REG)) {
code += getDst()->generate() + " = " + getSrc()->generate() + ";\n";
} else {
IT_ASSERT(false);
}
code += "}\n";
if (width < 32) {
code += "}\n";
}
return code;
}
} // namespace memb

View File

@ -0,0 +1,30 @@
#include "pfusion/micro_kernel/reduce.h"
namespace memb {
std::string ReduceOp::generate() {
std::string code;
IT_ASSERT(width == 32);
code += getBuf()->generate() + " = 0;";
code += "#pragma unroll\n";
code += "for (int inst_idx = 0; inst_idx < " + std::to_string(num) +
"; inst_idx++) {\n";
std::string opFunc = getBuf()->generate() + " = " + getBuf()->generate();
if (opType == REDUCEMEAN) {
opFunc += " + ";
} else {
IT_ASSERT(false);
}
opFunc += getSrc()->generate() + ";\n";
code += opFunc;
code += "}\n";
if (opType == REDUCEMEAN) {
code += getBuf()->generate() + " = " + getBuf()->generate() +
" / float(" + std::to_string(num * width) + ");\n";
}
return code;
}
} // namespace memb

View File

@ -0,0 +1,29 @@
#include "pfusion/micro_kernel/unary.h"
namespace memb {
std::string UnaryOp::generate() {
std::string code;
if (width < 32) {
code += "if (lane_id < " + std::to_string(width) + ") {\n";
}
code += "#pragma unroll\n";
code += "for (int inst_idx = 0; inst_idx < " + std::to_string(num) +
"; inst_idx++) {\n";
if (opType == RELU) {
code += getDst()->generate() + " = (" + getSrc()->generate() +
" > 0) ? " + getSrc()->generate() + " : 0;\n";
} else {
IT_ASSERT(false);
}
code += "}\n";
if (width < 32) {
code += "}\n";
}
return code;
}
} // namespace memb

22
src/pfusion/micro_op.cc Normal file
View File

@ -0,0 +1,22 @@
#include "pfusion/micro_op.h"
#include "pfusion/micro_kernel/empty.h"
#include "pfusion/micro_kernel/memory.h"
namespace memb {
std::shared_ptr<MicroOp> MicroOp::merge(std::shared_ptr<MicroOp> op0,
std::shared_ptr<MicroOp> op1) {
if (op0->getType() == WRITE && op1->getType() == READ) {
auto memOp0 = std::dynamic_pointer_cast<MemoryOp>(op0);
auto memOp1 = std::dynamic_pointer_cast<MemoryOp>(op1);
if (memOp0->getDst()->getHash() == memOp1->getSrc()->getHash()) {
if (memOp0->getSrc()->getHash() == memOp1->getDst()->getHash()) {
return std::make_shared<EmptyOp>();
} else {
// TODO: gen reg to reg.
// IT_ASSERT(false);
}
}
}
return nullptr;
}
} // namespace memb

View File

@ -0,0 +1,15 @@
#include "pfusion/search_graph.h"
namespace memb {
std::shared_ptr<MetaGraph> SearchGraph::exportFirstMetaGraph() {
auto metaGraph = std::make_shared<MetaGraph>();
for (auto node : nodes) {
metaGraph->addOp(node.metaOps[0]);
}
for (auto edge : edges) {
metaGraph->addEdge(nodes[edge.first].metaOps[0],
nodes[edge.second].metaOps[0]);
}
return metaGraph;
}
} // namespace memb

View File

@ -0,0 +1,265 @@
#include "core/graph_builder.h"
#include "test.h"
namespace infini {
TEST(GraphBuilder, ops) {
Runtime runtime = CpuRuntimeObj::getInstance();
{ // conv without output
GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
auto input =
make_ref<TensorObj>(Shape{1, 3, 4, 4}, DataType::UInt32, runtime);
auto weight =
make_ref<TensorObj>(Shape{2, 3, 3, 3}, DataType::UInt32, runtime);
auto conv = gf->conv(input, weight, 1, 1);
EXPECT_EQ(conv->getOutput()->getDims(), (Shape{1, 2, 4, 4}));
}
{ // conv with output
GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
auto input =
make_ref<TensorObj>(Shape{1, 3, 4, 4}, DataType::UInt32, runtime);
auto weight =
make_ref<TensorObj>(Shape{2, 3, 3, 3}, DataType::UInt32, runtime);
auto output =
make_ref<TensorObj>(Shape{1, 2, 4, 4}, DataType::UInt32, runtime);
auto conv = gf->conv(input, weight, output, 1, 1);
}
{ // matmul without output
GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
auto A = make_ref<TensorObj>(Shape{1, 3, 5}, DataType::UInt32, runtime);
auto B = make_ref<TensorObj>(Shape{1, 5, 2}, DataType::UInt32, runtime);
auto matmul = gf->matmul(A, B);
EXPECT_EQ(matmul->getOutput()->getDims(), (Shape{1, 3, 2}));
}
{ // matmul with output
GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
auto A = make_ref<TensorObj>(Shape{1, 3, 5}, DataType::UInt32, runtime);
auto B = make_ref<TensorObj>(Shape{1, 5, 2}, DataType::UInt32, runtime);
auto C = make_ref<TensorObj>(Shape{1, 3, 2}, DataType::UInt32, runtime);
auto matmul = gf->matmul(A, B, C);
}
{ // convtrans without output
GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
auto input =
make_ref<TensorObj>(Shape{1, 228, 1, 1}, DataType::UInt32, runtime);
auto weight = make_ref<TensorObj>(Shape{228, 448, 2, 2},
DataType::UInt32, runtime);
auto convtrans = gf->convTrans(input, weight, 0, 0);
EXPECT_EQ(convtrans->getOutput()->getDims(), (Shape{1, 448, 2, 2}));
}
{ // convtrans with output
GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
auto input =
make_ref<TensorObj>(Shape{1, 228, 1, 1}, DataType::UInt32, runtime);
auto weight = make_ref<TensorObj>(Shape{228, 448, 2, 2},
DataType::UInt32, runtime);
auto output =
make_ref<TensorObj>(Shape{1, 448, 2, 2}, DataType::UInt32, runtime);
auto convtrans = gf->convTrans(input, weight, 0, 0);
}
{ // pad without output
GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
auto input = make_ref<TensorObj>(Shape{1, 64, 162, 162},
DataType::UInt32, runtime);
vector<int> pads = {2, 10, 1, 5, 0, 10, 1, 5};
auto pad = gf->pad(input, pads, std::nullopt);
EXPECT_EQ(pad->getOutput()->getDims(), (Shape{3, 84, 164, 172}));
}
{ // pad with output
GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
auto input = make_ref<TensorObj>(Shape{1, 64, 162, 162},
DataType::UInt32, runtime);
auto output = make_ref<TensorObj>(Shape{3, 84, 164, 172},
DataType::UInt32, runtime);
vector<int> pads = {2, 10, 1, 5, 0, 10, 1, 5};
auto pad = gf->pad(input, output, pads, std::nullopt);
}
{ // slice without output
GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
auto input = make_ref<TensorObj>(Shape{10, 64, 162, 162},
DataType::UInt32, runtime);
vector<int> starts = {2, 10, 1, 5};
vector<int> ends = {3, 10, 100, 100};
auto slice = gf->slice(input, starts, ends, std::nullopt, std::nullopt);
EXPECT_EQ(slice->getOutput()->getDims(), (Shape{2, 1, 100, 96}));
}
{ // slice with output
GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
auto input = make_ref<TensorObj>(Shape{10, 64, 162, 162},
DataType::UInt32, runtime);
auto output = make_ref<TensorObj>(Shape{2, 1, 100, 96},
DataType::UInt32, runtime);
vector<int> starts = {2, 10, 1, 5};
vector<int> ends = {3, 10, 100, 100};
auto slice =
gf->slice(input, output, starts, ends, std::nullopt, std::nullopt);
}
{ // concat without output
GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
auto t1 =
make_ref<TensorObj>(Shape{1, 3, 2, 4}, DataType::Float32, runtime);
auto t2 =
make_ref<TensorObj>(Shape{1, 3, 2, 5}, DataType::Float32, runtime);
auto concat = gf->concat(TensorVec{t1, t2}, 3);
EXPECT_EQ(concat->getOutput()->getDims(), (Shape{1, 3, 2, 9}));
}
{ // concat with output
GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
auto t1 =
make_ref<TensorObj>(Shape{1, 3, 2, 4}, DataType::Float32, runtime);
auto t2 =
make_ref<TensorObj>(Shape{1, 3, 2, 5}, DataType::Float32, runtime);
auto o0 =
make_ref<TensorObj>(Shape{1, 3, 2, 9}, DataType::Float32, runtime);
auto concat = gf->concat(TensorVec{t1, t2}, o0, 3);
}
{ // split without output
GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
auto input =
make_ref<TensorObj>(Shape{1, 3, 2, 15}, DataType::Float32, runtime);
auto split = gf->split(input, 3, 4);
EXPECT_EQ(split->numOutputs(), 4);
EXPECT_EQ(split->getOutputs().size(), (size_t)4);
EXPECT_EQ(split->getOutput(0)->getDims(), (Shape{1, 3, 2, 3}));
EXPECT_EQ(split->getOutput(1)->getDims(), (Shape{1, 3, 2, 3}));
EXPECT_EQ(split->getOutput(2)->getDims(), (Shape{1, 3, 2, 3}));
EXPECT_EQ(split->getOutput(3)->getDims(), (Shape{1, 3, 2, 6}));
}
{ // split with output
GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
auto input =
make_ref<TensorObj>(Shape{1, 3, 2, 15}, DataType::Float32, runtime);
auto output0 =
make_ref<TensorObj>(Shape{1, 3, 2, 3}, DataType::Float32, runtime);
auto output1 =
make_ref<TensorObj>(Shape{1, 3, 2, 3}, DataType::Float32, runtime);
auto output2 =
make_ref<TensorObj>(Shape{1, 3, 2, 3}, DataType::Float32, runtime);
auto output3 =
make_ref<TensorObj>(Shape{1, 3, 2, 6}, DataType::Float32, runtime);
auto split = gf->split(
input, TensorVec{output0, output1, output2, output3}, 3, 4);
}
{ // extend without output
GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
auto input =
make_ref<TensorObj>(Shape{2, 3, 3, 4}, DataType::UInt32, runtime);
auto extend = gf->extend(input, 2, 1);
EXPECT_EQ(extend->getOutput()->getDims(), (Shape{2, 3, 6, 4}));
}
{ // extend with output
GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
auto input =
make_ref<TensorObj>(Shape{2, 3, 3, 4}, DataType::UInt32, runtime);
auto output =
make_ref<TensorObj>(Shape{2, 3, 6, 4}, DataType::UInt32, runtime);
auto extend = gf->extend(input, output, 2, 1);
}
{ // maxpool without output
GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
auto input = make_ref<TensorObj>(Shape{1, 64, 162, 162},
DataType::UInt32, runtime);
const int kh = 3, kw = 3, dh = 1, dw = 1, ph = 0, pw = 0, sh = 2,
sw = 2;
auto maxpool = gf->maxpool(input, kh, kw, dh, dw, ph, pw, sh, sw);
EXPECT_EQ(maxpool->getOutput()->getDims(), (Shape{1, 64, 80, 80}));
}
{ // maxpool with output
GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
auto input = make_ref<TensorObj>(Shape{1, 64, 162, 162},
DataType::UInt32, runtime);
auto output = make_ref<TensorObj>(Shape{1, 64, 80, 80},
DataType::UInt32, runtime);
const int kh = 3, kw = 3, dh = 1, dw = 1, ph = 0, pw = 0, sh = 2,
sw = 2;
auto maxpool =
gf->maxpool(input, output, kh, kw, dh, dw, ph, pw, sh, sw);
}
{ // add without output
GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
auto input0 =
make_ref<TensorObj>(Shape{2, 3, 3, 4}, DataType::UInt32, runtime);
auto input1 =
make_ref<TensorObj>(Shape{2, 3, 3, 4}, DataType::UInt32, runtime);
auto add = gf->add(input0, input1);
EXPECT_EQ(add->getOutput()->getDims(), (Shape{2, 3, 3, 4}));
}
{ // add with output
GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
auto input0 =
make_ref<TensorObj>(Shape{2, 3, 3, 4}, DataType::UInt32, runtime);
auto input1 =
make_ref<TensorObj>(Shape{2, 3, 3, 4}, DataType::UInt32, runtime);
auto output =
make_ref<TensorObj>(Shape{2, 3, 3, 4}, DataType::UInt32, runtime);
auto add = gf->add(input0, input1, output);
}
{ // gather without output
GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
auto input =
make_ref<TensorObj>(Shape{1, 3, 4, 4}, DataType::UInt32, runtime);
auto index =
make_ref<TensorObj>(Shape{2, 1, 2}, DataType::UInt32, runtime);
auto gather = gf->gather(input, index, 1);
EXPECT_EQ(gather->getOutput()->getDims(), (Shape{1, 2, 1, 2, 4, 4}));
}
{ // gather with output
GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
auto input =
make_ref<TensorObj>(Shape{1, 3, 4, 4}, DataType::UInt32, runtime);
auto index =
make_ref<TensorObj>(Shape{2, 1, 2}, DataType::UInt32, runtime);
auto output = make_ref<TensorObj>(Shape{1, 2, 1, 2, 4, 4},
DataType::UInt32, runtime);
auto gather = gf->gather(input, index, output, 1);
}
{ // reshape without output
GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
auto input =
make_ref<TensorObj>(Shape{2, 3, 3, 4}, DataType::Float32, runtime);
vector<int> dims = {3, 2, 4, 3};
auto reshape = gf->reshape(input, dims);
EXPECT_EQ(reshape->getOutput()->getDims(), (Shape{3, 2, 4, 3}));
}
{ // reshape with output
GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
auto input =
make_ref<TensorObj>(Shape{2, 3, 3, 4}, DataType::Float32, runtime);
vector<int> dims = {3, 2, 4, 3};
auto output =
make_ref<TensorObj>(Shape{3, 2, 4, 3}, DataType::Float32, runtime);
auto reshape = gf->reshape(input, output, dims);
}
{ // flatten without output
GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
auto input =
make_ref<TensorObj>(Shape{2, 3, 3, 4}, DataType::Float32, runtime);
auto flatten = gf->flatten(input);
EXPECT_EQ(flatten->getOutput()->getDims(), (Shape{72}));
}
{ // flatten without output
GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
auto input =
make_ref<TensorObj>(Shape{2, 3, 3, 4}, DataType::Float32, runtime);
auto output =
make_ref<TensorObj>(Shape{72}, DataType::Float32, runtime);
auto flatten = gf->flatten(input, output);
}
{ // identity without output
GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
auto input =
make_ref<TensorObj>(Shape{2, 3, 3, 4}, DataType::Float32, runtime);
auto identity = gf->identity(input);
EXPECT_EQ(identity->getOutput()->getDims(), (Shape{2, 3, 3, 4}));
}
{ // identity without output
GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
auto input =
make_ref<TensorObj>(Shape{2, 3, 3, 4}, DataType::Float32, runtime);
auto output =
make_ref<TensorObj>(Shape{2, 3, 3, 4}, DataType::Float32, runtime);
auto identity = gf->identity(input, output);
}
}
} // namespace infini

32
test/pfusion/test_bert.cc Normal file
View File

@ -0,0 +1,32 @@
#include "core/blob.h"
#include "core/graph.h"
#include "core/runtime.h"
#include "operators/element_wise.h"
#include "operators/extend.h"
#include "operators/gather.h"
#include "operators/matmul.h"
#include "operators/reduce_mean.h"
#include "operators/transpose.h"
#include "operators/unary.h"
#include "pfusion/memory_codegen.h"
#include "test.h"
namespace infini {
TEST(Graph, bert_layernorm) {
MemoryCodegen codegen;
codegen.exportBert_LN("bert_layernorm.cu");
}
TEST(Graph, bert_softmax) {
MemoryCodegen codegen;
codegen.exportBert_SM("bert_softmax.cu");
}
TEST(Graph, bert_gelu) {
MemoryCodegen codegen;
codegen.exportBert_GELU("bert_gelu.cu");
}
} // namespace infini

26
test/pfusion/test_bias.cc Normal file
View File

@ -0,0 +1,26 @@
#include "pfusion/memory_codegen.h"
#include "test.h"
namespace infini {
TEST(Graph, bias_0) {
MemoryCodegen codegen;
codegen.exportBias("bias_0.cu", std::vector<size_t>({28 * 28, 24}));
}
TEST(Graph, bias_1) {
MemoryCodegen codegen;
codegen.exportBias("bias_1.cu", std::vector<size_t>({28 * 28, 58}));
}
TEST(Graph, bias_2) {
MemoryCodegen codegen;
codegen.exportBias("bias_2.cu", std::vector<size_t>({14 * 14, 116}));
}
TEST(Graph, bias_3) {
MemoryCodegen codegen;
codegen.exportBias("bias_3.cu", std::vector<size_t>({7 * 7, 232}));
}
} // namespace infini

View File

@ -0,0 +1,38 @@
#include "core/blob.h"
#include "core/graph.h"
#include "core/runtime.h"
#include "operators/matmul.h"
#include "operators/transpose.h"
#include "operators/unary.h"
#include "pfusion/memory_codegen.h"
#include "test.h"
namespace infini {
TEST(Graph, build_and_run) {
Runtime runtime = CpuRuntimeObj::getInstance();
Graph g = make_ref<GraphObj>(runtime);
Tensor t0 = g->addTensor({1024, 32, 32}, DataType::Float32);
Tensor t1 = g->addTensor({1024, 32, 32}, DataType::Float32);
Tensor t2 = g->addTensor({1024, 32, 32}, DataType::Float32);
g->dataMalloc();
g->addOpWithOutputs<AbsObj>(t0, t1);
g->addOpWithOutputs<ReluObj>(t0, t1);
MemoryCodegen codegen;
codegen.exportGraph(g, "test.cu");
}
TEST(Graph, transpose) {
Runtime runtime = CpuRuntimeObj::getInstance();
Graph g = make_ref<GraphObj>(runtime);
Tensor t0 = g->addTensor({32, 31, 33, 32}, DataType::Float32);
Tensor t1 = g->addTensor({33, 32, 32, 31}, DataType::Float32);
Tensor t2 = g->addTensor({33, 32, 32, 31}, DataType::Float32);
g->dataMalloc();
g->addOpWithOutputs<TransposeObj>(t0, t1, Shape{2, 0, 3, 1});
g->addOpWithOutputs<ReluObj>(t1, t2);
MemoryCodegen codegen;
codegen.exportGraph(g, "transpose.cu");
}
} // namespace infini

View File

@ -0,0 +1,29 @@
#include "core/blob.h"
#include "core/graph.h"
#include "core/runtime.h"
#include "operators/element_wise.h"
#include "operators/extend.h"
#include "operators/gather.h"
#include "operators/matmul.h"
#include "operators/reduce_mean.h"
#include "operators/transpose.h"
#include "operators/unary.h"
#include "pfusion/memory_codegen.h"
#include "test.h"
namespace infini {
TEST(Graph, reduce_mean) {
Runtime runtime = CpuRuntimeObj::getInstance();
Graph g = make_ref<GraphObj>(runtime);
Tensor t0 = g->addTensor({1, 128, 512}, DataType::Float32);
Tensor t1 = g->addTensor({1, 128, 1}, DataType::Float32);
g->dataMalloc();
g->addOpWithOutputs<ReduceMeanObj>(t0, t1, Shape({2}));
MemoryCodegen codegen;
codegen.exportGraph(g, "reduce_mean.cu");
}
} // namespace infini

View File

@ -0,0 +1,41 @@
#include "core/blob.h"
#include "core/graph.h"
#include "core/runtime.h"
#include "operators/element_wise.h"
#include "operators/matmul.h"
#include "operators/transpose.h"
#include "operators/unary.h"
#include "pfusion/memory_codegen.h"
#include "test.h"
namespace infini {
TEST(Graph, SAR_DRN_0) {
Runtime runtime = CpuRuntimeObj::getInstance();
Graph g = make_ref<GraphObj>(runtime);
Tensor t0 = g->addTensor({1, 64, 512, 512}, DataType::Float32);
Tensor t1 = g->addTensor({1, 64, 512, 512}, DataType::Float32);
Tensor t2 = g->addTensor({1, 64, 512, 512}, DataType::Float32);
Tensor t3 = g->addTensor({1, 64, 512, 512}, DataType::Float32);
g->dataMalloc();
g->addOpWithOutputs<ReluObj>(t0, t1);
g->addOpWithOutputs<AddObj>(t1, t2, t3);
MemoryCodegen codegen;
codegen.exportGraph(g, "sar_drn_0.cu");
}
TEST(Graph, SAR_DRN_1) {
Runtime runtime = CpuRuntimeObj::getInstance();
Graph g = make_ref<GraphObj>(runtime);
Tensor t0 = g->addTensor({1, 1, 512, 512}, DataType::Float32);
Tensor t1 = g->addTensor({1, 1, 512, 512}, DataType::Float32);
Tensor t2 = g->addTensor({1, 1, 512, 512}, DataType::Float32);
Tensor t3 = g->addTensor({1, 1, 512, 512}, DataType::Float32);
g->dataMalloc();
g->addOpWithOutputs<ReluObj>(t0, t1);
g->addOpWithOutputs<SubObj>(t1, t2, t3);
MemoryCodegen codegen;
codegen.exportGraph(g, "sar_drn_1.cu");
}
} // namespace infini

View File

@ -0,0 +1,21 @@
#include "pfusion/memory_codegen.h"
#include "test.h"
namespace infini {
TEST(Graph, transpose_0) {
MemoryCodegen codegen;
codegen.exportTranspose("transpose_0.cu", {28 * 28, 58, 2}, {0, 2, 1});
}
TEST(Graph, transpose_1) {
MemoryCodegen codegen;
codegen.exportTranspose("transpose_1.cu", {14 * 14, 116, 2}, {0, 2, 1});
}
TEST(Graph, transpose_2) {
MemoryCodegen codegen;
codegen.exportTranspose("transpose_2.cu", {7 * 7, 232, 2}, {0, 2, 1});
}
} // namespace infini

View File

@ -0,0 +1,4 @@
. /home/spack/spack/share/spack/setup-env.sh
spack load /yb2wg5g # cuda@10.2.89
spack load /3bfwma4 # cudnn@7.6.5.32-10.2
export CUDAHOSTCXX=/home/spack/spack/opt/spack/linux-ubuntu22.04-haswell/gcc-7.5.0/gcc-7.5.0-sti65cu3zunc4p4kfylgweim6mqan3mk/bin/gcc