forked from jiuyuan/InfiniTensor
Compare commits
20 Commits
master
...
power-fusi
Author | SHA1 | Date |
---|---|---|
![]() |
89398a1c57 | |
![]() |
6e3f0bbf9a | |
![]() |
2aadcb6e9c | |
![]() |
3046dd5901 | |
![]() |
5ed540be6e | |
![]() |
ec58c85505 | |
![]() |
18b79903ee | |
![]() |
2c8bd3729b | |
![]() |
254d23b3c0 | |
![]() |
05d39439db | |
![]() |
48e986d377 | |
![]() |
09365c81f4 | |
![]() |
94faefb0ef | |
![]() |
970c77d0f4 | |
![]() |
7cf2d8f78f | |
![]() |
ff90c4c7d5 | |
![]() |
9e45e51279 | |
![]() |
1b1fc2585b | |
![]() |
eb993f7829 | |
![]() |
63e5df4227 |
|
@ -12,10 +12,11 @@ option(BUILD_TEST "Build tests" ON)
|
|||
cmake_dependent_option(BUILD_TEST_CORE "Build tests for core components" ON BUILD_TEST OFF)
|
||||
cmake_dependent_option(BUILD_TEST_PET "Build tests for PET" OFF BUILD_TEST OFF)
|
||||
cmake_dependent_option(BUILD_TEST_EINNET "Build tests for EINNET" OFF BUILD_TEST OFF)
|
||||
cmake_dependent_option(BUILD_TEST_PFUSION "Build tests for EINNET" ON BUILD_TEST OFF)
|
||||
|
||||
set(DEFAULT_BUILD_TYPE "RelWithDebInfo")
|
||||
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
set(CMAKE_CXX_STANDARD 20)
|
||||
set(CMAKE_CXX_EXTENSIONS OFF) # -std=gnu++11 when on, -std=c++11 when off
|
||||
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -Wall -Werror -Wno-error=deprecated-declarations")
|
||||
|
@ -74,7 +75,7 @@ if(BUILD_TEST)
|
|||
endif()
|
||||
|
||||
# Source files
|
||||
file(GLOB_RECURSE SRC src/ffi/*.cc src/core/*.cc src/kernels/cpu/*.cc src/nnet/*.cc src/operators/*.cc src/utils/*.cc)
|
||||
file(GLOB_RECURSE SRC src/ffi/*.cc src/core/*.cc src/kernels/cpu/*.cc src/nnet/*.cc src/pfusion/*.cc src/operators/*.cc src/utils/*.cc)
|
||||
|
||||
if(USE_CUDA)
|
||||
file(GLOB_RECURSE SRC_CUDA src/cuda/*.cc src/cuda/*.cu src/kernels/cuda/*.cc src/kernels/cuda/*.cu)
|
||||
|
@ -114,7 +115,7 @@ if(USE_CUDA)
|
|||
${CMAKE_CXX_COMPILER}
|
||||
CACHE STRING "Set cuda host compiler path")
|
||||
# CMP0104 requires CUDA_ARCHITECTURES
|
||||
set_target_properties(InfiniTensor PROPERTIES CUDA_ARCHITECTURES "70;80")
|
||||
set_target_properties(InfiniTensor PROPERTIES CUDA_ARCHITECTURES "70")
|
||||
enable_language(CUDA)
|
||||
find_package(CUDAToolkit) # For nvrtc and cuda driver
|
||||
target_link_libraries(InfiniTensor cudnn CUDA::curand CUDA::cublas CUDA::nvrtc CUDA::cudart CUDA::cuda_driver)
|
||||
|
@ -196,4 +197,7 @@ if(BUILD_TEST)
|
|||
if(BUILD_TEST_EINNET)
|
||||
build_test(test/nnet/test_*.cc)
|
||||
endif()
|
||||
if(BUILD_TEST_PFUSION)
|
||||
build_test(test/pfusion/test_*.cc)
|
||||
endif()
|
||||
endif()
|
||||
|
|
|
@ -0,0 +1,89 @@
|
|||
#pragma once
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include <cublas.h>
|
||||
#include <curand.h>
|
||||
|
||||
#define cudaSafeCall(err) __cudaSafeCall(err, __FILE__, __LINE__)
|
||||
#define cudaCheckError() __cudaCheckError(__FILE__, __LINE__)
|
||||
#define cublasSafeCall(err) __cublasSafeCall(err, __FILE__, __LINE__)
|
||||
#define curandSafeCall(err) __curandSafeCall(err, __FILE__, __LINE__)
|
||||
|
||||
inline void __cudaSafeCall(cudaError err, const char *file, const int line) {
|
||||
if (err != cudaSuccess) {
|
||||
std::cout << "[ERROR] " << file << "::" << line
|
||||
<< ": cudaSafeCall() failed. " << cudaGetErrorString(err)
|
||||
<< std::endl;
|
||||
exit(-1);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
inline void __cudaCheckError(const char *file, const int line) {
|
||||
auto err = cudaGetLastError();
|
||||
if (err != cudaSuccess) {
|
||||
std::cout << "[ERROR] " << file << "::" << line
|
||||
<< ": cudaCheckError() failed. " << cudaGetErrorString(err)
|
||||
<< std::endl;
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
#ifdef DEBUG
|
||||
// This checking will affect performance.
|
||||
err = cudaDeviceSynchronize();
|
||||
if (err != cudaSuccess) {
|
||||
std::cout << "[ERROR] " << file << "::" << line
|
||||
<< ": cudaCheckError() with sync failed. "
|
||||
<< cudaGetErrorString(err) << std::endl;
|
||||
exit(-1);
|
||||
}
|
||||
#endif
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
inline const char *cublasGetErrorString(cublasStatus_t err) {
|
||||
switch (err) {
|
||||
case CUBLAS_STATUS_SUCCESS:
|
||||
return "CUBLAS_STATUS_SUCCESS";
|
||||
case CUBLAS_STATUS_NOT_INITIALIZED:
|
||||
return "CUBLAS_STATUS_NOT_INITIALIZED";
|
||||
case CUBLAS_STATUS_ALLOC_FAILED:
|
||||
return "CUBLAS_STATUS_ALLOC_FAILED";
|
||||
case CUBLAS_STATUS_INVALID_VALUE:
|
||||
return "CUBLAS_STATUS_INVALID_VALUE";
|
||||
case CUBLAS_STATUS_ARCH_MISMATCH:
|
||||
return "CUBLAS_STATUS_ARCH_MISMATCH";
|
||||
case CUBLAS_STATUS_MAPPING_ERROR:
|
||||
return "CUBLAS_STATUS_MAPPING_ERROR";
|
||||
case CUBLAS_STATUS_EXECUTION_FAILED:
|
||||
return "CUBLAS_STATUS_EXECUTION_FAILED";
|
||||
case CUBLAS_STATUS_INTERNAL_ERROR:
|
||||
return "CUBLAS_STATUS_INTERNAL_ERROR";
|
||||
case CUBLAS_STATUS_NOT_SUPPORTED:
|
||||
return "CUBLAS_STATUS_NOT_SUPPORTED";
|
||||
case CUBLAS_STATUS_LICENSE_ERROR:
|
||||
return "CUBLAS_STATUS_LICENSE_ERROR";
|
||||
}
|
||||
return "<unknown>";
|
||||
}
|
||||
|
||||
inline void __cublasSafeCall(cublasStatus_t err, const char *file,
|
||||
const int line) {
|
||||
if (err != CUBLAS_STATUS_SUCCESS) {
|
||||
std::cout << "[ERROR]" << file << "::" << line
|
||||
<< ": cublasSafeCall() failed. " << cublasGetErrorString(err)
|
||||
<< std::endl;
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
inline void __curandSafeCall(curandStatus_t err, const char *file,
|
||||
const int line) {
|
||||
if (err != CURAND_STATUS_SUCCESS) {
|
||||
std::cout << "[ERROR]" << file << "::" << line
|
||||
<< ": curandSafeCall() failed. " << err << std::endl;
|
||||
exit(-1);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,21 @@
|
|||
import os
|
||||
|
||||
|
||||
def eval(filename, kernel, shape):
|
||||
with open("../eval_pfusion/eval_kernel.tmp", "r") as f:
|
||||
code = f.read()
|
||||
code = code.replace("%%invoke_func%%", kernel)
|
||||
code = code.replace("%%shape%%", shape)
|
||||
with open("../generated_code/tmp.cu", "w") as f:
|
||||
f.write(code)
|
||||
# os.system("make -j && ./test_bias")
|
||||
os.system(
|
||||
"nvcc ../generated_code/tmp.cu ../generated_code/" + filename + " -I ../eval_pfusion -o ./tmp")
|
||||
os.system("./tmp")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
eval("bias_0.cu", "invoke_func_0", "{28 * 28, 24}")
|
||||
eval("bias_1.cu", "invoke_func_1", "{28 * 28, 58}")
|
||||
eval("bias_2.cu", "invoke_func_2", "{14 * 14, 116}")
|
||||
eval("bias_3.cu", "invoke_func_3", "{7 * 7, 232}")
|
|
@ -0,0 +1,41 @@
|
|||
#include "cuda.h"
|
||||
#include "cuda_utils.h"
|
||||
|
||||
#include <vector>
|
||||
|
||||
void %%invoke_func%%(float *tensor_ptr_0, float *tensor_ptr_1,
|
||||
float *tensor_ptr_2);
|
||||
|
||||
int main() {
|
||||
std::vector<int> shape = %%shape%%;
|
||||
float *t0, *t1, *t2;
|
||||
size_t size = 1;
|
||||
for (auto x : shape) {
|
||||
size *= x;
|
||||
}
|
||||
|
||||
cudaSafeCall(cudaMalloc((void **)&t0, size * sizeof(float)));
|
||||
cudaSafeCall(cudaMalloc((void **)&t1, size * sizeof(float)));
|
||||
cudaSafeCall(cudaMalloc((void **)&t2, size * sizeof(float)));
|
||||
|
||||
float duration = 0;
|
||||
cudaEvent_t st, ed;
|
||||
cudaEventCreate(&st);
|
||||
cudaEventCreate(&ed);
|
||||
int cnt = 128;
|
||||
for (int t = 0; t < cnt; t++) {
|
||||
%%invoke_func%%(t0, t1, t2);
|
||||
}
|
||||
cudaEventRecord(st, 0);
|
||||
for (int t = 0; t < cnt; t++) {
|
||||
%%invoke_func%%(t0, t1, t2);
|
||||
}
|
||||
cudaEventRecord(ed, 0);
|
||||
cudaEventSynchronize(st);
|
||||
cudaEventSynchronize(ed);
|
||||
cudaEventElapsedTime(&duration, st, ed);
|
||||
std::cout << "[INFO] time: " << duration / cnt << std::endl;
|
||||
// double perf = double(size) * 8.0f * cnt / (duration * 1e-3) / 1024.0f / 1024.0f / 1024.0f;
|
||||
// std::cout << "[INFO] Perf: " << perf << "GB/s" << std::endl;
|
||||
std::cout << "[Exit] successful." << std::endl;
|
||||
}
|
|
@ -0,0 +1,41 @@
|
|||
#include "cuda.h"
|
||||
#include "cuda_utils.h"
|
||||
|
||||
#include <vector>
|
||||
|
||||
void invoke_func_2(float *tensor_ptr_2, float *tensor_ptr_3,
|
||||
float *tensor_ptr_4);
|
||||
|
||||
int main() {
|
||||
std::vector<int> shape = {1, 64, 512, 512};
|
||||
float *t0, *t1, *t2;
|
||||
size_t size = 1;
|
||||
for (auto x : shape) {
|
||||
size *= x;
|
||||
}
|
||||
|
||||
cudaSafeCall(cudaMalloc((void **)&t0, size * sizeof(float)));
|
||||
cudaSafeCall(cudaMalloc((void **)&t1, size * sizeof(float)));
|
||||
cudaSafeCall(cudaMalloc((void **)&t2, size * sizeof(float)));
|
||||
|
||||
float duration = 0;
|
||||
cudaEvent_t st, ed;
|
||||
cudaEventCreate(&st);
|
||||
cudaEventCreate(&ed);
|
||||
int cnt = 128;
|
||||
for (int t = 0; t < cnt; t++) {
|
||||
invoke_func_2(t0, t1, t2);
|
||||
}
|
||||
cudaEventRecord(st, 0);
|
||||
for (int t = 0; t < cnt; t++) {
|
||||
invoke_func_2(t0, t1, t2);
|
||||
}
|
||||
cudaEventRecord(ed, 0);
|
||||
cudaEventSynchronize(st);
|
||||
cudaEventSynchronize(ed);
|
||||
cudaEventElapsedTime(&duration, st, ed);
|
||||
std::cout << "[INFO] time: " << duration / cnt << std::endl;
|
||||
double perf = double(size) * 8.0f * cnt / (duration * 1e-3) / 1024.0f / 1024.0f / 1024.0f;
|
||||
std::cout << "[INFO] Perf: " << perf << "GB/s" << std::endl;
|
||||
std::cout << "[Exit] successful." << std::endl;
|
||||
}
|
|
@ -0,0 +1,41 @@
|
|||
#include "cuda.h"
|
||||
#include "cuda_utils.h"
|
||||
|
||||
#include <vector>
|
||||
|
||||
void invoke_func_5(float *tensor_ptr_2, float *tensor_ptr_3,
|
||||
float *tensor_ptr_4);
|
||||
|
||||
int main() {
|
||||
std::vector<int> shape = {1, 1, 512, 512};
|
||||
float *t0, *t1, *t2;
|
||||
size_t size = 1;
|
||||
for (auto x : shape) {
|
||||
size *= x;
|
||||
}
|
||||
|
||||
cudaSafeCall(cudaMalloc((void **)&t0, size * sizeof(float)));
|
||||
cudaSafeCall(cudaMalloc((void **)&t1, size * sizeof(float)));
|
||||
cudaSafeCall(cudaMalloc((void **)&t2, size * sizeof(float)));
|
||||
|
||||
float duration = 0;
|
||||
cudaEvent_t st, ed;
|
||||
cudaEventCreate(&st);
|
||||
cudaEventCreate(&ed);
|
||||
int cnt = 128;
|
||||
for (int t = 0; t < cnt; t++) {
|
||||
invoke_func_5(t0, t1, t2);
|
||||
}
|
||||
cudaEventRecord(st, 0);
|
||||
for (int t = 0; t < cnt; t++) {
|
||||
invoke_func_5(t0, t1, t2);
|
||||
}
|
||||
cudaEventRecord(ed, 0);
|
||||
cudaEventSynchronize(st);
|
||||
cudaEventSynchronize(ed);
|
||||
cudaEventElapsedTime(&duration, st, ed);
|
||||
std::cout << "[INFO] time: " << duration / cnt << std::endl;
|
||||
double perf = double(size) * 8.0f * cnt / (duration * 1e-3) / 1024.0f / 1024.0f / 1024.0f;
|
||||
std::cout << "[INFO] Perf: " << perf << "GB/s" << std::endl;
|
||||
std::cout << "[Exit] successful." << std::endl;
|
||||
}
|
|
@ -0,0 +1,8 @@
|
|||
# rm ./eval_sar_drn_0 ./eval_sar_drn_1 ./eval_sar_drn_2
|
||||
make -j && ./test_sar_drn
|
||||
nvcc ../eval_pfusion/eval_sar_drn_0.cu ../generated_code/sar_drn_0.cu -I ../eval_pfusion -o eval_sar_drn_0
|
||||
nvcc ../eval_pfusion/eval_sar_drn_1.cu ../generated_code/sar_drn_1.cu -I ../eval_pfusion -o eval_sar_drn_1
|
||||
# nvcc ../eval_pfusion/eval_sar_drn_2.cu ../generated_code/sar_drn_1.cu -I ../eval_pfusion -o eval_sar_drn_2
|
||||
./eval_sar_drn_0
|
||||
./eval_sar_drn_1
|
||||
# ./eval_sar_drn_2
|
|
@ -0,0 +1,84 @@
|
|||
#include "cuda.h"
|
||||
#include "cuda_utils.h"
|
||||
|
||||
#include <vector>
|
||||
|
||||
void invoke_func_0(float *src, float *dst);
|
||||
|
||||
int main() {
|
||||
std::vector<int> shape = {31, 32, 32, 33};
|
||||
std::vector<int> perm = {2, 0, 3, 1};
|
||||
float *src, *dst;
|
||||
size_t size = 1;
|
||||
for (auto x : shape) {
|
||||
size *= x;
|
||||
}
|
||||
std::vector<int> stride_src(4), stride_dst(4);
|
||||
stride_dst[0] = 1;
|
||||
for (int i = 1; i < 4; i++) {
|
||||
stride_dst[i] = stride_dst[i-1] * shape[i-1];
|
||||
}
|
||||
size_t this_stride = 1;
|
||||
for (int i = 0; i < 4; i++) {
|
||||
for (int j = 0; j < 4; j++) {
|
||||
if (perm[j] == i) {
|
||||
stride_src[i] = this_stride;
|
||||
this_stride *= shape[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
cudaSafeCall(cudaMalloc((void **)&src, size * sizeof(float)));
|
||||
cudaSafeCall(cudaMalloc((void **)&dst, size * sizeof(float)));
|
||||
|
||||
float *src_host, *dst_host;
|
||||
src_host = (float *)malloc(size * sizeof(float));
|
||||
dst_host = (float *)malloc(size * sizeof(float));
|
||||
for (size_t i = 0; i < size; i++) {
|
||||
src_host[i] = i;
|
||||
}
|
||||
cudaSafeCall(cudaMemcpy(src, src_host, size * sizeof(float), cudaMemcpyHostToDevice));
|
||||
invoke_func_0(src, dst);
|
||||
cudaSafeCall(cudaMemcpy(dst_host, dst, size * sizeof(float), cudaMemcpyDeviceToHost));
|
||||
bool flag = 0;
|
||||
for (size_t i = 0; i < size; i++) {
|
||||
size_t base = i;
|
||||
size_t offset_src = 0;
|
||||
for (int j = 0; j < 4; j++) {
|
||||
offset_src += base % shape[j] * stride_src[perm[j]];
|
||||
base /= shape[j];
|
||||
}
|
||||
if (dst_host[i] != src_host[offset_src]) {
|
||||
flag = 1;
|
||||
std::cout << "[ERROR] at " << i << "," << offset_src << ":" << dst_host[i] << "," << src_host[offset_src] << std::endl;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!flag) {
|
||||
std::cout << "[INFO] transpose correct." << std::endl;
|
||||
} else {
|
||||
std::cout << "[ERROR] transpose incorrect." << std::endl;
|
||||
}
|
||||
|
||||
float duration = 0;
|
||||
cudaEvent_t st, ed;
|
||||
cudaEventCreate(&st);
|
||||
cudaEventCreate(&ed);
|
||||
int cnt = 128;
|
||||
for (int t = 0; t < cnt; t++) {
|
||||
invoke_func_0(src, dst);
|
||||
}
|
||||
cudaEventRecord(st, 0);
|
||||
for (int t = 0; t < cnt; t++) {
|
||||
invoke_func_0(src, dst);
|
||||
}
|
||||
cudaEventRecord(ed, 0);
|
||||
cudaEventSynchronize(st);
|
||||
cudaEventSynchronize(ed);
|
||||
cudaEventElapsedTime(&duration, st, ed);
|
||||
std::cout << "[INFO] time: " << duration << std::endl;
|
||||
double perf = double(size) * 8.0f * cnt / (duration * 1e-3) / 1024.0f / 1024.0f / 1024.0f;
|
||||
std::cout << "[INFO] Perf: " << perf << "GB/s" << std::endl;
|
||||
std::cout << "[Exit] successful." << std::endl;
|
||||
}
|
|
@ -0,0 +1,21 @@
|
|||
import os
|
||||
|
||||
|
||||
def eval(filename, kernel, shape, perm):
|
||||
with open("../eval_pfusion/eval_transpose.tmp", "r") as f:
|
||||
code = f.read()
|
||||
code = code.replace("%%invoke_func%%", kernel)
|
||||
code = code.replace("%%shape%%", shape)
|
||||
code = code.replace("%%perm%%", perm)
|
||||
with open("../generated_code/tmp.cu", "w") as f:
|
||||
f.write(code)
|
||||
# os.system("make -j && ./test_bias")
|
||||
os.system(
|
||||
"nvcc ../generated_code/tmp.cu ../generated_code/" + filename + " -I ../eval_pfusion -o ./tmp")
|
||||
os.system("./tmp")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
eval("transpose_0.cu", "invoke_func_0", "{28 * 28, 58, 2}", "{0, 2, 1}")
|
||||
eval("transpose_1.cu", "invoke_func_1", "{14 * 14, 116, 2}", "{0, 2, 1}")
|
||||
eval("transpose_2.cu", "invoke_func_2", "{7 * 7, 232, 2}", "{0, 2, 1}")
|
|
@ -0,0 +1,4 @@
|
|||
rm ./eval_transpose
|
||||
make -j && ./test_transpose
|
||||
nvcc ../eval_pfusion/eval_transpose.cu ../generated_code/transpose.cu -I ../eval_pfusion -o eval_transpose
|
||||
./eval_transpose
|
|
@ -0,0 +1,84 @@
|
|||
#include "cuda.h"
|
||||
#include "cuda_utils.h"
|
||||
|
||||
#include <vector>
|
||||
|
||||
void %%invoke_func%%(float *src, float *dst);
|
||||
|
||||
int main() {
|
||||
std::vector<int> shape = %%shape%%;
|
||||
std::vector<int> perm = %%perm%%;
|
||||
float *src, *dst;
|
||||
size_t size = 1;
|
||||
for (auto x : shape) {
|
||||
size *= x;
|
||||
}
|
||||
std::vector<int> stride_src(shape.size()), stride_dst(shape.size());
|
||||
stride_dst[0] = 1;
|
||||
for (int i = 1; i < shape.size(); i++) {
|
||||
stride_dst[i] = stride_dst[i-1] * shape[i-1];
|
||||
}
|
||||
size_t this_stride = 1;
|
||||
for (int i = 0; i < shape.size(); i++) {
|
||||
for (int j = 0; j < shape.size(); j++) {
|
||||
if (perm[j] == i) {
|
||||
stride_src[i] = this_stride;
|
||||
this_stride *= shape[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
cudaSafeCall(cudaMalloc((void **)&src, size * sizeof(float)));
|
||||
cudaSafeCall(cudaMalloc((void **)&dst, size * sizeof(float)));
|
||||
|
||||
float *src_host, *dst_host;
|
||||
src_host = (float *)malloc(size * sizeof(float));
|
||||
dst_host = (float *)malloc(size * sizeof(float));
|
||||
for (size_t i = 0; i < size; i++) {
|
||||
src_host[i] = i;
|
||||
}
|
||||
cudaSafeCall(cudaMemcpy(src, src_host, size * sizeof(float), cudaMemcpyHostToDevice));
|
||||
%%invoke_func%%(src, dst);
|
||||
cudaSafeCall(cudaMemcpy(dst_host, dst, size * sizeof(float), cudaMemcpyDeviceToHost));
|
||||
bool flag = 0;
|
||||
for (size_t i = 0; i < size; i++) {
|
||||
size_t base = i;
|
||||
size_t offset_src = 0;
|
||||
for (int j = 0; j < shape.size(); j++) {
|
||||
offset_src += base % shape[j] * stride_src[perm[j]];
|
||||
base /= shape[j];
|
||||
}
|
||||
if (dst_host[i] != src_host[offset_src]) {
|
||||
flag = 1;
|
||||
std::cout << "[ERROR] at " << i << "," << offset_src << ":" << dst_host[i] << "," << src_host[offset_src] << std::endl;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!flag) {
|
||||
std::cout << "[INFO] transpose correct." << std::endl;
|
||||
} else {
|
||||
std::cout << "[ERROR] transpose incorrect." << std::endl;
|
||||
}
|
||||
|
||||
float duration = 0;
|
||||
cudaEvent_t st, ed;
|
||||
cudaEventCreate(&st);
|
||||
cudaEventCreate(&ed);
|
||||
int cnt = 128;
|
||||
for (int t = 0; t < cnt; t++) {
|
||||
%%invoke_func%%(src, dst);
|
||||
}
|
||||
cudaEventRecord(st, 0);
|
||||
for (int t = 0; t < cnt; t++) {
|
||||
%%invoke_func%%(src, dst);
|
||||
}
|
||||
cudaEventRecord(ed, 0);
|
||||
cudaEventSynchronize(st);
|
||||
cudaEventSynchronize(ed);
|
||||
cudaEventElapsedTime(&duration, st, ed);
|
||||
std::cout << "[INFO] time: " << duration / cnt << std::endl;
|
||||
double perf = double(size) * 8.0f * cnt / (duration * 1e-3) / 1024.0f / 1024.0f / 1024.0f;
|
||||
std::cout << "[INFO] Perf: " << perf << "GB/s" << std::endl;
|
||||
std::cout << "[Exit] successful." << std::endl;
|
||||
}
|
|
@ -0,0 +1,148 @@
|
|||
#include "cuda_utils.h"
|
||||
// Kernel
|
||||
__global__ void kernel_func_0(float *tensor_ptr_3, float *tensor_ptr_2,
|
||||
float *tensor_ptr_4) {
|
||||
int lane_id = threadIdx.x % 32;
|
||||
int warp_id = threadIdx.x / 32;
|
||||
int parallel_idx = blockIdx.x * 2 + warp_id;
|
||||
float buf[24];
|
||||
for (int loop_idx = parallel_idx; loop_idx < 15627264; loop_idx += 216) {
|
||||
int offset_src = 0;
|
||||
int offset_src_buf = loop_idx;
|
||||
offset_src += offset_src_buf % 15627264 * 32;
|
||||
offset_src_buf /= 15627264;
|
||||
}
|
||||
}
|
||||
// Kernel
|
||||
__global__ void kernel_func_6(float *tensor_ptr_4, float *tensor_ptr_5,
|
||||
float *tensor_ptr_6, float *tensor_ptr_7,
|
||||
float *tensor_ptr_8) {
|
||||
int lane_id = threadIdx.x % 32;
|
||||
int warp_id = threadIdx.x / 32;
|
||||
int parallel_idx = blockIdx.x * 8 + warp_id;
|
||||
float buf[48];
|
||||
for (int loop_idx = parallel_idx; loop_idx < 256; loop_idx += 864) {
|
||||
int offset_src = 0;
|
||||
int offset_src_buf = loop_idx;
|
||||
offset_src += offset_src_buf % 256 * 256;
|
||||
offset_src_buf /= 256;
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
|
||||
buf[inst_idx] =
|
||||
tensor_ptr_4[0 + offset_src + inst_idx * 32 + lane_id];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
|
||||
buf[inst_idx + 8] =
|
||||
tensor_ptr_5[0 + offset_src + inst_idx * 32 + lane_id];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
|
||||
buf[inst_idx + 16] = buf[inst_idx] + buf[inst_idx + 8];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
|
||||
tensor_ptr_6[0 + offset_src + inst_idx * 32 + lane_id] =
|
||||
buf[inst_idx + 16];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
|
||||
buf[inst_idx] =
|
||||
tensor_ptr_6[0 + offset_src + inst_idx * 32 + lane_id];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
|
||||
buf[inst_idx + 8] =
|
||||
tensor_ptr_7[0 + offset_src + inst_idx * 32 + lane_id];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
|
||||
buf[inst_idx + 16] = buf[inst_idx] + buf[inst_idx + 8];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
|
||||
tensor_ptr_8[0 + offset_src + inst_idx * 32 + lane_id] =
|
||||
buf[inst_idx + 16];
|
||||
}
|
||||
}
|
||||
}
|
||||
// Kernel
|
||||
__global__ void kernel_func_7(float *tensor_ptr_8, float *tensor_ptr_9,
|
||||
float *tensor_ptr_10) {
|
||||
int lane_id = threadIdx.x % 32;
|
||||
int warp_id = threadIdx.x / 32;
|
||||
int parallel_idx = blockIdx.x * 2 + warp_id;
|
||||
float buf[48];
|
||||
for (int loop_idx = parallel_idx; loop_idx < 128; loop_idx += 216) {
|
||||
int offset_src = 0;
|
||||
int offset_src_buf = loop_idx;
|
||||
offset_src += offset_src_buf % 128 * 512;
|
||||
offset_src_buf /= 128;
|
||||
}
|
||||
}
|
||||
// Kernel
|
||||
__global__ void kernel_func_5(float *tensor_ptr_10, float *tensor_ptr_11,
|
||||
float *tensor_ptr_12) {
|
||||
int lane_id = threadIdx.x % 32;
|
||||
int warp_id = threadIdx.x / 32;
|
||||
int parallel_idx = blockIdx.x * 8 + warp_id;
|
||||
float buf[24];
|
||||
for (int loop_idx = parallel_idx; loop_idx < 256; loop_idx += 864) {
|
||||
int offset_src = 0;
|
||||
int offset_src_buf = loop_idx;
|
||||
offset_src += offset_src_buf % 256 * 256;
|
||||
offset_src_buf /= 256;
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
|
||||
buf[inst_idx] =
|
||||
tensor_ptr_10[0 + offset_src + inst_idx * 32 + lane_id];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
|
||||
buf[inst_idx + 8] =
|
||||
tensor_ptr_11[0 + offset_src + inst_idx * 32 + lane_id];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
|
||||
buf[inst_idx + 16] = buf[inst_idx] - buf[inst_idx + 8];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
|
||||
tensor_ptr_12[0 + offset_src + inst_idx * 32 + lane_id] =
|
||||
buf[inst_idx + 16];
|
||||
}
|
||||
}
|
||||
}
|
||||
void invoke_func_0(float *tensor_ptr_3, float *tensor_ptr_2,
|
||||
float *tensor_ptr_4) {
|
||||
dim3 gridDim(108, 1);
|
||||
dim3 blockDim(64, 1);
|
||||
kernel_func_0<<<gridDim, blockDim>>>(tensor_ptr_3, tensor_ptr_2,
|
||||
tensor_ptr_4);
|
||||
cudaCheckError();
|
||||
}
|
||||
void invoke_func_6(float *tensor_ptr_4, float *tensor_ptr_5,
|
||||
float *tensor_ptr_6, float *tensor_ptr_7,
|
||||
float *tensor_ptr_8) {
|
||||
dim3 gridDim(108, 1);
|
||||
dim3 blockDim(256, 1);
|
||||
kernel_func_6<<<gridDim, blockDim>>>(
|
||||
tensor_ptr_4, tensor_ptr_5, tensor_ptr_6, tensor_ptr_7, tensor_ptr_8);
|
||||
cudaCheckError();
|
||||
}
|
||||
void invoke_func_7(float *tensor_ptr_8, float *tensor_ptr_9,
|
||||
float *tensor_ptr_10) {
|
||||
dim3 gridDim(108, 1);
|
||||
dim3 blockDim(64, 1);
|
||||
kernel_func_7<<<gridDim, blockDim>>>(tensor_ptr_8, tensor_ptr_9,
|
||||
tensor_ptr_10);
|
||||
cudaCheckError();
|
||||
}
|
||||
void invoke_func_5(float *tensor_ptr_10, float *tensor_ptr_11,
|
||||
float *tensor_ptr_12) {
|
||||
dim3 gridDim(108, 1);
|
||||
dim3 blockDim(256, 1);
|
||||
kernel_func_5<<<gridDim, blockDim>>>(tensor_ptr_10, tensor_ptr_11,
|
||||
tensor_ptr_12);
|
||||
cudaCheckError();
|
||||
}
|
|
@ -0,0 +1,42 @@
|
|||
#include "cuda_utils.h"
|
||||
// Kernel
|
||||
__global__ void kernel_func_0(float *input, float *bias, float *output) {
|
||||
int lane_id = threadIdx.x % 32;
|
||||
int warp_id = threadIdx.x / 32;
|
||||
int parallel_idx = blockIdx.x * 4 + warp_id;
|
||||
float buf[4];
|
||||
for (int loop_idx = parallel_idx; loop_idx < 144; loop_idx += 320) {
|
||||
int offset_input = 0;
|
||||
int offset_input_buf = loop_idx;
|
||||
offset_input += offset_input_buf % 7 * 128;
|
||||
offset_input_buf /= 7;
|
||||
offset_input += offset_input_buf % 24 * 784;
|
||||
offset_input_buf /= 24;
|
||||
int offset_bias = 0;
|
||||
int offset_bias_buf = loop_idx;
|
||||
offset_bias += offset_bias_buf % 24 * 24;
|
||||
offset_bias_buf /= 24;
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 4; inst_idx++) {
|
||||
buf[inst_idx] = input[0 + offset_input + inst_idx * 32 + lane_id];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 1; inst_idx++) {
|
||||
buf[4] = bias[0 + offset_bias];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 4; inst_idx++) {
|
||||
buf[inst_idx] = buf[inst_idx] + buf[4];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
|
||||
bias[0 + offset_input + inst_idx * 32 + lane_id] = buf[inst_idx];
|
||||
}
|
||||
}
|
||||
}
|
||||
void invoke_func_0(float *input, float *bias, float *output) {
|
||||
dim3 gridDim(80, 1);
|
||||
dim3 blockDim(128, 1);
|
||||
kernel_func_0<<<gridDim, blockDim>>>(input, bias, output);
|
||||
cudaCheckError();
|
||||
}
|
|
@ -0,0 +1,42 @@
|
|||
#include "cuda_utils.h"
|
||||
// Kernel
|
||||
__global__ void kernel_func_1(float *input, float *bias, float *output) {
|
||||
int lane_id = threadIdx.x % 32;
|
||||
int warp_id = threadIdx.x / 32;
|
||||
int parallel_idx = blockIdx.x * 4 + warp_id;
|
||||
float buf[4];
|
||||
for (int loop_idx = parallel_idx; loop_idx < 348; loop_idx += 320) {
|
||||
int offset_input = 0;
|
||||
int offset_input_buf = loop_idx;
|
||||
offset_input += offset_input_buf % 7 * 128;
|
||||
offset_input_buf /= 7;
|
||||
offset_input += offset_input_buf % 58 * 784;
|
||||
offset_input_buf /= 58;
|
||||
int offset_bias = 0;
|
||||
int offset_bias_buf = loop_idx;
|
||||
offset_bias += offset_bias_buf % 58 * 58;
|
||||
offset_bias_buf /= 58;
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 4; inst_idx++) {
|
||||
buf[inst_idx] = input[0 + offset_input + inst_idx * 32 + lane_id];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 1; inst_idx++) {
|
||||
buf[4] = bias[0 + offset_bias];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 4; inst_idx++) {
|
||||
buf[inst_idx] = buf[inst_idx] + buf[4];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
|
||||
bias[0 + offset_input + inst_idx * 32 + lane_id] = buf[inst_idx];
|
||||
}
|
||||
}
|
||||
}
|
||||
void invoke_func_1(float *input, float *bias, float *output) {
|
||||
dim3 gridDim(80, 1);
|
||||
dim3 blockDim(128, 1);
|
||||
kernel_func_1<<<gridDim, blockDim>>>(input, bias, output);
|
||||
cudaCheckError();
|
||||
}
|
|
@ -0,0 +1,42 @@
|
|||
#include "cuda_utils.h"
|
||||
// Kernel
|
||||
__global__ void kernel_func_2(float *input, float *bias, float *output) {
|
||||
int lane_id = threadIdx.x % 32;
|
||||
int warp_id = threadIdx.x / 32;
|
||||
int parallel_idx = blockIdx.x * 4 + warp_id;
|
||||
float buf[4];
|
||||
for (int loop_idx = parallel_idx; loop_idx < 116; loop_idx += 320) {
|
||||
int offset_input = 0;
|
||||
int offset_input_buf = loop_idx;
|
||||
offset_input += offset_input_buf % 2 * 128;
|
||||
offset_input_buf /= 2;
|
||||
offset_input += offset_input_buf % 116 * 196;
|
||||
offset_input_buf /= 116;
|
||||
int offset_bias = 0;
|
||||
int offset_bias_buf = loop_idx;
|
||||
offset_bias += offset_bias_buf % 116 * 116;
|
||||
offset_bias_buf /= 116;
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 4; inst_idx++) {
|
||||
buf[inst_idx] = input[0 + offset_input + inst_idx * 32 + lane_id];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 1; inst_idx++) {
|
||||
buf[4] = bias[0 + offset_bias];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 4; inst_idx++) {
|
||||
buf[inst_idx] = buf[inst_idx] + buf[4];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
|
||||
bias[0 + offset_input + inst_idx * 32 + lane_id] = buf[inst_idx];
|
||||
}
|
||||
}
|
||||
}
|
||||
void invoke_func_2(float *input, float *bias, float *output) {
|
||||
dim3 gridDim(80, 1);
|
||||
dim3 blockDim(128, 1);
|
||||
kernel_func_2<<<gridDim, blockDim>>>(input, bias, output);
|
||||
cudaCheckError();
|
||||
}
|
|
@ -0,0 +1,42 @@
|
|||
#include "cuda_utils.h"
|
||||
// Kernel
|
||||
__global__ void kernel_func_3(float *input, float *bias, float *output) {
|
||||
int lane_id = threadIdx.x % 32;
|
||||
int warp_id = threadIdx.x / 32;
|
||||
int parallel_idx = blockIdx.x * 4 + warp_id;
|
||||
float buf[4];
|
||||
for (int loop_idx = parallel_idx; loop_idx < 0; loop_idx += 320) {
|
||||
int offset_input = 0;
|
||||
int offset_input_buf = loop_idx;
|
||||
offset_input += offset_input_buf % 1 * 128;
|
||||
offset_input_buf /= 1;
|
||||
offset_input += offset_input_buf % 232 * 49;
|
||||
offset_input_buf /= 232;
|
||||
int offset_bias = 0;
|
||||
int offset_bias_buf = loop_idx;
|
||||
offset_bias += offset_bias_buf % 232 * 232;
|
||||
offset_bias_buf /= 232;
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 4; inst_idx++) {
|
||||
buf[inst_idx] = input[0 + offset_input + inst_idx * 32 + lane_id];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 1; inst_idx++) {
|
||||
buf[4] = bias[0 + offset_bias];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 4; inst_idx++) {
|
||||
buf[inst_idx] = buf[inst_idx] + buf[4];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
|
||||
bias[0 + offset_input + inst_idx * 32 + lane_id] = buf[inst_idx];
|
||||
}
|
||||
}
|
||||
}
|
||||
void invoke_func_3(float *input, float *bias, float *output) {
|
||||
dim3 gridDim(80, 1);
|
||||
dim3 blockDim(128, 1);
|
||||
kernel_func_3<<<gridDim, blockDim>>>(input, bias, output);
|
||||
cudaCheckError();
|
||||
}
|
|
@ -0,0 +1,20 @@
|
|||
#include "cuda_utils.h"
|
||||
// Kernel
|
||||
__global__ void kernel_func_0(float *tensor_ptr_2, float *tensor_ptr_3) {
|
||||
int lane_id = threadIdx.x % 32;
|
||||
int warp_id = threadIdx.x / 32;
|
||||
int parallel_idx = blockIdx.x * 2 + warp_id;
|
||||
float buf[24];
|
||||
for (int loop_idx = parallel_idx; loop_idx < 128; loop_idx += 216) {
|
||||
int offset_src = 0;
|
||||
int offset_src_buf = loop_idx;
|
||||
offset_src += offset_src_buf % 128 * 512;
|
||||
offset_src_buf /= 128;
|
||||
}
|
||||
}
|
||||
void invoke_func_0(float *tensor_ptr_2, float *tensor_ptr_3) {
|
||||
dim3 gridDim(108, 1);
|
||||
dim3 blockDim(64, 1);
|
||||
kernel_func_0<<<gridDim, blockDim>>>(tensor_ptr_2, tensor_ptr_3);
|
||||
cudaCheckError();
|
||||
}
|
|
@ -0,0 +1,46 @@
|
|||
#include "cuda_utils.h"
|
||||
// Kernel
|
||||
__global__ void kernel_func_2(float *tensor_ptr_2, float *tensor_ptr_4,
|
||||
float *tensor_ptr_5) {
|
||||
int lane_id = threadIdx.x % 32;
|
||||
int warp_id = threadIdx.x / 32;
|
||||
int parallel_idx = blockIdx.x * 8 + warp_id;
|
||||
float buf[32];
|
||||
for (int loop_idx = parallel_idx; loop_idx < 65536; loop_idx += 864) {
|
||||
int offset_src = 0;
|
||||
int offset_src_buf = loop_idx;
|
||||
offset_src += offset_src_buf % 65536 * 256;
|
||||
offset_src_buf /= 65536;
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
|
||||
buf[inst_idx] =
|
||||
tensor_ptr_2[0 + offset_src + inst_idx * 32 + lane_id];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
|
||||
buf[inst_idx] = (buf[inst_idx] > 0) ? buf[inst_idx] : 0;
|
||||
}
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
|
||||
buf[inst_idx + 8] =
|
||||
tensor_ptr_4[0 + offset_src + inst_idx * 32 + lane_id];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
|
||||
buf[inst_idx + 16] = buf[inst_idx] + buf[inst_idx + 8];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
|
||||
tensor_ptr_5[0 + offset_src + inst_idx * 32 + lane_id] =
|
||||
buf[inst_idx + 16];
|
||||
}
|
||||
}
|
||||
}
|
||||
void invoke_func_2(float *tensor_ptr_2, float *tensor_ptr_4,
|
||||
float *tensor_ptr_5) {
|
||||
dim3 gridDim(108, 1);
|
||||
dim3 blockDim(256, 1);
|
||||
kernel_func_2<<<gridDim, blockDim>>>(tensor_ptr_2, tensor_ptr_4,
|
||||
tensor_ptr_5);
|
||||
cudaCheckError();
|
||||
}
|
|
@ -0,0 +1,46 @@
|
|||
#include "cuda_utils.h"
|
||||
// Kernel
|
||||
__global__ void kernel_func_5(float *tensor_ptr_9, float *tensor_ptr_11,
|
||||
float *tensor_ptr_12) {
|
||||
int lane_id = threadIdx.x % 32;
|
||||
int warp_id = threadIdx.x / 32;
|
||||
int parallel_idx = blockIdx.x * 8 + warp_id;
|
||||
float buf[32];
|
||||
for (int loop_idx = parallel_idx; loop_idx < 1024; loop_idx += 864) {
|
||||
int offset_src = 0;
|
||||
int offset_src_buf = loop_idx;
|
||||
offset_src += offset_src_buf % 1024 * 256;
|
||||
offset_src_buf /= 1024;
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
|
||||
buf[inst_idx] =
|
||||
tensor_ptr_9[0 + offset_src + inst_idx * 32 + lane_id];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
|
||||
buf[inst_idx] = (buf[inst_idx] > 0) ? buf[inst_idx] : 0;
|
||||
}
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
|
||||
buf[inst_idx + 8] =
|
||||
tensor_ptr_11[0 + offset_src + inst_idx * 32 + lane_id];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
|
||||
buf[inst_idx + 16] = buf[inst_idx] - buf[inst_idx + 8];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 8; inst_idx++) {
|
||||
tensor_ptr_12[0 + offset_src + inst_idx * 32 + lane_id] =
|
||||
buf[inst_idx + 16];
|
||||
}
|
||||
}
|
||||
}
|
||||
void invoke_func_5(float *tensor_ptr_9, float *tensor_ptr_11,
|
||||
float *tensor_ptr_12) {
|
||||
dim3 gridDim(108, 1);
|
||||
dim3 blockDim(256, 1);
|
||||
kernel_func_5<<<gridDim, blockDim>>>(tensor_ptr_9, tensor_ptr_11,
|
||||
tensor_ptr_12);
|
||||
cudaCheckError();
|
||||
}
|
|
@ -0,0 +1,24 @@
|
|||
#include "cuda_utils.h"
|
||||
// Kernel
|
||||
__global__ void kernel_func(float *src, float *dst) {
|
||||
int lane_id = threadIdx.x % 32;
|
||||
int warp_id = threadIdx.x / 32;
|
||||
int parallel_idx = blockIdx.x * 0 + warp_id;
|
||||
float buf[0];
|
||||
__shared__ float smem[0];
|
||||
for (int loop_idx = parallel_idx; loop_idx < 1024; loop_idx += 0) {
|
||||
if (lane_id < 0) {
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 32; inst_idx++) {
|
||||
smem[warp_id * 32 * 32 * 2 + inst_idx * 32 + lane_id] =
|
||||
inst_idx;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
void invoke_func(float *src, float *dst) {
|
||||
dim3 gridDim(0, 1);
|
||||
dim3 blockDim(0, 1);
|
||||
kernel_func<<<gridDim, blockDim>>>(src, dst);
|
||||
cudaCheckError();
|
||||
}
|
|
@ -0,0 +1,84 @@
|
|||
#include "cuda.h"
|
||||
#include "cuda_utils.h"
|
||||
|
||||
#include <vector>
|
||||
|
||||
void invoke_func_2(float *src, float *dst);
|
||||
|
||||
int main() {
|
||||
std::vector<int> shape = {7 * 7, 232, 2};
|
||||
std::vector<int> perm = {0, 2, 1};
|
||||
float *src, *dst;
|
||||
size_t size = 1;
|
||||
for (auto x : shape) {
|
||||
size *= x;
|
||||
}
|
||||
std::vector<int> stride_src(shape.size()), stride_dst(shape.size());
|
||||
stride_dst[0] = 1;
|
||||
for (int i = 1; i < shape.size(); i++) {
|
||||
stride_dst[i] = stride_dst[i-1] * shape[i-1];
|
||||
}
|
||||
size_t this_stride = 1;
|
||||
for (int i = 0; i < shape.size(); i++) {
|
||||
for (int j = 0; j < shape.size(); j++) {
|
||||
if (perm[j] == i) {
|
||||
stride_src[i] = this_stride;
|
||||
this_stride *= shape[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
cudaSafeCall(cudaMalloc((void **)&src, size * sizeof(float)));
|
||||
cudaSafeCall(cudaMalloc((void **)&dst, size * sizeof(float)));
|
||||
|
||||
float *src_host, *dst_host;
|
||||
src_host = (float *)malloc(size * sizeof(float));
|
||||
dst_host = (float *)malloc(size * sizeof(float));
|
||||
for (size_t i = 0; i < size; i++) {
|
||||
src_host[i] = i;
|
||||
}
|
||||
cudaSafeCall(cudaMemcpy(src, src_host, size * sizeof(float), cudaMemcpyHostToDevice));
|
||||
invoke_func_2(src, dst);
|
||||
cudaSafeCall(cudaMemcpy(dst_host, dst, size * sizeof(float), cudaMemcpyDeviceToHost));
|
||||
bool flag = 0;
|
||||
for (size_t i = 0; i < size; i++) {
|
||||
size_t base = i;
|
||||
size_t offset_src = 0;
|
||||
for (int j = 0; j < shape.size(); j++) {
|
||||
offset_src += base % shape[j] * stride_src[perm[j]];
|
||||
base /= shape[j];
|
||||
}
|
||||
if (dst_host[i] != src_host[offset_src]) {
|
||||
flag = 1;
|
||||
std::cout << "[ERROR] at " << i << "," << offset_src << ":" << dst_host[i] << "," << src_host[offset_src] << std::endl;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!flag) {
|
||||
std::cout << "[INFO] transpose correct." << std::endl;
|
||||
} else {
|
||||
std::cout << "[ERROR] transpose incorrect." << std::endl;
|
||||
}
|
||||
|
||||
float duration = 0;
|
||||
cudaEvent_t st, ed;
|
||||
cudaEventCreate(&st);
|
||||
cudaEventCreate(&ed);
|
||||
int cnt = 128;
|
||||
for (int t = 0; t < cnt; t++) {
|
||||
invoke_func_2(src, dst);
|
||||
}
|
||||
cudaEventRecord(st, 0);
|
||||
for (int t = 0; t < cnt; t++) {
|
||||
invoke_func_2(src, dst);
|
||||
}
|
||||
cudaEventRecord(ed, 0);
|
||||
cudaEventSynchronize(st);
|
||||
cudaEventSynchronize(ed);
|
||||
cudaEventElapsedTime(&duration, st, ed);
|
||||
std::cout << "[INFO] time: " << duration / cnt << std::endl;
|
||||
double perf = double(size) * 8.0f * cnt / (duration * 1e-3) / 1024.0f / 1024.0f / 1024.0f;
|
||||
std::cout << "[INFO] Perf: " << perf << "GB/s" << std::endl;
|
||||
std::cout << "[Exit] successful." << std::endl;
|
||||
}
|
|
@ -0,0 +1,52 @@
|
|||
#include "cuda_utils.h"
|
||||
// Kernel
|
||||
__global__ void kernel_func_0(float *tensor_ptr_2, float *tensor_ptr_3) {
|
||||
int lane_id = threadIdx.x % 32;
|
||||
int warp_id = threadIdx.x / 32;
|
||||
int parallel_idx = blockIdx.x * 8 + warp_id;
|
||||
float buf[32];
|
||||
__shared__ float smem[8448];
|
||||
for (int loop_idx = parallel_idx; loop_idx < 1056; loop_idx += 864) {
|
||||
int offset_src = 0;
|
||||
int offset_src_buf = loop_idx;
|
||||
offset_src += offset_src_buf % 32 * 32736;
|
||||
offset_src_buf /= 32;
|
||||
offset_src += offset_src_buf % 33 * 33;
|
||||
offset_src_buf /= 33;
|
||||
int offset_dst = 0;
|
||||
int offset_dst_buf = loop_idx;
|
||||
offset_dst += offset_dst_buf % 32 * 1024;
|
||||
offset_dst_buf /= 32;
|
||||
offset_dst += offset_dst_buf % 33 * 33792;
|
||||
offset_dst_buf /= 33;
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 31; inst_idx++) {
|
||||
buf[inst_idx] =
|
||||
tensor_ptr_2[0 + offset_src + 0 + inst_idx * 1056 + lane_id];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 31; inst_idx++) {
|
||||
smem[group_id * 32 * 33 + inst_idx * 33 + lane_id] = buf[inst_idx];
|
||||
}
|
||||
if (lane_id < 31) {
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 32; inst_idx++) {
|
||||
buf[inst_idx] =
|
||||
smem[group_id * 32 * 33 + lane_id * 33 + inst_idx];
|
||||
}
|
||||
}
|
||||
if (lane_id < 31) {
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 32; inst_idx++) {
|
||||
tensor_ptr_3[0 + offset_dst + 0 + inst_idx * 31 + lane_id] =
|
||||
buf[inst_idx];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
void invoke_func_0(float *tensor_ptr_2, float *tensor_ptr_3) {
|
||||
dim3 gridDim(108, 1);
|
||||
dim3 blockDim(256, 1);
|
||||
kernel_func_0<<<gridDim, blockDim>>>(tensor_ptr_2, tensor_ptr_3);
|
||||
cudaCheckError();
|
||||
}
|
|
@ -0,0 +1,56 @@
|
|||
#include "cuda_utils.h"
|
||||
// Kernel
|
||||
__global__ void kernel_func_0(float *input, float *output) {
|
||||
int lane_id = threadIdx.x % 32;
|
||||
int warp_id = threadIdx.x / 32;
|
||||
int parallel_idx = blockIdx.x * 4 + warp_id;
|
||||
float buf[4];
|
||||
for (int loop_idx = parallel_idx; loop_idx < 812; loop_idx += 320) {
|
||||
int offset_input = 0;
|
||||
int offset_input_buf = loop_idx;
|
||||
offset_input += offset_input_buf % 7 * 128;
|
||||
offset_input_buf /= 7;
|
||||
offset_input += offset_input_buf % 58 * 1568;
|
||||
offset_input_buf /= 58;
|
||||
offset_input += offset_input_buf % 2 * 784;
|
||||
offset_input_buf /= 2;
|
||||
int offset_output = 0;
|
||||
int offset_output_buf = loop_idx;
|
||||
offset_output += offset_output_buf % 7 * 128;
|
||||
offset_output_buf /= 7;
|
||||
offset_output += offset_output_buf % 58 * 784;
|
||||
offset_output_buf /= 58;
|
||||
offset_output += offset_output_buf % 2 * 45472;
|
||||
offset_output_buf /= 2;
|
||||
if (loop_idx % 7 == 6) {
|
||||
if (lane_id < 16) {
|
||||
buf[0] = input[0 + offset_input + 0 * 32 + lane_id];
|
||||
}
|
||||
} else {
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 4; inst_idx++) {
|
||||
buf[inst_idx] =
|
||||
input[0 + offset_input + inst_idx * 32 + lane_id];
|
||||
}
|
||||
}
|
||||
// test
|
||||
if (loop_idx % 7 == 6) {
|
||||
if (lane_id < 16) {
|
||||
output[0 + offset_output + 0 * 32 + lane_id] = buf[0];
|
||||
}
|
||||
} else {
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 4; inst_idx++) {
|
||||
output[0 + offset_output + inst_idx * 32 + lane_id] =
|
||||
buf[inst_idx];
|
||||
}
|
||||
}
|
||||
// test
|
||||
}
|
||||
}
|
||||
void invoke_func_0(float *input, float *output) {
|
||||
dim3 gridDim(80, 1);
|
||||
dim3 blockDim(128, 1);
|
||||
kernel_func_0<<<gridDim, blockDim>>>(input, output);
|
||||
cudaCheckError();
|
||||
}
|
|
@ -0,0 +1,66 @@
|
|||
#include "cuda_utils.h"
|
||||
// Kernel
|
||||
__global__ void kernel_func_1(float *input, float *output) {
|
||||
int lane_id = threadIdx.x % 32;
|
||||
int warp_id = threadIdx.x / 32;
|
||||
int parallel_idx = blockIdx.x * 4 + warp_id;
|
||||
float buf[4];
|
||||
for (int loop_idx = parallel_idx; loop_idx < 464; loop_idx += 320) {
|
||||
int offset_input = 0;
|
||||
int offset_input_buf = loop_idx;
|
||||
offset_input += offset_input_buf % 2 * 128;
|
||||
offset_input_buf /= 2;
|
||||
offset_input += offset_input_buf % 116 * 392;
|
||||
offset_input_buf /= 116;
|
||||
offset_input += offset_input_buf % 2 * 196;
|
||||
offset_input_buf /= 2;
|
||||
int offset_output = 0;
|
||||
int offset_output_buf = loop_idx;
|
||||
offset_output += offset_output_buf % 2 * 128;
|
||||
offset_output_buf /= 2;
|
||||
offset_output += offset_output_buf % 116 * 196;
|
||||
offset_output_buf /= 116;
|
||||
offset_output += offset_output_buf % 2 * 22736;
|
||||
offset_output_buf /= 2;
|
||||
if (loop_idx % 2 == 1) {
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 2; inst_idx++) {
|
||||
buf[inst_idx] =
|
||||
input[0 + offset_input + inst_idx * 32 + lane_id];
|
||||
}
|
||||
if (lane_id < 4) {
|
||||
buf[2] = input[0 + offset_input + 2 * 32 + lane_id];
|
||||
}
|
||||
} else {
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 4; inst_idx++) {
|
||||
buf[inst_idx] =
|
||||
input[0 + offset_input + inst_idx * 32 + lane_id];
|
||||
}
|
||||
}
|
||||
// test
|
||||
if (loop_idx % 2 == 1) {
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 2; inst_idx++) {
|
||||
output[0 + offset_output + inst_idx * 32 + lane_id] =
|
||||
buf[inst_idx];
|
||||
}
|
||||
if (lane_id < 4) {
|
||||
output[0 + offset_output + 2 * 32 + lane_id] = buf[2];
|
||||
}
|
||||
} else {
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 4; inst_idx++) {
|
||||
output[0 + offset_output + inst_idx * 32 + lane_id] =
|
||||
buf[inst_idx];
|
||||
}
|
||||
}
|
||||
// test
|
||||
}
|
||||
}
|
||||
void invoke_func_1(float *input, float *output) {
|
||||
dim3 gridDim(80, 1);
|
||||
dim3 blockDim(128, 1);
|
||||
kernel_func_1<<<gridDim, blockDim>>>(input, output);
|
||||
cudaCheckError();
|
||||
}
|
|
@ -0,0 +1,62 @@
|
|||
#include "cuda_utils.h"
|
||||
// Kernel
|
||||
__global__ void kernel_func_2(float *input, float *output) {
|
||||
int lane_id = threadIdx.x % 32;
|
||||
int warp_id = threadIdx.x / 32;
|
||||
int parallel_idx = blockIdx.x * 4 + warp_id;
|
||||
float buf[4];
|
||||
for (int loop_idx = parallel_idx; loop_idx < 464; loop_idx += 320) {
|
||||
int offset_input = 0;
|
||||
int offset_input_buf = loop_idx;
|
||||
offset_input += offset_input_buf % 232 * 98;
|
||||
offset_input_buf /= 232;
|
||||
offset_input += offset_input_buf % 2 * 49;
|
||||
offset_input_buf /= 2;
|
||||
int offset_output = 0;
|
||||
int offset_output_buf = loop_idx;
|
||||
offset_output += offset_output_buf % 232 * 49;
|
||||
offset_output_buf /= 232;
|
||||
offset_output += offset_output_buf % 2 * 11368;
|
||||
offset_output_buf /= 2;
|
||||
if (loop_idx % 1 == 0) {
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 1; inst_idx++) {
|
||||
buf[inst_idx] =
|
||||
input[0 + offset_input + inst_idx * 32 + lane_id];
|
||||
}
|
||||
if (lane_id < 17) {
|
||||
buf[1] = input[0 + offset_input + 1 * 32 + lane_id];
|
||||
}
|
||||
} else {
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 2; inst_idx++) {
|
||||
buf[inst_idx] =
|
||||
input[0 + offset_input + inst_idx * 32 + lane_id];
|
||||
}
|
||||
}
|
||||
// test
|
||||
if (loop_idx % 1 == 0) {
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 1; inst_idx++) {
|
||||
output[0 + offset_output + inst_idx * 32 + lane_id] =
|
||||
buf[inst_idx];
|
||||
}
|
||||
if (lane_id < 17) {
|
||||
output[0 + offset_output + 1 * 32 + lane_id] = buf[1];
|
||||
}
|
||||
} else {
|
||||
#pragma unroll
|
||||
for (int inst_idx = 0; inst_idx < 2; inst_idx++) {
|
||||
output[0 + offset_output + inst_idx * 32 + lane_id] =
|
||||
buf[inst_idx];
|
||||
}
|
||||
}
|
||||
// test
|
||||
}
|
||||
}
|
||||
void invoke_func_2(float *input, float *output) {
|
||||
dim3 gridDim(80, 1);
|
||||
dim3 blockDim(128, 1);
|
||||
kernel_func_2<<<gridDim, blockDim>>>(input, output);
|
||||
cudaCheckError();
|
||||
}
|
|
@ -0,0 +1,169 @@
|
|||
#pragma once
|
||||
#include "core/common.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/operator.h"
|
||||
#include "core/tensor.h"
|
||||
#include "operators/G2BMM.h"
|
||||
#include "operators/GBMM.h"
|
||||
#include "operators/concat.h"
|
||||
#include "operators/conv.h"
|
||||
#include "operators/element_wise.h"
|
||||
#include "operators/extend.h"
|
||||
#include "operators/gather.h"
|
||||
#include "operators/matmul.h"
|
||||
#include "operators/membound.h"
|
||||
#include "operators/pad.h"
|
||||
#include "operators/pooling.h"
|
||||
#include "operators/reduce_mean.h"
|
||||
#include "operators/reshape.h"
|
||||
#include "operators/slice.h"
|
||||
#include "operators/split.h"
|
||||
#include "operators/transpose.h"
|
||||
#include "operators/unary.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
class GraphBuilderObj {
|
||||
private:
|
||||
Graph g;
|
||||
|
||||
public:
|
||||
GraphBuilderObj(Runtime runtime) : g(make_ref<GraphObj>(runtime)) {}
|
||||
|
||||
// tensors
|
||||
Tensor tensor(Shape dim, const std::string &dtype);
|
||||
|
||||
// operators
|
||||
// conv op
|
||||
Operator conv(Tensor input, Tensor weight, Tensor output, int ph, int pw,
|
||||
int sh = 1, int sw = 1, int dh = 1, int dw = 1,
|
||||
Tensor bias = nullptr);
|
||||
Operator conv(Tensor input, Tensor weight, int ph, int pw, int sh = 1,
|
||||
int sw = 1, int dh = 1, int dw = 1, Tensor bias = nullptr);
|
||||
Operator conv(Tensor input, Tensor weight, Tensor output,
|
||||
ConvBaseObj::PaddingMode pm, int sh = 1, int sw = 1,
|
||||
int dh = 1, int dw = 1, Tensor bias = nullptr);
|
||||
Operator conv(Tensor input, Tensor weight, ConvBaseObj::PaddingMode pm,
|
||||
int sh = 1, int sw = 1, int dh = 1, int dw = 1,
|
||||
Tensor bias = nullptr);
|
||||
// matmul op
|
||||
Operator matmul(Tensor A, Tensor B, Tensor C, bool transA = false,
|
||||
bool transB = false);
|
||||
Operator matmul(Tensor A, Tensor B, bool transA = false,
|
||||
bool transB = false);
|
||||
// conv trans op
|
||||
Operator convTrans(Tensor input, Tensor weight, Tensor output, int ph,
|
||||
int pw, int sh = 1, int sw = 1, int dh = 1, int dw = 1,
|
||||
int oph = 0, int opw = 0, int group = 1,
|
||||
Tensor bias = nullptr, ActType act = ActType::None);
|
||||
Operator convTrans(Tensor input, Tensor weight, int ph, int pw, int sh = 1,
|
||||
int sw = 1, int dh = 1, int dw = 1, int oph = 0,
|
||||
int opw = 0, int group = 1, Tensor bias = nullptr,
|
||||
ActType act = ActType::None);
|
||||
Operator convTrans(Tensor input, Tensor weight, Tensor output,
|
||||
ConvBaseObj::PaddingMode pm, int sh = 1, int sw = 1,
|
||||
int dh = 1, int dw = 1, int oph = 0, int opw = 0,
|
||||
int group = 1, Tensor bias = nullptr,
|
||||
ActType act = ActType::None);
|
||||
Operator convTrans(Tensor input, Tensor weight, ConvBaseObj::PaddingMode pm,
|
||||
int sh = 1, int sw = 1, int dh = 1, int dw = 1,
|
||||
int oph = 0, int opw = 0, int group = 1,
|
||||
Tensor bias = nullptr, ActType act = ActType::None);
|
||||
// g2bmm op
|
||||
Operator g2bmm(Tensor A, Tensor B, Tensor C, const int width,
|
||||
const int dilation, Tensor bias = nullptr,
|
||||
ActType act = ActType::None);
|
||||
Operator g2bmm(Tensor A, Tensor B, const int width, const int dilation,
|
||||
Tensor bias = nullptr, ActType act = ActType::None);
|
||||
// gbmm-like op
|
||||
Operator gbmml(Tensor A, Tensor B, Tensor C, const int dilation,
|
||||
Tensor bias = nullptr, ActType act = ActType::None);
|
||||
Operator gbmml(Tensor A, Tensor B, const int dilation,
|
||||
Tensor bias = nullptr, ActType act = ActType::None);
|
||||
// pad op
|
||||
Operator pad(Tensor input, Tensor output, const vector<int> &pads,
|
||||
const optional<const vector<int>> &axis);
|
||||
Operator pad(Tensor input, const vector<int> &pads,
|
||||
const optional<const vector<int>> &axis);
|
||||
// slice op
|
||||
Operator slice(Tensor input, Tensor output, const vector<int> &starts,
|
||||
const vector<int> &ends,
|
||||
const optional<const vector<int>> &axis,
|
||||
const optional<const vector<int>> &steps);
|
||||
Operator slice(Tensor input, const vector<int> &starts,
|
||||
const vector<int> &ends,
|
||||
const optional<const vector<int>> &axis,
|
||||
const optional<const vector<int>> &steps);
|
||||
// concat op
|
||||
Operator concat(TensorVec inputs, Tensor output, int dim);
|
||||
Operator concat(TensorVec inputs, int dim);
|
||||
// split op
|
||||
Operator split(Tensor input, std::optional<TensorVec> outputs, int dim,
|
||||
int num);
|
||||
Operator split(Tensor input, int dim, int num);
|
||||
Operator split(Tensor input, std::optional<TensorVec> outputs, int dim,
|
||||
const vector<int> &ratio);
|
||||
Operator split(Tensor input, int dim, const vector<int> &ratio);
|
||||
// transpose op
|
||||
Operator transpose(Tensor input, Tensor output, const Shape &perm);
|
||||
// TODO
|
||||
// extend op
|
||||
Operator extend(Tensor input, Tensor output, int dim, int num);
|
||||
Operator extend(Tensor input, int dim, int num);
|
||||
// max pool op
|
||||
Operator maxpool(Tensor input, Tensor output, int kh, int kw, int dh,
|
||||
int dw, int ph, int pw, int sh, int sw);
|
||||
Operator maxpool(Tensor input, int kh, int kw, int dh, int dw, int ph,
|
||||
int pw, int sh, int sw);
|
||||
// average pool op
|
||||
Operator avgpool(Tensor input, Tensor output, int kh, int kw, int dh,
|
||||
int dw, int ph, int pw, int sh, int sw);
|
||||
Operator avgpool(Tensor input, int kh, int kw, int dh, int dw, int ph,
|
||||
int pw, int sh, int sw);
|
||||
// element wise op
|
||||
Operator add(Tensor input0, Tensor input1, Tensor output);
|
||||
Operator add(Tensor input0, Tensor input1);
|
||||
Operator sub(Tensor input0, Tensor input1, Tensor output);
|
||||
Operator sub(Tensor input0, Tensor input1);
|
||||
Operator mul(Tensor input0, Tensor input1, Tensor output);
|
||||
Operator mul(Tensor input0, Tensor input1);
|
||||
Operator div(Tensor input0, Tensor input1, Tensor output);
|
||||
Operator div(Tensor input0, Tensor input1);
|
||||
Operator pow(Tensor input0, Tensor input1, Tensor output);
|
||||
Operator pow(Tensor input0, Tensor input1);
|
||||
// gather op
|
||||
Operator gather(Tensor input, Tensor index, Tensor output, int axis);
|
||||
Operator gather(Tensor input, Tensor index, int axis);
|
||||
// reduce mean op
|
||||
// TODO
|
||||
// reshape op
|
||||
Operator reshape(Tensor input, Tensor output, const Shape &dims);
|
||||
Operator reshape(Tensor input, const Shape &dims);
|
||||
Operator flatten(Tensor input, Tensor output);
|
||||
Operator flatten(Tensor input);
|
||||
Operator identity(Tensor input, Tensor output);
|
||||
Operator identity(Tensor input);
|
||||
// unary op
|
||||
// TODO: batch norm
|
||||
Operator softmax(Tensor input, Tensor output);
|
||||
Operator softmax(Tensor input);
|
||||
// TODO: activation
|
||||
Operator relu(Tensor input, Tensor output);
|
||||
Operator relu(Tensor input);
|
||||
Operator sigmoid(Tensor input, Tensor output);
|
||||
Operator sigmoid(Tensor input);
|
||||
Operator tanh(Tensor input, Tensor output);
|
||||
Operator tanh(Tensor input);
|
||||
Operator abs(Tensor input, Tensor output);
|
||||
Operator abs(Tensor input);
|
||||
Operator reduceMean(Tensor input, Tensor Output, int axis);
|
||||
Operator erf(Tensor input, Tensor output);
|
||||
// resize op
|
||||
// TODO
|
||||
// membound op
|
||||
Operator memBound(const TensorVec &inputs, const TensorVec &outputs,
|
||||
const std::vector<nnet::Tensor> &nnetInputs,
|
||||
nnet::Expr expr, double exec_time, std::string hint = {});
|
||||
};
|
||||
|
||||
} // namespace infini
|
|
@ -38,6 +38,7 @@ enum class OpType {
|
|||
Tanh,
|
||||
Abs,
|
||||
Resize,
|
||||
Erf,
|
||||
//
|
||||
MemBound = 300,
|
||||
};
|
||||
|
|
|
@ -10,6 +10,7 @@ class TensorBaseObj;
|
|||
class TensorObj;
|
||||
class OperatorObj;
|
||||
class GraphObj;
|
||||
class GraphBuilderObj;
|
||||
class RuntimeObj;
|
||||
class BlobObj;
|
||||
|
||||
|
@ -17,6 +18,7 @@ using TensorBase = Ref<TensorBaseObj>;
|
|||
using Tensor = Ref<TensorObj>;
|
||||
using Operator = Ref<OperatorObj>;
|
||||
using Graph = Ref<GraphObj>;
|
||||
using GraphBuilder = Ref<GraphBuilderObj>;
|
||||
using Runtime = Ref<RuntimeObj>;
|
||||
using Blob = Ref<BlobObj>;
|
||||
enum class OpType;
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
#pragma once
|
||||
#include "core/common.h"
|
||||
// #include "core/common.h"
|
||||
#include <cublas_v2.h>
|
||||
#include <cuda.h>
|
||||
#include <cuda_profiler_api.h>
|
||||
|
|
|
@ -1,10 +1,16 @@
|
|||
#pragma once
|
||||
#include "core/operator.h"
|
||||
namespace infini {
|
||||
namespace opTimer {
|
||||
|
||||
double getPerfConvCudnn(int n, int c, int h, int w, int f, int r, int s,
|
||||
int padh, int padw, int strideh, int stridew,
|
||||
int dilationh, int dilationw, int group,
|
||||
const char *name);
|
||||
int dilationh, int dilationw, int group);
|
||||
|
||||
double getPerfConvBiasActCudnn(int n, int c, int h, int w, int f, int r, int s,
|
||||
int padh, int padw, int strideh, int stridew,
|
||||
int dilationh, int dilationw, int group,
|
||||
bool bias, string act);
|
||||
|
||||
double getPerfConvTransposed2dCudnn(int n, int c, int h, int w, int f, int r,
|
||||
int s, int padh, int padw, int strideh,
|
||||
|
|
|
@ -25,19 +25,22 @@ class ConvBaseObj : public OperatorObj {
|
|||
int h, w; // input shape (same for conv2d and convTranposed2d)
|
||||
int f; // output/input channel for conv2d/convTransposed2d
|
||||
int r, s; // weight shape
|
||||
ActType act;
|
||||
|
||||
public:
|
||||
// Constructors for explicitly setting padding size
|
||||
ConvBaseObj(OpType opType, TensorVec inputs, Tensor &output, int ph, int pw,
|
||||
int sh, int sw, int dh, int dw, const Tensor &inputInConvFWD,
|
||||
const Tensor &weightInConvFWD);
|
||||
const Tensor &weightInConvFWD, const ActType act);
|
||||
ConvBaseObj(OpType opType, TensorVec inputs, Tensor &output,
|
||||
PaddingMode mode, int sh, int sw, int dh, int dw,
|
||||
const Tensor &inputInConvFWD, const Tensor &weightInConvFWD);
|
||||
const Tensor &inputInConvFWD, const Tensor &weightInConvFWD,
|
||||
const ActType act);
|
||||
|
||||
std::string toString() const override;
|
||||
int numInputs() const override { return 2; }
|
||||
int numInputs() const override { return inputs.size(); }
|
||||
int numOutputs() const override { return 1; }
|
||||
bool hasBias() const { return inputs.size() == 3; }
|
||||
|
||||
Tensor getBias() const { return inputs[2]; }
|
||||
PaddingMode getPaddingMode() const { return padding; }
|
||||
|
@ -53,6 +56,7 @@ class ConvBaseObj : public OperatorObj {
|
|||
auto getPadStrideDilation() const { return tuple(ph, pw, sh, sw, dh, dw); }
|
||||
int getChannelPerGroup() const { return inputs[1]->getDims()[1]; }
|
||||
virtual int getNumGroups() const = 0;
|
||||
ActType getAct() const { return act; }
|
||||
|
||||
private:
|
||||
vector<int> getWorkloadVector() const override;
|
||||
|
@ -65,8 +69,6 @@ class ConvBaseObj : public OperatorObj {
|
|||
};
|
||||
|
||||
class ConvObj : public ConvBaseObj {
|
||||
private:
|
||||
ActType act;
|
||||
|
||||
public:
|
||||
ConvObj(GraphObj *graph, Tensor input, Tensor weight, Tensor output, int ph,
|
||||
|
@ -79,7 +81,6 @@ class ConvObj : public ConvBaseObj {
|
|||
ActType act = ActType::None);
|
||||
|
||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||
ActType getAct() const { return act; }
|
||||
int getNumGroups() const override { return c / getChannelPerGroup(); }
|
||||
|
||||
private:
|
||||
|
@ -90,7 +91,6 @@ class ConvTransposed2dObj : public ConvBaseObj {
|
|||
private:
|
||||
int oph, opw;
|
||||
int group;
|
||||
ActType act;
|
||||
|
||||
public:
|
||||
ConvTransposed2dObj(GraphObj *graph, Tensor input, Tensor weight,
|
||||
|
@ -106,7 +106,6 @@ class ConvTransposed2dObj : public ConvBaseObj {
|
|||
Tensor bias = nullptr, ActType act = ActType::None);
|
||||
|
||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||
ActType getAct() const { return act; }
|
||||
int getNumGroups() const override { return group; }
|
||||
|
||||
private:
|
||||
|
|
|
@ -0,0 +1,25 @@
|
|||
#pragma once
|
||||
|
||||
#include "core/operator.h"
|
||||
|
||||
namespace infini {
|
||||
class TransposeObj : public OperatorObj {
|
||||
Shape perm;
|
||||
|
||||
public:
|
||||
TransposeObj(GraphObj *graph, Tensor input, Tensor output,
|
||||
const Shape &dims);
|
||||
|
||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||
|
||||
std::string toString() const override;
|
||||
int numInputs() const override { return 1; }
|
||||
int numOutputs() const override { return 1; }
|
||||
Shape getPerm() const { return perm; }
|
||||
|
||||
private:
|
||||
vector<int> getWorkloadVector() const override;
|
||||
vector<int> getOpAttrVector() const override;
|
||||
};
|
||||
|
||||
} // namespace infini
|
|
@ -28,4 +28,5 @@ DEFINE_UNARY_OBJ(Sigmoid, OpType::Sigmoid)
|
|||
DEFINE_UNARY_OBJ(Tanh, OpType::Tanh)
|
||||
DEFINE_UNARY_OBJ(Softmax, OpType::Softmax)
|
||||
DEFINE_UNARY_OBJ(Abs, OpType::Abs)
|
||||
DEFINE_UNARY_OBJ(Erf, OpType::Erf)
|
||||
}; // namespace infini
|
||||
|
|
|
@ -0,0 +1,71 @@
|
|||
#pragma once
|
||||
|
||||
#include "core/common.h"
|
||||
|
||||
#include <iostream>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace memb {
|
||||
|
||||
enum OpType {
|
||||
NONE = 0,
|
||||
EMPTY = 1,
|
||||
READ,
|
||||
WRITE,
|
||||
RELU,
|
||||
ADD,
|
||||
SUB,
|
||||
REDUCEMEAN,
|
||||
REDUCESUM,
|
||||
GATHER,
|
||||
TRANSPOSE,
|
||||
BROADCAST,
|
||||
};
|
||||
|
||||
enum MemType {
|
||||
DRAM = 1,
|
||||
SRAM,
|
||||
REG,
|
||||
};
|
||||
|
||||
inline std::string getName(OpType opType) {
|
||||
switch (opType) {
|
||||
case (OpType::EMPTY):
|
||||
return "EMPTY";
|
||||
case (OpType::READ):
|
||||
return "READ";
|
||||
case (OpType::WRITE):
|
||||
return "WRITE";
|
||||
case (OpType::RELU):
|
||||
return "RELU";
|
||||
case (OpType::ADD):
|
||||
return "ADD";
|
||||
case (OpType::SUB):
|
||||
return "SUB";
|
||||
default:
|
||||
IT_ASSERT(false);
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
inline std::string getName(MemType memType) {
|
||||
switch (memType) {
|
||||
case (MemType::DRAM):
|
||||
return "DRAM";
|
||||
case (MemType::SRAM):
|
||||
return "SRAM";
|
||||
case (MemType::REG):
|
||||
return "REG";
|
||||
default:
|
||||
IT_ASSERT(false);
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
inline size_t hashAppend(size_t a, size_t b) {
|
||||
return (a * 10000019 + b * 10000079) % 2147483647;
|
||||
}
|
||||
|
||||
} // namespace memb
|
|
@ -0,0 +1,88 @@
|
|||
#pragma once
|
||||
|
||||
#include <cublas.h>
|
||||
#include <curand.h>
|
||||
#include <iostream>
|
||||
|
||||
#define cudaSafeCall(err) __cudaSafeCall(err, __FILE__, __LINE__)
|
||||
#define cudaCheckError() __cudaCheckError(__FILE__, __LINE__)
|
||||
#define cublasSafeCall(err) __cublasSafeCall(err, __FILE__, __LINE__)
|
||||
#define curandSafeCall(err) __curandSafeCall(err, __FILE__, __LINE__)
|
||||
|
||||
inline void __cudaSafeCall(cudaError err, const char *file, const int line) {
|
||||
if (err != cudaSuccess) {
|
||||
std::cout << "[ERROR] " << file << "::" << line
|
||||
<< ": cudaSafeCall() failed. " << cudaGetErrorString(err)
|
||||
<< std::endl;
|
||||
exit(-1);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
inline void __cudaCheckError(const char *file, const int line) {
|
||||
auto err = cudaGetLastError();
|
||||
if (err != cudaSuccess) {
|
||||
std::cout << "[ERROR] " << file << "::" << line
|
||||
<< ": cudaCheckError() failed. " << cudaGetErrorString(err)
|
||||
<< std::endl;
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
#ifdef DEBUG
|
||||
// This checking will affect performance.
|
||||
err = cudaDeviceSynchronize();
|
||||
if (err != cudaSuccess) {
|
||||
std::cout << "[ERROR] " << file << "::" << line
|
||||
<< ": cudaCheckError() with sync failed. "
|
||||
<< cudaGetErrorString(err) << std::endl;
|
||||
exit(-1);
|
||||
}
|
||||
#endif
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
inline const char *cublasGetErrorString(cublasStatus_t err) {
|
||||
switch (err) {
|
||||
case CUBLAS_STATUS_SUCCESS:
|
||||
return "CUBLAS_STATUS_SUCCESS";
|
||||
case CUBLAS_STATUS_NOT_INITIALIZED:
|
||||
return "CUBLAS_STATUS_NOT_INITIALIZED";
|
||||
case CUBLAS_STATUS_ALLOC_FAILED:
|
||||
return "CUBLAS_STATUS_ALLOC_FAILED";
|
||||
case CUBLAS_STATUS_INVALID_VALUE:
|
||||
return "CUBLAS_STATUS_INVALID_VALUE";
|
||||
case CUBLAS_STATUS_ARCH_MISMATCH:
|
||||
return "CUBLAS_STATUS_ARCH_MISMATCH";
|
||||
case CUBLAS_STATUS_MAPPING_ERROR:
|
||||
return "CUBLAS_STATUS_MAPPING_ERROR";
|
||||
case CUBLAS_STATUS_EXECUTION_FAILED:
|
||||
return "CUBLAS_STATUS_EXECUTION_FAILED";
|
||||
case CUBLAS_STATUS_INTERNAL_ERROR:
|
||||
return "CUBLAS_STATUS_INTERNAL_ERROR";
|
||||
case CUBLAS_STATUS_NOT_SUPPORTED:
|
||||
return "CUBLAS_STATUS_NOT_SUPPORTED";
|
||||
case CUBLAS_STATUS_LICENSE_ERROR:
|
||||
return "CUBLAS_STATUS_LICENSE_ERROR";
|
||||
}
|
||||
return "<unknown>";
|
||||
}
|
||||
|
||||
inline void __cublasSafeCall(cublasStatus_t err, const char *file,
|
||||
const int line) {
|
||||
if (err != CUBLAS_STATUS_SUCCESS) {
|
||||
std::cout << "[ERROR]" << file << "::" << line
|
||||
<< ": cublasSafeCall() failed. " << cublasGetErrorString(err)
|
||||
<< std::endl;
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
inline void __curandSafeCall(curandStatus_t err, const char *file,
|
||||
const int line) {
|
||||
if (err != CURAND_STATUS_SUCCESS) {
|
||||
std::cout << "[ERROR]" << file << "::" << line
|
||||
<< ": curandSafeCall() failed. " << err << std::endl;
|
||||
exit(-1);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,32 @@
|
|||
#pragma once
|
||||
|
||||
#include "pfusion/common.h"
|
||||
#include "pfusion/meta_op.h"
|
||||
|
||||
namespace memb {
|
||||
std::vector<std::shared_ptr<MetaOp>>
|
||||
instantiateUnary(const OpType opType,
|
||||
std::vector<std::shared_ptr<Pointer>> ptrs,
|
||||
const std::vector<size_t> &shape);
|
||||
std::vector<std::shared_ptr<MetaOp>>
|
||||
instantiateBinary(const OpType opType,
|
||||
std::vector<std::shared_ptr<Pointer>> ptrs,
|
||||
const std::vector<size_t> &shape);
|
||||
std::vector<std::shared_ptr<MetaOp>> instantiateTranspose(
|
||||
const OpType opType, std::vector<std::shared_ptr<Pointer>> ptrs,
|
||||
const std::vector<size_t> &shape, const std::vector<size_t> &perm);
|
||||
std::vector<std::shared_ptr<MetaOp>>
|
||||
instantiateGather(const OpType opType,
|
||||
const std::vector<std::shared_ptr<Pointer>> &ptrs,
|
||||
const std::vector<size_t> &inputShape,
|
||||
const std::vector<size_t> &indexShape,
|
||||
const std::vector<size_t> &outputShape, const size_t axis);
|
||||
std::vector<std::shared_ptr<MetaOp>>
|
||||
instantiateReduce(const OpType opType,
|
||||
const std::vector<std::shared_ptr<Pointer>> &ptrs,
|
||||
const std::vector<size_t> &inputShape, const size_t axis);
|
||||
std::vector<std::shared_ptr<MetaOp>> instantiateBroadcast(
|
||||
const OpType opType, const std::vector<std::shared_ptr<Pointer>> &ptrs,
|
||||
const std::vector<size_t> &inputShape, const size_t axis, const size_t num);
|
||||
|
||||
} // namespace memb
|
|
@ -0,0 +1,29 @@
|
|||
#pragma once
|
||||
|
||||
#include "core/runtime.h"
|
||||
|
||||
namespace infini {
|
||||
class MemoryCodegen {
|
||||
private:
|
||||
std::string generateGraph(Graph graph);
|
||||
std::string generateBias(const std::vector<size_t> &shape);
|
||||
std::string generateTranspose(const std::vector<size_t> &shape,
|
||||
const std::vector<size_t> &perm);
|
||||
|
||||
public:
|
||||
MemoryCodegen() {}
|
||||
~MemoryCodegen() {}
|
||||
void exportGraph(Graph graph, std::string filename);
|
||||
void exportBert_LN(const std::string &filename);
|
||||
void exportBert_SM(const std::string &filename);
|
||||
void exportBert_GELU(const std::string &filename);
|
||||
void exportViT_LN(const std::string &filename);
|
||||
void exportViT_SM(const std::string &filename);
|
||||
void exportViT_GELU(const std::string &filename);
|
||||
void exportBias(const std::string &filename,
|
||||
const std::vector<size_t> &shape);
|
||||
void exportTranspose(const std::string &filename,
|
||||
const std::vector<size_t> &shape,
|
||||
const std::vector<size_t> &perm);
|
||||
};
|
||||
} // namespace infini
|
|
@ -0,0 +1,33 @@
|
|||
#pragma once
|
||||
|
||||
#include "pfusion/meta_op.h"
|
||||
|
||||
namespace memb {
|
||||
class MetaGraph {
|
||||
private:
|
||||
std::vector<std::shared_ptr<MetaOp>> metaOps;
|
||||
std::vector<std::pair<size_t, size_t>> edges;
|
||||
std::unordered_map<size_t, size_t> metaOpMap;
|
||||
|
||||
public:
|
||||
MetaGraph() {}
|
||||
~MetaGraph() {}
|
||||
inline void addOp(std::shared_ptr<MetaOp> op) {
|
||||
IT_ASSERT(metaOpMap.find(op->id) == metaOpMap.end());
|
||||
metaOpMap[op->id] = metaOps.size();
|
||||
metaOps.emplace_back(op);
|
||||
}
|
||||
inline void addEdge(std::shared_ptr<MetaOp> op1,
|
||||
std::shared_ptr<MetaOp> op2) {
|
||||
IT_ASSERT(metaOpMap.find(op1->id) != metaOpMap.end());
|
||||
IT_ASSERT(metaOpMap.find(op2->id) != metaOpMap.end());
|
||||
edges.emplace_back(metaOpMap[op1->id], metaOpMap[op2->id]);
|
||||
}
|
||||
void print();
|
||||
void optimize();
|
||||
std::string genHeader();
|
||||
std::string genKernelFuncs();
|
||||
std::string genInvokeFuncs();
|
||||
};
|
||||
|
||||
} // namespace memb
|
|
@ -0,0 +1,111 @@
|
|||
#pragma once
|
||||
|
||||
#include "pfusion/common.h"
|
||||
#include "pfusion/micro_op.h"
|
||||
#include "pfusion/pointer.h"
|
||||
|
||||
namespace memb {
|
||||
class TensorMapping {
|
||||
private:
|
||||
std::vector<size_t> shape, stride, map;
|
||||
std::string name;
|
||||
|
||||
public:
|
||||
TensorMapping(const std::string &_name, const std::vector<size_t> &_shape,
|
||||
const std::vector<size_t> &_stride,
|
||||
const std::vector<size_t> &_map) {
|
||||
name = "offset_" + _name;
|
||||
IT_ASSERT(_shape.size() > 0 && _shape.size() < 10);
|
||||
for (auto x : _shape) {
|
||||
shape.emplace_back(x);
|
||||
}
|
||||
IT_ASSERT(_stride.size() > 0 && _stride.size() < 10);
|
||||
for (auto x : _stride) {
|
||||
stride.emplace_back(x);
|
||||
}
|
||||
IT_ASSERT(_map.size() > 0 && _map.size() < 10);
|
||||
for (auto x : _map) {
|
||||
map.emplace_back(x);
|
||||
}
|
||||
}
|
||||
~TensorMapping() {}
|
||||
|
||||
static inline std::shared_ptr<TensorMapping>
|
||||
buildWithMap(const std::string &name, const std::vector<size_t> &shape,
|
||||
const std::vector<size_t> &map) {
|
||||
std::vector<size_t> stride(shape.size());
|
||||
stride[0] = 1;
|
||||
for (size_t i = 1; i < stride.size(); i++) {
|
||||
stride[i] = shape[i] * stride[i - 1];
|
||||
}
|
||||
return std::make_shared<TensorMapping>(name, shape, stride, map);
|
||||
}
|
||||
|
||||
static inline std::shared_ptr<TensorMapping>
|
||||
build(const std::string &name, const std::vector<size_t> &shape,
|
||||
const std::vector<size_t> &stride, const std::vector<size_t> &map) {
|
||||
return std::make_shared<TensorMapping>(name, shape, stride, map);
|
||||
}
|
||||
|
||||
inline std::string offset() { return name; }
|
||||
inline size_t getHash() {
|
||||
std::hash<size_t> hasher;
|
||||
std::hash<std::string> stringHasher;
|
||||
size_t ret = stringHasher(name);
|
||||
ret = hashAppend(ret, hasher(shape.size()));
|
||||
for (auto x : shape) {
|
||||
ret = hashAppend(ret, hasher(x));
|
||||
}
|
||||
ret = hashAppend(ret, hasher(map.size()));
|
||||
for (auto x : map) {
|
||||
ret = hashAppend(ret, hasher(x));
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
std::string genOffset();
|
||||
};
|
||||
|
||||
class MetaOp {
|
||||
public:
|
||||
int id;
|
||||
int main_loop_st, main_loop_ed, numBlocks, numGroups, numReg, numSmem,
|
||||
numLanes;
|
||||
std::vector<std::shared_ptr<MicroOp>> microOps;
|
||||
std::vector<std::shared_ptr<Pointer>> ptrs;
|
||||
std::vector<std::shared_ptr<TensorMapping>> mappings;
|
||||
MetaOp() {
|
||||
static int metaOpId = 0;
|
||||
id = metaOpId++;
|
||||
}
|
||||
~MetaOp() {}
|
||||
|
||||
inline void setLoopSt(int _main_loop_st) { main_loop_st = _main_loop_st; }
|
||||
inline void setLoopEd(int _main_loop_ed) { main_loop_ed = _main_loop_ed; }
|
||||
inline int getLoopSt() { return main_loop_st; }
|
||||
inline int getLoopEd() { return main_loop_ed; }
|
||||
|
||||
void optimize();
|
||||
std::string genKernelFunc();
|
||||
std::string genInvokeFunc();
|
||||
|
||||
static std::shared_ptr<MetaOp> merge(std::shared_ptr<MetaOp> metaOp0,
|
||||
std::shared_ptr<MetaOp> metaOp1);
|
||||
|
||||
inline void print() {
|
||||
std::cout << "MetaOp: " << id << std::endl;
|
||||
for (auto microOp : microOps) {
|
||||
microOp->print();
|
||||
}
|
||||
}
|
||||
bool checkValid() {
|
||||
// TODO: check valid
|
||||
return true;
|
||||
};
|
||||
static std::shared_ptr<MetaOp>
|
||||
buildBiasOp(const std::vector<size_t> &shape);
|
||||
static std::shared_ptr<MetaOp>
|
||||
buildTransposeOp(const std::vector<size_t> &shape,
|
||||
const std::vector<size_t> &perm);
|
||||
};
|
||||
|
||||
} // namespace memb
|
|
@ -0,0 +1,29 @@
|
|||
#pragma once
|
||||
|
||||
#include "pfusion/micro_op.h"
|
||||
|
||||
namespace memb {
|
||||
class BinaryOp : public MicroOp {
|
||||
private:
|
||||
size_t num, width;
|
||||
|
||||
public:
|
||||
BinaryOp(OpType _opType, std::shared_ptr<Pointer> _pSrc0,
|
||||
std::shared_ptr<Pointer> _pSrc1, std::shared_ptr<Pointer> _pDst,
|
||||
size_t _num, size_t _width)
|
||||
: num(_num), width(_width) {
|
||||
opType = _opType;
|
||||
ptrs = {_pSrc0, _pSrc1, _pDst};
|
||||
}
|
||||
~BinaryOp() {}
|
||||
std::shared_ptr<Pointer> getSrc0() { return ptrs[0]; }
|
||||
std::shared_ptr<Pointer> getSrc1() { return ptrs[1]; }
|
||||
std::shared_ptr<Pointer> getDst() { return ptrs[2]; }
|
||||
// bool checkValid() override;
|
||||
std::string generate() override;
|
||||
inline void print() override {
|
||||
std::cout << id << " " << getName(opType) << std::endl;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace memb
|
|
@ -0,0 +1,17 @@
|
|||
#pragma once
|
||||
|
||||
#include "pfusion/micro_op.h"
|
||||
|
||||
namespace memb {
|
||||
class EmptyOp : public MicroOp {
|
||||
public:
|
||||
EmptyOp() { opType = EMPTY; }
|
||||
~EmptyOp() {}
|
||||
// bool checkValid() override;
|
||||
std::string generate() override { return ""; };
|
||||
inline void print() override {
|
||||
std::cout << id << " " << getName(opType) << std::endl;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace memb
|
|
@ -0,0 +1,55 @@
|
|||
#pragma once
|
||||
|
||||
#include "pfusion/micro_op.h"
|
||||
|
||||
namespace memb {
|
||||
class MemoryOp : public MicroOp {
|
||||
private:
|
||||
size_t num, width;
|
||||
|
||||
public:
|
||||
MemoryOp(const OpType _opType, const std::shared_ptr<Pointer> _src,
|
||||
const std::shared_ptr<Pointer> _dst, const size_t _num,
|
||||
const size_t _width, const std::vector<size_t> &_cond)
|
||||
: num(_num), width(_width) {
|
||||
opType = _opType;
|
||||
ptrs = {_src, _dst};
|
||||
cond = _cond;
|
||||
}
|
||||
// bool checkValid() override;
|
||||
~MemoryOp() {}
|
||||
|
||||
static inline std::shared_ptr<MicroOp>
|
||||
build(const OpType opType, const std::shared_ptr<Pointer> src,
|
||||
const std::shared_ptr<Pointer> dst, const size_t num,
|
||||
const size_t width) {
|
||||
return std::make_shared<MemoryOp>(opType, src, dst, num, width,
|
||||
std::vector<size_t>({}));
|
||||
}
|
||||
|
||||
static inline std::shared_ptr<MicroOp>
|
||||
build(const OpType opType, const std::shared_ptr<Pointer> src,
|
||||
const std::shared_ptr<Pointer> dst, const size_t num,
|
||||
const size_t width, const std::vector<size_t> &cond) {
|
||||
return std::make_shared<MemoryOp>(opType, src, dst, num, width, cond);
|
||||
}
|
||||
|
||||
std::shared_ptr<Pointer> getSrc() { return ptrs[0]; }
|
||||
std::shared_ptr<Pointer> getDst() { return ptrs[1]; }
|
||||
std::string generate() override;
|
||||
std::string generateWithCond();
|
||||
inline void print() override {
|
||||
if (opType == READ) {
|
||||
std::cout << id << " " << getName(opType) << " "
|
||||
<< getName(getSrc()->getType()) << " "
|
||||
<< getSrc()->getHash() << std::endl;
|
||||
} else if (opType == WRITE) {
|
||||
std::cout << id << " " << getName(opType) << " "
|
||||
<< getName(getDst()->getType()) << " "
|
||||
<< getDst()->getHash() << std::endl;
|
||||
} else {
|
||||
IT_ASSERT(false);
|
||||
}
|
||||
}
|
||||
};
|
||||
} // namespace memb
|
|
@ -0,0 +1,29 @@
|
|||
#pragma once
|
||||
|
||||
#include "pfusion/micro_op.h"
|
||||
|
||||
namespace memb {
|
||||
class ReduceOp : public MicroOp {
|
||||
private:
|
||||
size_t num, width;
|
||||
|
||||
public:
|
||||
ReduceOp(OpType _opType, std::shared_ptr<Pointer> _pSrc,
|
||||
std::shared_ptr<Pointer> _pDst, std::shared_ptr<Pointer> _pBuf,
|
||||
size_t _num, size_t _width)
|
||||
: num(_num), width(_width) {
|
||||
opType = _opType;
|
||||
ptrs = {_pSrc, _pDst, _pBuf};
|
||||
}
|
||||
~ReduceOp() {}
|
||||
std::shared_ptr<Pointer> getSrc() { return ptrs[0]; }
|
||||
std::shared_ptr<Pointer> getDst() { return ptrs[1]; }
|
||||
std::shared_ptr<Pointer> getBuf() { return ptrs[2]; }
|
||||
// bool checkValid() override;
|
||||
std::string generate() override;
|
||||
inline void print() override {
|
||||
std::cout << id << " " << getName(opType) << std::endl;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace memb
|
|
@ -0,0 +1,29 @@
|
|||
#pragma once
|
||||
|
||||
#include "pfusion/micro_op.h"
|
||||
|
||||
namespace memb {
|
||||
class UnaryOp : public MicroOp {
|
||||
private:
|
||||
const int num, width;
|
||||
|
||||
public:
|
||||
UnaryOp(OpType _opType, std::shared_ptr<Pointer> _src,
|
||||
std::shared_ptr<Pointer> _dst, int _num, int _width)
|
||||
: num(_num), width(_width) {
|
||||
opType = _opType;
|
||||
ptrs = {_src, _dst};
|
||||
}
|
||||
~UnaryOp() {}
|
||||
|
||||
std::shared_ptr<Pointer> getSrc() { return ptrs[0]; }
|
||||
std::shared_ptr<Pointer> getDst() { return ptrs[1]; }
|
||||
|
||||
// bool checkValid() override;
|
||||
std::string generate() override;
|
||||
inline void print() override {
|
||||
std::cout << id << " " << getName(opType) << std::endl;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace memb
|
|
@ -0,0 +1,43 @@
|
|||
#pragma once
|
||||
|
||||
#include "pfusion/common.h"
|
||||
#include "pfusion/pointer.h"
|
||||
|
||||
namespace memb {
|
||||
|
||||
class MicroOp {
|
||||
protected:
|
||||
size_t id;
|
||||
OpType opType;
|
||||
std::vector<std::shared_ptr<Pointer>> ptrs;
|
||||
std::vector<size_t> cond;
|
||||
|
||||
public:
|
||||
MicroOp() : opType(NONE), cond(0) {
|
||||
static int microOpId = 0;
|
||||
id = microOpId++;
|
||||
}
|
||||
virtual ~MicroOp() {}
|
||||
|
||||
inline OpType getType() { return opType; }
|
||||
inline bool isMemoryOp() { return opType == READ || opType == WRITE; }
|
||||
inline std::vector<std::shared_ptr<Pointer>> getPtrs() { return ptrs; }
|
||||
|
||||
// virtual bool checkValid() = 0;
|
||||
virtual std::string generate() = 0;
|
||||
virtual void print() = 0;
|
||||
static std::shared_ptr<MicroOp> merge(std::shared_ptr<MicroOp> op0,
|
||||
std::shared_ptr<MicroOp> op1);
|
||||
};
|
||||
|
||||
class MicroGraph {
|
||||
public:
|
||||
MicroGraph() {}
|
||||
~MicroGraph() {}
|
||||
|
||||
private:
|
||||
std::vector<std::shared_ptr<MicroOp>> microOps;
|
||||
std::vector<std::pair<int, int>> edges;
|
||||
};
|
||||
|
||||
} // namespace memb
|
|
@ -0,0 +1,71 @@
|
|||
#pragma once
|
||||
|
||||
#include "pfusion/common.h"
|
||||
|
||||
namespace memb {
|
||||
class Pointer {
|
||||
private:
|
||||
const MemType memType;
|
||||
const std::string name, offset;
|
||||
|
||||
public:
|
||||
Pointer(MemType _memType, std::string _name, std::string _offset)
|
||||
: memType(_memType), name(_name), offset(_offset) {}
|
||||
~Pointer() {}
|
||||
|
||||
static inline std::shared_ptr<Pointer> buildPtr(MemType memType,
|
||||
std::string name) {
|
||||
return std::make_shared<Pointer>(memType, name, "0");
|
||||
}
|
||||
static inline std::shared_ptr<Pointer>
|
||||
buildPtr(MemType memType, std::string name, std::string offset) {
|
||||
return std::make_shared<Pointer>(memType, name, offset);
|
||||
}
|
||||
static inline std::shared_ptr<Pointer>
|
||||
buildPtr(std::shared_ptr<Pointer> ptr) {
|
||||
return std::make_shared<Pointer>(ptr->getType(), ptr->getName(),
|
||||
ptr->getOffset());
|
||||
}
|
||||
static inline std::shared_ptr<Pointer>
|
||||
buildPtr(std::shared_ptr<Pointer> ptr, std::string offset) {
|
||||
return std::make_shared<Pointer>(ptr->getType(), ptr->getName(),
|
||||
ptr->getOffset() + " + " + offset);
|
||||
}
|
||||
static inline std::shared_ptr<Pointer> buildPtrByTensorGuid(size_t guid) {
|
||||
return std::make_shared<Pointer>(
|
||||
MemType::DRAM, "tensor_ptr_" + std::to_string(guid), "0");
|
||||
}
|
||||
|
||||
inline const MemType getType() { return memType; }
|
||||
inline const std::string getName() { return name; }
|
||||
inline const std::string getOffset() { return offset; }
|
||||
inline const std::string generate() { return name + "[" + offset + "]"; }
|
||||
inline const std::string generateWithInstIdx(std::string idx) {
|
||||
std::string code = generate();
|
||||
size_t pos = 0, lengthA = 8, lengthB = idx.size();
|
||||
while ((pos = code.find("inst_idx", pos)) != std::string::npos) {
|
||||
code.replace(pos, lengthA, idx);
|
||||
pos += lengthB;
|
||||
}
|
||||
std::cout << "[INFO] " << idx << " " << lengthB << " " << code
|
||||
<< std::endl;
|
||||
return code;
|
||||
}
|
||||
inline bool equal(std::shared_ptr<Pointer> ptr) {
|
||||
if (name == ptr->getName() && offset == ptr->getOffset()) {
|
||||
IT_ASSERT(memType == ptr->getType());
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
inline const size_t getHash() {
|
||||
std::hash<MemType> memTypeHash;
|
||||
std::hash<std::string> stringHash;
|
||||
size_t ret = memTypeHash(memType);
|
||||
ret = hashAppend(ret, stringHash(name));
|
||||
ret = hashAppend(ret, stringHash(offset));
|
||||
return ret;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace memb
|
|
@ -0,0 +1,39 @@
|
|||
#pragma once
|
||||
|
||||
#include "pfusion/meta_graph.h"
|
||||
#include "pfusion/meta_op.h"
|
||||
|
||||
namespace memb {
|
||||
class SearchGraph {
|
||||
private:
|
||||
class Node {
|
||||
public:
|
||||
int id;
|
||||
std::vector<std::shared_ptr<MetaOp>> metaOps;
|
||||
std::vector<int> pred;
|
||||
std::vector<int> succ;
|
||||
};
|
||||
// each node is a vector of metaOps.
|
||||
std::vector<Node> nodes;
|
||||
std::vector<std::pair<int, int>> edges;
|
||||
|
||||
public:
|
||||
SearchGraph() {}
|
||||
~SearchGraph() {}
|
||||
inline void addNode(std::vector<std::shared_ptr<MetaOp>> metaOps) {
|
||||
Node node;
|
||||
node.id = nodes.size();
|
||||
for (auto metaOp : metaOps) {
|
||||
node.metaOps.emplace_back(metaOp);
|
||||
}
|
||||
nodes.emplace_back(node);
|
||||
}
|
||||
inline void addEdge(int i, int j) {
|
||||
edges.emplace_back(i, j);
|
||||
nodes[i].succ.emplace_back(j);
|
||||
nodes[j].pred.emplace_back(i);
|
||||
}
|
||||
std::shared_ptr<MetaGraph> exportFirstMetaGraph();
|
||||
};
|
||||
|
||||
} // namespace memb
|
|
@ -0,0 +1 @@
|
|||
from .import_onnx import import_onnx
|
|
@ -0,0 +1,483 @@
|
|||
import functools
|
||||
|
||||
import numpy as np
|
||||
|
||||
import onnx
|
||||
import onnx.checker
|
||||
import onnx.numpy_helper
|
||||
import onnx.shape_inference
|
||||
from rules import conv_transposed2d_rules, conv_rules, print_result
|
||||
|
||||
|
||||
def _add_value_info_for_constants(model: onnx.ModelProto):
|
||||
"""
|
||||
Currently onnx.shape_inference doesn't use the shape of initializers, so add
|
||||
that info explicitly as ValueInfoProtos.
|
||||
Mutates the model.
|
||||
Args:
|
||||
model: The ModelProto to update.
|
||||
"""
|
||||
# All (top-level) constants will have ValueInfos before IRv4 as they are all inputs
|
||||
if model.ir_version < 4:
|
||||
return
|
||||
|
||||
def add_const_value_infos_to_graph(graph: onnx.GraphProto):
|
||||
inputs = {i.name for i in graph.input}
|
||||
existing_info = {vi.name: vi for vi in graph.value_info}
|
||||
for init in graph.initializer:
|
||||
# Check it really is a constant, not an input
|
||||
if init.name in inputs:
|
||||
continue
|
||||
|
||||
# The details we want to add
|
||||
elem_type = init.data_type
|
||||
shape = init.dims
|
||||
|
||||
# Get existing or create new value info for this constant
|
||||
vi = existing_info.get(init.name)
|
||||
if vi is None:
|
||||
vi = graph.value_info.add()
|
||||
vi.name = init.name
|
||||
|
||||
# Even though it would be weird, we will not overwrite info even if it doesn't match
|
||||
tt = vi.type.tensor_type
|
||||
if tt.elem_type == onnx.TensorProto.UNDEFINED:
|
||||
tt.elem_type = elem_type
|
||||
if not tt.HasField("shape"):
|
||||
# Ensure we set an empty list if the const is scalar (zero dims)
|
||||
tt.shape.dim.extend([])
|
||||
for dim in shape:
|
||||
tt.shape.dim.add().dim_value = dim
|
||||
|
||||
# Handle subgraphs
|
||||
for node in graph.node:
|
||||
for attr in node.attribute:
|
||||
# Ref attrs refer to other attrs, so we don't need to do anything
|
||||
if attr.ref_attr_name != "":
|
||||
continue
|
||||
|
||||
if attr.type == onnx.AttributeProto.GRAPH:
|
||||
add_const_value_infos_to_graph(attr.g)
|
||||
if attr.type == onnx.AttributeProto.GRAPHS:
|
||||
for g in attr.graphs:
|
||||
add_const_value_infos_to_graph(g)
|
||||
|
||||
return add_const_value_infos_to_graph(model.graph)
|
||||
|
||||
|
||||
def _parse_attribute(attributes, defaults=dict()):
|
||||
atts = defaults
|
||||
for att in attributes:
|
||||
if att.type == onnx.AttributeProto.INT:
|
||||
atts[att.name] = att.i
|
||||
elif att.type == onnx.AttributeProto.INTS:
|
||||
atts[att.name] = att.ints
|
||||
elif att.type == onnx.AttributeProto.FLOAT:
|
||||
atts[att.name] = att.f
|
||||
elif att.type == onnx.AttributeProto.STRING:
|
||||
atts[att.name] = att.s
|
||||
elif att.type == onnx.AttributeProto.TENSOR:
|
||||
atts[att.name] = att.t
|
||||
else:
|
||||
assert False, "Unsupported Attribute Type: {}".format(att.type)
|
||||
return atts
|
||||
|
||||
|
||||
def _onnx_datatype_tostring(dtype):
|
||||
if dtype == 0:
|
||||
return 'UNDEFINED'
|
||||
elif dtype == 1:
|
||||
return 'FLOAT'
|
||||
elif dtype == 2:
|
||||
return 'UINT8'
|
||||
elif dtype == 3:
|
||||
return 'INT8'
|
||||
elif dtype == 4:
|
||||
return 'UINT16'
|
||||
elif dtype == 5:
|
||||
return 'INT16'
|
||||
elif dtype == 6:
|
||||
return 'INT32'
|
||||
elif dtype == 7:
|
||||
return 'INT64'
|
||||
elif dtype == 8:
|
||||
return 'STRING'
|
||||
elif dtype == 9:
|
||||
return 'BOOL'
|
||||
elif dtype == 10:
|
||||
return 'FLOAT16'
|
||||
elif dtype == 11:
|
||||
return 'DOUBLE'
|
||||
elif dtype == 12:
|
||||
return 'UINT32'
|
||||
elif dtype == 13:
|
||||
return 'UINT64'
|
||||
elif dtype == 14:
|
||||
return 'COMPLEX64'
|
||||
elif dtype == 15:
|
||||
return 'COMPLEX128'
|
||||
elif dtype == 16:
|
||||
return 'BFLOAT16'
|
||||
else:
|
||||
assert False, 'Unknown onnx datatype'
|
||||
|
||||
|
||||
def import_onnx(model_path: str, bs: int):
|
||||
ts, ds, ops, consts = dict(), dict(), dict(), dict() # (key, value) = (name, class)
|
||||
model = onnx.load(model_path)
|
||||
|
||||
# Tensor_input
|
||||
for input in model.graph.input:
|
||||
if input.name not in ts:
|
||||
dims = [d.dim_value for d in input.type.tensor_type.shape.dim]
|
||||
# ts[input.name] = g.tensor(dims, _onnx_datatype_tostring(input.type.tensor_type.elem_type))
|
||||
ds[input.name] = dims
|
||||
|
||||
# Tensor_weight
|
||||
for weight in model.graph.initializer:
|
||||
if weight.name not in ts:
|
||||
# ts[weight.name] = g.tensor(weight.dims, _onnx_datatype_tostring(weight.data_type))
|
||||
ds[weight.name] = weight.dims
|
||||
|
||||
# Tensor_inference
|
||||
_add_value_info_for_constants(model)
|
||||
infered_model = onnx.shape_inference.infer_shapes(model)
|
||||
for v in infered_model.graph.value_info:
|
||||
if v.name not in ts:
|
||||
dims = [d.dim_value for d in v.type.tensor_type.shape.dim]
|
||||
# ts[v.name] = g.tensor(dims, _onnx_datatype_tostring(v.type.tensor_type.elem_type))
|
||||
ds[v.name] = dims
|
||||
|
||||
# Tensor_output
|
||||
for output in model.graph.output:
|
||||
if output.name not in ts:
|
||||
dims = [d.dim_value for d in output.type.tensor_type.shape.dim]
|
||||
# ts[output.name] = g.tensor(dims, _onnx_datatype_tostring(output.type.tensor_type.elem_type))
|
||||
ds[output.name] = dims
|
||||
|
||||
# Op
|
||||
for node in model.graph.node:
|
||||
# if node.op_type == 'Add':
|
||||
# assert len(node.output) == 1
|
||||
# g.add([ts[item] for item in node.input], ts[node.output[0]])
|
||||
|
||||
# elif node.op_type == 'Cast':
|
||||
# assert len(node.input) == 1
|
||||
# assert len(node.output) == 1
|
||||
# # Ignore for now (TODO)
|
||||
# g.identity(ts[node.input[0]], ts[node.output[0]])
|
||||
|
||||
if node.op_type == 'Conv':
|
||||
attrs = _parse_attribute(node.attribute, {
|
||||
"auto_pad": "NOTSET",
|
||||
"dilations": [1, 1],
|
||||
"pads": [0, 0, 0, 0],
|
||||
"strides": [1, 1]})
|
||||
assert len(node.input) == 2 or len(node.input) == 3
|
||||
assert len(node.output) == 1
|
||||
assert attrs["auto_pad"] == "NOTSET"
|
||||
assert len(attrs["pads"]) == 4
|
||||
assert len(attrs["strides"]) == 2
|
||||
assert len(attrs["dilations"]) == 2
|
||||
assert attrs["pads"][0] == attrs["pads"][2]
|
||||
assert attrs["pads"][1] == attrs["pads"][3]
|
||||
assert ds[node.input[0]][1] % ds[node.input[1]][1] == 0
|
||||
n, c, h, w, f, r, s, ph, pw, sh, sw, dh, dw = ds[node.input[0]][0], ds[node.input[0]][1], ds[node.input[0]][2], ds[node.input[0]][3], ds[node.input[1]][0], ds[node.input[1]][2], ds[node.input[1]
|
||||
][3], attrs["pads"][0], attrs["pads"][1], attrs["strides"][0], attrs["strides"][1], attrs["dilations"][0], attrs["dilations"][1]
|
||||
group = ds[node.input[0]][1] // ds[node.input[1]][1]
|
||||
# t = getPerfConv(n, c, h, w, f, r, s, ph, pw,
|
||||
# sh, sw, dh, dw, group, "")
|
||||
# print(node.name, n, c, h, w, f, r, s, ph, pw, sh, sw, dh, dw, group, f'{t:.3f}')
|
||||
n = n*bs
|
||||
for rule in conv_rules:
|
||||
rule(node.name, n, c, h, w, f, r, s,
|
||||
ph, pw, sh, sw, dh, dw, group)
|
||||
|
||||
elif node.op_type == 'ConvTranspose':
|
||||
attrs = _parse_attribute(node.attribute, {
|
||||
"auto_pad": "NOTSET",
|
||||
"dilations": [1, 1],
|
||||
"pads": [0, 0, 0, 0],
|
||||
"strides": [1, 1],
|
||||
"group": 1})
|
||||
assert len(node.input) == 2 or len(node.input) == 3
|
||||
assert len(node.output) == 1
|
||||
assert attrs["auto_pad"] == "NOTSET"
|
||||
assert len(attrs["pads"]) == 4
|
||||
assert len(attrs["strides"]) == 2
|
||||
assert len(attrs["dilations"]) == 2
|
||||
assert attrs["pads"][0] == attrs["pads"][2]
|
||||
assert attrs["pads"][1] == attrs["pads"][3]
|
||||
n, f, h, w = ds[node.input[0]]
|
||||
_, c, r, s = ds[node.input[1]]
|
||||
ph, pw, sh, sw, dh, dw = attrs["pads"][0], attrs["pads"][1], attrs["strides"][
|
||||
0], attrs["strides"][1], attrs["dilations"][0], attrs["dilations"][1]
|
||||
oph, opw = 0, 0
|
||||
if "output_padding" in attrs:
|
||||
oph, opw = attrs["output_padding"][0], attrs["output_padding"][1]
|
||||
assert attrs["output_padding"][0] == attrs["output_padding"][1]
|
||||
group = attrs["group"]
|
||||
n = n*bs
|
||||
for rule in conv_transposed2d_rules:
|
||||
rule(node.name, n, c, h, w, f, r, s, ph,
|
||||
pw, sh, sw, dh, dw, oph, opw, group)
|
||||
|
||||
elif node.op_type == 'MatMul':
|
||||
print(f'{node.name} skipped')
|
||||
continue
|
||||
assert len(node.input) == 2
|
||||
assert len(node.output) == 1
|
||||
dimA = list(ds[node.input[0]])
|
||||
dimB = list(ds[node.input[1]])
|
||||
dimO = list(ds[node.output[0]])
|
||||
|
||||
# if len(dimA) == 2 and len(dimB) == 2:
|
||||
# tmpI0 = g.tensor([1] + list(ds[node.input[0]]), "FLOAT")
|
||||
# tmpI1 = g.tensor([1] + list(ds[node.input[1]]), "FLOAT")
|
||||
# tmpO = g.tensor([1] + list(ds[node.output[0]]), "FLOAT")
|
||||
# g.transpose(ts[node.input[0]], tmpI0, 0, Perm([PermItem(-1), PermItem(0), PermItem(1)]), 1)
|
||||
# g.transpose(ts[node.input[1]], tmpI1, 0, Perm([PermItem(-1), PermItem(0), PermItem(1)]), 1)
|
||||
# g.matmul(tmpI0, tmpI1, tmpO, False, False, None)
|
||||
# g.transpose(tmpO, ts[node.output[0]], -1, Perm([PermItem([0, 1]), PermItem(2)]), 0)
|
||||
|
||||
# else:
|
||||
# assert len(dimO) >= 3
|
||||
# batch = functools.reduce(lambda x, y: x * y, dimO[:-2])
|
||||
|
||||
# if len(dimA) == 3:
|
||||
# tmpI0 = ts[node.input[0]]
|
||||
# else:
|
||||
# tmpI0 = g.tensor([batch, dimA[-2], dimA[-1]], "FLOAT")
|
||||
# g.reshape(ts[node.input[0]], tmpI0)
|
||||
|
||||
# if len(dimB) == 3:
|
||||
# tmpI1 = ts[node.input[1]]
|
||||
# else:
|
||||
# tmpI1 = g.tensor([batch, dimB[-2], dimB[-1]], "FLOAT")
|
||||
# g.reshape(ts[node.input[1]], tmpI1)
|
||||
|
||||
# if len(dimO) == 3:
|
||||
# tmpO = ts[node.output[0]]
|
||||
# g.matmul(tmpI0, tmpI1, tmpO, False, False, None)
|
||||
# else:
|
||||
# tmpO = g.tensor([batch, dimO[-2], dimO[-1]], "FLOAT")
|
||||
# g.matmul(tmpI0, tmpI1, tmpO, False, False, None)
|
||||
# g.reshape(tmpO, ts[node.output[0]])
|
||||
|
||||
# elif node.op_type == 'Concat':
|
||||
# assert len(node.output) == 1
|
||||
# attrs = _parse_attribute(node.attribute, {})
|
||||
# g.concat([ts[item] for item in node.input], ts[node.output[0]], attrs["axis"])
|
||||
|
||||
# elif node.op_type == 'Constant':
|
||||
# attrs = _parse_attribute(node.attribute, {})
|
||||
# assert len(node.output) == 1
|
||||
# c = onnx.numpy_helper.to_array(attrs["value"])
|
||||
# if c.ndim == 0:
|
||||
# c = c[()]
|
||||
# consts[node.output[0]] = c
|
||||
|
||||
# elif node.op_type == 'Flatten':
|
||||
# attrs = _parse_attribute(node.attribute, {"axis": 1})
|
||||
# assert len(node.input) == 1
|
||||
# assert len(node.output) == 1
|
||||
# g.flatten(ts[node.input[0]], ts[node.output[0]], attrs["axis"])
|
||||
|
||||
# elif node.op_type == 'Gather':
|
||||
# attrs = _parse_attribute(node.attribute, {"axis": 0})
|
||||
# assert len(node.input) == 2
|
||||
# assert len(node.output) == 1
|
||||
# g.gather(ts[node.input[0]], ts[node.input[1]], ts[node.output[0]], attrs["axis"])
|
||||
|
||||
# elif node.op_type == 'Gemm':
|
||||
# attrs = _parse_attribute(node.attribute, {
|
||||
# "alpha": 1.0,
|
||||
# "beta": 1.0,
|
||||
# "transA": 0,
|
||||
# "transB": 0})
|
||||
# assert len(node.input) == 2 or len(node.input) == 3
|
||||
# assert len(node.output) == 1
|
||||
# assert attrs["alpha"] == 1.0
|
||||
# assert attrs["beta"] == 1.0 or len(node.input) == 2
|
||||
# tmpI0 = g.tensor([1] + list(ds[node.input[0]]), "FLOAT")
|
||||
# tmpI1 = g.tensor([1] + list(ds[node.input[1]]), "FLOAT")
|
||||
# tmpO = g.tensor([1] + list(ds[node.output[0]]), "FLOAT")
|
||||
# g.transpose(ts[node.input[0]], tmpI0, 0, Perm([PermItem(-1), PermItem(0), PermItem(1)]), 1)
|
||||
# g.transpose(ts[node.input[1]], tmpI1, 0, Perm([PermItem(-1), PermItem(0), PermItem(1)]), 1)
|
||||
# g.matmul(tmpI0, tmpI1, tmpO,
|
||||
# attrs["transA"], attrs["transB"],
|
||||
# None if len(node.input) == 2 else ts[node.input[2]])
|
||||
# g.transpose(tmpO, ts[node.output[0]], -1, Perm([PermItem([0, 1]), PermItem(2)]), 0)
|
||||
|
||||
# elif node.op_type == 'Mul':
|
||||
# assert len(node.output) == 1
|
||||
# g.mul([ts[x] for x in node.input], ts[node.output[0]])
|
||||
|
||||
# elif node.op_type == 'GlobalAveragePool':
|
||||
# assert len(node.input) == 1
|
||||
# assert len(node.output) == 1
|
||||
# dims = ds[node.input[0]]
|
||||
# if len(dims) > 0:
|
||||
# g.avgpool(ts[node.input[0]], ts[node.output[0]], dims[2], dims[3], 0, 0, 1, 1)
|
||||
# else:
|
||||
# g.avgpool(ts[node.input[0]], ts[node.output[0]])
|
||||
|
||||
# elif node.op_type == 'MaxPool':
|
||||
# attrs = _parse_attribute(node.attribute, {
|
||||
# "auto_pad": "NOTSET",
|
||||
# "dilations": [1, 1],
|
||||
# "pads": [0, 0, 0, 0],
|
||||
# "strides": [1, 1]})
|
||||
# assert len(node.input) == 1
|
||||
# assert len(node.output) == 1
|
||||
# assert len(attrs["kernel_shape"]) == 2
|
||||
# assert len(attrs["pads"]) == 4
|
||||
# assert len(attrs["strides"]) == 2
|
||||
# assert len(attrs["dilations"]) == 2
|
||||
# assert attrs["pads"][0] == attrs["pads"][2]
|
||||
# assert attrs["pads"][1] == attrs["pads"][3]
|
||||
# g.maxpool(ts[node.input[0]], ts[node.output[0]],
|
||||
# attrs["kernel_shape"][0], attrs["kernel_shape"][1],
|
||||
# attrs["dilations"][0], attrs["dilations"][1],
|
||||
# attrs["pads"][0], attrs["pads"][1],
|
||||
# attrs["strides"][0], attrs["strides"][1])
|
||||
|
||||
# elif node.op_type == 'AveragePool':
|
||||
# attrs = _parse_attribute(node.attribute, {
|
||||
# "auto_pad": "NOTSET",
|
||||
# "count_include_pad": 0,
|
||||
# "pads": [0, 0, 0, 0],
|
||||
# "strides": [1, 1]})
|
||||
# # No dilation in ONNX
|
||||
# assert len(node.input) == 1
|
||||
# assert len(node.output) == 1
|
||||
# assert attrs["count_include_pad"] == 0 # To be consistent with operator.cc
|
||||
# assert len(attrs["kernel_shape"]) == 2
|
||||
# assert len(attrs["pads"]) == 4
|
||||
# assert len(attrs["strides"]) == 2
|
||||
# assert attrs["pads"][0] == attrs["pads"][2]
|
||||
# assert attrs["pads"][1] == attrs["pads"][3]
|
||||
# g.avgpool(ts[node.input[0]], ts[node.output[0]],
|
||||
# attrs["kernel_shape"][0], attrs["kernel_shape"][1],
|
||||
# attrs["pads"][0], attrs["pads"][1],
|
||||
# attrs["strides"][0], attrs["strides"][1])
|
||||
|
||||
# elif node.op_type == 'Pad':
|
||||
# attrs = _parse_attribute(node.attribute, {'mode': b'constant'})
|
||||
# assert attrs["mode"].decode("ascii") == "constant"
|
||||
# assert len(attrs["pads"]) % 2 == 0
|
||||
# assert attrs["value"] == 0
|
||||
# nDim = len(attrs["pads"]) // 2
|
||||
# begin = attrs["pads"][:nDim]
|
||||
# end = attrs["pads"][nDim:]
|
||||
# g.pad(ts[node.input[0]], ts[node.output[0]], begin, end)
|
||||
|
||||
# elif node.op_type == 'ReduceMean':
|
||||
# attrs = _parse_attribute(node.attribute, {'keepdims': 1})
|
||||
# assert len(node.input) == 1
|
||||
# assert len(node.output) == 1
|
||||
# assert len(attrs["axes"]) == 1
|
||||
# axis = attrs["axes"][0]
|
||||
# if axis < 0:
|
||||
# axis = len(ds[node.input[0]]) - axis
|
||||
# g.reduceMean(ts[node.input[0]], ts[node.output[0]], axis)
|
||||
|
||||
# elif node.op_type == 'Softmax':
|
||||
# attrs = _parse_attribute(node.attribute)
|
||||
# assert len(node.input) == 1
|
||||
# assert len(node.output) == 1
|
||||
# axis = attrs["axis"]
|
||||
# if axis < 0:
|
||||
# axis = len(ds[node.input[0]]) - axis
|
||||
# g.softmax(ts[node.input[0]], ts[node.output[0]], axis)
|
||||
|
||||
# elif node.op_type == 'Reshape':
|
||||
# assert len(node.input) == 2
|
||||
# assert len(node.output) == 1
|
||||
# g.reshape(ts[node.input[0]], ts[node.output[0]])
|
||||
|
||||
# elif node.op_type == 'Relu':
|
||||
# assert len(node.input) == 1
|
||||
# assert len(node.output) == 1
|
||||
# g.relu(ts[node.input[0]], ts[node.output[0]])
|
||||
|
||||
# elif node.op_type == 'Tanh':
|
||||
# assert len(node.input) == 1
|
||||
# assert len(node.output) == 1
|
||||
# g.tanh(ts[node.input[0]], ts[node.output[0]])
|
||||
|
||||
# elif node.op_type == 'Sigmoid':
|
||||
# assert len(node.input) == 1
|
||||
# assert len(node.output) == 1
|
||||
# g.sigmoid(ts[node.input[0]], ts[node.output[0]])
|
||||
|
||||
# elif node.op_type == 'Shape':
|
||||
# # Ignore for now, and no need to output anything (TODO)
|
||||
# pass
|
||||
|
||||
# elif node.op_type == 'Sub':
|
||||
# assert len(node.input) == 2
|
||||
# assert len(node.output) == 1
|
||||
# g.sub(ts[node.input[0]], ts[node.input[1]], ts[node.output[0]])
|
||||
|
||||
# elif node.op_type == 'Transpose':
|
||||
# attrs = _parse_attribute(node.attribute, {})
|
||||
# assert len(node.input) == 1
|
||||
# assert len(node.output) == 1
|
||||
# assert "perm" in attrs
|
||||
# g.transpose(ts[node.input[0]], ts[node.output[0]], -1,
|
||||
# Perm([PermItem(x) for x in attrs["perm"]]), 0)
|
||||
|
||||
# elif node.op_type == 'Unsqueeze':
|
||||
# assert len(node.input) == 2
|
||||
# assert len(node.output) == 1
|
||||
# g.reshape(ts[node.input[0]], ts[node.output[0]])
|
||||
|
||||
# elif node.op_type == "BatchNormalization":
|
||||
# attrs = _parse_attribute(node.attribute, {})
|
||||
# assert len(node.input) == 5
|
||||
# assert len(node.output) == 1
|
||||
# epsilon = attrs['epsilon'] if 'epsilon' in attrs else 1e-5
|
||||
# momentum = attrs['momentum'] if 'momentum' in attrs else 0.9
|
||||
# g.batchnorm(ts[node.input[0]], ts[node.input[1]],
|
||||
# ts[node.input[2]], ts[node.input[3]],
|
||||
# ts[node.input[4]], ts[node.output[0]],
|
||||
# epsilon, momentum)
|
||||
|
||||
# elif node.op_type == "Split":
|
||||
# attrs = _parse_attribute(node.attribute, {})
|
||||
# assert len(node.input) == 1
|
||||
# assert len(node.output) > 1
|
||||
# axis = attrs['axis']
|
||||
# split = attrs['split']
|
||||
# g.split(ts[node.input[0]], [ts[t] for t in node.output], axis, split)
|
||||
|
||||
# elif node.op_type == "Slice":
|
||||
# attrs = _parse_attribute(node.attribute, {})
|
||||
# assert len(node.input) == 4
|
||||
# assert len(node.output) == 1
|
||||
# g.slice(ts[node.input[0]], ts[node.output[0]],
|
||||
# ts[node.input[1]], ts[node.input[2]])
|
||||
|
||||
# elif node.op_type == "Resize":
|
||||
# attrs = _parse_attribute(node.attribute, {})
|
||||
# assert len(node.input) == 4
|
||||
# assert len(node.output) == 1
|
||||
# roi = ts[node.input[1]] if node.input[1] != '' else g.tensor(
|
||||
# [], 'FLOAT')
|
||||
# g.resize(ts[node.input[0]], roi, ts[node.output[0]])
|
||||
|
||||
# else:
|
||||
# assert False, "Unsupported op: " + node.op_type
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("model", help="ONNX model file")
|
||||
parser.add_argument("bs", help="batch size", type=int, default=1)
|
||||
# parser.add_argument("--output", help="Output file")
|
||||
args = parser.parse_args()
|
||||
import_onnx(args.model, args.bs)
|
||||
print_result(args.model)
|
|
@ -0,0 +1,588 @@
|
|||
from pyinfinitensor import *
|
||||
|
||||
import functools
|
||||
|
||||
import numpy as np
|
||||
|
||||
import onnx
|
||||
import onnx.checker
|
||||
import onnx.numpy_helper
|
||||
import onnx.shape_inference
|
||||
|
||||
|
||||
def _add_value_info_for_constants(model: onnx.ModelProto):
|
||||
"""
|
||||
Currently onnx.shape_inference doesn't use the shape of initializers, so add
|
||||
that info explicitly as ValueInfoProtos.
|
||||
Mutates the model.
|
||||
Args:
|
||||
model: The ModelProto to update.
|
||||
"""
|
||||
# All (top-level) constants will have ValueInfos before IRv4 as they are all inputs
|
||||
if model.ir_version < 4:
|
||||
return
|
||||
|
||||
def add_const_value_infos_to_graph(graph: onnx.GraphProto):
|
||||
inputs = {i.name for i in graph.input}
|
||||
existing_info = {vi.name: vi for vi in graph.value_info}
|
||||
for init in graph.initializer:
|
||||
# Check it really is a constant, not an input
|
||||
if init.name in inputs:
|
||||
continue
|
||||
|
||||
# The details we want to add
|
||||
elem_type = init.data_type
|
||||
shape = init.dims
|
||||
|
||||
# Get existing or create new value info for this constant
|
||||
vi = existing_info.get(init.name)
|
||||
if vi is None:
|
||||
vi = graph.value_info.add()
|
||||
vi.name = init.name
|
||||
|
||||
# Even though it would be weird, we will not overwrite info even if it doesn't match
|
||||
tt = vi.type.tensor_type
|
||||
if tt.elem_type == onnx.TensorProto.UNDEFINED:
|
||||
tt.elem_type = elem_type
|
||||
if not tt.HasField("shape"):
|
||||
# Ensure we set an empty list if the const is scalar (zero dims)
|
||||
tt.shape.dim.extend([])
|
||||
for dim in shape:
|
||||
tt.shape.dim.add().dim_value = dim
|
||||
|
||||
# Handle subgraphs
|
||||
for node in graph.node:
|
||||
for attr in node.attribute:
|
||||
# Ref attrs refer to other attrs, so we don't need to do anything
|
||||
if attr.ref_attr_name != "":
|
||||
continue
|
||||
|
||||
if attr.type == onnx.AttributeProto.GRAPH:
|
||||
add_const_value_infos_to_graph(attr.g)
|
||||
if attr.type == onnx.AttributeProto.GRAPHS:
|
||||
for g in attr.graphs:
|
||||
add_const_value_infos_to_graph(g)
|
||||
|
||||
return add_const_value_infos_to_graph(model.graph)
|
||||
|
||||
|
||||
def _parse_attribute(attributes, defaults=dict()):
|
||||
atts = defaults
|
||||
for att in attributes:
|
||||
if att.type == onnx.AttributeProto.INT:
|
||||
atts[att.name] = att.i
|
||||
elif att.type == onnx.AttributeProto.INTS:
|
||||
atts[att.name] = att.ints
|
||||
elif att.type == onnx.AttributeProto.FLOAT:
|
||||
atts[att.name] = att.f
|
||||
elif att.type == onnx.AttributeProto.STRING:
|
||||
atts[att.name] = att.s
|
||||
elif att.type == onnx.AttributeProto.TENSOR:
|
||||
atts[att.name] = att.t
|
||||
else:
|
||||
assert False, "Unsupported Attribute Type: {}".format(att.type)
|
||||
return atts
|
||||
|
||||
|
||||
def _onnx_datatype_tostring(dtype):
|
||||
if dtype == 0:
|
||||
return 'UNDEFINED'
|
||||
elif dtype == 1:
|
||||
return 'FLOAT'
|
||||
elif dtype == 2:
|
||||
return 'UINT8'
|
||||
elif dtype == 3:
|
||||
return 'INT8'
|
||||
elif dtype == 4:
|
||||
return 'UINT16'
|
||||
elif dtype == 5:
|
||||
return 'INT16'
|
||||
elif dtype == 6:
|
||||
return 'INT32'
|
||||
elif dtype == 7:
|
||||
return 'INT64'
|
||||
elif dtype == 8:
|
||||
return 'STRING'
|
||||
elif dtype == 9:
|
||||
return 'BOOL'
|
||||
elif dtype == 10:
|
||||
return 'FLOAT16'
|
||||
elif dtype == 11:
|
||||
return 'DOUBLE'
|
||||
elif dtype == 12:
|
||||
return 'UINT32'
|
||||
elif dtype == 13:
|
||||
return 'UINT64'
|
||||
elif dtype == 14:
|
||||
return 'COMPLEX64'
|
||||
elif dtype == 15:
|
||||
return 'COMPLEX128'
|
||||
elif dtype == 16:
|
||||
return 'BFLOAT16'
|
||||
else:
|
||||
assert False, 'Unknown onnx datatype'
|
||||
|
||||
|
||||
def import_onnx(gf: GraphBuilder, net: str):
|
||||
ts, ds, ops, consts = dict(), dict(), dict(), dict() # (key, value) = (name, class)
|
||||
model = onnx.load(net)
|
||||
|
||||
# Tensor_input
|
||||
for input in model.graph.input:
|
||||
if input.name not in ts:
|
||||
dims = [d.dim_value for d in input.type.tensor_type.shape.dim]
|
||||
ts[input.name] = gf.tensor(dims, _onnx_datatype_tostring(
|
||||
input.type.tensor_type.elem_type))
|
||||
ds[input.name] = dims
|
||||
|
||||
# Tensor_weight
|
||||
for weight in model.graph.initializer:
|
||||
if weight.name not in ts:
|
||||
ts[weight.name] = gf.tensor(
|
||||
weight.dims, _onnx_datatype_tostring(weight.data_type))
|
||||
ds[weight.name] = weight.dims
|
||||
|
||||
# Tensor_inference
|
||||
_add_value_info_for_constants(model)
|
||||
infered_model = onnx.shape_inference.infer_shapes(model)
|
||||
for v in infered_model.graph.value_info:
|
||||
if v.name not in ts:
|
||||
dims = [d.dim_value for d in v.type.tensor_type.shape.dim]
|
||||
ts[v.name] = gf.tensor(dims, _onnx_datatype_tostring(
|
||||
v.type.tensor_type.elem_type))
|
||||
ds[v.name] = dims
|
||||
|
||||
# Tensor_output
|
||||
for output in model.graph.output:
|
||||
if output.name not in ts:
|
||||
dims = [d.dim_value for d in output.type.tensor_type.shape.dim]
|
||||
ts[output.name] = gf.tensor(dims, _onnx_datatype_tostring(
|
||||
output.type.tensor_type.elem_type))
|
||||
ds[output.name] = dims
|
||||
|
||||
# Op
|
||||
for node in model.graph.node:
|
||||
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#Conv
|
||||
if node.op_type == 'Conv':
|
||||
attrs = _parse_attribute(node.attribute, {
|
||||
"auto_pad": "NOTSET",
|
||||
"dilations": [1, 1],
|
||||
"pads": [0, 0, 0, 0],
|
||||
"strides": [1, 1]})
|
||||
# assert len(node.input) == 2 # bias is not implemented yet
|
||||
assert len(node.output) == 1
|
||||
assert attrs["auto_pad"] == "NOTSET"
|
||||
assert len(attrs["pads"]) == 4
|
||||
assert len(attrs["strides"]) == 2
|
||||
assert len(attrs["dilations"]) == 2
|
||||
assert attrs["pads"][0] == attrs["pads"][2]
|
||||
assert attrs["pads"][1] == attrs["pads"][3]
|
||||
gf.conv(ts[node.input[0]], ts[node.input[1]], ts[node.output[0]],
|
||||
attrs["pads"][0], attrs["pads"][1],
|
||||
attrs["strides"][0], attrs["strides"][1],
|
||||
attrs["dilations"][0], attrs["dilations"][1],
|
||||
None)
|
||||
|
||||
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#MatMul
|
||||
elif node.op_type == 'MatMul':
|
||||
assert len(node.input) == 2
|
||||
assert len(node.output) == 1
|
||||
dimA = list(ds[node.input[0]])
|
||||
dimB = list(ds[node.input[1]])
|
||||
dimO = list(ds[node.output[0]])
|
||||
|
||||
if len(dimA) == 2 and len(dimB) == 2:
|
||||
tmpI0 = gf.tensor([1] + list(ds[node.input[0]]), "FLOAT")
|
||||
tmpI1 = gf.tensor([1] + list(ds[node.input[1]]), "FLOAT")
|
||||
tmpO = gf.tensor([1] + list(ds[node.output[0]]), "FLOAT")
|
||||
gf.matmul(tmpI0, tmpI1, tmpO, False, False, None)
|
||||
|
||||
else:
|
||||
assert len(dimO) >= 3
|
||||
batch = functools.reduce(lambda x, y: x * y, dimO[:-2])
|
||||
|
||||
if len(dimA) == 3:
|
||||
tmpI0 = ts[node.input[0]]
|
||||
else:
|
||||
tmpI0 = gf.tensor([batch, dimA[-2], dimA[-1]], "FLOAT")
|
||||
gf.reshape(ts[node.input[0]], tmpI0, [
|
||||
batch, dimA[-2], dimA[-1]])
|
||||
|
||||
if len(dimB) == 3:
|
||||
tmpI1 = ts[node.input[1]]
|
||||
else:
|
||||
tmpI1 = gf.tensor([batch, dimB[-2], dimB[-1]], "FLOAT")
|
||||
gf.reshape(ts[node.input[1]], tmpI1, [
|
||||
batch, dimB[-2], dimB[-1]])
|
||||
|
||||
if len(dimO) == 3:
|
||||
tmpO = ts[node.output[0]]
|
||||
gf.matmul(tmpI0, tmpI1, tmpO, False, False)
|
||||
else:
|
||||
tmpO = gf.tensor([batch, dimO[-2], dimO[-1]], "FLOAT")
|
||||
gf.matmul(tmpI0, tmpI1, tmpO, False, False)
|
||||
gf.reshape(tmpO, ts[node.output[0]], ds[node.output[0]])
|
||||
|
||||
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#ConvTranspose
|
||||
elif node.op_type == 'ConvTranspose':
|
||||
attrs = _parse_attribute(node.attribute, {
|
||||
"auto_pad": "NOTSET",
|
||||
"dilations": [1, 1],
|
||||
"pads": [0, 0, 0, 0],
|
||||
"strides": [1, 1],
|
||||
"group": 1})
|
||||
assert len(node.input) == 2 or len(node.input) == 3
|
||||
assert len(node.output) == 1
|
||||
assert attrs["auto_pad"] == "NOTSET"
|
||||
assert len(attrs["pads"]) == 4
|
||||
assert len(attrs["strides"]) == 2
|
||||
assert len(attrs["dilations"]) == 2
|
||||
assert attrs["pads"][0] == attrs["pads"][2]
|
||||
assert attrs["pads"][1] == attrs["pads"][3]
|
||||
oph, opw = 0, 0
|
||||
if "output_padding" in attrs:
|
||||
oph, opw = attrs["output_padding"][0], attrs["output_padding"][1]
|
||||
assert attrs["output_padding"][0] == attrs["output_padding"][1]
|
||||
gf.convTrans(ts[node.input[0]], ts[node.input[1]], ts[node.output[0]],
|
||||
attrs["pads"][0], attrs["pads"][1],
|
||||
attrs["strides"][0], attrs["strides"][1],
|
||||
attrs["dilations"][0], attrs["dilations"][1],
|
||||
oph, opw, group,
|
||||
None if len(node.input) == 2 else ts[node.input[2]])
|
||||
|
||||
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#Pad
|
||||
elif node.op_type == 'Pad':
|
||||
attrs = _parse_attribute(node.attribute, {'mode': b'constant'})
|
||||
assert attrs["mode"].decode("ascii") == "constant"
|
||||
assert len(attrs["pads"]) % 2 == 0
|
||||
if "constant_value" in attrs:
|
||||
assert attrs["constant_value"] == 0
|
||||
gf.pad(ts[node.input[0]], ts[node.output[0]], attrs["pads"],
|
||||
attrs["axes"] if axes in attrs else None)
|
||||
|
||||
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#Slice
|
||||
elif node.op_type == "Slice":
|
||||
assert 3 <= len(node.input) <= 5
|
||||
assert len(node.output) == 1
|
||||
gf.slice(ts[node.input[0]], ts[node.output[0]],
|
||||
ts[node.input[1]], ts[node.input[2]],
|
||||
ts[node.input[3]] if len(node.input) == 4 else None,
|
||||
ts[node.input[4]] if len(node.input) == 5 else None)
|
||||
|
||||
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#Concat
|
||||
elif node.op_type == 'Concat':
|
||||
attrs = _parse_attribute(node.attribute, {})
|
||||
assert len(node.output) == 1
|
||||
gf.concat([ts[item] for item in node.input],
|
||||
ts[node.output[0]], attrs["axis"])
|
||||
|
||||
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#Split
|
||||
elif node.op_type == "Split":
|
||||
attrs = _parse_attribute(node.attribute, {'axis': 0})
|
||||
assert len(node.input) == 1
|
||||
assert len(node.output) > 1
|
||||
dim = attrs['axis']
|
||||
num = attrs['num_outputs']
|
||||
gf.split(ts[node.input[0]], [ts[t] for t in node.output], dim, num)
|
||||
|
||||
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#MaxPool
|
||||
elif node.op_type == 'MaxPool':
|
||||
attrs = _parse_attribute(node.attribute, {
|
||||
"auto_pad": "NOTSET",
|
||||
"dilations": [1, 1],
|
||||
"pads": [0, 0, 0, 0],
|
||||
"strides": [1, 1]})
|
||||
assert len(node.input) == 1
|
||||
assert len(node.output) == 1
|
||||
assert len(attrs["kernel_shape"]) == 2
|
||||
assert len(attrs["pads"]) == 4
|
||||
assert len(attrs["strides"]) == 2
|
||||
assert len(attrs["dilations"]) == 2
|
||||
assert attrs["pads"][0] == attrs["pads"][2]
|
||||
assert attrs["pads"][1] == attrs["pads"][3]
|
||||
gf.maxpool(ts[node.input[0]], ts[node.output[0]],
|
||||
attrs["kernel_shape"][0], attrs["kernel_shape"][1],
|
||||
attrs["dilations"][0], attrs["dilations"][1],
|
||||
attrs["pads"][0], attrs["pads"][1],
|
||||
attrs["strides"][0], attrs["strides"][1])
|
||||
|
||||
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#AveragePool
|
||||
# No dilation in ONNX
|
||||
elif node.op_type == 'AveragePool':
|
||||
attrs = _parse_attribute(node.attribute, {
|
||||
"auto_pad": "NOTSET",
|
||||
"count_include_pad": 0,
|
||||
"pads": [0, 0, 0, 0],
|
||||
"strides": [1, 1]})
|
||||
assert len(node.input) == 1
|
||||
assert len(node.output) == 1
|
||||
# To be consistent with operator.cc
|
||||
assert attrs["count_include_pad"] == 0
|
||||
assert len(attrs["kernel_shape"]) == 2
|
||||
assert len(attrs["pads"]) == 4
|
||||
assert len(attrs["strides"]) == 2
|
||||
assert attrs["pads"][0] == attrs["pads"][2]
|
||||
assert attrs["pads"][1] == attrs["pads"][3]
|
||||
dh, dw = 1, 1
|
||||
gf.avgpool(ts[node.input[0]], ts[node.output[0]],
|
||||
attrs["kernel_shape"][0], attrs["kernel_shape"][1],
|
||||
dw, dh,
|
||||
attrs["pads"][0], attrs["pads"][1],
|
||||
attrs["strides"][0], attrs["strides"][1])
|
||||
|
||||
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#Add
|
||||
elif node.op_type == 'Add':
|
||||
print(ds[node.input[0]], ds[node.input[1]], ds[node.output[0]])
|
||||
assert len(node.input) == 2
|
||||
assert len(node.output) == 1
|
||||
if ds[node.input[0]] == ds[node.input[1]]:
|
||||
gf.add(ts[node.input[0]], ts[node.input[1]],
|
||||
ts[node.output[0]])
|
||||
elif ds[node.input[0]] == ds[node.output[0]]:
|
||||
tmp = gf.tensor(ds[node.output[0]], "FLOAT")
|
||||
gf.add(ts[node.input[0]], tmp, ts[node.output[0]])
|
||||
elif ds[node.input[1]] == ds[node.output[0]]:
|
||||
tmp = gf.tensor(ds[node.output[0]], "FLOAT")
|
||||
gf.add(tmp, ts[node.input[1]], ts[node.output[0]])
|
||||
else:
|
||||
assert False
|
||||
|
||||
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#Sub
|
||||
elif node.op_type == 'Sub':
|
||||
assert len(node.input) == 2
|
||||
assert len(node.output) == 1
|
||||
if ds[node.input[0]] == ds[node.input[1]]:
|
||||
gf.sub(ts[node.input[0]], ts[node.input[1]],
|
||||
ts[node.output[0]])
|
||||
elif ds[node.input[0]] == ds[node.output[0]]:
|
||||
tmp = gf.tensor(ds[node.output[0]], "FLOAT")
|
||||
gf.sub(ts[node.input[0]], tmp, ts[node.output[0]])
|
||||
elif ds[node.input[1]] == ds[node.output[0]]:
|
||||
tmp = gf.tensor(ds[node.output[0]], "FLOAT")
|
||||
gf.sub(tmp, ts[node.input[1]], ts[node.output[0]])
|
||||
else:
|
||||
assert False
|
||||
|
||||
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#Mul
|
||||
elif node.op_type == 'Mul':
|
||||
assert len(node.input) == 2
|
||||
assert len(node.output) == 1
|
||||
if ds[node.input[0]] == ds[node.input[1]]:
|
||||
gf.mul(ts[node.input[0]], ts[node.input[1]],
|
||||
ts[node.output[0]])
|
||||
elif ds[node.input[0]] == ds[node.output[0]]:
|
||||
tmp = gf.tensor(ds[node.output[0]], "FLOAT")
|
||||
gf.mul(ts[node.input[0]], tmp, ts[node.output[0]])
|
||||
elif ds[node.input[1]] == ds[node.output[0]]:
|
||||
tmp = gf.tensor(ds[node.output[0]], "FLOAT")
|
||||
gf.mul(tmp, ts[node.input[1]], ts[node.output[0]])
|
||||
else:
|
||||
assert False
|
||||
|
||||
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#Div
|
||||
elif node.op_type == 'Div':
|
||||
assert len(node.input) == 2
|
||||
assert len(node.output) == 1
|
||||
if ds[node.input[0]] == ds[node.input[1]]:
|
||||
gf.div(ts[node.input[0]], ts[node.input[1]],
|
||||
ts[node.output[0]])
|
||||
elif ds[node.input[0]] == ds[node.output[0]]:
|
||||
tmp = gf.tensor(ds[node.output[0]], "FLOAT")
|
||||
gf.div(ts[node.input[0]], tmp, ts[node.output[0]])
|
||||
elif ds[node.input[1]] == ds[node.output[0]]:
|
||||
tmp = gf.tensor(ds[node.output[0]], "FLOAT")
|
||||
gf.div(tmp, ts[node.input[1]], ts[node.output[0]])
|
||||
else:
|
||||
assert False
|
||||
|
||||
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#Pow
|
||||
elif node.op_type == 'Pow':
|
||||
assert len(node.input) == 2
|
||||
assert len(node.output) == 1
|
||||
if ds[node.input[0]] == ds[node.input[1]]:
|
||||
gf.pow(ts[node.input[0]], ts[node.input[1]],
|
||||
ts[node.output[0]])
|
||||
elif ds[node.input[0]] == ds[node.output[0]]:
|
||||
tmp = gf.tensor(ds[node.output[0]], "FLOAT")
|
||||
gf.pow(ts[node.input[0]], tmp, ts[node.output[0]])
|
||||
elif ds[node.input[1]] == ds[node.output[0]]:
|
||||
tmp = gf.tensor(ds[node.output[0]], "FLOAT")
|
||||
gf.pow(tmp, ts[node.input[1]], ts[node.output[0]])
|
||||
else:
|
||||
assert False
|
||||
|
||||
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#Gather
|
||||
elif node.op_type == 'Gather':
|
||||
attrs = _parse_attribute(node.attribute, {"axis": 0})
|
||||
assert len(node.input) == 2
|
||||
assert len(node.output) == 1
|
||||
gf.gather(ts[node.input[0]], ts[node.input[1]],
|
||||
ts[node.output[0]], attrs["axis"])
|
||||
|
||||
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#Reshape
|
||||
elif node.op_type == 'Reshape':
|
||||
attrs = _parse_attribute(node.attribute, {"allowzero": 0})
|
||||
assert len(node.input) == 2
|
||||
assert len(node.output) == 1
|
||||
gf.reshape(ts[node.input[0]],
|
||||
ts[node.output[0]], ds[node.output[0]])
|
||||
|
||||
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#Flatten
|
||||
# Output is 2D in ONNX
|
||||
elif node.op_type == 'Flatten':
|
||||
assert len(node.input) == 1
|
||||
assert len(node.output) == 1
|
||||
gf.flatten(ts[node.input[0]], ts[node.output[0]])
|
||||
|
||||
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#Identity
|
||||
elif node.op_type == 'Identity':
|
||||
assert len(node.input) == 1
|
||||
assert len(node.output) == 1
|
||||
gf.identity(ts[node.input[0]], ts[node.output[0]])
|
||||
|
||||
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#Softmax
|
||||
elif node.op_type == 'Softmax':
|
||||
attrs = _parse_attribute(node.attribute, {"axis": -1})
|
||||
assert len(node.input) == 1
|
||||
assert len(node.output) == 1
|
||||
gf.softmax(ts[node.input[0]], ts[node.output[0]])
|
||||
|
||||
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#Relu
|
||||
elif node.op_type == 'Relu':
|
||||
assert len(node.input) == 1
|
||||
assert len(node.output) == 1
|
||||
gf.relu(ts[node.input[0]], ts[node.output[0]])
|
||||
|
||||
# TODO
|
||||
elif node.op_type == 'Sqrt':
|
||||
assert len(node.input) == 1
|
||||
assert len(node.output) == 1
|
||||
gf.relu(ts[node.input[0]], ts[node.output[0]])
|
||||
|
||||
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#Sigmoid
|
||||
elif node.op_type == 'Sigmoid':
|
||||
assert len(node.input) == 1
|
||||
assert len(node.output) == 1
|
||||
gf.sigmoid(ts[node.input[0]], ts[node.output[0]])
|
||||
|
||||
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#Tanh
|
||||
elif node.op_type == 'Tanh':
|
||||
assert len(node.input) == 1
|
||||
assert len(node.output) == 1
|
||||
gf.tanh(ts[node.input[0]], ts[node.output[0]])
|
||||
|
||||
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#Abs
|
||||
elif node.op_type == 'Abs':
|
||||
assert len(node.input) == 1
|
||||
assert len(node.output) == 1
|
||||
gf.abs(ts[node.input[0]], ts[node.output[0]])
|
||||
|
||||
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#Cast
|
||||
# Ignore for now (TODO)
|
||||
elif node.op_type == 'Cast':
|
||||
assert len(node.input) == 1
|
||||
assert len(node.output) == 1
|
||||
gf.identity(ts[node.input[0]], ts[node.output[0]])
|
||||
|
||||
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#Constant
|
||||
elif node.op_type == 'Constant':
|
||||
attrs = _parse_attribute(node.attribute, {})
|
||||
assert len(node.output) == 1
|
||||
c = onnx.numpy_helper.to_array(attrs["value"])
|
||||
if c.ndim == 0:
|
||||
c = c[()]
|
||||
consts[node.output[0]] = c
|
||||
|
||||
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#Gemm
|
||||
elif node.op_type == 'Gemm':
|
||||
attrs = _parse_attribute(node.attribute, {
|
||||
"alpha": 1.0,
|
||||
"beta": 1.0,
|
||||
"transA": 0,
|
||||
"transB": 0})
|
||||
assert len(node.input) == 2 or len(node.input) == 3
|
||||
assert len(node.output) == 1
|
||||
assert attrs["alpha"] == 1.0
|
||||
assert attrs["beta"] == 1.0 or len(node.input) == 2
|
||||
i0 = gf.tensor([1] + list(ds[node.input[0]]), "FLOAT")
|
||||
i1 = gf.tensor([1] + list(ds[node.input[1]]), "FLOAT")
|
||||
o0 = gf.tensor([1] + list(ds[node.output[0]]), "FLOAT")
|
||||
gf.reshape(ts[node.input[0]], i0, [1] + list(ds[node.input[0]]))
|
||||
gf.reshape(ts[node.input[1]], i1, [1] + list(ds[node.input[1]]))
|
||||
gf.matmul(i0, i1, o0, attrs["transA"], attrs["transB"])
|
||||
o1 = gf.tensor(ds[node.output[0]], "FLOAT")
|
||||
a0 = gf.tensor(ds[node.output[0]], "FLOAT")
|
||||
gf.reshape(o0, o1, ds[node.output[0]])
|
||||
gf.add(o1, a0, ts[node.output[0]])
|
||||
|
||||
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#GlobalAveragePool
|
||||
# elif node.op_type == 'GlobalAveragePool':
|
||||
# assert len(node.input) == 1
|
||||
# assert len(node.output) == 1
|
||||
# dims = ds[node.input[0]]
|
||||
# if len(dims) > 0:
|
||||
# g.avgpool(ts[node.input[0]], ts[node.output[0]], dims[2], dims[3], 0, 0, 1, 1)
|
||||
# else:
|
||||
# g.avgpool(ts[node.input[0]], ts[node.output[0]])
|
||||
|
||||
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#ReduceMean
|
||||
elif node.op_type == 'ReduceMean':
|
||||
attrs = _parse_attribute(node.attribute, {'keepdims': 1})
|
||||
assert len(node.input) == 1
|
||||
assert len(node.output) == 1
|
||||
assert len(attrs["axes"]) == 1
|
||||
axis = attrs["axes"][0]
|
||||
print(axis, len(ds[node.input[0]]))
|
||||
if axis < 0:
|
||||
axis = len(ds[node.input[0]]) + axis
|
||||
gf.reduceMean(ts[node.input[0]], ts[node.output[0]], axis)
|
||||
|
||||
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#Shape
|
||||
# Ignore for now, and no need to output anything (TODO)
|
||||
# elif node.op_type == 'Shape':
|
||||
# pass
|
||||
|
||||
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#Transpose
|
||||
elif node.op_type == 'Transpose':
|
||||
attrs = _parse_attribute(node.attribute, {})
|
||||
assert len(node.input) == 1
|
||||
assert len(node.output) == 1
|
||||
assert "perm" in attrs
|
||||
gf.transpose(ts[node.input[0]], ts[node.output[0]], attrs["perm"])
|
||||
|
||||
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#Unsqueeze
|
||||
elif node.op_type == 'Unsqueeze':
|
||||
assert len(node.input) == 2
|
||||
assert len(node.output) == 1
|
||||
gf.reshape(ts[node.input[0]],
|
||||
ts[node.output[0]], ts[node.output[0]])
|
||||
|
||||
# TODO
|
||||
elif node.op_type == 'Erf':
|
||||
assert len(node.input) == 1
|
||||
assert len(node.output) == 1
|
||||
gf.erf(ts[node.input[0]], ts[node.output[0]])
|
||||
|
||||
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#BatchNormalization
|
||||
# elif node.op_type == "BatchNormalization":
|
||||
# attrs = _parse_attribute(node.attribute, {})
|
||||
# assert len(node.input) == 5
|
||||
# assert len(node.output) == 1
|
||||
# epsilon = attrs['epsilon'] if 'epsilon' in attrs else 1e-5
|
||||
# momentum = attrs['momentum'] if 'momentum' in attrs else 0.9
|
||||
# g.batchnorm(ts[node.input[0]], ts[node.input[1]],
|
||||
# ts[node.input[2]], ts[node.input[3]],
|
||||
# ts[node.input[4]], ts[node.output[0]],
|
||||
# epsilon, momentum)
|
||||
|
||||
# https://github.com/onnx/onnx/blob/main/docs/Operators.md#Resize
|
||||
# elif node.op_type == "Resize":
|
||||
# attrs = _parse_attribute(node.attribute, {})
|
||||
# assert len(node.input) == 4
|
||||
# assert len(node.output) == 1
|
||||
# roi = ts[node.input[1]] if node.input[1] != '' else g.tensor(
|
||||
# [], 'FLOAT')
|
||||
# g.resize(ts[node.input[0]], roi, ts[node.output[0]])
|
||||
|
||||
else:
|
||||
assert False, "Unsupported op: " + node.op_type
|
|
@ -1,10 +1,11 @@
|
|||
import argparse
|
||||
from tokenize import Double
|
||||
import pyinfinitensor # import getPerfConv, getPerfMatmul
|
||||
|
||||
|
||||
def getPerfConv(n, c, h, w, f, r, s, padh, padw, strideh, stridew, dilationh, dilationw, group, name=""):
|
||||
def getPerfConv(n, c, h, w, f, r, s, padh, padw, strideh, stridew, dilationh, dilationw, group):
|
||||
return pyinfinitensor.getPerfConvCudnn(n, c, h, w, f, r, s, padh, padw,
|
||||
strideh, stridew, dilationh, dilationw, group, name)
|
||||
strideh, stridew, dilationh, dilationw, group)
|
||||
|
||||
|
||||
def getPerfConvTransposed2dCudnn(n, c, h, w, f, r, s, padh, padw, strideh, stridew, dilationh, dilationw, oph, opw, group):
|
||||
|
@ -13,3 +14,43 @@ def getPerfConvTransposed2dCudnn(n, c, h, w, f, r, s, padh, padw, strideh, strid
|
|||
|
||||
def getPerfMatmul(b, m, n, k, name=""):
|
||||
return pyinfinitensor.getPerfMatmulCublas(b, m, n, k, name)
|
||||
|
||||
|
||||
def getPerfConvBiasActCudnn(n, c, h, w, f, r, s, padh, padw, strideh, stridew, dilationh, dilationw, group, bias: bool, act="None"):
|
||||
return pyinfinitensor.getPerfConvBiasActCudnn(n, c, h, w, f, r, s, padh, padw,
|
||||
strideh, stridew, dilationh, dilationw, group, bias, act)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description='Process some integers.')
|
||||
parser.add_argument('op', metavar='operator', type=str)
|
||||
parser.add_argument('shape', nargs='+')
|
||||
parser.add_argument('--pad', type=int, default=0)
|
||||
parser.add_argument('--stride', type=int, default=1)
|
||||
parser.add_argument('--dilation', type=int, default=1)
|
||||
parser.add_argument('--group', type=int, default=1)
|
||||
parser.add_argument('--bias', type=bool, default=False)
|
||||
parser.add_argument('--act', type=str, default="None")
|
||||
args = parser.parse_args()
|
||||
print(args)
|
||||
if args.op == 'gemm':
|
||||
t = getPerfMatmul(int(args.shape[0]), int(
|
||||
args.shape[1]), int(args.shape[2]), int(args.shape[3]))
|
||||
print(
|
||||
f'time {t:.3f} ms, {2*int(args.shape[0])*int(args.shape[1])*int(args.shape[2])*int(args.shape[3])/t/1e9:.3f} TFLOPS')
|
||||
elif args.op == 'conv':
|
||||
assert len(args.shape) == 7
|
||||
n, c, h, w, f, r, s = [int(v) for v in args.shape]
|
||||
padh = padw = int(args.pad)
|
||||
strideh = stridew = int(args.stride)
|
||||
dilationh = dilationw = int(args.dilation)
|
||||
group = int(args.group)
|
||||
bias = int(args.bias)
|
||||
act = args.act
|
||||
assert group==1, "Unsupported"
|
||||
t = pyinfinitensor.getPerfConvBiasActCudnn(
|
||||
n, c, h, w, f, r, s, padh, padw, strideh, stridew, dilationh, dilationw, group, bias, act)
|
||||
print(
|
||||
f'time {t:.3f} ms, {n*c*h*w*f*r*s/strideh/stridew*2/10**9:.3f} TFlops')
|
||||
else:
|
||||
assert False, "Not supported"
|
||||
|
|
|
@ -0,0 +1,84 @@
|
|||
import pandas as pd
|
||||
import numpy as np
|
||||
from operator_timer import *
|
||||
from datetime import datetime
|
||||
|
||||
pd.options.display.float_format = '{:,.3f}'.format
|
||||
|
||||
df= pd.DataFrame(columns=['n', 'c', 'h', 'w', 'f', 'r', 's', 'ph', 'pw', 'sh', 'sw', 'dh', 'dw', 'oph', 'opw', 'group'])
|
||||
def conv_original(name, n, c, h, w, f, r, s, ph, pw,
|
||||
sh, sw, dh, dw, group):
|
||||
df.loc[name, ['n', 'c', 'h', 'w', 'f', 'r', 's', 'ph', 'pw', 'sh', 'sw', 'dh', 'dw', 'group']] = n, c, h, w, f, r, s, ph, pw, sh, sw, dh, dw, group
|
||||
df.loc[name, 't_original'] = getPerfConv(n, c, h, w, f, r, s, ph, pw,
|
||||
sh, sw, dh, dw, group)
|
||||
df.loc[name, 't_bias'] = getPerfConvBiasActCudnn(n, c, h, w, f, r, s, ph, pw,
|
||||
sh, sw, dh, dw, group, bias=True)
|
||||
df.loc[name, 't_bias_relu'] = getPerfConvBiasActCudnn(n, c, h, w, f, r, s, ph, pw,
|
||||
sh, sw, dh, dw, group, bias=True, act="Relu")
|
||||
|
||||
def conv_rule_5x5_to_3x3(name, n, c, h, w, f, r, s, ph, pw,
|
||||
sh, sw, dh, dw, group):
|
||||
col = 't_5x5_to_3x3'
|
||||
if r == 5 and s == 5:
|
||||
df.loc[name, col] = getPerfConv(n, c, h, w, f*4, 3, 3, ph, pw,
|
||||
sh, sw, dh, dw, group)
|
||||
else:
|
||||
df.loc[name, col] = np.inf
|
||||
|
||||
def conv_rule_9x9_to_3x3(name, n, c, h, w, f, r, s, ph, pw,
|
||||
sh, sw, dh, dw, group):
|
||||
col = 't_9x9_to_3x3'
|
||||
if r == 9 and s == 9:
|
||||
df.loc[name, col] = getPerfConv(n, c, h, w, f*9, r//3, s//3, ph, pw,
|
||||
sh, sw, dh, dw, group)
|
||||
else:
|
||||
df.loc[name, col] = np.inf
|
||||
|
||||
bandwidth=200*10**6 # (200GB/ms)
|
||||
|
||||
def conv_rule_conv2gemm(name, n, c, h, w, f, r, s, ph, pw,
|
||||
sh, sw, dh, dw, group):
|
||||
col = 't_conv2gemm'
|
||||
if [sh, sw, dh, dw, group] == [1] * 5:
|
||||
# b = group
|
||||
# m = batch_size * input_height * input_width
|
||||
# n = output_channel * kernel_height * kernel_width
|
||||
# k = input_channel // group
|
||||
t_reduce= group*n*h*w*f*r*s*4/bandwidth if r>1 or s>1 else 0
|
||||
df.loc[name, '_'+col+'_mem'] = t_reduce
|
||||
df.loc[name, col] = getPerfMatmul(group, n*h*w, f*r*s, c//group) + t_reduce
|
||||
else:
|
||||
df.loc[name, col] = np.inf
|
||||
|
||||
# conv_rules=[conv_original, conv_rule_9x9_to_3x3, conv_rule_5x5_to_3x3, conv_rule_conv2gemm]
|
||||
conv_rules=[conv_original]
|
||||
|
||||
def conv_tranpsposed2d_original(name, n, c, h, w, f, r, s, ph, pw, sh, sw, dh, dw, oph, opw, group):
|
||||
df.loc[name, ['n', 'c', 'h', 'w', 'f', 'r', 's', 'ph', 'pw', 'sh', 'sw', 'dh', 'dw', 'oph', 'opw', 'group']] = n, c, h, w, f, r, s, ph, pw, sh, sw, dh, dw, oph, opw, group
|
||||
df.loc[name, 't_original'] = getPerfConvTransposed2dCudnn(n, c, h, w, f, r, s, ph, pw, sh, sw, dh, dw, oph, opw, group)
|
||||
|
||||
def conv_tranpsposed2d_togemm(name, n, c, h, w, f, r, s, ph, pw, sh, sw, dh, dw, oph, opw, group):
|
||||
col = 't_conv2gemm'
|
||||
if [dh, dw, group] == [1] * 3:
|
||||
# ConvTransose2gemm
|
||||
# b = 1
|
||||
# m = batch_size * input_height*input_width
|
||||
# n = output_channel*kernel_height*kernel_width
|
||||
# k = input_channel
|
||||
t_reduce= n*h*w*c*r*s*4/bandwidth if r>1 or s>1 else 0
|
||||
df.loc[name, '_'+col+'_mem'] = t_reduce
|
||||
print('t_conv2gemm', group, n*h*w, c*r*s, f)
|
||||
df.loc[name, col] = getPerfMatmul(group, n*h*w, c*r*s, f) + t_reduce
|
||||
else:
|
||||
df.loc[name, col] = np.inf
|
||||
|
||||
conv_transposed2d_rules=[conv_tranpsposed2d_original, conv_tranpsposed2d_togemm]
|
||||
|
||||
def print_result(model_fn):
|
||||
pd.set_option('display.max_rows', 500)
|
||||
df['t_min'] = df.filter(regex=("^t_.*")).min(axis=1)
|
||||
print(df)
|
||||
print(f'Origin: {df["t_original"].sum():.3f} ms')
|
||||
print(f'Min: {df["t_min"].sum():.3f} ms')
|
||||
print(f'Speedup: {df["t_original"].sum()/df["t_min"].sum():.3f} x')
|
||||
df.to_pickle(f'optime_{model_fn.split("/")[-1]}_{datetime.now().strftime("%m_%d_%H_%M_%S")}.pkl')
|
|
@ -0,0 +1,5 @@
|
|||
import pytest
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
retcode = pytest.main()
|
|
@ -0,0 +1,16 @@
|
|||
from pyinfinitensor import *
|
||||
from infinitensor import import_onnx
|
||||
|
||||
|
||||
class Test_ImportOnnx:
|
||||
def test_Netname(self):
|
||||
runtime = CpuRuntimeObj.getInstance()
|
||||
graphBuilder = GraphBuilderObj(runtime)
|
||||
import_onnx(graphBuilder, '/home/mazx/git/pf-models/bert.bs1.onnx')
|
||||
|
||||
|
||||
class Test_SARDRN:
|
||||
def test_Netname(self):
|
||||
runtime = CpuRuntimeObj.getInstance()
|
||||
graphBuilder = GraphBuilderObj(runtime)
|
||||
import_onnx(graphBuilder, '/home/mazx/git/pf-models/sardrn.bs1.onnx')
|
|
@ -0,0 +1,545 @@
|
|||
#include "core/graph_builder.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
Tensor GraphBuilderObj::tensor(Shape dim, const std::string &dtype) {
|
||||
if (dtype == "FLOAT") {
|
||||
return g->addTensor(dim, DataType::Float32);
|
||||
}
|
||||
if (dtype == "INT32") {
|
||||
return g->addTensor(dim, DataType::UInt32);
|
||||
}
|
||||
if (dtype == "INT64") {
|
||||
return g->addTensor(dim, DataType::UInt32);
|
||||
}
|
||||
IT_TODO_HALT_MSG("Unsupported data type");
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::conv(Tensor input, Tensor weight, Tensor output,
|
||||
int ph, int pw, int sh, int sw, int dh, int dw,
|
||||
Tensor bias) {
|
||||
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
|
||||
Tensor w0 = g->addTensor(weight->getDims(), weight->getDType());
|
||||
Tensor o0 = g->addTensor(output->getDims(), output->getDType());
|
||||
auto op =
|
||||
g->addOpWithOutputs<ConvObj>(i0, w0, o0, ph, ph, sh, sw, dh, dw, bias);
|
||||
return op;
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::conv(Tensor input, Tensor weight, int ph, int pw,
|
||||
int sh, int sw, int dh, int dw, Tensor bias) {
|
||||
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
|
||||
Tensor w0 = g->addTensor(weight->getDims(), weight->getDType());
|
||||
auto op = g->addOp<ConvObj>(i0, w0, nullptr, ph, ph, sh, sw, dh, dw, bias);
|
||||
return op;
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::conv(Tensor input, Tensor weight, Tensor output,
|
||||
ConvBaseObj::PaddingMode pm, int sh, int sw,
|
||||
int dh, int dw, Tensor bias) {
|
||||
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
|
||||
Tensor w0 = g->addTensor(weight->getDims(), weight->getDType());
|
||||
Tensor o0 = g->addTensor(output->getDims(), output->getDType());
|
||||
auto op =
|
||||
g->addOpWithOutputs<ConvObj>(i0, w0, o0, pm, sh, sw, dh, dw, bias);
|
||||
return op;
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::conv(Tensor input, Tensor weight,
|
||||
ConvBaseObj::PaddingMode pm, int sh, int sw,
|
||||
int dh, int dw, Tensor bias) {
|
||||
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
|
||||
Tensor w0 = g->addTensor(weight->getDims(), weight->getDType());
|
||||
auto op = g->addOp<ConvObj>(i0, w0, nullptr, pm, sh, sw, dh, dw, bias);
|
||||
return op;
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::matmul(Tensor A, Tensor B, Tensor C, bool transA,
|
||||
bool transB) {
|
||||
Tensor i0 = g->addTensor(A->getDims(), A->getDType());
|
||||
Tensor i1 = g->addTensor(B->getDims(), B->getDType());
|
||||
Tensor o0 = g->addTensor(C->getDims(), C->getDType());
|
||||
auto op = g->addOpWithOutputs<MatmulObj>(i0, i1, o0, transA, transB);
|
||||
return op;
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::matmul(Tensor A, Tensor B, bool transA, bool transB) {
|
||||
Tensor i0 = g->addTensor(A->getDims(), A->getDType());
|
||||
Tensor i1 = g->addTensor(B->getDims(), B->getDType());
|
||||
auto op = g->addOp<MatmulObj>(i0, i1, nullptr, transA, transB);
|
||||
return op;
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::convTrans(Tensor input, Tensor weight, Tensor output,
|
||||
int ph, int pw, int sh, int sw, int dh,
|
||||
int dw, int oph, int opw, int group,
|
||||
Tensor bias, ActType act) {
|
||||
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
|
||||
Tensor w0 = g->addTensor(weight->getDims(), weight->getDType());
|
||||
Tensor o0 = g->addTensor(output->getDims(), output->getDType());
|
||||
auto op = g->addOpWithOutputs<ConvTransposed2dObj>(
|
||||
i0, w0, o0, ph, pw, sh, sw, dh, dw, oph, opw, group, bias, act);
|
||||
return op;
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::convTrans(Tensor input, Tensor weight, int ph, int pw,
|
||||
int sh, int sw, int dh, int dw, int oph,
|
||||
int opw, int group, Tensor bias,
|
||||
ActType act) {
|
||||
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
|
||||
Tensor w0 = g->addTensor(weight->getDims(), weight->getDType());
|
||||
auto op = g->addOp<ConvTransposed2dObj>(i0, w0, nullptr, ph, pw, sh, sw, dh,
|
||||
dw, oph, opw, group, bias, act);
|
||||
return op;
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::convTrans(Tensor input, Tensor weight, Tensor output,
|
||||
ConvBaseObj::PaddingMode pm, int sh, int sw,
|
||||
int dh, int dw, int oph, int opw, int group,
|
||||
Tensor bias, ActType act) {
|
||||
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
|
||||
Tensor w0 = g->addTensor(weight->getDims(), weight->getDType());
|
||||
Tensor o0 = g->addTensor(output->getDims(), output->getDType());
|
||||
auto op = g->addOpWithOutputs<ConvTransposed2dObj>(
|
||||
i0, w0, o0, pm, sh, sw, dh, dw, oph, opw, group, bias, act);
|
||||
return op;
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::convTrans(Tensor input, Tensor weight,
|
||||
ConvBaseObj::PaddingMode pm, int sh, int sw,
|
||||
int dh, int dw, int oph, int opw, int group,
|
||||
Tensor bias, ActType act) {
|
||||
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
|
||||
Tensor w0 = g->addTensor(weight->getDims(), weight->getDType());
|
||||
auto op = g->addOp<ConvTransposed2dObj>(i0, w0, nullptr, pm, sh, sw, dh, dw,
|
||||
oph, opw, group, bias, act);
|
||||
return op;
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::g2bmm(Tensor A, Tensor B, Tensor C, const int width,
|
||||
const int dilation, Tensor bias, ActType act) {
|
||||
Tensor i0 = g->addTensor(A->getDims(), A->getDType());
|
||||
Tensor i1 = g->addTensor(B->getDims(), B->getDType());
|
||||
Tensor o0 = g->addTensor(C->getDims(), C->getDType());
|
||||
auto op =
|
||||
g->addOpWithOutputs<G2BMMObj>(i0, i1, o0, width, dilation, bias, act);
|
||||
return op;
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::g2bmm(Tensor A, Tensor B, const int width,
|
||||
const int dilation, Tensor bias, ActType act) {
|
||||
Tensor i0 = g->addTensor(A->getDims(), A->getDType());
|
||||
Tensor i1 = g->addTensor(B->getDims(), B->getDType());
|
||||
auto op = g->addOp<G2BMMObj>(i0, i1, nullptr, width, dilation, bias, act);
|
||||
return op;
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::gbmml(Tensor A, Tensor B, Tensor C,
|
||||
const int dilation, Tensor bias, ActType act) {
|
||||
Tensor i0 = g->addTensor(A->getDims(), A->getDType());
|
||||
Tensor i1 = g->addTensor(B->getDims(), B->getDType());
|
||||
Tensor o0 = g->addTensor(C->getDims(), C->getDType());
|
||||
auto op = g->addOpWithOutputs<GBMMObj>(i0, i1, o0, dilation, bias, act);
|
||||
return op;
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::gbmml(Tensor A, Tensor B, const int dilation,
|
||||
Tensor bias, ActType act) {
|
||||
Tensor i0 = g->addTensor(A->getDims(), A->getDType());
|
||||
Tensor i1 = g->addTensor(B->getDims(), B->getDType());
|
||||
auto op = g->addOp<GBMMObj>(i0, i1, nullptr, dilation, bias, act);
|
||||
return op;
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::pad(Tensor input, Tensor output,
|
||||
const vector<int> &pads,
|
||||
const optional<const vector<int>> &axis) {
|
||||
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
|
||||
Tensor o0 = g->addTensor(output->getDims(), output->getDType());
|
||||
auto op = g->addOpWithOutputs<PadObj>(i0, o0, pads, axis);
|
||||
return op;
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::pad(Tensor input, const vector<int> &pads,
|
||||
const optional<const vector<int>> &axis) {
|
||||
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
|
||||
auto op = g->addOp<PadObj>(i0, nullptr, pads, axis);
|
||||
return op;
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::slice(Tensor input, Tensor output,
|
||||
const vector<int> &starts,
|
||||
const vector<int> &ends,
|
||||
const optional<const vector<int>> &axis,
|
||||
const optional<const vector<int>> &steps) {
|
||||
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
|
||||
Tensor o0 = g->addTensor(output->getDims(), output->getDType());
|
||||
auto op = g->addOpWithOutputs<SliceObj>(i0, o0, starts, ends, axis, steps);
|
||||
return op;
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::slice(Tensor input, const vector<int> &starts,
|
||||
const vector<int> &ends,
|
||||
const optional<const vector<int>> &axis,
|
||||
const optional<const vector<int>> &steps) {
|
||||
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
|
||||
auto op = g->addOp<SliceObj>(i0, nullptr, starts, ends, axis, steps);
|
||||
return op;
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::concat(TensorVec inputs, Tensor output, int dim) {
|
||||
TensorVec is;
|
||||
for (auto input : inputs) {
|
||||
Tensor i = g->addTensor(input->getDims(), input->getDType());
|
||||
is.push_back(i);
|
||||
}
|
||||
Tensor o = g->addTensor(output->getDims(), output->getDType());
|
||||
auto op = g->addOpWithOutputs<ConcatObj>(is, o, dim);
|
||||
return op;
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::concat(TensorVec inputs, int dim) {
|
||||
TensorVec is;
|
||||
for (auto input : inputs) {
|
||||
Tensor i = g->addTensor(input->getDims(), input->getDType());
|
||||
is.push_back(i);
|
||||
}
|
||||
auto op = g->addOp<ConcatObj>(is, nullptr, dim);
|
||||
return op;
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::split(Tensor input, std::optional<TensorVec> outputs,
|
||||
int dim, int num) {
|
||||
Tensor i = g->addTensor(input->getDims(), input->getDType());
|
||||
if (outputs.has_value()) {
|
||||
TensorVec os;
|
||||
for (auto output : outputs.value()) {
|
||||
Tensor o = g->addTensor(output->getDims(), output->getDType());
|
||||
os.push_back(o);
|
||||
}
|
||||
auto op = g->addOpWithOutputs<SplitObj>(i, os, dim, num);
|
||||
return op;
|
||||
} else {
|
||||
auto op = g->addOp<SplitObj>(i, std::nullopt, dim, num);
|
||||
return op;
|
||||
}
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::split(Tensor input, int dim, int num) {
|
||||
Tensor i = g->addTensor(input->getDims(), input->getDType());
|
||||
auto op = g->addOp<SplitObj>(i, std::nullopt, dim, num);
|
||||
return op;
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::split(Tensor input, std::optional<TensorVec> outputs,
|
||||
int dim, const vector<int> &ratio) {
|
||||
Tensor i = g->addTensor(input->getDims(), input->getDType());
|
||||
if (outputs.has_value()) {
|
||||
TensorVec os;
|
||||
for (auto output : outputs.value()) {
|
||||
Tensor o = g->addTensor(output->getDims(), output->getDType());
|
||||
os.push_back(o);
|
||||
}
|
||||
auto op = g->addOpWithOutputs<SplitObj>(i, os, dim, ratio);
|
||||
return op;
|
||||
} else {
|
||||
auto op = g->addOp<SplitObj>(i, std::nullopt, dim, ratio);
|
||||
return op;
|
||||
}
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::split(Tensor input, int dim,
|
||||
const vector<int> &ratio) {
|
||||
Tensor i = g->addTensor(input->getDims(), input->getDType());
|
||||
auto op = g->addOp<SplitObj>(i, std::nullopt, dim, ratio);
|
||||
return op;
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::transpose(Tensor input, Tensor output,
|
||||
const Shape &perm) {
|
||||
Tensor i = g->addTensor(input->getDims(), input->getDType());
|
||||
Tensor o = g->addTensor(output->getDims(), output->getDType());
|
||||
auto op = g->addOpWithOutputs<TransposeObj>(i, o, perm);
|
||||
return op;
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::extend(Tensor input, Tensor output, int dim,
|
||||
int num) {
|
||||
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
|
||||
Tensor o0 = g->addTensor(output->getDims(), output->getDType());
|
||||
auto op = g->addOpWithOutputs<ExtendObj>(i0, o0, dim, num);
|
||||
return op;
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::extend(Tensor input, int dim, int num) {
|
||||
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
|
||||
auto op = g->addOp<ExtendObj>(i0, nullptr, dim, num);
|
||||
return op;
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::maxpool(Tensor input, Tensor output, int kh, int kw,
|
||||
int dh, int dw, int ph, int pw, int sh,
|
||||
int sw) {
|
||||
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
|
||||
Tensor o0 = g->addTensor(output->getDims(), output->getDType());
|
||||
auto op =
|
||||
g->addOpWithOutputs<MaxPoolObj>(i0, o0, kh, kw, dh, dw, ph, pw, sh, sw);
|
||||
return op;
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::maxpool(Tensor input, int kh, int kw, int dh, int dw,
|
||||
int ph, int pw, int sh, int sw) {
|
||||
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
|
||||
auto op = g->addOp<MaxPoolObj>(i0, nullptr, kh, kw, dh, dw, ph, pw, sh, sw);
|
||||
return op;
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::avgpool(Tensor input, Tensor output, int kh, int kw,
|
||||
int dh, int dw, int ph, int pw, int sh,
|
||||
int sw) {
|
||||
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
|
||||
Tensor o0 = g->addTensor(input->getDims(), input->getDType());
|
||||
auto op =
|
||||
g->addOpWithOutputs<AvgPoolObj>(i0, o0, kh, kw, dh, dw, ph, pw, sh, sw);
|
||||
return op;
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::avgpool(Tensor input, int kh, int kw, int dh, int dw,
|
||||
int ph, int pw, int sh, int sw) {
|
||||
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
|
||||
auto op = g->addOp<AvgPoolObj>(i0, nullptr, kh, kw, dh, dw, ph, pw, sh, sw);
|
||||
return op;
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::add(Tensor input0, Tensor input1, Tensor output) {
|
||||
Tensor i0 = g->addTensor(input0->getDims(), input0->getDType());
|
||||
Tensor i1 = g->addTensor(input1->getDims(), input1->getDType());
|
||||
Tensor o0 = g->addTensor(output->getDims(), output->getDType());
|
||||
auto op = g->addOpWithOutputs<AddObj>(i0, i1, o0);
|
||||
return op;
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::add(Tensor input0, Tensor input1) {
|
||||
Tensor i0 = g->addTensor(input0->getDims(), input0->getDType());
|
||||
Tensor i1 = g->addTensor(input1->getDims(), input1->getDType());
|
||||
auto op = g->addOp<AddObj>(i0, i1, nullptr);
|
||||
return op;
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::sub(Tensor input0, Tensor input1, Tensor output) {
|
||||
std::cout << "Sub1" << std::endl;
|
||||
Tensor i0 = g->addTensor(input0->getDims(), input0->getDType());
|
||||
Tensor i1 = g->addTensor(input1->getDims(), input1->getDType());
|
||||
Tensor o0 = g->addTensor(output->getDims(), output->getDType());
|
||||
auto op = g->addOpWithOutputs<SubObj>(i0, i1, o0);
|
||||
return op;
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::sub(Tensor input0, Tensor input1) {
|
||||
std::cout << "Sub2" << std::endl;
|
||||
Tensor i0 = g->addTensor(input0->getDims(), input0->getDType());
|
||||
Tensor i1 = g->addTensor(input1->getDims(), input1->getDType());
|
||||
auto op = g->addOp<SubObj>(i0, i1, nullptr);
|
||||
return op;
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::mul(Tensor input0, Tensor input1, Tensor output) {
|
||||
Tensor i0 = g->addTensor(input0->getDims(), input0->getDType());
|
||||
Tensor i1 = g->addTensor(input1->getDims(), input1->getDType());
|
||||
Tensor o0 = g->addTensor(output->getDims(), output->getDType());
|
||||
auto op = g->addOpWithOutputs<MulObj>(i0, i1, o0);
|
||||
return op;
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::mul(Tensor input0, Tensor input1) {
|
||||
Tensor i0 = g->addTensor(input0->getDims(), input0->getDType());
|
||||
Tensor i1 = g->addTensor(input1->getDims(), input1->getDType());
|
||||
auto op = g->addOp<MulObj>(i0, i1, nullptr);
|
||||
return op;
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::div(Tensor input0, Tensor input1, Tensor output) {
|
||||
Tensor i0 = g->addTensor(input0->getDims(), input0->getDType());
|
||||
Tensor i1 = g->addTensor(input1->getDims(), input1->getDType());
|
||||
Tensor o0 = g->addTensor(output->getDims(), output->getDType());
|
||||
auto op = g->addOpWithOutputs<DivObj>(i0, i1, o0);
|
||||
return op;
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::div(Tensor input0, Tensor input1) {
|
||||
Tensor i0 = g->addTensor(input0->getDims(), input0->getDType());
|
||||
Tensor i1 = g->addTensor(input1->getDims(), input1->getDType());
|
||||
auto op = g->addOp<DivObj>(i0, i1, nullptr);
|
||||
return op;
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::pow(Tensor input0, Tensor input1, Tensor output) {
|
||||
Tensor i0 = g->addTensor(input0->getDims(), input0->getDType());
|
||||
Tensor i1 = g->addTensor(input1->getDims(), input1->getDType());
|
||||
Tensor o0 = g->addTensor(output->getDims(), output->getDType());
|
||||
auto op = g->addOpWithOutputs<PowObj>(i0, i1, o0);
|
||||
return op;
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::pow(Tensor input0, Tensor input1) {
|
||||
Tensor i0 = g->addTensor(input0->getDims(), input0->getDType());
|
||||
Tensor i1 = g->addTensor(input1->getDims(), input1->getDType());
|
||||
auto op = g->addOp<PowObj>(i0, i1, nullptr);
|
||||
return op;
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::gather(Tensor input, Tensor index, Tensor output,
|
||||
int axis) {
|
||||
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
|
||||
Tensor o0 = g->addTensor(output->getDims(), output->getDType());
|
||||
auto op = g->addOpWithOutputs<GatherObj>(i0, index, o0, axis);
|
||||
return op;
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::gather(Tensor input, Tensor index, int axis) {
|
||||
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
|
||||
auto op = g->addOp<GatherObj>(i0, index, nullptr, axis);
|
||||
return op;
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::reshape(Tensor input, Tensor output,
|
||||
const Shape &dims) {
|
||||
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
|
||||
Tensor o0 = g->addTensor(output->getDims(), output->getDType());
|
||||
auto op = g->addOpWithOutputs<ReshapeObj>(i0, o0, dims);
|
||||
return op;
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::reshape(Tensor input, const Shape &dims) {
|
||||
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
|
||||
auto op = g->addOp<ReshapeObj>(i0, nullptr, dims);
|
||||
return op;
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::flatten(Tensor input, Tensor output) {
|
||||
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
|
||||
Tensor o0 = g->addTensor(output->getDims(), output->getDType());
|
||||
auto op = g->addOpWithOutputs<FlattenObj>(i0, o0);
|
||||
return op;
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::flatten(Tensor input) {
|
||||
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
|
||||
auto op = g->addOp<FlattenObj>(i0, nullptr);
|
||||
return op;
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::identity(Tensor input, Tensor output) {
|
||||
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
|
||||
Tensor o0 = g->addTensor(output->getDims(), output->getDType());
|
||||
auto op = g->addOpWithOutputs<IdentityObj>(i0, o0);
|
||||
return op;
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::identity(Tensor input) {
|
||||
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
|
||||
auto op = g->addOp<IdentityObj>(i0, nullptr);
|
||||
return op;
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::softmax(Tensor input, Tensor output) {
|
||||
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
|
||||
Tensor o0 = g->addTensor(output->getDims(), output->getDType());
|
||||
auto op = g->addOpWithOutputs<SoftmaxObj>(i0, o0);
|
||||
return op;
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::softmax(Tensor input) {
|
||||
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
|
||||
auto op = g->addOp<SoftmaxObj>(i0, nullptr);
|
||||
return op;
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::relu(Tensor input, Tensor output) {
|
||||
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
|
||||
Tensor o0 = g->addTensor(output->getDims(), output->getDType());
|
||||
auto op = g->addOpWithOutputs<ReluObj>(i0, o0);
|
||||
return op;
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::relu(Tensor input) {
|
||||
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
|
||||
auto op = g->addOp<ReluObj>(i0, nullptr);
|
||||
return op;
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::sigmoid(Tensor input, Tensor output) {
|
||||
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
|
||||
Tensor o0 = g->addTensor(output->getDims(), output->getDType());
|
||||
auto op = g->addOpWithOutputs<SigmoidObj>(i0, o0);
|
||||
return op;
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::sigmoid(Tensor input) {
|
||||
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
|
||||
auto op = g->addOp<SigmoidObj>(i0, nullptr);
|
||||
return op;
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::tanh(Tensor input, Tensor output) {
|
||||
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
|
||||
Tensor o0 = g->addTensor(output->getDims(), output->getDType());
|
||||
auto op = g->addOpWithOutputs<TanhObj>(i0, o0);
|
||||
return op;
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::tanh(Tensor input) {
|
||||
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
|
||||
auto op = g->addOp<TanhObj>(i0, nullptr);
|
||||
return op;
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::abs(Tensor input, Tensor output) {
|
||||
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
|
||||
Tensor o0 = g->addTensor(output->getDims(), output->getDType());
|
||||
auto op = g->addOpWithOutputs<AbsObj>(i0, o0);
|
||||
return op;
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::abs(Tensor input) {
|
||||
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
|
||||
auto op = g->addOp<AbsObj>(i0, nullptr);
|
||||
return op;
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::reduceMean(Tensor input, Tensor output, int axis) {
|
||||
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
|
||||
Tensor o0 = g->addTensor(output->getDims(), output->getDType());
|
||||
auto op =
|
||||
g->addOpWithOutputs<ReduceMeanObj>(i0, o0, std::vector<int>({axis}));
|
||||
return op;
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::erf(Tensor input, Tensor output) {
|
||||
Tensor i0 = g->addTensor(input->getDims(), input->getDType());
|
||||
Tensor o0 = g->addTensor(output->getDims(), output->getDType());
|
||||
auto op = g->addOpWithOutputs<ErfObj>(i0, o0);
|
||||
return op;
|
||||
}
|
||||
|
||||
Operator GraphBuilderObj::memBound(const TensorVec &inputs,
|
||||
const TensorVec &outputs,
|
||||
const std::vector<nnet::Tensor> &nnetInputs,
|
||||
nnet::Expr expr, double exec_time,
|
||||
std::string hint) {
|
||||
TensorVec is;
|
||||
for (auto input : inputs) {
|
||||
auto i = g->addTensor(input->getDims(), input->getDType());
|
||||
is.push_back(i);
|
||||
}
|
||||
TensorVec os;
|
||||
for (auto output : outputs) {
|
||||
auto o = g->addTensor(output->getDims(), output->getDType());
|
||||
os.push_back(o);
|
||||
}
|
||||
auto op = g->addOpWithOutputs<MemBoundObj>(is, os, nnetInputs, expr,
|
||||
exec_time, hint);
|
||||
return op;
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -1,3 +1,4 @@
|
|||
#include "cuda/operator_timer.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
|
@ -12,8 +13,23 @@ namespace opTimer {
|
|||
|
||||
double getPerfConvCudnn(int n, int c, int h, int w, int f, int r, int s,
|
||||
int padh, int padw, int strideh, int stridew,
|
||||
int dilationh, int dilationw, int group,
|
||||
const char *name) {
|
||||
int dilationh, int dilationw, int group) {
|
||||
return getPerfConvBiasActCudnn(n, c, h, w, f, r, s, padh, padw, strideh,
|
||||
stridew, dilationh, dilationw, group, false,
|
||||
"None");
|
||||
}
|
||||
|
||||
double getPerfConvBiasActCudnn(int n, int c, int h, int w, int f, int r, int s,
|
||||
int padh, int padw, int strideh, int stridew,
|
||||
int dilationh, int dilationw, int group,
|
||||
bool bias, string actName) {
|
||||
ActType act = ActType::None;
|
||||
if (actName == "None")
|
||||
act = ActType::None;
|
||||
else if (actName == "Relu")
|
||||
act = ActType::Relu;
|
||||
else
|
||||
IT_ASSERT(false, "Unsupported activation");
|
||||
// const auto &[n, c, h, w, f, r, s, padh, padw, strideh, stridew,
|
||||
// dilationh, dilationw, group] =
|
||||
// tuple{1, 512, 14, 14, 512, 3, 3, 2, 2, 1, 1, 2, 2, 1};
|
||||
|
@ -25,17 +41,27 @@ double getPerfConvCudnn(int n, int c, int h, int w, int f, int r, int s,
|
|||
IT_ASSERT(c % group == 0);
|
||||
Tensor i0Cpu = gCpu->addTensor({n, c, h, w}, DataType::Float32);
|
||||
Tensor w0Cpu = gCpu->addTensor({f, c / group, r, s}, DataType::Float32);
|
||||
Tensor b0Cpu = gCpu->addTensor({f}, DataType::Float32);
|
||||
// Malloc data for all tensors in a graph. Do we need implicit allocation?
|
||||
gCpu->dataMalloc();
|
||||
i0Cpu->setData(IncrementalGenerator());
|
||||
w0Cpu->setData(IncrementalGenerator());
|
||||
b0Cpu->setData(IncrementalGenerator());
|
||||
|
||||
// Copy input tensors from CPU to CUDA
|
||||
Tensor i0Cuda = gCuda->cloneTensor(i0Cpu);
|
||||
Tensor w0Cuda = gCuda->cloneTensor(w0Cpu);
|
||||
Tensor b0Cuda = gCuda->cloneTensor(b0Cpu);
|
||||
// Build CUDA graph
|
||||
auto conv = gCuda->addOp<ConvObj>(i0Cuda, w0Cuda, nullptr, padh, padw,
|
||||
strideh, stridew, dilationh, dilationw);
|
||||
if (!bias) {
|
||||
auto conv =
|
||||
gCuda->addOp<ConvObj>(i0Cuda, w0Cuda, nullptr, padh, padw, strideh,
|
||||
stridew, dilationh, dilationw);
|
||||
} else {
|
||||
auto conv =
|
||||
gCuda->addOp<ConvObj>(i0Cuda, w0Cuda, nullptr, padh, padw, strideh,
|
||||
stridew, dilationh, dilationw, b0Cuda, act);
|
||||
}
|
||||
// allocate CUDA memory
|
||||
gCuda->dataMalloc();
|
||||
// Execute on CUDA
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
#ifdef USE_CUDA
|
||||
#include "cuda/operator_timer.h"
|
||||
#endif
|
||||
#include "core/graph_builder.h"
|
||||
namespace py = pybind11;
|
||||
|
||||
namespace infini {
|
||||
|
@ -13,11 +14,193 @@ void register_operator_timer(py::module &m) {
|
|||
#ifdef USE_CUDA
|
||||
using namespace opTimer;
|
||||
m.def("getPerfConvCudnn", &getPerfConvCudnn);
|
||||
m.def("getPerfConvBiasActCudnn", &getPerfConvBiasActCudnn);
|
||||
m.def("getPerfConvTransposed2dCudnn", &getPerfConvTransposed2dCudnn);
|
||||
m.def("getPerfMatmulCublas", &getPerfMatmulCublas);
|
||||
m.def("getPerfMatmulCublas", &getPerfMatmulCublas);
|
||||
#endif
|
||||
}
|
||||
|
||||
void init_graph_builder(py::module &m) {
|
||||
py::class_<RuntimeObj, std::shared_ptr<RuntimeObj>>(m, "RuntimeObj");
|
||||
py::class_<CpuRuntimeObj, std::shared_ptr<CpuRuntimeObj>, RuntimeObj>(
|
||||
m, "CpuRuntimeObj")
|
||||
.def(py::init<>())
|
||||
.def("getInstance", py::overload_cast<>(&CpuRuntimeObj::getInstance),
|
||||
policy::reference_internal);
|
||||
py::class_<Shape>(m, "Shape");
|
||||
py::class_<TensorObj, std::shared_ptr<TensorObj>>(m, "TensorObj");
|
||||
py::class_<Tensor>(m, "Tensor");
|
||||
py::class_<TensorVec>(m, "TensorVec");
|
||||
py::class_<OperatorObj, std::shared_ptr<OperatorObj>>(m, "OperatorObj");
|
||||
py::class_<Operator>(m, "Operator");
|
||||
py::class_<ActType>(m, "ActType");
|
||||
py::class_<ConvObj, std::shared_ptr<ConvObj>, OperatorObj>(m, "ConvObj");
|
||||
py::class_<MatmulObj, std::shared_ptr<MatmulObj>, OperatorObj>(m,
|
||||
"MatmulObj");
|
||||
py::class_<ConvTransposed2dObj, std::shared_ptr<ConvTransposed2dObj>,
|
||||
OperatorObj>(m, "ConvTransposed2dObj");
|
||||
py::class_<G2BMMObj, std::shared_ptr<G2BMMObj>, OperatorObj>(m, "G2BMMObj");
|
||||
py::class_<GBMMObj, std::shared_ptr<GBMMObj>, OperatorObj>(m, "GBMMObj");
|
||||
py::class_<PadObj, std::shared_ptr<PadObj>, OperatorObj>(m, "PadObj");
|
||||
py::class_<SliceObj, std::shared_ptr<SliceObj>, OperatorObj>(m, "SliceObj");
|
||||
py::class_<ConcatObj, std::shared_ptr<ConcatObj>, OperatorObj>(m,
|
||||
"ConcatObj");
|
||||
py::class_<SplitObj, std::shared_ptr<SplitObj>, OperatorObj>(m, "SplitObj");
|
||||
py::class_<TransposeObj, std::shared_ptr<TransposeObj>, OperatorObj>(
|
||||
m, "TransposeObj");
|
||||
py::class_<ExtendObj, std::shared_ptr<ExtendObj>, OperatorObj>(m,
|
||||
"ExtendObj");
|
||||
py::class_<MaxPoolObj, std::shared_ptr<MaxPoolObj>, OperatorObj>(
|
||||
m, "MaxPoolObj");
|
||||
py::class_<AvgPoolObj, std::shared_ptr<AvgPoolObj>, OperatorObj>(
|
||||
m, "AvgPoolObj");
|
||||
py::class_<AddObj, std::shared_ptr<AddObj>, OperatorObj>(m, "AddObj");
|
||||
py::class_<SubObj, std::shared_ptr<SubObj>, OperatorObj>(m, "SubObj");
|
||||
py::class_<MulObj, std::shared_ptr<MulObj>, OperatorObj>(m, "MulObj");
|
||||
py::class_<DivObj, std::shared_ptr<DivObj>, OperatorObj>(m, "DivObj");
|
||||
py::class_<PowObj, std::shared_ptr<PowObj>, OperatorObj>(m, "PowObj");
|
||||
py::class_<GatherObj, std::shared_ptr<GatherObj>, OperatorObj>(m,
|
||||
"GatherObj");
|
||||
py::class_<ReshapeObj, std::shared_ptr<ReshapeObj>, OperatorObj>(
|
||||
m, "ReshapeObj");
|
||||
py::class_<FlattenObj, std::shared_ptr<FlattenObj>, OperatorObj>(
|
||||
m, "FlattenObj");
|
||||
py::class_<IdentityObj, std::shared_ptr<IdentityObj>, OperatorObj>(
|
||||
m, "IdentityObj");
|
||||
py::class_<SoftmaxObj, std::shared_ptr<SoftmaxObj>, OperatorObj>(
|
||||
m, "SoftmaxObj");
|
||||
py::class_<ReluObj, std::shared_ptr<ReluObj>, OperatorObj>(m, "ReluObj");
|
||||
py::class_<SigmoidObj, std::shared_ptr<SigmoidObj>, OperatorObj>(
|
||||
m, "SigmoidObj");
|
||||
py::class_<TanhObj, std::shared_ptr<TanhObj>, OperatorObj>(m, "TanhObj");
|
||||
py::class_<AbsObj, std::shared_ptr<AbsObj>, OperatorObj>(m, "AbsObj");
|
||||
py::class_<ReduceMeanObj, std::shared_ptr<ReduceMeanObj>, OperatorObj>(
|
||||
m, "ReduceMeanObj");
|
||||
py::class_<MemBoundObj, std::shared_ptr<MemBoundObj>, OperatorObj>(
|
||||
m, "MemBoundObj");
|
||||
py::class_<GraphBuilder>(m, "GraphBuilder");
|
||||
py::class_<GraphBuilderObj>(m, "GraphBuilderObj")
|
||||
.def(py::init<Runtime>())
|
||||
.def("tensor",
|
||||
py::overload_cast<Shape, const std::string &>(
|
||||
&GraphBuilderObj::tensor),
|
||||
policy::reference_internal)
|
||||
.def("conv",
|
||||
py::overload_cast<Tensor, Tensor, Tensor, int, int, int, int, int,
|
||||
int, Tensor>(&GraphBuilderObj::conv),
|
||||
policy::reference_internal)
|
||||
.def("matmul",
|
||||
py::overload_cast<Tensor, Tensor, Tensor, bool, bool>(
|
||||
&GraphBuilderObj::matmul),
|
||||
policy::reference_internal)
|
||||
.def("convTrans",
|
||||
py::overload_cast<Tensor, Tensor, Tensor, int, int, int, int, int,
|
||||
int, int, int, int, Tensor, ActType>(
|
||||
&GraphBuilderObj::convTrans),
|
||||
policy::reference_internal)
|
||||
.def("g2bmm",
|
||||
py::overload_cast<Tensor, Tensor, Tensor, const int, const int,
|
||||
Tensor, ActType>(&GraphBuilderObj::g2bmm),
|
||||
policy::reference_internal)
|
||||
.def("gbmml",
|
||||
py::overload_cast<Tensor, Tensor, Tensor, const int, Tensor,
|
||||
ActType>(&GraphBuilderObj::gbmml),
|
||||
policy::reference_internal)
|
||||
|
||||
.def("pad",
|
||||
py::overload_cast<Tensor, Tensor, const vector<int> &,
|
||||
const optional<const vector<int>> &>(
|
||||
&GraphBuilderObj::pad),
|
||||
policy::reference_internal)
|
||||
.def("slice",
|
||||
py::overload_cast<Tensor, Tensor, const vector<int> &,
|
||||
const vector<int> &,
|
||||
const optional<const vector<int>> &,
|
||||
const optional<const vector<int>> &>(
|
||||
&GraphBuilderObj::slice),
|
||||
policy::reference_internal)
|
||||
.def(
|
||||
"concat",
|
||||
py::overload_cast<TensorVec, Tensor, int>(&GraphBuilderObj::concat),
|
||||
policy::reference_internal)
|
||||
.def("split",
|
||||
py::overload_cast<Tensor, std::optional<TensorVec>, int, int>(
|
||||
&GraphBuilderObj::split),
|
||||
policy::reference_internal)
|
||||
.def("transpose",
|
||||
py::overload_cast<Tensor, Tensor, const vector<int> &>(
|
||||
&GraphBuilderObj::transpose),
|
||||
policy::reference_internal)
|
||||
.def("extend",
|
||||
py::overload_cast<Tensor, Tensor, int, int>(
|
||||
&GraphBuilderObj::extend),
|
||||
policy::reference_internal)
|
||||
.def("maxpool",
|
||||
py::overload_cast<Tensor, Tensor, int, int, int, int, int, int,
|
||||
int, int>(&GraphBuilderObj::maxpool),
|
||||
policy::reference_internal)
|
||||
.def("avgpool",
|
||||
py::overload_cast<Tensor, Tensor, int, int, int, int, int, int,
|
||||
int, int>(&GraphBuilderObj::avgpool),
|
||||
policy::reference_internal)
|
||||
.def("add",
|
||||
py::overload_cast<Tensor, Tensor, Tensor>(&GraphBuilderObj::add),
|
||||
policy::reference_internal)
|
||||
.def("sub",
|
||||
py::overload_cast<Tensor, Tensor, Tensor>(&GraphBuilderObj::sub),
|
||||
policy::reference_internal)
|
||||
.def("mul",
|
||||
py::overload_cast<Tensor, Tensor, Tensor>(&GraphBuilderObj::mul),
|
||||
policy::reference_internal)
|
||||
.def("div",
|
||||
py::overload_cast<Tensor, Tensor, Tensor>(&GraphBuilderObj::div),
|
||||
policy::reference_internal)
|
||||
.def("pow",
|
||||
py::overload_cast<Tensor, Tensor, Tensor>(&GraphBuilderObj::pow),
|
||||
policy::reference_internal)
|
||||
.def("gather",
|
||||
py::overload_cast<Tensor, Tensor, Tensor, int>(
|
||||
&GraphBuilderObj::gather),
|
||||
policy::reference_internal)
|
||||
.def("reshape",
|
||||
py::overload_cast<Tensor, Tensor, const Shape &>(
|
||||
&GraphBuilderObj::reshape),
|
||||
policy::reference_internal)
|
||||
.def("flatten",
|
||||
py::overload_cast<Tensor, Tensor>(&GraphBuilderObj::flatten),
|
||||
policy::reference_internal)
|
||||
.def("identity",
|
||||
py::overload_cast<Tensor, Tensor>(&GraphBuilderObj::identity),
|
||||
policy::reference_internal)
|
||||
.def("softmax",
|
||||
py::overload_cast<Tensor, Tensor>(&GraphBuilderObj::softmax),
|
||||
policy::reference_internal)
|
||||
.def("relu", py::overload_cast<Tensor, Tensor>(&GraphBuilderObj::relu),
|
||||
policy::reference_internal)
|
||||
.def("sigmoid",
|
||||
py::overload_cast<Tensor, Tensor>(&GraphBuilderObj::sigmoid),
|
||||
policy::reference_internal)
|
||||
.def("tanh", py::overload_cast<Tensor, Tensor>(&GraphBuilderObj::tanh),
|
||||
policy::reference_internal)
|
||||
.def("abs", py::overload_cast<Tensor, Tensor>(&GraphBuilderObj::abs),
|
||||
policy::reference_internal)
|
||||
.def("reduceMean",
|
||||
py::overload_cast<Tensor, Tensor, int>(
|
||||
&GraphBuilderObj::reduceMean),
|
||||
policy::reference_internal)
|
||||
.def("erf", py::overload_cast<Tensor, Tensor>(&GraphBuilderObj::erf),
|
||||
policy::reference_internal)
|
||||
.def("memBound",
|
||||
py::overload_cast<const TensorVec &, const TensorVec &,
|
||||
const std::vector<nnet::Tensor> &, nnet::Expr,
|
||||
double, std::string>(&GraphBuilderObj::memBound),
|
||||
policy::reference_internal);
|
||||
}
|
||||
|
||||
} // namespace infini
|
||||
|
||||
PYBIND11_MODULE(pyinfinitensor, m) { infini::register_operator_timer(m); }
|
||||
PYBIND11_MODULE(pyinfinitensor, m) {
|
||||
infini::register_operator_timer(m);
|
||||
infini::init_graph_builder(m);
|
||||
}
|
||||
|
|
|
@ -8,6 +8,8 @@
|
|||
namespace infini {
|
||||
|
||||
struct ConvCuDnnPerfRecordObj : public PerfRecordObj {
|
||||
int kernel =
|
||||
0; // 0 cudnnConvolutionForward, 1 cudnnConvolutionBiasActivationForward
|
||||
int algo = 0; // cudnnConvolutionFwdAlgo_t
|
||||
int mode = 1;
|
||||
size_t workspaceSize = 100000;
|
||||
|
@ -56,8 +58,6 @@ class convCudnn : public Kernel {
|
|||
const ConvCuDnnPerfRecord &record) const {
|
||||
void *const inData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const knData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
if (op->getInputs().size() > 2) // Bias is not supported yet
|
||||
IT_TODO_HALT();
|
||||
// void *const biasData = (op->getInputs(2)->getRawDataPtr<void *>());
|
||||
void *const outData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
|
@ -209,6 +209,36 @@ class convCudnn : public Kernel {
|
|||
return true;
|
||||
}
|
||||
|
||||
bool cuDNNfused(const Ref<ConvObj> &op, const ConvCuDnnPerfRecord &record,
|
||||
const CudaRuntimeObj *context) const {
|
||||
cudnnStatus_t stat;
|
||||
|
||||
const auto &[inData, knData, outData, inDesc, knDesc, biasDesc,
|
||||
convDesc, actDesc, outDesc] =
|
||||
createCuDNNDescriptor(op, record);
|
||||
size_t wsSize = record->workspaceSize;
|
||||
CudaPtr wsData = context->getWorkspace(wsSize);
|
||||
float alpha = 1.f, beta = 0.f;
|
||||
|
||||
// w/ bias & act
|
||||
stat = cudnnConvolutionBiasActivationForward(
|
||||
context->cudnnHandle(), &alpha, inDesc, inData, knDesc, knData,
|
||||
convDesc, ALGOS[record->algo], wsData, wsSize, &beta, outDesc,
|
||||
outData, biasDesc, nullptr, actDesc, outDesc, outData);
|
||||
if (stat != CUDNN_STATUS_SUCCESS)
|
||||
return false;
|
||||
|
||||
// Destories in CUDA does not require sync. But cuDNN does not state
|
||||
// whether sync is required before destories.
|
||||
checkCudnnError(cudnnDestroyTensorDescriptor(outDesc));
|
||||
checkCudnnError(cudnnDestroyActivationDescriptor(actDesc));
|
||||
checkCudnnError(cudnnDestroyConvolutionDescriptor(convDesc));
|
||||
checkCudnnError(cudnnDestroyTensorDescriptor(biasDesc));
|
||||
checkCudnnError(cudnnDestroyFilterDescriptor(knDesc));
|
||||
checkCudnnError(cudnnDestroyTensorDescriptor(inDesc));
|
||||
return true;
|
||||
}
|
||||
|
||||
void compute(const Operator &op, const RuntimeObj *context) const override {
|
||||
auto record = make_ref<ConvCuDnnPerfRecordObj>(); // with paramters in
|
||||
// default ctor
|
||||
|
@ -217,10 +247,88 @@ class convCudnn : public Kernel {
|
|||
|
||||
PerfRecord tune(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
ConvCuDnnPerfRecordObj ret;
|
||||
ret.time = std::numeric_limits<double>::max();
|
||||
auto context = dynamic_cast<const CudaRuntimeObj *>(_context);
|
||||
auto op = as<ConvObj>(_op);
|
||||
printf("%s\n", op->toString().c_str());
|
||||
if (op->hasBias() || op->getAct() != ActType::None)
|
||||
return tuneFused(op, context);
|
||||
else
|
||||
return tuneUnfused(op, context);
|
||||
}
|
||||
|
||||
PerfRecord tuneFused(const Ref<ConvObj> &op,
|
||||
const CudaRuntimeObj *context) const {
|
||||
ConvCuDnnPerfRecordObj ret;
|
||||
ret.time = std::numeric_limits<double>::max();
|
||||
|
||||
// Both modes have the same performance. Only run cross-correlation.
|
||||
for (int mode = 1; mode < 2; mode++) {
|
||||
// Try every possible algorithm of convolution
|
||||
for (int algo = 0; algo < N_ALGO; algo++) {
|
||||
auto recordRef = make_ref<ConvCuDnnPerfRecordObj>();
|
||||
auto &record = *recordRef;
|
||||
record.mode = mode;
|
||||
record.algo = algo;
|
||||
cudnnStatus_t stat;
|
||||
const auto &[inData, knData, outData, inDesc, knDesc, biasDesc,
|
||||
convDesc, actDesc, outDesc] =
|
||||
createCuDNNDescriptor(op, recordRef);
|
||||
void *biasData = op->getBias()->getRawDataPtr<void *>();
|
||||
|
||||
// get workspace
|
||||
stat = cudnnGetConvolutionForwardWorkspaceSize(
|
||||
context->cudnnHandle(), inDesc, knDesc, convDesc, outDesc,
|
||||
ALGOS[record.algo], &record.workspaceSize);
|
||||
if (stat != CUDNN_STATUS_SUCCESS)
|
||||
continue;
|
||||
if (record.workspaceSize > context->getWorkspaceSize())
|
||||
continue;
|
||||
CudaPtr wsData = context->getWorkspace(record.workspaceSize);
|
||||
float alpha = 1.f, beta = 0.f;
|
||||
|
||||
stat = cudnnConvolutionBiasActivationForward(
|
||||
context->cudnnHandle(), &alpha, inDesc, inData, knDesc,
|
||||
knData, convDesc, ALGOS[record.algo], wsData,
|
||||
record.workspaceSize, &beta, outDesc, outData, biasDesc,
|
||||
biasData, actDesc, outDesc, outData);
|
||||
if (stat != CUDNN_STATUS_SUCCESS)
|
||||
continue;
|
||||
record.time = timeit(
|
||||
[&]() {
|
||||
stat = cudnnConvolutionBiasActivationForward(
|
||||
context->cudnnHandle(), &alpha, inDesc, inData,
|
||||
knDesc, knData, convDesc, ALGOS[record.algo],
|
||||
wsData, record.workspaceSize, &beta, outDesc,
|
||||
outData, biasDesc, biasData, actDesc, outDesc,
|
||||
outData);
|
||||
},
|
||||
[&]() { context->sync(); });
|
||||
printf("mode %d, algo %d, time %.3lf ms\n", mode, algo,
|
||||
record.time);
|
||||
|
||||
// Update the tune result
|
||||
if (ret.time > record.time)
|
||||
ret = record;
|
||||
checkCudnnError(cudnnDestroyTensorDescriptor(outDesc));
|
||||
checkCudnnError(cudnnDestroyActivationDescriptor(actDesc));
|
||||
checkCudnnError(cudnnDestroyConvolutionDescriptor(convDesc));
|
||||
checkCudnnError(cudnnDestroyTensorDescriptor(biasDesc));
|
||||
checkCudnnError(cudnnDestroyFilterDescriptor(knDesc));
|
||||
checkCudnnError(cudnnDestroyTensorDescriptor(inDesc));
|
||||
}
|
||||
}
|
||||
printf("tuneFused: the best algo is %d, the best conv mode is %d\n",
|
||||
ret.algo, ret.mode);
|
||||
IT_ASSERT(ret.time < std::numeric_limits<double>::max(), "No valid "
|
||||
"algorithm "
|
||||
"found");
|
||||
return make_ref<ConvCuDnnPerfRecordObj>(ret);
|
||||
}
|
||||
|
||||
PerfRecord tuneUnfused(const Ref<ConvObj> &op,
|
||||
const CudaRuntimeObj *context) const {
|
||||
ConvCuDnnPerfRecordObj ret;
|
||||
ret.time = std::numeric_limits<double>::max();
|
||||
// Both modes have the same performance. Only run cross-correlation.
|
||||
for (int mode = 1; mode < 2; mode++) {
|
||||
// Try every possible algorithm of convolution
|
||||
|
@ -260,7 +368,8 @@ class convCudnn : public Kernel {
|
|||
&beta, outDesc, outData);
|
||||
},
|
||||
[&]() { context->sync(); });
|
||||
// printf("mode:%d algo:%d :%.8lf\n", mode, algo, record.time);
|
||||
printf("mode %d, algo %d, time %.3lf ms\n", mode, algo,
|
||||
record.time);
|
||||
|
||||
// Update the tune result
|
||||
if (ret.time > record.time)
|
||||
|
@ -273,8 +382,8 @@ class convCudnn : public Kernel {
|
|||
checkCudnnError(cudnnDestroyTensorDescriptor(inDesc));
|
||||
}
|
||||
}
|
||||
// printf("the best algo is %d, the best conv mode is %d\n", ret.algo,
|
||||
// ret.mode);
|
||||
printf("tuneUnfused: the best algo is %d, the best conv mode is %d\n",
|
||||
ret.algo, ret.mode);
|
||||
IT_ASSERT(ret.time < std::numeric_limits<double>::max(), "No valid "
|
||||
"algorithm "
|
||||
"found");
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
#include "core/common.h"
|
||||
#include "core/constants.h"
|
||||
#include "cuda/cuda_common.h"
|
||||
#include <math.h>
|
||||
|
|
|
@ -5,15 +5,15 @@ namespace infini {
|
|||
ConvBaseObj::ConvBaseObj(OpType opType, TensorVec inputs, Tensor &output,
|
||||
int ph, int pw, int sh, int sw, int dh, int dw,
|
||||
const Tensor &inputInConvFWD,
|
||||
const Tensor &weightInConvFWD)
|
||||
const Tensor &weightInConvFWD, const ActType act)
|
||||
: OperatorObj(opType, inputs, {output}), ph(ph), pw(pw), sh(sh), sw(sw),
|
||||
dh(dh), dw(dw), padding(PaddingMode::Other) {}
|
||||
dh(dh), dw(dw), padding(PaddingMode::Other), act(act) {}
|
||||
ConvBaseObj::ConvBaseObj(OpType opType, TensorVec inputs, Tensor &output,
|
||||
PaddingMode mode, int sh, int sw, int dh, int dw,
|
||||
const Tensor &inputInConvFWD,
|
||||
const Tensor &weightInConvFWD)
|
||||
const Tensor &weightInConvFWD, const ActType act)
|
||||
: OperatorObj(opType, inputs, {output}), ph(-1), pw(-1), sh(sh), sw(sw),
|
||||
dh(dh), dw(dw), padding(mode) {
|
||||
dh(dh), dw(dw), padding(mode), act(act) {
|
||||
IT_ASSERT(mode != PaddingMode::Other);
|
||||
}
|
||||
|
||||
|
@ -21,28 +21,60 @@ string ConvBaseObj::toString() const {
|
|||
std::ostringstream os;
|
||||
os << OpRegistry::getOpName(getOpType()) << "[" << getGuid() << "]";
|
||||
os << "(";
|
||||
if (inputs.size() == 2) {
|
||||
os << vecToString(inputs[0]->getDims()) << ",";
|
||||
os << vecToString(inputs[1]->getDims()) << ",";
|
||||
os << vecToString(inputs[0]->getDims()) << ",";
|
||||
os << vecToString(inputs[1]->getDims()) << ",";
|
||||
if (inputs.size() > 2) {
|
||||
os << vecToString(inputs[2]->getDims()) << ",";
|
||||
}
|
||||
os << "p=[" << ph << "," << pw << "],";
|
||||
os << "s=[" << sh << "," << sw << "],";
|
||||
os << "d=[" << dh << "," << dw << "],";
|
||||
os << "act=" << enum_to_underlying(getAct()) << ",";
|
||||
// os << "act=" << enum_to_underlying(act) << ",";
|
||||
os << "input=" << inputs[0]->getGuid() << ",";
|
||||
os << "weight=" << inputs[1]->getGuid() << ",";
|
||||
os << "bias="
|
||||
<< ((inputs.size() == 2) ? "nullptr"
|
||||
: std::to_string(inputs[2]->getGuid()))
|
||||
<< ",";
|
||||
os << "output=" << outputs[0]->getGuid() << ")";
|
||||
return os.str();
|
||||
}
|
||||
|
||||
vector<int> ConvBaseObj::getWorkloadVector() const {
|
||||
return {
|
||||
enum_to_underlying(type), n, c, h, w, f, r, s, ph, pw, sh, sw, dh, dw};
|
||||
return {enum_to_underlying(type),
|
||||
n,
|
||||
c,
|
||||
h,
|
||||
w,
|
||||
f,
|
||||
r,
|
||||
s,
|
||||
ph,
|
||||
pw,
|
||||
sh,
|
||||
sw,
|
||||
dh,
|
||||
dw,
|
||||
hasBias(),
|
||||
enum_to_underlying(getAct())};
|
||||
}
|
||||
|
||||
vector<int> ConvBaseObj::getOpAttrVector() const {
|
||||
IT_TODO_HALT(); // should padding mode / ph+pw be in attrs?
|
||||
return {enum_to_underlying(type), c, f, r, s, ph, pw, sh, sw, dh, dw};
|
||||
return {enum_to_underlying(type),
|
||||
c,
|
||||
f,
|
||||
r,
|
||||
s,
|
||||
ph,
|
||||
pw,
|
||||
sh,
|
||||
sw,
|
||||
dh,
|
||||
dw,
|
||||
hasBias(),
|
||||
enum_to_underlying(getAct())};
|
||||
}
|
||||
|
||||
void ConvObj::setAuxilaryAttributes(PaddingMode mode) {
|
||||
|
@ -64,11 +96,10 @@ void ConvObj::setAuxilaryAttributes(PaddingMode mode) {
|
|||
ConvObj::ConvObj(GraphObj *graph, Tensor input, Tensor weight, Tensor output,
|
||||
int ph, int pw, int sh, int sw, int dh, int dw, Tensor bias,
|
||||
ActType act)
|
||||
: ConvBaseObj(OpType::Conv, {input, weight}, output, ph, pw, sh, sw, dh, dw,
|
||||
input, weight),
|
||||
act(act) {
|
||||
if (bias)
|
||||
IT_TODO_HALT();
|
||||
: ConvBaseObj(OpType::Conv,
|
||||
((bias) ? (TensorVec{input, weight, bias})
|
||||
: (TensorVec{input, weight})),
|
||||
output, ph, pw, sh, sw, dh, dw, input, weight, act) {
|
||||
setAuxilaryAttributes(PaddingMode::Other);
|
||||
IT_ASSERT(checkValid(graph));
|
||||
}
|
||||
|
@ -76,11 +107,10 @@ ConvObj::ConvObj(GraphObj *graph, Tensor input, Tensor weight, Tensor output,
|
|||
ConvObj::ConvObj(GraphObj *graph, Tensor input, Tensor weight, Tensor output,
|
||||
PaddingMode mode, int sh, int sw, int dh, int dw, Tensor bias,
|
||||
ActType act)
|
||||
: ConvBaseObj(OpType::Conv, {input, weight}, output, mode, sh, sw, dh, dw,
|
||||
input, weight),
|
||||
act(act) {
|
||||
if (bias)
|
||||
IT_TODO_HALT();
|
||||
: ConvBaseObj(OpType::Conv,
|
||||
((bias) ? (TensorVec{input, weight, bias})
|
||||
: (TensorVec{input, weight})),
|
||||
output, mode, sh, sw, dh, dw, input, weight, act) {
|
||||
setAuxilaryAttributes(mode);
|
||||
IT_ASSERT(checkValid(graph));
|
||||
}
|
||||
|
@ -98,6 +128,11 @@ optional<vector<Shape>> ConvObj::inferShape(const TensorVec &inputs) const {
|
|||
// For NCHW+FCRS layout, C of input is divisable by C of weight
|
||||
if (input->getDims()[1] % weight->getDims()[1] != 0)
|
||||
return {};
|
||||
// check bias shape
|
||||
if (inputs.size() == 3) {
|
||||
if (inputs[2]->size() != (size_t)f)
|
||||
return {};
|
||||
}
|
||||
// Set padding size
|
||||
if (padding == PaddingMode::Other) {
|
||||
oh = (h - (r - sh) * dh + ph * 2) / sh;
|
||||
|
@ -122,8 +157,8 @@ ConvTransposed2dObj::ConvTransposed2dObj(GraphObj *graph, Tensor input,
|
|||
int oph, int opw, int group,
|
||||
Tensor bias, ActType act)
|
||||
: ConvBaseObj(OpType::ConvTrans, {input, weight}, output, ph, pw, sh, sw,
|
||||
dh, dw, output, weight),
|
||||
oph(oph), opw(opw), group(group), act(act) {
|
||||
dh, dw, output, weight, act),
|
||||
oph(oph), opw(opw), group(group) {
|
||||
if (bias)
|
||||
IT_TODO_HALT();
|
||||
setAuxilaryAttributes(PaddingMode::Other);
|
||||
|
@ -136,8 +171,8 @@ ConvTransposed2dObj::ConvTransposed2dObj(GraphObj *graph, Tensor input,
|
|||
int dh, int dw, int oph, int opw,
|
||||
int group, Tensor bias, ActType act)
|
||||
: ConvBaseObj(OpType::ConvTrans, {input, weight}, output, mode, sh, sw, dh,
|
||||
dw, output, weight),
|
||||
oph(oph), opw(opw), group(group), act(act) {
|
||||
dw, output, weight, act),
|
||||
oph(oph), opw(opw), group(group) {
|
||||
if (bias)
|
||||
IT_TODO_HALT();
|
||||
setAuxilaryAttributes(mode);
|
||||
|
@ -156,6 +191,8 @@ ConvTransposed2dObj::inferShape(const TensorVec &inputs) const {
|
|||
auto s = weight->getDims()[3];
|
||||
if (f != weight->getDims()[0])
|
||||
return {};
|
||||
if (inputs.size() != 2)
|
||||
IT_TODO_HALT();
|
||||
|
||||
int on = n, oc = c * group;
|
||||
int oh = 0, ow = 0;
|
||||
|
|
|
@ -11,6 +11,7 @@ optional<vector<Shape>>
|
|||
ElementWiseObj::inferShape(const TensorVec &inputs) const {
|
||||
// For now,we only process the same dims here, broardcast will be considered
|
||||
// in the opt layer.
|
||||
std::cout << std::endl;
|
||||
const auto A = inputs[0], B = inputs[1];
|
||||
if (A->getDims().size() != B->getDims().size() ||
|
||||
A->getDims() != B->getDims())
|
||||
|
|
|
@ -12,6 +12,7 @@ ReduceMeanObj::ReduceMeanObj(GraphObj *graph, Tensor input, Tensor output,
|
|||
int idx = (*_axis)[j];
|
||||
if (idx < 0)
|
||||
IT_TODO_HALT();
|
||||
std::cout << idx << " " << input->getDims().size() << std::endl;
|
||||
IT_ASSERT((size_t)idx < input->getDims().size());
|
||||
axis.emplace(idx);
|
||||
}
|
||||
|
|
|
@ -0,0 +1,52 @@
|
|||
#include "operators/transpose.h"
|
||||
|
||||
namespace infini {
|
||||
TransposeObj::TransposeObj(GraphObj *graph, Tensor input, Tensor output,
|
||||
const Shape &perm)
|
||||
: OperatorObj(OpType::Transpose, {input}, {output}), perm(perm) {
|
||||
IT_ASSERT(checkValid(graph));
|
||||
}
|
||||
|
||||
optional<vector<Shape>>
|
||||
TransposeObj::inferShape(const TensorVec &inputs) const {
|
||||
const Shape &dimsIn = inputs[0]->getDims();
|
||||
Shape dimsOut;
|
||||
std::unordered_set<size_t> dimSet;
|
||||
for (size_t i = 0; i < perm.size(); ++i) {
|
||||
if (size_t(perm[i]) >= dimsIn.size() ||
|
||||
dimSet.find(perm[i]) != dimSet.end()) {
|
||||
std::cout << i << " " << perm[i] << " "
|
||||
<< int(dimSet.find(perm[i]) != dimSet.end()) << std::endl;
|
||||
return {};
|
||||
}
|
||||
dimsOut.emplace_back(dimsIn[perm[i]]);
|
||||
dimSet.emplace(perm[i]);
|
||||
}
|
||||
std::cout << "transpose Ok" << std::endl;
|
||||
return {{dimsOut}};
|
||||
}
|
||||
|
||||
std::string TransposeObj::toString() const {
|
||||
std::ostringstream os;
|
||||
os << "Transpose[" << getGuid() << "]";
|
||||
os << "(";
|
||||
os << vecToString(inputs[0]->getDims()) << ",";
|
||||
os << "perm=" << vecToString(perm) << ",";
|
||||
os << "input=" << inputs[0]->getGuid() << ",";
|
||||
os << "output=" << outputs[0]->getGuid() << ")";
|
||||
return os.str();
|
||||
}
|
||||
|
||||
vector<int> TransposeObj::getWorkloadVector() const {
|
||||
vector<int> ret = inputs[0]->getDims();
|
||||
ret.insert(ret.end(), perm.begin(), perm.end());
|
||||
ret.emplace(ret.begin(), enum_to_underlying(type));
|
||||
return ret;
|
||||
}
|
||||
vector<int> TransposeObj::getOpAttrVector() const {
|
||||
vector<int> ret = perm;
|
||||
ret.emplace(ret.begin(), enum_to_underlying(type));
|
||||
return ret;
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,6 @@
|
|||
project(PowerFusion)
|
||||
|
||||
aux_source_directory(. SRC_CXX)
|
||||
aux_source_directory(micro_kernel/. SRC_CXX)
|
||||
cuda_add_library(pf_base SHARED ${SRC_CXX})
|
||||
set_target_properties(pf_base PROPERTIES CUDA_ARCHITECTURES "80")
|
|
@ -0,0 +1,27 @@
|
|||
#include <iostream>
|
||||
|
||||
#include "code.h"
|
||||
#include "memory_operator.h"
|
||||
|
||||
namespace memb {
|
||||
std::string MemoryOperator::generate() {
|
||||
CodeBuffer buf;
|
||||
std::string inst;
|
||||
buf.emit("#pragma unroll");
|
||||
buf.emit("for (int inst_idx = 0; inst_idx < " + num + "; inst_idx++) {");
|
||||
if (opType == READ) {
|
||||
inst = "reg[inst_idx] = " + ptr.base_ptr + "[" + ptr.offset + " + " +
|
||||
offset + "];";
|
||||
buf.emit(inst);
|
||||
} else if (opType == WRITE) {
|
||||
inst = ptr.base_ptr + "[" + ptr.offset + " + " + offset +
|
||||
"] = " + "reg[inst_idx];";
|
||||
buf.emit(inst);
|
||||
} else {
|
||||
std::cout << "[ERROR]" << std::endl;
|
||||
exit(-1);
|
||||
}
|
||||
buf.emit("}");
|
||||
return buf.toString();
|
||||
}
|
||||
} // namespace memb
|
|
@ -0,0 +1,90 @@
|
|||
#include "codegen.h"
|
||||
#include "memory_operator.h"
|
||||
#include "micro_kernel/element.h"
|
||||
#include "micro_kernel/transpose.h"
|
||||
|
||||
namespace memb {
|
||||
|
||||
std::string gen_relu(int dimn, int dimx, int dimy, int nblocks, int nthreads) {
|
||||
std::string code = "\n";
|
||||
|
||||
MicroKernelElement relu("relu", "return element > 0 ? element : 0");
|
||||
code += relu.gen_func();
|
||||
code += "\n";
|
||||
|
||||
code += "// Kernel\n";
|
||||
code += "__global__ void kernel_tmp(float *src, float *dst) {\n";
|
||||
code += "int lane_id = threadIdx.x % 32;\n";
|
||||
code += "int warp_id = threadIdx.x / 32;\n";
|
||||
code += "int parallel_idx = blockIdx.x * " + std::to_string(nthreads / 32) +
|
||||
" + warp_id;\n";
|
||||
code += "float reg[32];\n";
|
||||
code += "__shared__ float smem[32 * 32 * 2 * " +
|
||||
std::to_string(nthreads / 32) + "];\n";
|
||||
|
||||
int loop = ((dimx - 1) / 32 + 1) * ((dimy - 1) / 32 + 1);
|
||||
code += "for (int loop_idx = 0; loop_idx < " + std::to_string(loop) +
|
||||
"; loop_idx++) {\n";
|
||||
|
||||
MemoryOperator dram_read;
|
||||
dram_read.memoryType = MemoryOperator::DRAM;
|
||||
dram_read.opType = MemoryOperator::READ;
|
||||
dram_read.ptr = Ptr("src", "parallel_idx * " + std::to_string(dimx * dimy));
|
||||
dram_read.num =
|
||||
"MIN(32, " + std::to_string(dimx) + " - loop_idx / 32 * 32)";
|
||||
dram_read.offset =
|
||||
"INDEX(loop_idx / 32 + inst_idx, loop_idx % 32 + lane_id, " +
|
||||
std::to_string(dimy) + ")";
|
||||
dram_read.reg = "inst_idx";
|
||||
code += dram_read.generate();
|
||||
|
||||
MemoryOperator sram_write;
|
||||
sram_write.memoryType = MemoryOperator::SRAM;
|
||||
sram_write.opType = MemoryOperator::WRITE;
|
||||
sram_write.ptr = Ptr("smem", "warp_id * 32 * 32 * 2");
|
||||
sram_write.num =
|
||||
"MIN(32, " + std::to_string(dimx) + " - loop_idx / 32 * 32)";
|
||||
sram_write.offset = "inst_idx * 32 + lane_id";
|
||||
sram_write.reg = "inst_idx";
|
||||
code += sram_write.generate();
|
||||
|
||||
code += relu.gen_kernel(Ptr("smem", "warp_id * 32 * 32 * 2"),
|
||||
Ptr("smem", "warp_id * 32 * 32 * 2 + 32 * 32"), 32,
|
||||
32, "32");
|
||||
|
||||
MemoryOperator sram_read;
|
||||
sram_read.memoryType = MemoryOperator::SRAM;
|
||||
sram_read.opType = MemoryOperator::READ;
|
||||
sram_read.ptr = Ptr("smem", "warp_id * 32 * 32 * 2 + 32 * 32");
|
||||
sram_read.num =
|
||||
"MIN(32, " + std::to_string(dimx) + " - loop_idx / 32 * 32)";
|
||||
sram_read.offset = "inst_idx * 32 + lane_id";
|
||||
sram_read.reg = "inst_idx";
|
||||
code += sram_read.generate();
|
||||
|
||||
MemoryOperator dram_write;
|
||||
sram_read.memoryType = MemoryOperator::DRAM;
|
||||
sram_read.opType = MemoryOperator::WRITE;
|
||||
sram_read.ptr = Ptr("dst", "parallel_idx * " + std::to_string(dimx) +
|
||||
" * " + std::to_string(dimy));
|
||||
sram_read.num =
|
||||
"MIN(32, " + std::to_string(dimx) + " - loop_idx / 32 * 32)";
|
||||
sram_read.offset =
|
||||
"INDEX(loop_idx / 32 + inst_idx, loop_idx % 32 + lane_id, " +
|
||||
std::to_string(dimy) + ")";
|
||||
sram_read.reg = "inst_idx";
|
||||
code += sram_read.generate();
|
||||
code += "}\n";
|
||||
code += "}\n\n";
|
||||
|
||||
code += "void transpose(float *src, float *dst) {\n";
|
||||
code += "dim3 gridDim(" + std::to_string(nblocks) + ", 1);";
|
||||
code += "dim3 blockDim(" + std::to_string(nthreads) + ", 1);";
|
||||
code += "kernel_tmp<<<gridDim, blockDim>>>(src, dst);\n";
|
||||
code += "cudaCheckError();\n";
|
||||
code += "}\n";
|
||||
|
||||
return code;
|
||||
}
|
||||
|
||||
} // namespace memb
|
|
@ -0,0 +1,93 @@
|
|||
#include "codegen.h"
|
||||
#include "memory_operator.h"
|
||||
#include "micro_kernel/transpose.h"
|
||||
|
||||
namespace memb {
|
||||
|
||||
std::string header =
|
||||
std::string("// Header code\n") + std::string("#include \"cuda.h\"\n") +
|
||||
std::string("#include \"cuda_utils.h\"\n\n") +
|
||||
std::string("#define ROUND_UP(n, b) (((n) - 1) / (b) + 1)\n") +
|
||||
std::string("#define INDEX(x, y, n) ((x) * (n) + (y))\n") +
|
||||
std::string("#define MIN(x, y) ((x) < (y) ? (x) : (y))\n\n");
|
||||
|
||||
std::string gen_lowest_basic(int dimn, int dimx, int dimy, int nblocks,
|
||||
int nthreads) {
|
||||
std::string code = header;
|
||||
code += "// Kernel\n";
|
||||
code += "__global__ void kernel_tmp(float *src, float *dst) {\n";
|
||||
code += "int lane_id = threadIdx.x % 32;\n";
|
||||
code += "int warp_id = threadIdx.x / 32;\n";
|
||||
code += "int parallel_idx = blockIdx.x * " + std::to_string(nthreads / 32) +
|
||||
" + warp_id;\n";
|
||||
code += "float reg[32];\n";
|
||||
code += "__shared__ float smem[32 * 32 * 2 * " +
|
||||
std::to_string(nthreads / 32) + "];\n";
|
||||
|
||||
int loop = ((dimx - 1) / 32 + 1) * ((dimy - 1) / 32 + 1);
|
||||
code += "for (int loop_idx = 0; loop_idx < " + std::to_string(loop) +
|
||||
"; loop_idx++) {\n";
|
||||
|
||||
MemoryOperator dram_read;
|
||||
dram_read.memoryType = MemoryOperator::DRAM;
|
||||
dram_read.opType = MemoryOperator::READ;
|
||||
dram_read.ptr = Ptr("src", "parallel_idx * " + std::to_string(dimx * dimy));
|
||||
dram_read.num =
|
||||
"MIN(32, " + std::to_string(dimx) + " - loop_idx / 32 * 32)";
|
||||
dram_read.offset =
|
||||
"INDEX(loop_idx / 32 + inst_idx, loop_idx % 32 + lane_id, " +
|
||||
std::to_string(dimy) + ")";
|
||||
dram_read.reg = "inst_idx";
|
||||
code += dram_read.generate();
|
||||
|
||||
MemoryOperator sram_write;
|
||||
sram_write.memoryType = MemoryOperator::SRAM;
|
||||
sram_write.opType = MemoryOperator::WRITE;
|
||||
sram_write.ptr = Ptr("smem", "warp_id * 32 * 32 * 2");
|
||||
sram_write.num =
|
||||
"MIN(32, " + std::to_string(dimx) + " - loop_idx / 32 * 32)";
|
||||
sram_write.offset = "inst_idx * 32 + lane_id";
|
||||
sram_write.reg = "inst_idx";
|
||||
code += sram_write.generate();
|
||||
|
||||
MicroKernelTranspose transpose;
|
||||
code += transpose.generate(Ptr("smem", "warp_id * 32 * 32 * 2"),
|
||||
Ptr("smem", "warp_id * 32 * 32 * 2 + 32 * 32"),
|
||||
32, "32", 32, "32");
|
||||
|
||||
MemoryOperator sram_read;
|
||||
sram_read.memoryType = MemoryOperator::SRAM;
|
||||
sram_read.opType = MemoryOperator::READ;
|
||||
sram_read.ptr = Ptr("smem", "warp_id * 32 * 32 * 2 + 32 * 32");
|
||||
sram_read.num =
|
||||
"MIN(32, " + std::to_string(dimx) + " - loop_idx / 32 * 32)";
|
||||
sram_read.offset = "inst_idx * 32 + lane_id";
|
||||
sram_read.reg = "inst_idx";
|
||||
code += sram_read.generate();
|
||||
|
||||
MemoryOperator dram_write;
|
||||
sram_read.memoryType = MemoryOperator::DRAM;
|
||||
sram_read.opType = MemoryOperator::WRITE;
|
||||
sram_read.ptr = Ptr("dst", "parallel_idx * " + std::to_string(dimx) +
|
||||
" * " + std::to_string(dimy));
|
||||
sram_read.num =
|
||||
"MIN(32, " + std::to_string(dimx) + " - loop_idx / 32 * 32)";
|
||||
sram_read.offset =
|
||||
"INDEX(loop_idx / 32 + inst_idx, loop_idx % 32 + lane_id, " +
|
||||
std::to_string(dimy) + ")";
|
||||
sram_read.reg = "inst_idx";
|
||||
code += sram_read.generate();
|
||||
code += "}\n";
|
||||
code += "}\n\n";
|
||||
|
||||
code += "void transpose(float *src, float *dst) {\n";
|
||||
code += "dim3 gridDim(" + std::to_string(nblocks) + ", 1);";
|
||||
code += "dim3 blockDim(" + std::to_string(nthreads) + ", 1);";
|
||||
code += "kernel_tmp<<<gridDim, blockDim>>>(src, dst);\n";
|
||||
code += "cudaCheckError();\n";
|
||||
code += "}\n";
|
||||
|
||||
return code;
|
||||
}
|
||||
|
||||
} // namespace memb
|
|
@ -0,0 +1,302 @@
|
|||
#include "pfusion/instantiate.h"
|
||||
#include "pfusion/meta_op.h"
|
||||
#include "pfusion/micro_kernel/binary.h"
|
||||
#include "pfusion/micro_kernel/memory.h"
|
||||
#include "pfusion/micro_kernel/unary.h"
|
||||
#include "pfusion/micro_op.h"
|
||||
|
||||
namespace memb {
|
||||
|
||||
size_t getSize(const std::vector<size_t> &shape) {
|
||||
size_t size = 1;
|
||||
for (auto x : shape) {
|
||||
size *= x;
|
||||
}
|
||||
return size;
|
||||
}
|
||||
|
||||
size_t min(size_t a, size_t b) { return (a < b) ? a : b; }
|
||||
|
||||
std::vector<std::shared_ptr<MetaOp>>
|
||||
instantiateUnary(const OpType opType,
|
||||
std::vector<std::shared_ptr<Pointer>> ptrs,
|
||||
const std::vector<size_t> &shape) {
|
||||
std::vector<std::shared_ptr<MetaOp>> metaOps;
|
||||
size_t size = getSize(shape);
|
||||
|
||||
auto metaOp = std::make_shared<MetaOp>();
|
||||
metaOp->main_loop_st = 0;
|
||||
metaOp->main_loop_ed = size / 32 / 8;
|
||||
metaOp->numBlocks = 108;
|
||||
metaOp->numGroups = 8;
|
||||
metaOp->numLanes = 32;
|
||||
metaOp->numReg = 8;
|
||||
metaOp->numSmem = 0;
|
||||
|
||||
metaOp->mappings.emplace_back(TensorMapping::buildWithMap(
|
||||
std::string("src"), std::vector<size_t>({32 * 8, size / 32 / 8}),
|
||||
std::vector<size_t>({1})));
|
||||
|
||||
metaOp->ptrs = ptrs;
|
||||
auto buf = Pointer::buildPtr(REG, "buf", "inst_idx");
|
||||
|
||||
metaOp->microOps.emplace_back(MemoryOp::build(
|
||||
READ,
|
||||
Pointer::buildPtr(ptrs[0], "offset_src + inst_idx * 32 + lane_id"), buf,
|
||||
8, 32));
|
||||
metaOp->microOps.emplace_back(
|
||||
std::make_shared<UnaryOp>(opType, buf, buf, 8, 32));
|
||||
|
||||
metaOp->microOps.emplace_back(MemoryOp::build(
|
||||
WRITE, buf,
|
||||
Pointer::buildPtr(ptrs[1], "offset_src + inst_idx * 32 + lane_id"), 8,
|
||||
32));
|
||||
|
||||
metaOps.emplace_back(metaOp);
|
||||
return metaOps;
|
||||
}
|
||||
|
||||
std::vector<std::shared_ptr<MetaOp>>
|
||||
instantiateBinary(const OpType opType,
|
||||
std::vector<std::shared_ptr<Pointer>> ptrs,
|
||||
const std::vector<size_t> &shape) {
|
||||
std::vector<std::shared_ptr<MetaOp>> metaOps;
|
||||
size_t size = getSize(shape);
|
||||
|
||||
auto metaOp = std::make_shared<MetaOp>();
|
||||
metaOp->main_loop_st = 0;
|
||||
metaOp->main_loop_ed = size / 32 / 8;
|
||||
metaOp->numBlocks = 108;
|
||||
metaOp->numGroups = 8;
|
||||
metaOp->numLanes = 32;
|
||||
metaOp->numReg = 24;
|
||||
metaOp->numSmem = 0;
|
||||
|
||||
metaOp->mappings.emplace_back(TensorMapping::buildWithMap(
|
||||
std::string("src"), std::vector<size_t>({32 * 8, size / 32 / 8}),
|
||||
std::vector<size_t>({1})));
|
||||
|
||||
metaOp->ptrs = ptrs;
|
||||
auto buf0 = Pointer::buildPtr(REG, "buf", "inst_idx");
|
||||
auto buf1 = Pointer::buildPtr(REG, "buf", "inst_idx + 8");
|
||||
auto buf2 = Pointer::buildPtr(REG, "buf", "inst_idx + 16");
|
||||
|
||||
metaOp->microOps.emplace_back(MemoryOp::build(
|
||||
READ,
|
||||
Pointer::buildPtr(ptrs[0], "offset_src + inst_idx * 32 + lane_id"),
|
||||
buf0, 8, 32));
|
||||
metaOp->microOps.emplace_back(MemoryOp::build(
|
||||
READ,
|
||||
Pointer::buildPtr(ptrs[1], "offset_src + inst_idx * 32 + lane_id"),
|
||||
buf1, 8, 32));
|
||||
metaOp->microOps.emplace_back(
|
||||
std::make_shared<BinaryOp>(opType, buf0, buf1, buf2, 8, 32));
|
||||
metaOp->microOps.emplace_back(MemoryOp::build(
|
||||
WRITE, buf2,
|
||||
Pointer::buildPtr(ptrs[2], "offset_src + inst_idx * 32 + lane_id"), 8,
|
||||
32));
|
||||
|
||||
metaOps.emplace_back(metaOp);
|
||||
return metaOps;
|
||||
}
|
||||
|
||||
std::vector<std::shared_ptr<MetaOp>> instantiateTranspose(
|
||||
const OpType opType, std::vector<std::shared_ptr<Pointer>> ptrs,
|
||||
const std::vector<size_t> &shape, const std::vector<size_t> &perm) {
|
||||
std::vector<std::shared_ptr<MetaOp>> metaOps;
|
||||
|
||||
size_t size = 1;
|
||||
for (auto x : shape) {
|
||||
size *= x;
|
||||
}
|
||||
|
||||
auto metaOp = std::make_shared<MetaOp>();
|
||||
|
||||
std::vector<size_t> srcShape(shape.size());
|
||||
std::vector<size_t> srcMap;
|
||||
size_t parallelSize = 1;
|
||||
for (size_t i = 0; i < shape.size(); i++) {
|
||||
srcShape[perm[i]] = shape[i];
|
||||
if (i != 0 && perm[i] != 0) {
|
||||
srcMap.emplace_back(perm[i]);
|
||||
parallelSize *= shape[i];
|
||||
}
|
||||
}
|
||||
metaOp->mappings.emplace_back(
|
||||
TensorMapping::buildWithMap("src", srcShape, srcMap));
|
||||
|
||||
std::vector<size_t> dstMap;
|
||||
for (size_t i = 0; i < shape.size(); i++) {
|
||||
if (i != 0 && perm[i] != 0) {
|
||||
dstMap.emplace_back(i);
|
||||
}
|
||||
}
|
||||
metaOp->mappings.emplace_back(
|
||||
TensorMapping::buildWithMap("dst", shape, dstMap));
|
||||
|
||||
metaOp->main_loop_st = 0;
|
||||
metaOp->main_loop_ed = parallelSize;
|
||||
metaOp->numBlocks = 108;
|
||||
metaOp->numGroups = 8;
|
||||
metaOp->numLanes = 32;
|
||||
|
||||
int numTileA = (shape[perm[0]] - 1) / 32 + 1;
|
||||
int numTileB = (shape[0] - 1) / 32 + 1;
|
||||
// std::cout << numTileA << " " << numTileB << std::endl;
|
||||
|
||||
metaOp->numReg = 32;
|
||||
metaOp->numSmem = 32 * 33;
|
||||
|
||||
size_t stride_src = 1, stride_dst = 1;
|
||||
for (size_t i = 0; i < shape.size(); i++) {
|
||||
if (perm[i] < perm[0]) {
|
||||
stride_src *= shape[i];
|
||||
}
|
||||
}
|
||||
for (size_t i = 0; perm[i] != 0; i++) {
|
||||
stride_dst *= shape[i];
|
||||
}
|
||||
|
||||
// TODO: tiling is a metaOp or microOps?
|
||||
|
||||
metaOp->ptrs = ptrs;
|
||||
auto smem =
|
||||
Pointer::buildPtr(SRAM, "smem",
|
||||
"group_id * " + std::to_string(metaOp->numLanes) +
|
||||
" * " + std::to_string(metaOp->numLanes + 1));
|
||||
auto buf = Pointer::buildPtr(REG, "buf", "inst_idx");
|
||||
|
||||
for (int i = 0; i < numTileA; i++) {
|
||||
for (int j = 0; j < numTileB; j++) {
|
||||
auto src_ptr = Pointer::buildPtr(
|
||||
ptrs[0], "offset_src + " +
|
||||
std::to_string(j * 32 * stride_src + i * 32) +
|
||||
"+" + "inst_idx * " + std::to_string(stride_src) +
|
||||
" + lane_id");
|
||||
metaOp->microOps.emplace_back(
|
||||
MemoryOp::build(READ, src_ptr, buf, min(32u, shape[0]),
|
||||
min(32, shape[perm[0]])));
|
||||
metaOp->microOps.emplace_back(MemoryOp::build(
|
||||
WRITE, buf, Pointer::buildPtr(smem, "inst_idx * 33 + lane_id"),
|
||||
min(32, shape[0]), min(32, shape[perm[0]])));
|
||||
metaOp->microOps.emplace_back(MemoryOp::build(
|
||||
READ, Pointer::buildPtr(smem, "lane_id * 33 + inst_idx"), buf,
|
||||
min(32, shape[perm[0]]), min(32, shape[0])));
|
||||
auto dst_ptr = Pointer::buildPtr(
|
||||
ptrs[1], "offset_dst + " +
|
||||
std::to_string(i * 32 * stride_dst + j * 32) +
|
||||
"+" + "inst_idx * " + std::to_string(stride_dst) +
|
||||
" + lane_id");
|
||||
metaOp->microOps.emplace_back(
|
||||
MemoryOp::build(WRITE, buf, dst_ptr, min(32, shape[perm[0]]),
|
||||
min(32, shape[0])));
|
||||
}
|
||||
}
|
||||
metaOps.emplace_back(metaOp);
|
||||
|
||||
return metaOps;
|
||||
}
|
||||
|
||||
std::vector<std::shared_ptr<MetaOp>>
|
||||
instantiateGather(const OpType opType,
|
||||
const std::vector<std::shared_ptr<Pointer>> &ptrs,
|
||||
const std::vector<size_t> &inputShape,
|
||||
const std::vector<size_t> &indexShape,
|
||||
const std::vector<size_t> &outputShape, const size_t axis) {
|
||||
std::vector<std::shared_ptr<MetaOp>> metaOps;
|
||||
size_t par_size = 1;
|
||||
for (size_t i = 0; i < outputShape.size() - 1; i++) {
|
||||
par_size *= inputShape[i];
|
||||
}
|
||||
size_t seq_size = inputShape[outputShape.size() - 1];
|
||||
|
||||
auto metaOp = std::make_shared<MetaOp>();
|
||||
metaOp->main_loop_st = 0;
|
||||
metaOp->main_loop_ed = par_size;
|
||||
metaOp->numBlocks = 108;
|
||||
metaOp->numGroups = 2;
|
||||
metaOp->numLanes = 32;
|
||||
metaOp->numReg = 24;
|
||||
metaOp->numSmem = 0;
|
||||
|
||||
metaOp->mappings.emplace_back(TensorMapping::buildWithMap(
|
||||
std::string("src"), std::vector<size_t>({seq_size, par_size}),
|
||||
std::vector<size_t>({1})));
|
||||
|
||||
metaOp->ptrs = ptrs;
|
||||
auto buf0 = Pointer::buildPtr(REG, "buf", "inst_idx");
|
||||
auto buf1 = Pointer::buildPtr(REG, "buf", "inst_idx + 8");
|
||||
auto buf2 = Pointer::buildPtr(REG, "buf", "inst_idx + 16");
|
||||
|
||||
metaOps.emplace_back(metaOp);
|
||||
return metaOps;
|
||||
}
|
||||
|
||||
std::vector<std::shared_ptr<MetaOp>>
|
||||
instantiateReduce(const OpType opType,
|
||||
const std::vector<std::shared_ptr<Pointer>> &ptrs,
|
||||
const std::vector<size_t> &inputShape, const size_t axis) {
|
||||
IT_ASSERT(axis == 0);
|
||||
std::vector<std::shared_ptr<MetaOp>> metaOps;
|
||||
size_t par_size = 1;
|
||||
for (size_t i = 0; i < inputShape.size(); i++) {
|
||||
if (i != axis) {
|
||||
par_size *= inputShape[i];
|
||||
}
|
||||
}
|
||||
size_t seq_size = inputShape[axis];
|
||||
|
||||
auto metaOp = std::make_shared<MetaOp>();
|
||||
metaOp->main_loop_st = 0;
|
||||
metaOp->main_loop_ed = par_size;
|
||||
metaOp->numBlocks = 108;
|
||||
metaOp->numGroups = 1;
|
||||
metaOp->numLanes = 128;
|
||||
metaOp->numReg = inputShape[0] / 128;
|
||||
metaOp->numSmem = 0;
|
||||
|
||||
metaOp->mappings.emplace_back(TensorMapping::buildWithMap(
|
||||
std::string("src"), std::vector<size_t>({seq_size, par_size}),
|
||||
std::vector<size_t>({1})));
|
||||
|
||||
metaOp->ptrs = ptrs;
|
||||
auto buf0 = Pointer::buildPtr(REG, "buf", "inst_idx");
|
||||
auto buf1 = Pointer::buildPtr(REG, "buf", "inst_idx + 8");
|
||||
auto buf2 = Pointer::buildPtr(REG, "buf", "inst_idx + 16");
|
||||
|
||||
metaOps.emplace_back(metaOp);
|
||||
return metaOps;
|
||||
}
|
||||
|
||||
std::vector<std::shared_ptr<MetaOp>>
|
||||
instantiateBroadcast(const OpType opType,
|
||||
const std::vector<std::shared_ptr<Pointer>> &ptrs,
|
||||
const std::vector<size_t> &inputShape, const size_t axis,
|
||||
const size_t num) {
|
||||
std::vector<std::shared_ptr<MetaOp>> metaOps;
|
||||
size_t par_size = getSize(inputShape);
|
||||
size_t seq_size = num;
|
||||
|
||||
auto metaOp = std::make_shared<MetaOp>();
|
||||
metaOp->main_loop_st = 0;
|
||||
metaOp->main_loop_ed = par_size;
|
||||
metaOp->numBlocks = 108;
|
||||
metaOp->numGroups = 2;
|
||||
metaOp->numLanes = 32;
|
||||
metaOp->numReg = 24;
|
||||
metaOp->numSmem = 0;
|
||||
|
||||
metaOp->mappings.emplace_back(TensorMapping::buildWithMap(
|
||||
std::string("src"), std::vector<size_t>({seq_size, par_size}),
|
||||
std::vector<size_t>({1})));
|
||||
|
||||
metaOp->ptrs = ptrs;
|
||||
auto buf0 = Pointer::buildPtr(REG, "buf", "inst_idx");
|
||||
auto buf1 = Pointer::buildPtr(REG, "buf", "inst_idx + 8");
|
||||
auto buf2 = Pointer::buildPtr(REG, "buf", "inst_idx + 16");
|
||||
|
||||
metaOps.emplace_back(metaOp);
|
||||
return metaOps;
|
||||
}
|
||||
|
||||
} // namespace memb
|
|
@ -0,0 +1,257 @@
|
|||
#include "core/graph.h"
|
||||
#include "operators/extend.h"
|
||||
#include "operators/gather.h"
|
||||
#include "operators/reduce_mean.h"
|
||||
#include "operators/transpose.h"
|
||||
|
||||
#include "pfusion/instantiate.h"
|
||||
#include "pfusion/memory_codegen.h"
|
||||
#include "pfusion/pointer.h"
|
||||
#include "pfusion/search_graph.h"
|
||||
|
||||
#include <cstdlib>
|
||||
#include <filesystem>
|
||||
#include <fstream>
|
||||
|
||||
void exportCode(const std::string &filename, const std::string &code) {
|
||||
// check dir
|
||||
if (std::filesystem::exists("../generated_code")) {
|
||||
assert(std::filesystem::is_directory("../generated_code"));
|
||||
} else {
|
||||
auto ok = std::filesystem::create_directory("../generated_code");
|
||||
assert(ok);
|
||||
}
|
||||
std::string dir = "../generated_code/" + filename;
|
||||
std::ofstream fout(dir);
|
||||
assert(fout.is_open());
|
||||
fout << code;
|
||||
fout.close();
|
||||
system(std::string("clang-format -i " + dir).c_str());
|
||||
}
|
||||
|
||||
void infini::MemoryCodegen::exportGraph(Graph graph, std::string filename) {
|
||||
std::string code = generateGraph(graph);
|
||||
exportCode(filename, code);
|
||||
}
|
||||
|
||||
void infini::MemoryCodegen::exportBert_LN(const std::string &filename) {
|
||||
std::string code = "";
|
||||
exportCode(filename, code);
|
||||
}
|
||||
|
||||
void infini::MemoryCodegen::exportBert_SM(const std::string &filename) {
|
||||
std::string code = "";
|
||||
exportCode(filename, code);
|
||||
}
|
||||
|
||||
void infini::MemoryCodegen::exportBert_GELU(const std::string &filename) {
|
||||
std::string code = "";
|
||||
exportCode(filename, code);
|
||||
}
|
||||
|
||||
void infini::MemoryCodegen::exportViT_LN(const std::string &filename) {
|
||||
std::string code = "";
|
||||
exportCode(filename, code);
|
||||
}
|
||||
|
||||
void infini::MemoryCodegen::exportViT_SM(const std::string &filename) {
|
||||
std::string code = "";
|
||||
exportCode(filename, code);
|
||||
}
|
||||
|
||||
void infini::MemoryCodegen::exportViT_GELU(const std::string &filename) {
|
||||
std::string code = "";
|
||||
exportCode(filename, code);
|
||||
}
|
||||
|
||||
void infini::MemoryCodegen::exportBias(const std::string &filename,
|
||||
const std::vector<size_t> &shape) {
|
||||
std::string code = generateBias(shape);
|
||||
exportCode(filename, code);
|
||||
}
|
||||
|
||||
void infini::MemoryCodegen::exportTranspose(const std::string &filename,
|
||||
const std::vector<size_t> &shape,
|
||||
const std::vector<size_t> &perm) {
|
||||
std::string code = generateTranspose(shape, perm);
|
||||
exportCode(filename, code);
|
||||
}
|
||||
|
||||
std::vector<size_t> convertShape(const std::vector<int> &_shape) {
|
||||
std::vector<size_t> shape;
|
||||
for (int i = int(_shape.size()); i > 0; i--) {
|
||||
shape.emplace_back(_shape[i - 1]);
|
||||
}
|
||||
return shape;
|
||||
}
|
||||
|
||||
size_t convertIndex(const size_t idx, const size_t size) { return size - idx; }
|
||||
|
||||
std::vector<size_t> convertPerm(const std::vector<int> &_perm) {
|
||||
std::vector<size_t> perm;
|
||||
for (int i = int(_perm.size()); i > 0; i--) {
|
||||
perm.emplace_back(_perm.size() - _perm[i - 1] - 1);
|
||||
}
|
||||
return perm;
|
||||
}
|
||||
|
||||
void convertTranspose(std::shared_ptr<memb::SearchGraph> searchGraph,
|
||||
infini::Operator op) {
|
||||
searchGraph->addNode(memb::instantiateTranspose(
|
||||
memb::TRANSPOSE,
|
||||
{memb::Pointer::buildPtrByTensorGuid(op->getInputs()[0]->getGuid()),
|
||||
memb::Pointer::buildPtrByTensorGuid(op->getOutputs()[0]->getGuid())},
|
||||
convertShape(op->getOutputs()[0]->getDims()),
|
||||
convertPerm(infini::as<infini::TransposeObj>(op)->getPerm())));
|
||||
}
|
||||
|
||||
void convertUnary(std::shared_ptr<memb::SearchGraph> searchGraph,
|
||||
infini::Operator op, memb::OpType opType) {
|
||||
searchGraph->addNode(memb::instantiateUnary(
|
||||
opType,
|
||||
{memb::Pointer::buildPtrByTensorGuid(op->getInputs()[0]->getGuid()),
|
||||
memb::Pointer::buildPtrByTensorGuid(op->getOutputs()[0]->getGuid())},
|
||||
convertShape(op->getOutputs()[0]->getDims())));
|
||||
}
|
||||
|
||||
void convertBinary(std::shared_ptr<memb::SearchGraph> searchGraph,
|
||||
infini::Operator op, memb::OpType opType) {
|
||||
searchGraph->addNode(memb::instantiateBinary(
|
||||
opType,
|
||||
{memb::Pointer::buildPtrByTensorGuid(op->getInputs()[0]->getGuid()),
|
||||
memb::Pointer::buildPtrByTensorGuid(op->getInputs()[1]->getGuid()),
|
||||
memb::Pointer::buildPtrByTensorGuid(op->getOutputs()[0]->getGuid())},
|
||||
convertShape(op->getOutputs()[0]->getDims())));
|
||||
}
|
||||
void convertGather(std::shared_ptr<memb::SearchGraph> searchGraph,
|
||||
infini::Operator op) {
|
||||
searchGraph->addNode(memb::instantiateGather(
|
||||
memb::GATHER,
|
||||
{memb::Pointer::buildPtrByTensorGuid(op->getInputs()[0]->getGuid()),
|
||||
memb::Pointer::buildPtrByTensorGuid(op->getInputs()[1]->getGuid()),
|
||||
memb::Pointer::buildPtrByTensorGuid(op->getOutputs()[0]->getGuid())},
|
||||
convertShape(op->getInputs()[0]->getDims()),
|
||||
convertShape(op->getInputs()[1]->getDims()),
|
||||
convertShape(op->getOutputs()[0]->getDims()),
|
||||
convertIndex(infini::as<infini::GatherObj>(op)->getAxis(),
|
||||
op->getInputs()[0]->getDims().size())));
|
||||
}
|
||||
void convertReduce(std::shared_ptr<memb::SearchGraph> searchGraph,
|
||||
infini::Operator op, memb::OpType opType) {
|
||||
auto reduceMeanOp = infini::as<infini::ReduceMeanObj>(op);
|
||||
int axis = -1,
|
||||
dimSize = int(reduceMeanOp->getInputs()[0]->getDims().size());
|
||||
for (int i = 0; i < dimSize; i++) {
|
||||
if (reduceMeanOp->isReduced(i)) {
|
||||
if (axis != -1) {
|
||||
IT_ASSERT(false);
|
||||
} else {
|
||||
axis = dimSize - i - 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
IT_ASSERT(axis != -1);
|
||||
searchGraph->addNode(memb::instantiateReduce(
|
||||
opType,
|
||||
{memb::Pointer::buildPtrByTensorGuid(op->getInputs()[0]->getGuid()),
|
||||
memb::Pointer::buildPtrByTensorGuid(op->getOutputs()[0]->getGuid())},
|
||||
convertShape(op->getInputs()[0]->getDims()), axis));
|
||||
}
|
||||
|
||||
void convertBroadcast(std::shared_ptr<memb::SearchGraph> searchGraph,
|
||||
infini::Operator op) {
|
||||
auto extendOp = infini::as<infini::ExtendObj>(op);
|
||||
IT_ASSERT(op->getInputs()[0]->getDims()[extendOp->getDim()] == 1);
|
||||
searchGraph->addNode(memb::instantiateBroadcast(
|
||||
memb::BROADCAST,
|
||||
{memb::Pointer::buildPtrByTensorGuid(op->getInputs()[0]->getGuid()),
|
||||
memb::Pointer::buildPtrByTensorGuid(op->getOutputs()[0]->getGuid())},
|
||||
convertShape(op->getInputs()[0]->getDims()), extendOp->getDim(),
|
||||
extendOp->getNum() + 1));
|
||||
}
|
||||
|
||||
std::shared_ptr<memb::SearchGraph> instantiateGraph(infini::Graph graph) {
|
||||
auto searchGraph = std::make_shared<memb::SearchGraph>();
|
||||
std::unordered_map<int, int> opMap;
|
||||
int id = 0;
|
||||
for (auto op : graph->getOperators()) {
|
||||
switch (op->getOpType()) {
|
||||
case infini::OpType::Transpose:
|
||||
convertTranspose(searchGraph, op);
|
||||
break;
|
||||
case infini::OpType::Relu:
|
||||
convertUnary(searchGraph, op, memb::RELU);
|
||||
break;
|
||||
case infini::OpType::Add:
|
||||
convertBinary(searchGraph, op, memb::ADD);
|
||||
break;
|
||||
case infini::OpType::Sub:
|
||||
convertBinary(searchGraph, op, memb::SUB);
|
||||
break;
|
||||
case infini::OpType::Gather:
|
||||
convertGather(searchGraph, op);
|
||||
break;
|
||||
case infini::OpType::ReduceMean:
|
||||
convertReduce(searchGraph, op, memb::REDUCEMEAN);
|
||||
break;
|
||||
case infini::OpType::Extend:
|
||||
convertBroadcast(searchGraph, op);
|
||||
break;
|
||||
default:
|
||||
std::cout << int(op->getOpType()) << std::endl;
|
||||
IT_ASSERT(false);
|
||||
}
|
||||
IT_ASSERT(opMap.find(op->getGuid()) == opMap.end());
|
||||
opMap[op->getGuid()] = id;
|
||||
id++;
|
||||
}
|
||||
for (auto op : graph->getOperators()) {
|
||||
for (auto nextOp : op->getSuccessors()) {
|
||||
assert(opMap.find(op->getGuid()) != opMap.end());
|
||||
assert(opMap.find(nextOp->getGuid()) != opMap.end());
|
||||
searchGraph->addEdge(opMap[op->getGuid()],
|
||||
opMap[nextOp->getGuid()]);
|
||||
}
|
||||
}
|
||||
return searchGraph;
|
||||
}
|
||||
|
||||
std::string infini::MemoryCodegen::generateGraph(Graph graph) {
|
||||
auto searchGraph = instantiateGraph(graph);
|
||||
auto metaGraph = searchGraph->exportFirstMetaGraph();
|
||||
std::string code = "";
|
||||
std::cout << "[INFO] before opt." << std::endl;
|
||||
metaGraph->print();
|
||||
metaGraph->optimize();
|
||||
std::cout << "[INFO] after opt." << std::endl;
|
||||
metaGraph->print();
|
||||
code += metaGraph->genHeader();
|
||||
code += metaGraph->genKernelFuncs();
|
||||
code += metaGraph->genInvokeFuncs();
|
||||
return code;
|
||||
}
|
||||
|
||||
std::string
|
||||
infini::MemoryCodegen::generateBias(const std::vector<size_t> &shape) {
|
||||
auto metaGraph = std::make_shared<memb::MetaGraph>();
|
||||
metaGraph->addOp(memb::MetaOp::buildBiasOp(shape));
|
||||
metaGraph->print();
|
||||
std::string code = "";
|
||||
code += metaGraph->genHeader();
|
||||
code += metaGraph->genKernelFuncs();
|
||||
code += metaGraph->genInvokeFuncs();
|
||||
return code;
|
||||
}
|
||||
|
||||
std::string
|
||||
infini::MemoryCodegen::generateTranspose(const std::vector<size_t> &shape,
|
||||
const std::vector<size_t> &perm) {
|
||||
auto metaGraph = std::make_shared<memb::MetaGraph>();
|
||||
metaGraph->addOp(memb::MetaOp::buildTransposeOp(shape, perm));
|
||||
metaGraph->print();
|
||||
std::string code = "";
|
||||
code += metaGraph->genHeader();
|
||||
code += metaGraph->genKernelFuncs();
|
||||
code += metaGraph->genInvokeFuncs();
|
||||
return code;
|
||||
}
|
|
@ -0,0 +1,54 @@
|
|||
#include "pfusion/meta_graph.h"
|
||||
|
||||
namespace memb {
|
||||
|
||||
void MetaGraph::print() {
|
||||
for (auto op : metaOps) {
|
||||
op->print();
|
||||
}
|
||||
}
|
||||
|
||||
void MetaGraph::optimize() {
|
||||
std::vector<std::shared_ptr<MetaOp>> ops;
|
||||
int numOp = metaOps.size();
|
||||
int cur = 0;
|
||||
for (int i = 1; i < numOp; i++) {
|
||||
auto next = MetaOp::merge(metaOps[cur], metaOps[i]);
|
||||
if (next == nullptr) {
|
||||
ops.emplace_back(metaOps[cur]);
|
||||
cur = i;
|
||||
} else {
|
||||
cur = metaOps.size();
|
||||
metaOps.emplace_back(next);
|
||||
}
|
||||
}
|
||||
ops.emplace_back(metaOps[cur]);
|
||||
metaOps.clear();
|
||||
for (auto op : ops) {
|
||||
op->optimize();
|
||||
metaOps.emplace_back(op);
|
||||
}
|
||||
}
|
||||
|
||||
std::string MetaGraph::genHeader() {
|
||||
std::string code = "#include \"cuda_utils.h\"\n";
|
||||
return code;
|
||||
}
|
||||
|
||||
std::string MetaGraph::genKernelFuncs() {
|
||||
std::string code = "";
|
||||
for (auto metaOp : metaOps) {
|
||||
code += metaOp->genKernelFunc();
|
||||
}
|
||||
return code;
|
||||
}
|
||||
|
||||
std::string MetaGraph::genInvokeFuncs() {
|
||||
std::string code = "";
|
||||
for (auto metaOp : metaOps) {
|
||||
code += metaOp->genInvokeFunc();
|
||||
}
|
||||
return code;
|
||||
}
|
||||
|
||||
} // namespace memb
|
|
@ -0,0 +1,303 @@
|
|||
#include "pfusion/meta_op.h"
|
||||
#include "pfusion/micro_kernel/binary.h"
|
||||
#include "pfusion/micro_kernel/memory.h"
|
||||
|
||||
namespace memb {
|
||||
|
||||
std::string TensorMapping::genOffset() {
|
||||
std::string code = "int " + offset() + " = 0;\n";
|
||||
|
||||
std::string bufName = name + "_buf";
|
||||
code += "int " + bufName + " = loop_idx;\n";
|
||||
for (size_t i = 0; i < map.size(); i++) {
|
||||
code += name + " += " + bufName + " % " +
|
||||
std::to_string(shape[map[i]]) + " * " +
|
||||
std::to_string(stride[map[i]]) + ";\n";
|
||||
code += bufName + " /= " + std::to_string(shape[map[i]]) + ";\n";
|
||||
}
|
||||
|
||||
return code;
|
||||
}
|
||||
|
||||
void MetaOp::optimize() {
|
||||
if (microOps.size() == 0)
|
||||
return;
|
||||
std::vector<std::shared_ptr<MicroOp>> ops;
|
||||
int numOp = microOps.size();
|
||||
int cur = 0;
|
||||
for (int i = 1; i < numOp; i++) {
|
||||
auto next = MicroOp::merge(microOps[cur], microOps[i]);
|
||||
if (next == nullptr) {
|
||||
ops.emplace_back(microOps[cur]);
|
||||
cur = i;
|
||||
} else {
|
||||
cur = microOps.size();
|
||||
microOps.emplace_back(next);
|
||||
}
|
||||
}
|
||||
ops.emplace_back(microOps[cur]);
|
||||
microOps.clear();
|
||||
std::unordered_set<std::string> ptrSet;
|
||||
for (auto op : ops) {
|
||||
for (auto ptr : op->getPtrs()) {
|
||||
ptrSet.emplace(ptr->getName());
|
||||
}
|
||||
if (op->getType() != EMPTY) {
|
||||
microOps.emplace_back(op);
|
||||
}
|
||||
}
|
||||
std::vector<std::shared_ptr<Pointer>> newPtrs;
|
||||
for (auto ptr : ptrs) {
|
||||
if (ptrSet.find(ptr->getName()) != ptrSet.end()) {
|
||||
newPtrs.emplace_back(ptr);
|
||||
}
|
||||
}
|
||||
ptrs.clear();
|
||||
for (auto ptr : newPtrs) {
|
||||
ptrs.emplace_back(ptr);
|
||||
}
|
||||
}
|
||||
|
||||
std::string MetaOp::genKernelFunc() {
|
||||
std::string code = "";
|
||||
code += "// Kernel\n";
|
||||
code += "__global__ void kernel_func_" + std::to_string(id) + "(";
|
||||
IT_ASSERT(ptrs.size() > 0);
|
||||
code += "float *" + ptrs[0]->getName();
|
||||
for (size_t i = 1; i < ptrs.size(); i++) {
|
||||
code += ", float *" + ptrs[i]->getName();
|
||||
}
|
||||
code += ") {\n";
|
||||
code += "int lane_id = threadIdx.x % " + std::to_string(numLanes) + ";\n";
|
||||
code += "int warp_id = threadIdx.x / " + std::to_string(numLanes) + ";\n";
|
||||
code += "int parallel_idx = blockIdx.x * " + std::to_string(numGroups) +
|
||||
" + warp_id;\n";
|
||||
if (numReg != 0) {
|
||||
code += "float buf[" + std::to_string(numReg) + "];\n";
|
||||
}
|
||||
if (numSmem != 0) {
|
||||
code += "__shared__ float smem[" + std::to_string(numSmem * numGroups) +
|
||||
"];\n";
|
||||
}
|
||||
|
||||
code += "for (int loop_idx = parallel_idx; loop_idx < " +
|
||||
std::to_string(main_loop_ed) +
|
||||
"; loop_idx += " + std::to_string(numBlocks * numGroups) + ") {\n";
|
||||
|
||||
// gen offset_src
|
||||
for (auto mapping : mappings) {
|
||||
code += mapping->genOffset();
|
||||
}
|
||||
|
||||
for (auto microOp : microOps) {
|
||||
code += microOp->generate();
|
||||
}
|
||||
code += "}\n}\n";
|
||||
return code;
|
||||
}
|
||||
|
||||
std::string MetaOp::genInvokeFunc() {
|
||||
std::string code = "";
|
||||
code += "void invoke_func_" + std::to_string(id) + "(";
|
||||
IT_ASSERT(ptrs.size() > 0);
|
||||
code += "float *" + ptrs[0]->getName();
|
||||
for (size_t i = 1; i < ptrs.size(); i++) {
|
||||
code += ", float *" + ptrs[i]->getName();
|
||||
}
|
||||
code += ") {\n";
|
||||
int numThreads = numGroups * numLanes;
|
||||
code += "dim3 gridDim(" + std::to_string(numBlocks) + ", 1);";
|
||||
code += "dim3 blockDim(" + std::to_string(numThreads) + ", 1);";
|
||||
code += "kernel_func_" + std::to_string(id) + "<<<gridDim, blockDim>>>(";
|
||||
IT_ASSERT(ptrs.size() > 0);
|
||||
code += ptrs[0]->getName();
|
||||
for (size_t i = 1; i < ptrs.size(); i++) {
|
||||
code += ", " + ptrs[i]->getName();
|
||||
}
|
||||
code += ");\n";
|
||||
code += "cudaCheckError();\n";
|
||||
code += "}\n";
|
||||
return code;
|
||||
}
|
||||
|
||||
std::shared_ptr<MetaOp> MetaOp::merge(std::shared_ptr<MetaOp> metaOp0,
|
||||
std::shared_ptr<MetaOp> metaOp1) {
|
||||
IT_ASSERT(metaOp0->checkValid());
|
||||
IT_ASSERT(metaOp1->checkValid());
|
||||
// Check unmergeable
|
||||
if (metaOp0->main_loop_st != metaOp1->main_loop_st ||
|
||||
metaOp0->main_loop_ed != metaOp1->main_loop_ed ||
|
||||
metaOp0->numBlocks != metaOp1->numBlocks ||
|
||||
metaOp0->numGroups != metaOp1->numGroups ||
|
||||
metaOp0->numLanes != metaOp1->numLanes) {
|
||||
return nullptr;
|
||||
}
|
||||
auto metaOp = std::make_shared<MetaOp>();
|
||||
metaOp->main_loop_st = metaOp0->main_loop_st;
|
||||
metaOp->main_loop_ed = metaOp0->main_loop_ed;
|
||||
metaOp->numBlocks = metaOp0->numBlocks;
|
||||
metaOp->numGroups = metaOp0->numGroups;
|
||||
metaOp->numLanes = metaOp0->numLanes;
|
||||
metaOp->numReg = metaOp0->numReg + metaOp1->numReg;
|
||||
metaOp->numSmem = metaOp0->numSmem + metaOp1->numSmem;
|
||||
|
||||
// Merge ptr
|
||||
std::unordered_set<size_t> ptrSet;
|
||||
for (auto ptr : metaOp0->ptrs) {
|
||||
IT_ASSERT(ptrSet.find(ptr->getHash()) == ptrSet.end());
|
||||
metaOp->ptrs.emplace_back(ptr);
|
||||
ptrSet.emplace(ptr->getHash());
|
||||
}
|
||||
for (auto ptr : metaOp1->ptrs) {
|
||||
if (ptrSet.find(ptr->getHash()) == ptrSet.end()) {
|
||||
metaOp->ptrs.emplace_back(ptr);
|
||||
ptrSet.emplace(ptr->getHash());
|
||||
}
|
||||
}
|
||||
|
||||
// Merge mapping
|
||||
std::unordered_set<size_t> mappingSet;
|
||||
for (auto mapping : metaOp0->mappings) {
|
||||
IT_ASSERT(mappingSet.find(mapping->getHash()) == mappingSet.end());
|
||||
metaOp->mappings.emplace_back(mapping);
|
||||
mappingSet.emplace(mapping->getHash());
|
||||
}
|
||||
for (auto mapping : metaOp1->mappings) {
|
||||
if (mappingSet.find(mapping->getHash()) == mappingSet.end()) {
|
||||
metaOp->mappings.emplace_back(mapping);
|
||||
mappingSet.emplace(mapping->getHash());
|
||||
}
|
||||
}
|
||||
|
||||
// Merge microOps.
|
||||
// TODO: make it a graph.
|
||||
for (auto microOp : metaOp0->microOps) {
|
||||
metaOp->microOps.emplace_back(microOp);
|
||||
}
|
||||
for (auto microOp : metaOp1->microOps) {
|
||||
metaOp->microOps.emplace_back(microOp);
|
||||
}
|
||||
for (auto microOp : metaOp->microOps) {
|
||||
microOp->print();
|
||||
}
|
||||
// TODO: elimiate microOps.
|
||||
|
||||
return metaOp;
|
||||
}
|
||||
|
||||
std::shared_ptr<MetaOp> MetaOp::buildBiasOp(const std::vector<size_t> &shape) {
|
||||
IT_ASSERT(shape.size() == 2);
|
||||
auto metaOp = std::make_shared<MetaOp>();
|
||||
metaOp->main_loop_st = 0;
|
||||
metaOp->main_loop_ed = shape[1] * (shape[0] / 32 / 4);
|
||||
metaOp->numBlocks = 80;
|
||||
metaOp->numGroups = 4;
|
||||
metaOp->numLanes = 32;
|
||||
metaOp->numReg = 4;
|
||||
metaOp->numSmem = 0;
|
||||
|
||||
metaOp->mappings.emplace_back(TensorMapping::build(
|
||||
std::string("input"),
|
||||
std::vector<size_t>({32 * 4, (shape[0] - 1) / (32 * 4) + 1, shape[1]}),
|
||||
std::vector<size_t>({1, 32 * 4, shape[0]}),
|
||||
std::vector<size_t>({1, 2})));
|
||||
metaOp->mappings.emplace_back(TensorMapping::buildWithMap(
|
||||
std::string("bias"), std::vector<size_t>({shape[0], shape[1]}),
|
||||
std::vector<size_t>({1})));
|
||||
|
||||
metaOp->ptrs = std::vector<std::shared_ptr<Pointer>>();
|
||||
auto &ptrs = metaOp->ptrs;
|
||||
ptrs.emplace_back(Pointer::buildPtr(DRAM, "input"));
|
||||
ptrs.emplace_back(Pointer::buildPtr(DRAM, "bias"));
|
||||
ptrs.emplace_back(Pointer::buildPtr(DRAM, "output"));
|
||||
|
||||
auto buf_input = Pointer::buildPtr(REG, "buf", "inst_idx");
|
||||
auto buf_bias = Pointer::buildPtr(REG, "buf", "4");
|
||||
auto buf_output = Pointer::buildPtr(REG, "buf", "inst_idx");
|
||||
|
||||
// @cond group_id * 4 * 32 + inst_idx * 32 + lane_id < shape[0]
|
||||
metaOp->microOps.emplace_back(MemoryOp::build(
|
||||
READ,
|
||||
Pointer::buildPtr(ptrs[0], "offset_input + inst_idx * 32 + lane_id"),
|
||||
buf_input, 4, 32));
|
||||
metaOp->microOps.emplace_back(MemoryOp::build(
|
||||
READ, Pointer::buildPtr(ptrs[1], "offset_bias"), buf_bias, 1, 32));
|
||||
metaOp->microOps.emplace_back(std::make_shared<BinaryOp>(
|
||||
ADD, buf_input, buf_bias, buf_output, 4, 32));
|
||||
// @cond group_id * 4 * 32 + inst_idx * 32 + lane_id < shape[0]
|
||||
metaOp->microOps.emplace_back(MemoryOp::build(
|
||||
WRITE, buf_output,
|
||||
Pointer::buildPtr(ptrs[1], "offset_input + inst_idx * 32 + lane_id"), 8,
|
||||
32));
|
||||
return metaOp;
|
||||
}
|
||||
|
||||
std::shared_ptr<MetaOp>
|
||||
MetaOp::buildTransposeOp(const std::vector<size_t> &shape,
|
||||
const std::vector<size_t> &perm) {
|
||||
IT_ASSERT(perm[0] == 0 && shape[0] >= 32);
|
||||
IT_ASSERT(shape.size() == 3);
|
||||
auto metaOp = std::make_shared<MetaOp>();
|
||||
size_t numInst, extraDim;
|
||||
std::vector<size_t> map_shape, map_stride;
|
||||
if (shape[0] <= 4 * 32) {
|
||||
numInst = (shape[0] - 1) / 32 + 1;
|
||||
extraDim = 1;
|
||||
metaOp->mappings.emplace_back(TensorMapping::build(
|
||||
std::string("input"),
|
||||
std::vector<size_t>({shape[0], shape[perm[1]], shape[perm[2]]}),
|
||||
std::vector<size_t>({1, shape[0], shape[0] * shape[perm[1]]}),
|
||||
std::vector<size_t>({perm[1], perm[2]})));
|
||||
metaOp->mappings.emplace_back(TensorMapping::build(
|
||||
std::string("output"),
|
||||
std::vector<size_t>({shape[0], shape[1], shape[2]}),
|
||||
std::vector<size_t>({1, shape[0], shape[0] * shape[1]}),
|
||||
std::vector<size_t>({1, 2})));
|
||||
// cond: local_id < shape[0];
|
||||
} else {
|
||||
numInst = 4;
|
||||
extraDim = (shape[0] - 1) / 128 + 1;
|
||||
metaOp->mappings.emplace_back(TensorMapping::build(
|
||||
std::string("input"),
|
||||
std::vector<size_t>(
|
||||
{128, extraDim, shape[perm[1]], shape[perm[2]]}),
|
||||
std::vector<size_t>({1, 128, shape[0], shape[0] * shape[perm[1]]}),
|
||||
std::vector<size_t>({1, perm[1] + 1, perm[2] + 1})));
|
||||
metaOp->mappings.emplace_back(TensorMapping::build(
|
||||
std::string("output"),
|
||||
std::vector<size_t>({128, extraDim, shape[1], shape[2]}),
|
||||
std::vector<size_t>({1, 128, shape[0], shape[0] * shape[1]}),
|
||||
std::vector<size_t>({1, 2, 3})));
|
||||
// cond loop_idx % extraDim * 128 + local_id < shape[0];
|
||||
}
|
||||
metaOp->main_loop_st = 0;
|
||||
metaOp->main_loop_ed = shape[1] * shape[2] * extraDim;
|
||||
metaOp->numBlocks = 80;
|
||||
metaOp->numGroups = 4;
|
||||
metaOp->numLanes = 32;
|
||||
metaOp->numReg = 4;
|
||||
metaOp->numSmem = 0;
|
||||
|
||||
metaOp->ptrs = std::vector<std::shared_ptr<Pointer>>();
|
||||
auto &ptrs = metaOp->ptrs;
|
||||
ptrs.emplace_back(Pointer::buildPtr(DRAM, "input"));
|
||||
ptrs.emplace_back(Pointer::buildPtr(DRAM, "output"));
|
||||
|
||||
auto buf_input = Pointer::buildPtr(REG, "buf", "inst_idx");
|
||||
auto buf_output = Pointer::buildPtr(REG, "buf", "inst_idx");
|
||||
|
||||
// @cond group_id * 4 * 32 + inst_idx * 32 + lane_id < shape[0]
|
||||
std::vector<size_t> cond = {shape[0], extraDim, 128};
|
||||
auto inPtr =
|
||||
Pointer::buildPtr(ptrs[0], "offset_input + inst_idx * 32 + lane_id");
|
||||
auto opRead = MemoryOp::build(READ, inPtr, buf_input, numInst, 32, cond);
|
||||
auto outPtr =
|
||||
Pointer::buildPtr(ptrs[1], "offset_output + inst_idx * 32 + lane_id");
|
||||
auto opWrite =
|
||||
MemoryOp::build(WRITE, buf_output, outPtr, numInst, 32, cond);
|
||||
metaOp->microOps = std::vector<std::shared_ptr<MicroOp>>({opRead, opWrite});
|
||||
|
||||
return metaOp;
|
||||
}
|
||||
|
||||
} // namespace memb
|
|
@ -0,0 +1,33 @@
|
|||
#include "pfusion/micro_kernel/binary.h"
|
||||
|
||||
namespace memb {
|
||||
std::string BinaryOp::generate() {
|
||||
std::string code;
|
||||
|
||||
if (width < 32) {
|
||||
code += "if (lane_id < " + std::to_string(width) + ") {\n";
|
||||
}
|
||||
|
||||
code += "#pragma unroll\n";
|
||||
code += "for (int inst_idx = 0; inst_idx < " + std::to_string(num) +
|
||||
"; inst_idx++) {\n";
|
||||
std::string opFunc = getDst()->generate() + " = " + getSrc0()->generate();
|
||||
if (opType == ADD) {
|
||||
opFunc += " + ";
|
||||
} else if (opType == SUB) {
|
||||
opFunc += " - ";
|
||||
} else {
|
||||
IT_ASSERT(false);
|
||||
}
|
||||
opFunc += getSrc1()->generate() + ";\n";
|
||||
code += opFunc;
|
||||
code += "}\n";
|
||||
|
||||
if (width < 32) {
|
||||
code += "}\n";
|
||||
}
|
||||
|
||||
return code;
|
||||
}
|
||||
|
||||
} // namespace memb
|
|
@ -0,0 +1,99 @@
|
|||
#include "pfusion/micro_kernel/memory.h"
|
||||
|
||||
namespace memb {
|
||||
|
||||
std::string MemoryOp::generateWithCond() {
|
||||
IT_ASSERT(cond.size() == 3);
|
||||
std::string code = "";
|
||||
int edge_length = cond[0] % cond[2];
|
||||
int edge_num = edge_length / width;
|
||||
int edge_width = edge_length % width;
|
||||
|
||||
if (edge_num > 0 || edge_width > 0) {
|
||||
code += "if (loop_idx % " + std::to_string(cond[1]) +
|
||||
" == " + std::to_string(cond[1] - 1) + ") {\n";
|
||||
}
|
||||
|
||||
if (edge_num > 0) {
|
||||
code += "#pragma unroll\n";
|
||||
code += "for (int inst_idx = 0; inst_idx < " +
|
||||
std::to_string(edge_num) + "; inst_idx++) {\n";
|
||||
if ((opType == OpType::READ && getSrc()->getType() != MemType::REG &&
|
||||
getDst()->getType() == MemType::REG) ||
|
||||
(opType == OpType::WRITE && getSrc()->getType() == MemType::REG &&
|
||||
getDst()->getType() != MemType::REG)) {
|
||||
code += getDst()->generate() + " = " + getSrc()->generate() + ";\n";
|
||||
} else {
|
||||
IT_ASSERT(false);
|
||||
}
|
||||
code += "}\n";
|
||||
}
|
||||
|
||||
if (edge_width > 0) {
|
||||
code += "if (lane_id < " + std::to_string(edge_width) + ") {";
|
||||
if ((opType == OpType::READ && getSrc()->getType() != MemType::REG &&
|
||||
getDst()->getType() == MemType::REG) ||
|
||||
(opType == OpType::WRITE && getSrc()->getType() == MemType::REG &&
|
||||
getDst()->getType() != MemType::REG)) {
|
||||
code += getDst()->generateWithInstIdx(std::to_string(edge_num)) +
|
||||
" = " +
|
||||
getSrc()->generateWithInstIdx(std::to_string(edge_num)) +
|
||||
";\n";
|
||||
} else {
|
||||
IT_ASSERT(false);
|
||||
}
|
||||
code += "}\n";
|
||||
}
|
||||
|
||||
if (edge_num > 0 || edge_width > 0) {
|
||||
code += "} else {\n";
|
||||
}
|
||||
|
||||
code += "#pragma unroll\n";
|
||||
code += "for (int inst_idx = 0; inst_idx < " + std::to_string(num) +
|
||||
"; inst_idx++) {\n";
|
||||
if ((opType == OpType::READ && getSrc()->getType() != MemType::REG &&
|
||||
getDst()->getType() == MemType::REG) ||
|
||||
(opType == OpType::WRITE && getSrc()->getType() == MemType::REG &&
|
||||
getDst()->getType() != MemType::REG)) {
|
||||
code += getDst()->generate() + " = " + getSrc()->generate() + ";\n";
|
||||
} else {
|
||||
IT_ASSERT(false);
|
||||
}
|
||||
|
||||
code += "}\n";
|
||||
if (edge_num > 0 || edge_width > 0) {
|
||||
code += "}\n";
|
||||
}
|
||||
code += "// test\n";
|
||||
return code;
|
||||
}
|
||||
|
||||
std::string MemoryOp::generate() {
|
||||
if (cond.size() != 0) {
|
||||
return generateWithCond();
|
||||
}
|
||||
std::string code;
|
||||
if (width < 32) {
|
||||
code += "if (lane_id < " + std::to_string(width) + ") {\n";
|
||||
}
|
||||
|
||||
code += "#pragma unroll\n";
|
||||
code += "for (int inst_idx = 0; inst_idx < " + std::to_string(num) +
|
||||
"; inst_idx++) {\n";
|
||||
if ((opType == OpType::READ && getSrc()->getType() != MemType::REG &&
|
||||
getDst()->getType() == MemType::REG) ||
|
||||
(opType == OpType::WRITE && getSrc()->getType() == MemType::REG &&
|
||||
getDst()->getType() != MemType::REG)) {
|
||||
code += getDst()->generate() + " = " + getSrc()->generate() + ";\n";
|
||||
} else {
|
||||
IT_ASSERT(false);
|
||||
}
|
||||
|
||||
code += "}\n";
|
||||
if (width < 32) {
|
||||
code += "}\n";
|
||||
}
|
||||
return code;
|
||||
}
|
||||
} // namespace memb
|
|
@ -0,0 +1,30 @@
|
|||
#include "pfusion/micro_kernel/reduce.h"
|
||||
|
||||
namespace memb {
|
||||
std::string ReduceOp::generate() {
|
||||
std::string code;
|
||||
IT_ASSERT(width == 32);
|
||||
|
||||
code += getBuf()->generate() + " = 0;";
|
||||
code += "#pragma unroll\n";
|
||||
code += "for (int inst_idx = 0; inst_idx < " + std::to_string(num) +
|
||||
"; inst_idx++) {\n";
|
||||
std::string opFunc = getBuf()->generate() + " = " + getBuf()->generate();
|
||||
if (opType == REDUCEMEAN) {
|
||||
opFunc += " + ";
|
||||
} else {
|
||||
IT_ASSERT(false);
|
||||
}
|
||||
opFunc += getSrc()->generate() + ";\n";
|
||||
code += opFunc;
|
||||
code += "}\n";
|
||||
|
||||
if (opType == REDUCEMEAN) {
|
||||
code += getBuf()->generate() + " = " + getBuf()->generate() +
|
||||
" / float(" + std::to_string(num * width) + ");\n";
|
||||
}
|
||||
|
||||
return code;
|
||||
}
|
||||
|
||||
} // namespace memb
|
|
@ -0,0 +1,29 @@
|
|||
#include "pfusion/micro_kernel/unary.h"
|
||||
|
||||
namespace memb {
|
||||
std::string UnaryOp::generate() {
|
||||
std::string code;
|
||||
|
||||
if (width < 32) {
|
||||
code += "if (lane_id < " + std::to_string(width) + ") {\n";
|
||||
}
|
||||
|
||||
code += "#pragma unroll\n";
|
||||
code += "for (int inst_idx = 0; inst_idx < " + std::to_string(num) +
|
||||
"; inst_idx++) {\n";
|
||||
if (opType == RELU) {
|
||||
code += getDst()->generate() + " = (" + getSrc()->generate() +
|
||||
" > 0) ? " + getSrc()->generate() + " : 0;\n";
|
||||
} else {
|
||||
IT_ASSERT(false);
|
||||
}
|
||||
code += "}\n";
|
||||
|
||||
if (width < 32) {
|
||||
code += "}\n";
|
||||
}
|
||||
|
||||
return code;
|
||||
}
|
||||
|
||||
} // namespace memb
|
|
@ -0,0 +1,22 @@
|
|||
#include "pfusion/micro_op.h"
|
||||
#include "pfusion/micro_kernel/empty.h"
|
||||
#include "pfusion/micro_kernel/memory.h"
|
||||
|
||||
namespace memb {
|
||||
std::shared_ptr<MicroOp> MicroOp::merge(std::shared_ptr<MicroOp> op0,
|
||||
std::shared_ptr<MicroOp> op1) {
|
||||
if (op0->getType() == WRITE && op1->getType() == READ) {
|
||||
auto memOp0 = std::dynamic_pointer_cast<MemoryOp>(op0);
|
||||
auto memOp1 = std::dynamic_pointer_cast<MemoryOp>(op1);
|
||||
if (memOp0->getDst()->getHash() == memOp1->getSrc()->getHash()) {
|
||||
if (memOp0->getSrc()->getHash() == memOp1->getDst()->getHash()) {
|
||||
return std::make_shared<EmptyOp>();
|
||||
} else {
|
||||
// TODO: gen reg to reg.
|
||||
// IT_ASSERT(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
} // namespace memb
|
|
@ -0,0 +1,15 @@
|
|||
#include "pfusion/search_graph.h"
|
||||
|
||||
namespace memb {
|
||||
std::shared_ptr<MetaGraph> SearchGraph::exportFirstMetaGraph() {
|
||||
auto metaGraph = std::make_shared<MetaGraph>();
|
||||
for (auto node : nodes) {
|
||||
metaGraph->addOp(node.metaOps[0]);
|
||||
}
|
||||
for (auto edge : edges) {
|
||||
metaGraph->addEdge(nodes[edge.first].metaOps[0],
|
||||
nodes[edge.second].metaOps[0]);
|
||||
}
|
||||
return metaGraph;
|
||||
}
|
||||
} // namespace memb
|
|
@ -0,0 +1,265 @@
|
|||
#include "core/graph_builder.h"
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
TEST(GraphBuilder, ops) {
|
||||
Runtime runtime = CpuRuntimeObj::getInstance();
|
||||
{ // conv without output
|
||||
GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
|
||||
auto input =
|
||||
make_ref<TensorObj>(Shape{1, 3, 4, 4}, DataType::UInt32, runtime);
|
||||
auto weight =
|
||||
make_ref<TensorObj>(Shape{2, 3, 3, 3}, DataType::UInt32, runtime);
|
||||
auto conv = gf->conv(input, weight, 1, 1);
|
||||
EXPECT_EQ(conv->getOutput()->getDims(), (Shape{1, 2, 4, 4}));
|
||||
}
|
||||
{ // conv with output
|
||||
GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
|
||||
auto input =
|
||||
make_ref<TensorObj>(Shape{1, 3, 4, 4}, DataType::UInt32, runtime);
|
||||
auto weight =
|
||||
make_ref<TensorObj>(Shape{2, 3, 3, 3}, DataType::UInt32, runtime);
|
||||
auto output =
|
||||
make_ref<TensorObj>(Shape{1, 2, 4, 4}, DataType::UInt32, runtime);
|
||||
auto conv = gf->conv(input, weight, output, 1, 1);
|
||||
}
|
||||
{ // matmul without output
|
||||
GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
|
||||
auto A = make_ref<TensorObj>(Shape{1, 3, 5}, DataType::UInt32, runtime);
|
||||
auto B = make_ref<TensorObj>(Shape{1, 5, 2}, DataType::UInt32, runtime);
|
||||
auto matmul = gf->matmul(A, B);
|
||||
EXPECT_EQ(matmul->getOutput()->getDims(), (Shape{1, 3, 2}));
|
||||
}
|
||||
{ // matmul with output
|
||||
GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
|
||||
auto A = make_ref<TensorObj>(Shape{1, 3, 5}, DataType::UInt32, runtime);
|
||||
auto B = make_ref<TensorObj>(Shape{1, 5, 2}, DataType::UInt32, runtime);
|
||||
auto C = make_ref<TensorObj>(Shape{1, 3, 2}, DataType::UInt32, runtime);
|
||||
auto matmul = gf->matmul(A, B, C);
|
||||
}
|
||||
{ // convtrans without output
|
||||
GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
|
||||
auto input =
|
||||
make_ref<TensorObj>(Shape{1, 228, 1, 1}, DataType::UInt32, runtime);
|
||||
auto weight = make_ref<TensorObj>(Shape{228, 448, 2, 2},
|
||||
DataType::UInt32, runtime);
|
||||
auto convtrans = gf->convTrans(input, weight, 0, 0);
|
||||
EXPECT_EQ(convtrans->getOutput()->getDims(), (Shape{1, 448, 2, 2}));
|
||||
}
|
||||
{ // convtrans with output
|
||||
GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
|
||||
auto input =
|
||||
make_ref<TensorObj>(Shape{1, 228, 1, 1}, DataType::UInt32, runtime);
|
||||
auto weight = make_ref<TensorObj>(Shape{228, 448, 2, 2},
|
||||
DataType::UInt32, runtime);
|
||||
auto output =
|
||||
make_ref<TensorObj>(Shape{1, 448, 2, 2}, DataType::UInt32, runtime);
|
||||
auto convtrans = gf->convTrans(input, weight, 0, 0);
|
||||
}
|
||||
{ // pad without output
|
||||
GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
|
||||
auto input = make_ref<TensorObj>(Shape{1, 64, 162, 162},
|
||||
DataType::UInt32, runtime);
|
||||
vector<int> pads = {2, 10, 1, 5, 0, 10, 1, 5};
|
||||
auto pad = gf->pad(input, pads, std::nullopt);
|
||||
EXPECT_EQ(pad->getOutput()->getDims(), (Shape{3, 84, 164, 172}));
|
||||
}
|
||||
{ // pad with output
|
||||
GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
|
||||
auto input = make_ref<TensorObj>(Shape{1, 64, 162, 162},
|
||||
DataType::UInt32, runtime);
|
||||
auto output = make_ref<TensorObj>(Shape{3, 84, 164, 172},
|
||||
DataType::UInt32, runtime);
|
||||
vector<int> pads = {2, 10, 1, 5, 0, 10, 1, 5};
|
||||
auto pad = gf->pad(input, output, pads, std::nullopt);
|
||||
}
|
||||
{ // slice without output
|
||||
GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
|
||||
auto input = make_ref<TensorObj>(Shape{10, 64, 162, 162},
|
||||
DataType::UInt32, runtime);
|
||||
vector<int> starts = {2, 10, 1, 5};
|
||||
vector<int> ends = {3, 10, 100, 100};
|
||||
auto slice = gf->slice(input, starts, ends, std::nullopt, std::nullopt);
|
||||
EXPECT_EQ(slice->getOutput()->getDims(), (Shape{2, 1, 100, 96}));
|
||||
}
|
||||
{ // slice with output
|
||||
GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
|
||||
auto input = make_ref<TensorObj>(Shape{10, 64, 162, 162},
|
||||
DataType::UInt32, runtime);
|
||||
auto output = make_ref<TensorObj>(Shape{2, 1, 100, 96},
|
||||
DataType::UInt32, runtime);
|
||||
vector<int> starts = {2, 10, 1, 5};
|
||||
vector<int> ends = {3, 10, 100, 100};
|
||||
auto slice =
|
||||
gf->slice(input, output, starts, ends, std::nullopt, std::nullopt);
|
||||
}
|
||||
{ // concat without output
|
||||
GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
|
||||
auto t1 =
|
||||
make_ref<TensorObj>(Shape{1, 3, 2, 4}, DataType::Float32, runtime);
|
||||
auto t2 =
|
||||
make_ref<TensorObj>(Shape{1, 3, 2, 5}, DataType::Float32, runtime);
|
||||
auto concat = gf->concat(TensorVec{t1, t2}, 3);
|
||||
EXPECT_EQ(concat->getOutput()->getDims(), (Shape{1, 3, 2, 9}));
|
||||
}
|
||||
{ // concat with output
|
||||
GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
|
||||
auto t1 =
|
||||
make_ref<TensorObj>(Shape{1, 3, 2, 4}, DataType::Float32, runtime);
|
||||
auto t2 =
|
||||
make_ref<TensorObj>(Shape{1, 3, 2, 5}, DataType::Float32, runtime);
|
||||
auto o0 =
|
||||
make_ref<TensorObj>(Shape{1, 3, 2, 9}, DataType::Float32, runtime);
|
||||
auto concat = gf->concat(TensorVec{t1, t2}, o0, 3);
|
||||
}
|
||||
{ // split without output
|
||||
GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
|
||||
auto input =
|
||||
make_ref<TensorObj>(Shape{1, 3, 2, 15}, DataType::Float32, runtime);
|
||||
auto split = gf->split(input, 3, 4);
|
||||
EXPECT_EQ(split->numOutputs(), 4);
|
||||
EXPECT_EQ(split->getOutputs().size(), (size_t)4);
|
||||
EXPECT_EQ(split->getOutput(0)->getDims(), (Shape{1, 3, 2, 3}));
|
||||
EXPECT_EQ(split->getOutput(1)->getDims(), (Shape{1, 3, 2, 3}));
|
||||
EXPECT_EQ(split->getOutput(2)->getDims(), (Shape{1, 3, 2, 3}));
|
||||
EXPECT_EQ(split->getOutput(3)->getDims(), (Shape{1, 3, 2, 6}));
|
||||
}
|
||||
{ // split with output
|
||||
GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
|
||||
auto input =
|
||||
make_ref<TensorObj>(Shape{1, 3, 2, 15}, DataType::Float32, runtime);
|
||||
auto output0 =
|
||||
make_ref<TensorObj>(Shape{1, 3, 2, 3}, DataType::Float32, runtime);
|
||||
auto output1 =
|
||||
make_ref<TensorObj>(Shape{1, 3, 2, 3}, DataType::Float32, runtime);
|
||||
auto output2 =
|
||||
make_ref<TensorObj>(Shape{1, 3, 2, 3}, DataType::Float32, runtime);
|
||||
auto output3 =
|
||||
make_ref<TensorObj>(Shape{1, 3, 2, 6}, DataType::Float32, runtime);
|
||||
auto split = gf->split(
|
||||
input, TensorVec{output0, output1, output2, output3}, 3, 4);
|
||||
}
|
||||
{ // extend without output
|
||||
GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
|
||||
auto input =
|
||||
make_ref<TensorObj>(Shape{2, 3, 3, 4}, DataType::UInt32, runtime);
|
||||
auto extend = gf->extend(input, 2, 1);
|
||||
EXPECT_EQ(extend->getOutput()->getDims(), (Shape{2, 3, 6, 4}));
|
||||
}
|
||||
{ // extend with output
|
||||
GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
|
||||
auto input =
|
||||
make_ref<TensorObj>(Shape{2, 3, 3, 4}, DataType::UInt32, runtime);
|
||||
auto output =
|
||||
make_ref<TensorObj>(Shape{2, 3, 6, 4}, DataType::UInt32, runtime);
|
||||
auto extend = gf->extend(input, output, 2, 1);
|
||||
}
|
||||
{ // maxpool without output
|
||||
GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
|
||||
auto input = make_ref<TensorObj>(Shape{1, 64, 162, 162},
|
||||
DataType::UInt32, runtime);
|
||||
const int kh = 3, kw = 3, dh = 1, dw = 1, ph = 0, pw = 0, sh = 2,
|
||||
sw = 2;
|
||||
auto maxpool = gf->maxpool(input, kh, kw, dh, dw, ph, pw, sh, sw);
|
||||
EXPECT_EQ(maxpool->getOutput()->getDims(), (Shape{1, 64, 80, 80}));
|
||||
}
|
||||
{ // maxpool with output
|
||||
GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
|
||||
auto input = make_ref<TensorObj>(Shape{1, 64, 162, 162},
|
||||
DataType::UInt32, runtime);
|
||||
auto output = make_ref<TensorObj>(Shape{1, 64, 80, 80},
|
||||
DataType::UInt32, runtime);
|
||||
const int kh = 3, kw = 3, dh = 1, dw = 1, ph = 0, pw = 0, sh = 2,
|
||||
sw = 2;
|
||||
auto maxpool =
|
||||
gf->maxpool(input, output, kh, kw, dh, dw, ph, pw, sh, sw);
|
||||
}
|
||||
{ // add without output
|
||||
GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
|
||||
auto input0 =
|
||||
make_ref<TensorObj>(Shape{2, 3, 3, 4}, DataType::UInt32, runtime);
|
||||
auto input1 =
|
||||
make_ref<TensorObj>(Shape{2, 3, 3, 4}, DataType::UInt32, runtime);
|
||||
auto add = gf->add(input0, input1);
|
||||
EXPECT_EQ(add->getOutput()->getDims(), (Shape{2, 3, 3, 4}));
|
||||
}
|
||||
{ // add with output
|
||||
GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
|
||||
auto input0 =
|
||||
make_ref<TensorObj>(Shape{2, 3, 3, 4}, DataType::UInt32, runtime);
|
||||
auto input1 =
|
||||
make_ref<TensorObj>(Shape{2, 3, 3, 4}, DataType::UInt32, runtime);
|
||||
auto output =
|
||||
make_ref<TensorObj>(Shape{2, 3, 3, 4}, DataType::UInt32, runtime);
|
||||
auto add = gf->add(input0, input1, output);
|
||||
}
|
||||
{ // gather without output
|
||||
GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
|
||||
auto input =
|
||||
make_ref<TensorObj>(Shape{1, 3, 4, 4}, DataType::UInt32, runtime);
|
||||
auto index =
|
||||
make_ref<TensorObj>(Shape{2, 1, 2}, DataType::UInt32, runtime);
|
||||
auto gather = gf->gather(input, index, 1);
|
||||
EXPECT_EQ(gather->getOutput()->getDims(), (Shape{1, 2, 1, 2, 4, 4}));
|
||||
}
|
||||
{ // gather with output
|
||||
GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
|
||||
auto input =
|
||||
make_ref<TensorObj>(Shape{1, 3, 4, 4}, DataType::UInt32, runtime);
|
||||
auto index =
|
||||
make_ref<TensorObj>(Shape{2, 1, 2}, DataType::UInt32, runtime);
|
||||
auto output = make_ref<TensorObj>(Shape{1, 2, 1, 2, 4, 4},
|
||||
DataType::UInt32, runtime);
|
||||
auto gather = gf->gather(input, index, output, 1);
|
||||
}
|
||||
{ // reshape without output
|
||||
GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
|
||||
auto input =
|
||||
make_ref<TensorObj>(Shape{2, 3, 3, 4}, DataType::Float32, runtime);
|
||||
vector<int> dims = {3, 2, 4, 3};
|
||||
auto reshape = gf->reshape(input, dims);
|
||||
EXPECT_EQ(reshape->getOutput()->getDims(), (Shape{3, 2, 4, 3}));
|
||||
}
|
||||
{ // reshape with output
|
||||
GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
|
||||
auto input =
|
||||
make_ref<TensorObj>(Shape{2, 3, 3, 4}, DataType::Float32, runtime);
|
||||
vector<int> dims = {3, 2, 4, 3};
|
||||
auto output =
|
||||
make_ref<TensorObj>(Shape{3, 2, 4, 3}, DataType::Float32, runtime);
|
||||
auto reshape = gf->reshape(input, output, dims);
|
||||
}
|
||||
{ // flatten without output
|
||||
GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
|
||||
auto input =
|
||||
make_ref<TensorObj>(Shape{2, 3, 3, 4}, DataType::Float32, runtime);
|
||||
auto flatten = gf->flatten(input);
|
||||
EXPECT_EQ(flatten->getOutput()->getDims(), (Shape{72}));
|
||||
}
|
||||
{ // flatten without output
|
||||
GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
|
||||
auto input =
|
||||
make_ref<TensorObj>(Shape{2, 3, 3, 4}, DataType::Float32, runtime);
|
||||
auto output =
|
||||
make_ref<TensorObj>(Shape{72}, DataType::Float32, runtime);
|
||||
auto flatten = gf->flatten(input, output);
|
||||
}
|
||||
{ // identity without output
|
||||
GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
|
||||
auto input =
|
||||
make_ref<TensorObj>(Shape{2, 3, 3, 4}, DataType::Float32, runtime);
|
||||
auto identity = gf->identity(input);
|
||||
EXPECT_EQ(identity->getOutput()->getDims(), (Shape{2, 3, 3, 4}));
|
||||
}
|
||||
{ // identity without output
|
||||
GraphBuilder gf = make_ref<GraphBuilderObj>(runtime);
|
||||
auto input =
|
||||
make_ref<TensorObj>(Shape{2, 3, 3, 4}, DataType::Float32, runtime);
|
||||
auto output =
|
||||
make_ref<TensorObj>(Shape{2, 3, 3, 4}, DataType::Float32, runtime);
|
||||
auto identity = gf->identity(input, output);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,32 @@
|
|||
#include "core/blob.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/element_wise.h"
|
||||
#include "operators/extend.h"
|
||||
#include "operators/gather.h"
|
||||
#include "operators/matmul.h"
|
||||
#include "operators/reduce_mean.h"
|
||||
#include "operators/transpose.h"
|
||||
#include "operators/unary.h"
|
||||
|
||||
#include "pfusion/memory_codegen.h"
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
TEST(Graph, bert_layernorm) {
|
||||
MemoryCodegen codegen;
|
||||
codegen.exportBert_LN("bert_layernorm.cu");
|
||||
}
|
||||
|
||||
TEST(Graph, bert_softmax) {
|
||||
MemoryCodegen codegen;
|
||||
codegen.exportBert_SM("bert_softmax.cu");
|
||||
}
|
||||
|
||||
TEST(Graph, bert_gelu) {
|
||||
MemoryCodegen codegen;
|
||||
codegen.exportBert_GELU("bert_gelu.cu");
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,26 @@
|
|||
#include "pfusion/memory_codegen.h"
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
TEST(Graph, bias_0) {
|
||||
MemoryCodegen codegen;
|
||||
codegen.exportBias("bias_0.cu", std::vector<size_t>({28 * 28, 24}));
|
||||
}
|
||||
|
||||
TEST(Graph, bias_1) {
|
||||
MemoryCodegen codegen;
|
||||
codegen.exportBias("bias_1.cu", std::vector<size_t>({28 * 28, 58}));
|
||||
}
|
||||
|
||||
TEST(Graph, bias_2) {
|
||||
MemoryCodegen codegen;
|
||||
codegen.exportBias("bias_2.cu", std::vector<size_t>({14 * 14, 116}));
|
||||
}
|
||||
|
||||
TEST(Graph, bias_3) {
|
||||
MemoryCodegen codegen;
|
||||
codegen.exportBias("bias_3.cu", std::vector<size_t>({7 * 7, 232}));
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,38 @@
|
|||
#include "core/blob.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/matmul.h"
|
||||
#include "operators/transpose.h"
|
||||
#include "operators/unary.h"
|
||||
#include "pfusion/memory_codegen.h"
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
TEST(Graph, build_and_run) {
|
||||
Runtime runtime = CpuRuntimeObj::getInstance();
|
||||
Graph g = make_ref<GraphObj>(runtime);
|
||||
Tensor t0 = g->addTensor({1024, 32, 32}, DataType::Float32);
|
||||
Tensor t1 = g->addTensor({1024, 32, 32}, DataType::Float32);
|
||||
Tensor t2 = g->addTensor({1024, 32, 32}, DataType::Float32);
|
||||
g->dataMalloc();
|
||||
g->addOpWithOutputs<AbsObj>(t0, t1);
|
||||
g->addOpWithOutputs<ReluObj>(t0, t1);
|
||||
MemoryCodegen codegen;
|
||||
codegen.exportGraph(g, "test.cu");
|
||||
}
|
||||
|
||||
TEST(Graph, transpose) {
|
||||
Runtime runtime = CpuRuntimeObj::getInstance();
|
||||
Graph g = make_ref<GraphObj>(runtime);
|
||||
Tensor t0 = g->addTensor({32, 31, 33, 32}, DataType::Float32);
|
||||
Tensor t1 = g->addTensor({33, 32, 32, 31}, DataType::Float32);
|
||||
Tensor t2 = g->addTensor({33, 32, 32, 31}, DataType::Float32);
|
||||
g->dataMalloc();
|
||||
g->addOpWithOutputs<TransposeObj>(t0, t1, Shape{2, 0, 3, 1});
|
||||
g->addOpWithOutputs<ReluObj>(t1, t2);
|
||||
MemoryCodegen codegen;
|
||||
codegen.exportGraph(g, "transpose.cu");
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,29 @@
|
|||
#include "core/blob.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/element_wise.h"
|
||||
#include "operators/extend.h"
|
||||
#include "operators/gather.h"
|
||||
#include "operators/matmul.h"
|
||||
#include "operators/reduce_mean.h"
|
||||
#include "operators/transpose.h"
|
||||
#include "operators/unary.h"
|
||||
|
||||
#include "pfusion/memory_codegen.h"
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
TEST(Graph, reduce_mean) {
|
||||
Runtime runtime = CpuRuntimeObj::getInstance();
|
||||
Graph g = make_ref<GraphObj>(runtime);
|
||||
Tensor t0 = g->addTensor({1, 128, 512}, DataType::Float32);
|
||||
Tensor t1 = g->addTensor({1, 128, 1}, DataType::Float32);
|
||||
g->dataMalloc();
|
||||
g->addOpWithOutputs<ReduceMeanObj>(t0, t1, Shape({2}));
|
||||
|
||||
MemoryCodegen codegen;
|
||||
codegen.exportGraph(g, "reduce_mean.cu");
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,41 @@
|
|||
#include "core/blob.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/element_wise.h"
|
||||
#include "operators/matmul.h"
|
||||
#include "operators/transpose.h"
|
||||
#include "operators/unary.h"
|
||||
#include "pfusion/memory_codegen.h"
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
TEST(Graph, SAR_DRN_0) {
|
||||
Runtime runtime = CpuRuntimeObj::getInstance();
|
||||
Graph g = make_ref<GraphObj>(runtime);
|
||||
Tensor t0 = g->addTensor({1, 64, 512, 512}, DataType::Float32);
|
||||
Tensor t1 = g->addTensor({1, 64, 512, 512}, DataType::Float32);
|
||||
Tensor t2 = g->addTensor({1, 64, 512, 512}, DataType::Float32);
|
||||
Tensor t3 = g->addTensor({1, 64, 512, 512}, DataType::Float32);
|
||||
g->dataMalloc();
|
||||
g->addOpWithOutputs<ReluObj>(t0, t1);
|
||||
g->addOpWithOutputs<AddObj>(t1, t2, t3);
|
||||
MemoryCodegen codegen;
|
||||
codegen.exportGraph(g, "sar_drn_0.cu");
|
||||
}
|
||||
|
||||
TEST(Graph, SAR_DRN_1) {
|
||||
Runtime runtime = CpuRuntimeObj::getInstance();
|
||||
Graph g = make_ref<GraphObj>(runtime);
|
||||
Tensor t0 = g->addTensor({1, 1, 512, 512}, DataType::Float32);
|
||||
Tensor t1 = g->addTensor({1, 1, 512, 512}, DataType::Float32);
|
||||
Tensor t2 = g->addTensor({1, 1, 512, 512}, DataType::Float32);
|
||||
Tensor t3 = g->addTensor({1, 1, 512, 512}, DataType::Float32);
|
||||
g->dataMalloc();
|
||||
g->addOpWithOutputs<ReluObj>(t0, t1);
|
||||
g->addOpWithOutputs<SubObj>(t1, t2, t3);
|
||||
MemoryCodegen codegen;
|
||||
codegen.exportGraph(g, "sar_drn_1.cu");
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,21 @@
|
|||
#include "pfusion/memory_codegen.h"
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
TEST(Graph, transpose_0) {
|
||||
MemoryCodegen codegen;
|
||||
codegen.exportTranspose("transpose_0.cu", {28 * 28, 58, 2}, {0, 2, 1});
|
||||
}
|
||||
|
||||
TEST(Graph, transpose_1) {
|
||||
MemoryCodegen codegen;
|
||||
codegen.exportTranspose("transpose_1.cu", {14 * 14, 116, 2}, {0, 2, 1});
|
||||
}
|
||||
|
||||
TEST(Graph, transpose_2) {
|
||||
MemoryCodegen codegen;
|
||||
codegen.exportTranspose("transpose_2.cu", {7 * 7, 232, 2}, {0, 2, 1});
|
||||
}
|
||||
|
||||
} // namespace infini
|
|
@ -0,0 +1,4 @@
|
|||
. /home/spack/spack/share/spack/setup-env.sh
|
||||
spack load /yb2wg5g # cuda@10.2.89
|
||||
spack load /3bfwma4 # cudnn@7.6.5.32-10.2
|
||||
export CUDAHOSTCXX=/home/spack/spack/opt/spack/linux-ubuntu22.04-haswell/gcc-7.5.0/gcc-7.5.0-sti65cu3zunc4p4kfylgweim6mqan3mk/bin/gcc
|
Loading…
Reference in New Issue