add conv_transpose

This commit is contained in:
OdinaryWord 2024-04-02 16:38:40 +08:00
parent a5ccf06551
commit dddb40cd93
3 changed files with 158 additions and 1 deletions

View File

@ -447,7 +447,10 @@ void init_graph_builder(py::module &m) {
#ifdef USE_ASCEND
py::class_<ASCENDRuntimeObj, std::shared_ptr<ASCENDRuntimeObj>, RuntimeObj>(
m, "ASCENDRuntime");
m, "ASCENDRuntime")
.def(py::init<int>(), py::arg("device") = 0)
.def("init_comm", &ASCENDRuntimeObj::initComm);
;
#endif
py::class_<TensorObj, std::shared_ptr<TensorObj>>(m, "Tensor",
py::buffer_protocol())

View File

@ -0,0 +1,98 @@
#include "aclnnop/level2/aclnn_convolution.h"
#include "ascend/ascend_kernel_without_config.h"
#include "ascend/ascend_runtime.h"
#include "operators/conv.h"
namespace infini {
class ConvTransAclnn : public ASCENDKernelWithoutConfig {
void compute(const Operator &_op,
const RuntimeObj *_context) const override {
auto op = as<ConvTransposed2dObj>(_op);
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
const auto [oph, opw] = op->getOutputPadding();
const auto [n, c, h, w, f, r, s] = op->getNCHWFRS();
const int cpg = op->getChannelPerGroup();
const int g = c / cpg;
std::vector<int64_t> pads = {ph, pw};
// std::vector<int64_t> ksize = {r, s};
std::vector<int64_t> stride = {sh, sw};
std::vector<int64_t> dilation = {dh, dw};
std::vector<int64_t> outputPadding = {oph, opw};
aclIntArray *convpads = aclCreateIntArray(pads.data(), pads.size());
aclIntArray *convstride =
aclCreateIntArray(stride.data(), stride.size());
aclIntArray *convdilation =
aclCreateIntArray(dilation.data(), dilation.size());
aclIntArray *convOutputpadding =
aclCreateIntArray(outputPadding.data(), outputPadding.size());
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
// void *const biasData = (op->getBias()->getRawDataPtr<void *>());
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
auto inputD = op->getInputs(0)->getDims();
auto inputS = op->getInputs(0)->getStride();
auto weightD = op->getInputs(1)->getDims();
auto weightS = op->getInputs(1)->getStride();
// auto biasD = op->getBias()->getDims();
// auto biasS = op->getBias()->getStride();
auto outD = op->getOutput()->getDims();
auto outS = op->getOutput()->getStride();
std::vector<int64_t> inputDim = castTo64(inputD);
std::vector<int64_t> inputStride = castTo64(inputS);
std::vector<int64_t> weightDim = castTo64(weightD);
std::vector<int64_t> weightStride = castTo64(weightS);
std::vector<int64_t> outputDim = castTo64(outD);
std::vector<int64_t> outputStride = castTo64(outS);
auto inputTensor =
aclCreateTensor(inputDim.data(), inputDim.size(), ACL_FLOAT,
inputStride.data(), 0, aclFormat::ACL_FORMAT_NCHW,
inputDim.data(), inputDim.size(), aData);
auto weightTensor =
aclCreateTensor(weightDim.data(), weightDim.size(), ACL_FLOAT,
weightStride.data(), 0, aclFormat::ACL_FORMAT_NCHW,
weightDim.data(), weightDim.size(), bData);
auto outputTensor =
aclCreateTensor(outputDim.data(), outputDim.size(), ACL_FLOAT,
outputStride.data(), 0, aclFormat::ACL_FORMAT_NCHW,
outputDim.data(), outputDim.size(), cData);
uint64_t workspaceSize = 0;
aclOpExecutor *executor;
auto ret = aclnnConvolutionGetWorkspaceSize(
inputTensor, weightTensor, nullptr, convstride, convpads,
convdilation, true, convOutputpadding, int64_t(g), outputTensor,
int8_t(1), &workspaceSize, &executor);
void *workspaceAddr = nullptr;
if (workspaceSize > 0) {
workspaceAddr = context->getWorkspace(workspaceSize);
}
assert(ret == ACL_SUCCESS);
ret = aclnnConvolution(workspaceAddr, workspaceSize, executor,
context->ASCENDHandle());
assert(ret == ACL_SUCCESS);
ret = aclrtSynchronizeStream(context->ASCENDHandle());
assert(ret == ACL_SUCCESS);
// aclDestroyTensor(inputTensor);
// aclDestroyTensor(weightTensor);
// aclDestroyTensor(outputTensor);
return;
}
};
REGISTER_KERNEL(Device::ASCEND, OpType::ConvTranspose, ConvTransAclnn,
"ConvTrans_ASCEND_float");
}; // namespace infini

View File

@ -0,0 +1,56 @@
#include "ascend/ascend_runtime.h"
#include "core/graph.h"
#include "core/kernel.h"
#include "core/runtime.h"
#include "operators/conv.h"
#include "test.h"
namespace infini {
void testConvTransposedAclnn(
const std::function<void(void *, size_t, DataType)> &generator,
std::vector<float> ansVec) {
const auto &[N, C, H, W, F, R, S] = tuple{1, 1, 2, 2, 1, 4, 4};
const int stride = 1, padding = 0, dilation = 1;
// Construct Runtime and graph for CPU and CUDA
Runtime cpu = NativeCpuRuntimeObj::getInstance(); // CPUruntime is singleton
Graph gCpu = make_ref<GraphObj>(cpu);
Runtime npu = make_ref<ASCENDRuntimeObj>();
Graph gNpu = make_ref<GraphObj>(npu);
// Set input data on CPU in a CPU Graph
Tensor i0Cpu = gCpu->addTensor({N, F, H, H}, DataType::Float32);
Tensor w0Cpu = gCpu->addTensor({F, C, R, S}, DataType::Float32);
// Malloc data for all tensors in a graph. Do we need implicit allocation?
gCpu->dataMalloc();
i0Cpu->setData(generator);
w0Cpu->setData(generator);
// Copy input tensors from CPU to CUDA
Tensor i0Npu = gNpu->cloneTensor(i0Cpu);
Tensor w0Npu = gNpu->cloneTensor(w0Cpu);
// Build CUDA graph
auto conv = gNpu->addOp<ConvTransposed2dObj>(i0Npu, w0Npu, nullptr, padding,
padding, stride, stride,
dilation, dilation);
gNpu->dataMalloc();
i0Npu->setData(generator);
w0Npu->setData(generator);
// Execute on CUDA
npu->run(gNpu);
// copy output from CUDA to CPU
auto o0Cpu = gCpu->cloneTensor(conv->getOutput());
// check results on CPU
o0Cpu->printData();
EXPECT_TRUE(o0Cpu->equalData(ansVec));
}
TEST(ascend_ConvTransposed, run) {
testConvTransposedAclnn(
IncrementalGenerator(),
std::vector<float>{0., 0., 1., 2., 3., 0., 6., 12., 18.,
16., 8., 30., 36., 42., 32., 16., 54., 60.,
66., 48., 24., 62., 67., 72., 45.});
}
} // namespace infini