forked from jiuyuan/InfiniTensor
add conv_transpose
This commit is contained in:
parent
a5ccf06551
commit
dddb40cd93
|
@ -447,7 +447,10 @@ void init_graph_builder(py::module &m) {
|
|||
|
||||
#ifdef USE_ASCEND
|
||||
py::class_<ASCENDRuntimeObj, std::shared_ptr<ASCENDRuntimeObj>, RuntimeObj>(
|
||||
m, "ASCENDRuntime");
|
||||
m, "ASCENDRuntime")
|
||||
.def(py::init<int>(), py::arg("device") = 0)
|
||||
.def("init_comm", &ASCENDRuntimeObj::initComm);
|
||||
;
|
||||
#endif
|
||||
py::class_<TensorObj, std::shared_ptr<TensorObj>>(m, "Tensor",
|
||||
py::buffer_protocol())
|
||||
|
|
|
@ -0,0 +1,98 @@
|
|||
#include "aclnnop/level2/aclnn_convolution.h"
|
||||
#include "ascend/ascend_kernel_without_config.h"
|
||||
#include "ascend/ascend_runtime.h"
|
||||
#include "operators/conv.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
class ConvTransAclnn : public ASCENDKernelWithoutConfig {
|
||||
|
||||
void compute(const Operator &_op,
|
||||
const RuntimeObj *_context) const override {
|
||||
auto op = as<ConvTransposed2dObj>(_op);
|
||||
auto context = dynamic_cast<const ASCENDRuntimeObj *>(_context);
|
||||
|
||||
const auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
|
||||
const auto [oph, opw] = op->getOutputPadding();
|
||||
const auto [n, c, h, w, f, r, s] = op->getNCHWFRS();
|
||||
const int cpg = op->getChannelPerGroup();
|
||||
const int g = c / cpg;
|
||||
|
||||
std::vector<int64_t> pads = {ph, pw};
|
||||
// std::vector<int64_t> ksize = {r, s};
|
||||
std::vector<int64_t> stride = {sh, sw};
|
||||
std::vector<int64_t> dilation = {dh, dw};
|
||||
std::vector<int64_t> outputPadding = {oph, opw};
|
||||
|
||||
aclIntArray *convpads = aclCreateIntArray(pads.data(), pads.size());
|
||||
aclIntArray *convstride =
|
||||
aclCreateIntArray(stride.data(), stride.size());
|
||||
aclIntArray *convdilation =
|
||||
aclCreateIntArray(dilation.data(), dilation.size());
|
||||
aclIntArray *convOutputpadding =
|
||||
aclCreateIntArray(outputPadding.data(), outputPadding.size());
|
||||
|
||||
void *const aData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||
void *const bData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||
// void *const biasData = (op->getBias()->getRawDataPtr<void *>());
|
||||
void *const cData = (op->getOutput()->getRawDataPtr<void *>());
|
||||
|
||||
auto inputD = op->getInputs(0)->getDims();
|
||||
auto inputS = op->getInputs(0)->getStride();
|
||||
auto weightD = op->getInputs(1)->getDims();
|
||||
auto weightS = op->getInputs(1)->getStride();
|
||||
// auto biasD = op->getBias()->getDims();
|
||||
// auto biasS = op->getBias()->getStride();
|
||||
auto outD = op->getOutput()->getDims();
|
||||
auto outS = op->getOutput()->getStride();
|
||||
|
||||
std::vector<int64_t> inputDim = castTo64(inputD);
|
||||
std::vector<int64_t> inputStride = castTo64(inputS);
|
||||
std::vector<int64_t> weightDim = castTo64(weightD);
|
||||
std::vector<int64_t> weightStride = castTo64(weightS);
|
||||
std::vector<int64_t> outputDim = castTo64(outD);
|
||||
std::vector<int64_t> outputStride = castTo64(outS);
|
||||
|
||||
auto inputTensor =
|
||||
aclCreateTensor(inputDim.data(), inputDim.size(), ACL_FLOAT,
|
||||
inputStride.data(), 0, aclFormat::ACL_FORMAT_NCHW,
|
||||
inputDim.data(), inputDim.size(), aData);
|
||||
auto weightTensor =
|
||||
aclCreateTensor(weightDim.data(), weightDim.size(), ACL_FLOAT,
|
||||
weightStride.data(), 0, aclFormat::ACL_FORMAT_NCHW,
|
||||
weightDim.data(), weightDim.size(), bData);
|
||||
auto outputTensor =
|
||||
aclCreateTensor(outputDim.data(), outputDim.size(), ACL_FLOAT,
|
||||
outputStride.data(), 0, aclFormat::ACL_FORMAT_NCHW,
|
||||
outputDim.data(), outputDim.size(), cData);
|
||||
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor *executor;
|
||||
|
||||
auto ret = aclnnConvolutionGetWorkspaceSize(
|
||||
inputTensor, weightTensor, nullptr, convstride, convpads,
|
||||
convdilation, true, convOutputpadding, int64_t(g), outputTensor,
|
||||
int8_t(1), &workspaceSize, &executor);
|
||||
void *workspaceAddr = nullptr;
|
||||
if (workspaceSize > 0) {
|
||||
workspaceAddr = context->getWorkspace(workspaceSize);
|
||||
}
|
||||
assert(ret == ACL_SUCCESS);
|
||||
ret = aclnnConvolution(workspaceAddr, workspaceSize, executor,
|
||||
context->ASCENDHandle());
|
||||
assert(ret == ACL_SUCCESS);
|
||||
|
||||
ret = aclrtSynchronizeStream(context->ASCENDHandle());
|
||||
assert(ret == ACL_SUCCESS);
|
||||
|
||||
// aclDestroyTensor(inputTensor);
|
||||
// aclDestroyTensor(weightTensor);
|
||||
// aclDestroyTensor(outputTensor);
|
||||
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_KERNEL(Device::ASCEND, OpType::ConvTranspose, ConvTransAclnn,
|
||||
"ConvTrans_ASCEND_float");
|
||||
}; // namespace infini
|
|
@ -0,0 +1,56 @@
|
|||
#include "ascend/ascend_runtime.h"
|
||||
#include "core/graph.h"
|
||||
#include "core/kernel.h"
|
||||
#include "core/runtime.h"
|
||||
#include "operators/conv.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
namespace infini {
|
||||
|
||||
void testConvTransposedAclnn(
|
||||
const std::function<void(void *, size_t, DataType)> &generator,
|
||||
std::vector<float> ansVec) {
|
||||
const auto &[N, C, H, W, F, R, S] = tuple{1, 1, 2, 2, 1, 4, 4};
|
||||
const int stride = 1, padding = 0, dilation = 1;
|
||||
// Construct Runtime and graph for CPU and CUDA
|
||||
Runtime cpu = NativeCpuRuntimeObj::getInstance(); // CPUruntime is singleton
|
||||
Graph gCpu = make_ref<GraphObj>(cpu);
|
||||
Runtime npu = make_ref<ASCENDRuntimeObj>();
|
||||
Graph gNpu = make_ref<GraphObj>(npu);
|
||||
// Set input data on CPU in a CPU Graph
|
||||
Tensor i0Cpu = gCpu->addTensor({N, F, H, H}, DataType::Float32);
|
||||
Tensor w0Cpu = gCpu->addTensor({F, C, R, S}, DataType::Float32);
|
||||
// Malloc data for all tensors in a graph. Do we need implicit allocation?
|
||||
gCpu->dataMalloc();
|
||||
i0Cpu->setData(generator);
|
||||
w0Cpu->setData(generator);
|
||||
|
||||
// Copy input tensors from CPU to CUDA
|
||||
Tensor i0Npu = gNpu->cloneTensor(i0Cpu);
|
||||
Tensor w0Npu = gNpu->cloneTensor(w0Cpu);
|
||||
// Build CUDA graph
|
||||
auto conv = gNpu->addOp<ConvTransposed2dObj>(i0Npu, w0Npu, nullptr, padding,
|
||||
padding, stride, stride,
|
||||
dilation, dilation);
|
||||
gNpu->dataMalloc();
|
||||
i0Npu->setData(generator);
|
||||
w0Npu->setData(generator);
|
||||
// Execute on CUDA
|
||||
npu->run(gNpu);
|
||||
// copy output from CUDA to CPU
|
||||
auto o0Cpu = gCpu->cloneTensor(conv->getOutput());
|
||||
// check results on CPU
|
||||
o0Cpu->printData();
|
||||
EXPECT_TRUE(o0Cpu->equalData(ansVec));
|
||||
}
|
||||
|
||||
TEST(ascend_ConvTransposed, run) {
|
||||
testConvTransposedAclnn(
|
||||
IncrementalGenerator(),
|
||||
std::vector<float>{0., 0., 1., 2., 3., 0., 6., 12., 18.,
|
||||
16., 8., 30., 36., 42., 32., 16., 54., 60.,
|
||||
66., 48., 24., 62., 67., 72., 45.});
|
||||
}
|
||||
|
||||
} // namespace infini
|
Loading…
Reference in New Issue