forked from jiuyuan/InfiniTensor
Compare commits
1 Commits
master
...
broadcast_
Author | SHA1 | Date |
---|---|---|
![]() |
0fd8c4fbc5 |
|
@ -3,8 +3,12 @@
|
||||||
#include "operators/element_wise.h"
|
#include "operators/element_wise.h"
|
||||||
|
|
||||||
namespace infini {
|
namespace infini {
|
||||||
void div_kernel(float *a, float *b, float *c, int num);
|
void div_kernel(float *a, float *b, float *c, int a0, int a1, int a2, int a3,
|
||||||
void pow_kernel(float *a, float *b, float *c, int num);
|
int b0, int b1, int b2, int b3,
|
||||||
|
int c0, int c1, int c2, int c3);
|
||||||
|
void pow_kernel(float *a, float *b, float *c, int a0, int a1, int a2, int a3,
|
||||||
|
int b0, int b1, int b2, int b3,
|
||||||
|
int c0, int c1, int c2, int c3);
|
||||||
|
|
||||||
void element_wise_kernel(const Operator &_op) {
|
void element_wise_kernel(const Operator &_op) {
|
||||||
auto op = as<ElementWiseObj>(_op);
|
auto op = as<ElementWiseObj>(_op);
|
||||||
|
@ -12,12 +16,17 @@ void element_wise_kernel(const Operator &_op) {
|
||||||
float *const bData = (op->getInputs(1)->getRawDataPtr<float *>());
|
float *const bData = (op->getInputs(1)->getRawDataPtr<float *>());
|
||||||
float *const cData = (op->getOutput()->getRawDataPtr<float *>());
|
float *const cData = (op->getOutput()->getRawDataPtr<float *>());
|
||||||
|
|
||||||
auto dim = op->getInputs(0)->getDims();
|
auto aDim = op->getInputs(0)->getDims();
|
||||||
int n = dim[0], c = dim[1], h = dim[2], w = dim[3];
|
auto bDim = op->getInputs(1)->getDims();
|
||||||
|
auto cDim = op->getOutput()->getDims();
|
||||||
if (op->getOpType() == OpType::Div)
|
if (op->getOpType() == OpType::Div)
|
||||||
div_kernel(aData, bData, cData, n * c * h * w);
|
div_kernel(aData, bData, cData, aDim[0], aDim[1], aDim[2], aDim[3],
|
||||||
|
bDim[0], bDim[1], bDim[2], bDim[3],
|
||||||
|
cDim[0], cDim[1], cDim[2], cDim[3]);
|
||||||
else if (op->getOpType() == OpType::Pow)
|
else if (op->getOpType() == OpType::Pow)
|
||||||
pow_kernel(aData, bData, cData, n * c * h * w);
|
pow_kernel(aData, bData, cData, aDim[0], aDim[1], aDim[2], aDim[3],
|
||||||
|
bDim[0], bDim[1], bDim[2], bDim[3],
|
||||||
|
cDim[0], cDim[1], cDim[2], cDim[3]);
|
||||||
else
|
else
|
||||||
IT_TODO_HALT();
|
IT_TODO_HALT();
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,34 +5,73 @@ constexpr unsigned int num_threads() { return 32 * 4; }
|
||||||
constexpr int thread_work_size() { return 4; }
|
constexpr int thread_work_size() { return 4; }
|
||||||
constexpr int block_work_size() { return thread_work_size() * num_threads(); }
|
constexpr int block_work_size() { return thread_work_size() * num_threads(); }
|
||||||
|
|
||||||
__global__ void _div_kernel(float *x, float *y, float *z, int n) {
|
__global__ void _div_kernel(float *x, float *y, float *z, int a0, int a1, int a2, int a3,
|
||||||
|
int b0, int b1, int b2, int b3,
|
||||||
|
int c0, int c1, int c2, int c3) {
|
||||||
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
int stride = blockDim.x * gridDim.x;
|
int stride = blockDim.x * gridDim.x;
|
||||||
|
int n = c0 * c1 * c2 * c3;
|
||||||
|
|
||||||
for (int i = index; i < n; i += stride) {
|
for (int i = index; i < n; i += stride) {
|
||||||
z[i] = x[i] / y[i];
|
int c0_index = i/ (c1 * c2 * c3);
|
||||||
|
int c1_index = (i % (c1 * c2 * c3)) / (c2 * c3);
|
||||||
|
int c2_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) / c3;
|
||||||
|
int c3_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) % c3;
|
||||||
|
|
||||||
|
int a0_index = c0_index % a0;
|
||||||
|
int a1_index = c1_index % a1;
|
||||||
|
int a2_index = c2_index % a2;
|
||||||
|
int a3_index = c3_index % a3;
|
||||||
|
|
||||||
|
int b0_index = c0_index % b0;
|
||||||
|
int b1_index = c1_index % b1;
|
||||||
|
int b2_index = c2_index % b2;
|
||||||
|
int b3_index = c3_index % b3;
|
||||||
|
z[i] = x[a0_index*a1*a2*a3 + a1_index*a2*a3 + a2_index*a3 + a3_index] + y[b0_index*b1*b2*b3 + b1_index*b2*b3 + b2_index*b3 + b3_index];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void _pow_kernel(float *x, float *y, float *z, int n) {
|
__global__ void _pow_kernel(float *x, float *y, float *z, int a0, int a1, int a2, int a3,
|
||||||
|
int b0, int b1, int b2, int b3,
|
||||||
|
int c0, int c1, int c2, int c3) {
|
||||||
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
int stride = blockDim.x * gridDim.x;
|
int stride = blockDim.x * gridDim.x;
|
||||||
|
int n = c0 * c1 * c2 * c3;
|
||||||
|
|
||||||
for (int i = index; i < n; i += stride) {
|
for (int i = index; i < n; i += stride) {
|
||||||
z[i] = pow(x[i], y[i]);
|
int c0_index = i/ (c1 * c2 * c3);
|
||||||
|
int c1_index = (i % (c1 * c2 * c3)) / (c2 * c3);
|
||||||
|
int c2_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) / c3;
|
||||||
|
int c3_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) % c3;
|
||||||
|
|
||||||
|
int a0_index = c0_index % a0;
|
||||||
|
int a1_index = c1_index % a1;
|
||||||
|
int a2_index = c2_index % a2;
|
||||||
|
int a3_index = c3_index % a3;
|
||||||
|
|
||||||
|
int b0_index = c0_index % b0;
|
||||||
|
int b1_index = c1_index % b1;
|
||||||
|
int b2_index = c2_index % b2;
|
||||||
|
int b3_index = c3_index % b3;
|
||||||
|
z[i] = pow(x[a0_index*a1*a2*a3 + a1_index*a2*a3 + a2_index*a3 + a3_index], y[b0_index*b1*b2*b3 + b1_index*b2*b3 + b2_index*b3 + b3_index]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
namespace infini {
|
namespace infini {
|
||||||
void div_kernel(float *a, float *b, float *c, int num) {
|
void div_kernel(float *a, float *b, float *c, int a0, int a1, int a2, int a3,
|
||||||
|
int b0, int b1, int b2, int b3,
|
||||||
|
int c0, int c1, int c2, int c3) {
|
||||||
|
|
||||||
int blocksize = block_work_size();
|
int blocksize = block_work_size();
|
||||||
int gridsize = (num + block_work_size() - 1) / block_work_size();
|
int gridsize = (c0*c1*c2*c3 + block_work_size() - 1) / block_work_size();
|
||||||
_div_kernel<<<blocksize, gridsize>>>(a, b, c, num);
|
_div_kernel<<<blocksize, gridsize>>>(a, b, c, a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3);
|
||||||
}
|
}
|
||||||
void pow_kernel(float *a, float *b, float *c, int num) {
|
void pow_kernel(float *a, float *b, float *c, int a0, int a1, int a2, int a3,
|
||||||
|
int b0, int b1, int b2, int b3,
|
||||||
|
int c0, int c1, int c2, int c3) {
|
||||||
int blocksize = block_work_size();
|
int blocksize = block_work_size();
|
||||||
int gridsize = (num + block_work_size() - 1) / block_work_size();
|
int gridsize = (c0*c1*c2*c3 + block_work_size() - 1) / block_work_size();
|
||||||
_pow_kernel<<<blocksize, gridsize>>>(a, b, c, num);
|
_pow_kernel<<<blocksize, gridsize>>>(a, b, c, a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3);
|
||||||
}
|
}
|
||||||
|
|
||||||
}; // namespace infini
|
}; // namespace infini
|
|
@ -12,12 +12,10 @@ ElementWiseObj::inferShape(const TensorVec &inputs) const {
|
||||||
// For now,we only process the same dims here, broardcast will be considered
|
// For now,we only process the same dims here, broardcast will be considered
|
||||||
// in the opt layer.
|
// in the opt layer.
|
||||||
const auto A = inputs[0], B = inputs[1];
|
const auto A = inputs[0], B = inputs[1];
|
||||||
if (A->getDims().size() != B->getDims().size() ||
|
if (A->getDims().size() != B->getDims().size())
|
||||||
A->getDims() != B->getDims())
|
|
||||||
return {};
|
return {};
|
||||||
|
|
||||||
return {{A->getDims()}};
|
//return {{A->getDims()}};
|
||||||
/*
|
|
||||||
int n = A->getDims().size();
|
int n = A->getDims().size();
|
||||||
Shape shape;
|
Shape shape;
|
||||||
for (int i = 0; i < n; i++) {
|
for (int i = 0; i < n; i++) {
|
||||||
|
@ -28,7 +26,7 @@ ElementWiseObj::inferShape(const TensorVec &inputs) const {
|
||||||
auto dimI = dimA > dimB ? dimA : dimB;
|
auto dimI = dimA > dimB ? dimA : dimB;
|
||||||
shape.emplace_back(dimI);
|
shape.emplace_back(dimI);
|
||||||
}
|
}
|
||||||
return {{shape}};*/
|
return {{shape}};
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string ElementWiseObj::toString() const {
|
std::string ElementWiseObj::toString() const {
|
||||||
|
|
|
@ -14,69 +14,25 @@ TEST(ElementWise, ShapeInference) {
|
||||||
Runtime runtime = CpuRuntimeObj::getInstance();
|
Runtime runtime = CpuRuntimeObj::getInstance();
|
||||||
{
|
{
|
||||||
Graph g = make_ref<GraphObj>(runtime);
|
Graph g = make_ref<GraphObj>(runtime);
|
||||||
Tensor i0 = g->addTensor({2, 3, 3, 4}, DataType::UInt32);
|
Tensor i0 = g->addTensor({1, 4, 1, 6}, DataType::UInt32);
|
||||||
Tensor i1 = g->addTensor({2, 3, 3, 4}, DataType::UInt32);
|
Tensor i1 = g->addTensor({3, 1, 5, 1}, DataType::UInt32);
|
||||||
auto op = g->addOp<AddObj>(i0, i1, nullptr);
|
auto op = g->addOp<AddObj>(i0, i1, nullptr);
|
||||||
EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 3, 3, 4}));
|
EXPECT_EQ(op->getOutput()->getDims(), (Shape{3, 4, 5, 6}));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/*
|
|
||||||
template <typename T>
|
|
||||||
void test_element_wise(
|
|
||||||
const std::function<void(void *, size_t, DataType)> &generator,
|
|
||||||
const vector<uint32_t> &ans) {
|
|
||||||
Runtime runtime = CpuRuntimeObj::getInstance();
|
|
||||||
Graph g = make_ref<GraphObj>(runtime);
|
|
||||||
Tensor i0 = g->addTensor({1, 3, 2, 2}, DataType::UInt32);
|
|
||||||
Tensor i1 = g->addTensor({2, 3, 1, 2}, DataType::UInt32);
|
|
||||||
auto op = g->addOp<T>(i0, i1, nullptr);
|
|
||||||
|
|
||||||
g->dataMalloc();
|
|
||||||
i0->setData(generator);
|
|
||||||
i1->setData(generator);
|
|
||||||
runtime->run(g, true, true);
|
|
||||||
// check answer
|
|
||||||
EXPECT_TRUE(op->getOutput()->equalData(ans));
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST(ElementWise, NaiveCPU) {
|
|
||||||
test_element_wise<AddObj>(IncrementalGenerator(),
|
|
||||||
vector<uint32_t>{0, 2, 2, 4, 6, 8, 8, 10,
|
|
||||||
12, 14, 14, 16, 6, 8, 8, 10,
|
|
||||||
12, 14, 14, 16, 18, 20, 20, 22});
|
|
||||||
test_element_wise<SubObj>(
|
|
||||||
IncrementalGenerator(),
|
|
||||||
vector<uint32_t>{0, 0, 2, 2,
|
|
||||||
2, 2, 4, 4,
|
|
||||||
4, 4, 6, 6,
|
|
||||||
4294967290, 4294967290, 4294967292, 4294967292,
|
|
||||||
4294967292, 4294967292, 4294967294, 4294967294,
|
|
||||||
4294967294, 4294967294, 0, 0});
|
|
||||||
test_element_wise<MulObj>(
|
|
||||||
IncrementalGenerator(),
|
|
||||||
vector<uint32_t>{0, 1, 0, 3, 8, 15, 12, 21, 32, 45, 40, 55,
|
|
||||||
0, 7, 12, 21, 32, 45, 48, 63, 80, 99, 100, 121});
|
|
||||||
test_element_wise<DivObj>(OneGenerator(),
|
|
||||||
vector<uint32_t>{
|
|
||||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
||||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
|
|
||||||
template <class T>
|
template <class T>
|
||||||
void testElementWiseCudnn(
|
void testElementWiseCudnn(
|
||||||
const std::function<void(void *, size_t, DataType)> &generator,
|
const std::function<void(void *, size_t, DataType)> &generator) {
|
||||||
const Shape &shape, const ExpectOutput &ansVec) {
|
|
||||||
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
|
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
|
||||||
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||||
|
|
||||||
// Build input data on CPU
|
// Build input data on CPU
|
||||||
Tensor acpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
Tensor acpu = make_ref<TensorObj>(Shape{1, 1, 1, 4}, DataType::Float32, cpuRuntime);
|
||||||
acpu->dataMalloc();
|
acpu->dataMalloc();
|
||||||
acpu->setData(generator);
|
acpu->setData(generator);
|
||||||
|
|
||||||
Tensor bcpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
Tensor bcpu = make_ref<TensorObj>(Shape{1, 1, 3, 1}, DataType::Float32, cpuRuntime);
|
||||||
bcpu->dataMalloc();
|
bcpu->dataMalloc();
|
||||||
bcpu->setData(generator);
|
bcpu->setData(generator);
|
||||||
|
|
||||||
|
@ -85,6 +41,7 @@ void testElementWiseCudnn(
|
||||||
auto a = g->cloneTensor(acpu);
|
auto a = g->cloneTensor(acpu);
|
||||||
auto b = g->cloneTensor(bcpu);
|
auto b = g->cloneTensor(bcpu);
|
||||||
auto op = g->addOp<T>(a, b, nullptr);
|
auto op = g->addOp<T>(a, b, nullptr);
|
||||||
|
auto bop = g->addOp<DivObj>(a, b, nullptr);
|
||||||
|
|
||||||
// allocate CUDA memory
|
// allocate CUDA memory
|
||||||
g->dataMalloc();
|
g->dataMalloc();
|
||||||
|
@ -94,29 +51,20 @@ void testElementWiseCudnn(
|
||||||
|
|
||||||
// clone CUDA output to CPU
|
// clone CUDA output to CPU
|
||||||
auto c = op->getOutput();
|
auto c = op->getOutput();
|
||||||
|
auto bcast = bop->getOutput();
|
||||||
auto ccpu = c->clone(cpuRuntime);
|
auto ccpu = c->clone(cpuRuntime);
|
||||||
|
acpu->printData();
|
||||||
|
bcpu->printData();
|
||||||
|
ccpu->printData();
|
||||||
|
auto bcastcpu = bcast->clone(cpuRuntime);
|
||||||
|
bcastcpu->printData();
|
||||||
// cudaPrintTensor(c);
|
// cudaPrintTensor(c);
|
||||||
// check results on CPU
|
// check results on CPU
|
||||||
EXPECT_TRUE(ccpu->equalData(ansVec));
|
EXPECT_TRUE(ccpu->equalData(bcastcpu));
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(ElementWise, CuDNN) {
|
TEST(ElementWise, CuDNN) {
|
||||||
testElementWiseCudnn<AddObj>(
|
testElementWiseCudnn<AddObj>(IncrementalGenerator());
|
||||||
IncrementalGenerator(), Shape{1, 2, 2, 3},
|
|
||||||
ExpectOutput{0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22});
|
|
||||||
testElementWiseCudnn<SubObj>(
|
|
||||||
IncrementalGenerator(), Shape{1, 2, 2, 3},
|
|
||||||
ExpectOutput{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0});
|
|
||||||
testElementWiseCudnn<MulObj>(
|
|
||||||
IncrementalGenerator(), Shape{1, 2, 2, 3},
|
|
||||||
ExpectOutput{0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 100, 121});
|
|
||||||
|
|
||||||
testElementWiseCudnn<DivObj>(
|
|
||||||
OneGenerator(), Shape{1, 2, 2, 3},
|
|
||||||
ExpectOutput{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
|
|
||||||
|
|
||||||
testElementWiseCudnn<PowObj>(IncrementalGenerator(), Shape{1, 2, 2, 1},
|
|
||||||
ExpectOutput{1, 1, 4, 27});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace infini
|
} // namespace infini
|
Loading…
Reference in New Issue