forked from jiuyuan/InfiniTensor
Compare commits
1 Commits
master
...
broadcast_
Author | SHA1 | Date |
---|---|---|
wanghailu | 0fd8c4fbc5 |
|
@ -3,8 +3,12 @@
|
|||
#include "operators/element_wise.h"
|
||||
|
||||
namespace infini {
|
||||
void div_kernel(float *a, float *b, float *c, int num);
|
||||
void pow_kernel(float *a, float *b, float *c, int num);
|
||||
void div_kernel(float *a, float *b, float *c, int a0, int a1, int a2, int a3,
|
||||
int b0, int b1, int b2, int b3,
|
||||
int c0, int c1, int c2, int c3);
|
||||
void pow_kernel(float *a, float *b, float *c, int a0, int a1, int a2, int a3,
|
||||
int b0, int b1, int b2, int b3,
|
||||
int c0, int c1, int c2, int c3);
|
||||
|
||||
void element_wise_kernel(const Operator &_op) {
|
||||
auto op = as<ElementWiseObj>(_op);
|
||||
|
@ -12,12 +16,17 @@ void element_wise_kernel(const Operator &_op) {
|
|||
float *const bData = (op->getInputs(1)->getRawDataPtr<float *>());
|
||||
float *const cData = (op->getOutput()->getRawDataPtr<float *>());
|
||||
|
||||
auto dim = op->getInputs(0)->getDims();
|
||||
int n = dim[0], c = dim[1], h = dim[2], w = dim[3];
|
||||
auto aDim = op->getInputs(0)->getDims();
|
||||
auto bDim = op->getInputs(1)->getDims();
|
||||
auto cDim = op->getOutput()->getDims();
|
||||
if (op->getOpType() == OpType::Div)
|
||||
div_kernel(aData, bData, cData, n * c * h * w);
|
||||
div_kernel(aData, bData, cData, aDim[0], aDim[1], aDim[2], aDim[3],
|
||||
bDim[0], bDim[1], bDim[2], bDim[3],
|
||||
cDim[0], cDim[1], cDim[2], cDim[3]);
|
||||
else if (op->getOpType() == OpType::Pow)
|
||||
pow_kernel(aData, bData, cData, n * c * h * w);
|
||||
pow_kernel(aData, bData, cData, aDim[0], aDim[1], aDim[2], aDim[3],
|
||||
bDim[0], bDim[1], bDim[2], bDim[3],
|
||||
cDim[0], cDim[1], cDim[2], cDim[3]);
|
||||
else
|
||||
IT_TODO_HALT();
|
||||
}
|
||||
|
|
|
@ -5,34 +5,73 @@ constexpr unsigned int num_threads() { return 32 * 4; }
|
|||
constexpr int thread_work_size() { return 4; }
|
||||
constexpr int block_work_size() { return thread_work_size() * num_threads(); }
|
||||
|
||||
__global__ void _div_kernel(float *x, float *y, float *z, int n) {
|
||||
__global__ void _div_kernel(float *x, float *y, float *z, int a0, int a1, int a2, int a3,
|
||||
int b0, int b1, int b2, int b3,
|
||||
int c0, int c1, int c2, int c3) {
|
||||
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
int stride = blockDim.x * gridDim.x;
|
||||
int n = c0 * c1 * c2 * c3;
|
||||
|
||||
for (int i = index; i < n; i += stride) {
|
||||
z[i] = x[i] / y[i];
|
||||
int c0_index = i/ (c1 * c2 * c3);
|
||||
int c1_index = (i % (c1 * c2 * c3)) / (c2 * c3);
|
||||
int c2_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) / c3;
|
||||
int c3_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) % c3;
|
||||
|
||||
int a0_index = c0_index % a0;
|
||||
int a1_index = c1_index % a1;
|
||||
int a2_index = c2_index % a2;
|
||||
int a3_index = c3_index % a3;
|
||||
|
||||
int b0_index = c0_index % b0;
|
||||
int b1_index = c1_index % b1;
|
||||
int b2_index = c2_index % b2;
|
||||
int b3_index = c3_index % b3;
|
||||
z[i] = x[a0_index*a1*a2*a3 + a1_index*a2*a3 + a2_index*a3 + a3_index] + y[b0_index*b1*b2*b3 + b1_index*b2*b3 + b2_index*b3 + b3_index];
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void _pow_kernel(float *x, float *y, float *z, int n) {
|
||||
__global__ void _pow_kernel(float *x, float *y, float *z, int a0, int a1, int a2, int a3,
|
||||
int b0, int b1, int b2, int b3,
|
||||
int c0, int c1, int c2, int c3) {
|
||||
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
int stride = blockDim.x * gridDim.x;
|
||||
int n = c0 * c1 * c2 * c3;
|
||||
|
||||
for (int i = index; i < n; i += stride) {
|
||||
z[i] = pow(x[i], y[i]);
|
||||
int c0_index = i/ (c1 * c2 * c3);
|
||||
int c1_index = (i % (c1 * c2 * c3)) / (c2 * c3);
|
||||
int c2_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) / c3;
|
||||
int c3_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) % c3;
|
||||
|
||||
int a0_index = c0_index % a0;
|
||||
int a1_index = c1_index % a1;
|
||||
int a2_index = c2_index % a2;
|
||||
int a3_index = c3_index % a3;
|
||||
|
||||
int b0_index = c0_index % b0;
|
||||
int b1_index = c1_index % b1;
|
||||
int b2_index = c2_index % b2;
|
||||
int b3_index = c3_index % b3;
|
||||
z[i] = pow(x[a0_index*a1*a2*a3 + a1_index*a2*a3 + a2_index*a3 + a3_index], y[b0_index*b1*b2*b3 + b1_index*b2*b3 + b2_index*b3 + b3_index]);
|
||||
}
|
||||
}
|
||||
|
||||
namespace infini {
|
||||
void div_kernel(float *a, float *b, float *c, int num) {
|
||||
void div_kernel(float *a, float *b, float *c, int a0, int a1, int a2, int a3,
|
||||
int b0, int b1, int b2, int b3,
|
||||
int c0, int c1, int c2, int c3) {
|
||||
|
||||
int blocksize = block_work_size();
|
||||
int gridsize = (num + block_work_size() - 1) / block_work_size();
|
||||
_div_kernel<<<blocksize, gridsize>>>(a, b, c, num);
|
||||
int gridsize = (c0*c1*c2*c3 + block_work_size() - 1) / block_work_size();
|
||||
_div_kernel<<<blocksize, gridsize>>>(a, b, c, a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3);
|
||||
}
|
||||
void pow_kernel(float *a, float *b, float *c, int num) {
|
||||
|
||||
void pow_kernel(float *a, float *b, float *c, int a0, int a1, int a2, int a3,
|
||||
int b0, int b1, int b2, int b3,
|
||||
int c0, int c1, int c2, int c3) {
|
||||
int blocksize = block_work_size();
|
||||
int gridsize = (num + block_work_size() - 1) / block_work_size();
|
||||
_pow_kernel<<<blocksize, gridsize>>>(a, b, c, num);
|
||||
int gridsize = (c0*c1*c2*c3 + block_work_size() - 1) / block_work_size();
|
||||
_pow_kernel<<<blocksize, gridsize>>>(a, b, c, a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3);
|
||||
}
|
||||
|
||||
}; // namespace infini
|
|
@ -12,12 +12,10 @@ ElementWiseObj::inferShape(const TensorVec &inputs) const {
|
|||
// For now,we only process the same dims here, broardcast will be considered
|
||||
// in the opt layer.
|
||||
const auto A = inputs[0], B = inputs[1];
|
||||
if (A->getDims().size() != B->getDims().size() ||
|
||||
A->getDims() != B->getDims())
|
||||
if (A->getDims().size() != B->getDims().size())
|
||||
return {};
|
||||
|
||||
return {{A->getDims()}};
|
||||
/*
|
||||
//return {{A->getDims()}};
|
||||
int n = A->getDims().size();
|
||||
Shape shape;
|
||||
for (int i = 0; i < n; i++) {
|
||||
|
@ -28,7 +26,7 @@ ElementWiseObj::inferShape(const TensorVec &inputs) const {
|
|||
auto dimI = dimA > dimB ? dimA : dimB;
|
||||
shape.emplace_back(dimI);
|
||||
}
|
||||
return {{shape}};*/
|
||||
return {{shape}};
|
||||
}
|
||||
|
||||
std::string ElementWiseObj::toString() const {
|
||||
|
|
|
@ -14,69 +14,25 @@ TEST(ElementWise, ShapeInference) {
|
|||
Runtime runtime = CpuRuntimeObj::getInstance();
|
||||
{
|
||||
Graph g = make_ref<GraphObj>(runtime);
|
||||
Tensor i0 = g->addTensor({2, 3, 3, 4}, DataType::UInt32);
|
||||
Tensor i1 = g->addTensor({2, 3, 3, 4}, DataType::UInt32);
|
||||
Tensor i0 = g->addTensor({1, 4, 1, 6}, DataType::UInt32);
|
||||
Tensor i1 = g->addTensor({3, 1, 5, 1}, DataType::UInt32);
|
||||
auto op = g->addOp<AddObj>(i0, i1, nullptr);
|
||||
EXPECT_EQ(op->getOutput()->getDims(), (Shape{2, 3, 3, 4}));
|
||||
EXPECT_EQ(op->getOutput()->getDims(), (Shape{3, 4, 5, 6}));
|
||||
}
|
||||
}
|
||||
/*
|
||||
template <typename T>
|
||||
void test_element_wise(
|
||||
const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const vector<uint32_t> &ans) {
|
||||
Runtime runtime = CpuRuntimeObj::getInstance();
|
||||
Graph g = make_ref<GraphObj>(runtime);
|
||||
Tensor i0 = g->addTensor({1, 3, 2, 2}, DataType::UInt32);
|
||||
Tensor i1 = g->addTensor({2, 3, 1, 2}, DataType::UInt32);
|
||||
auto op = g->addOp<T>(i0, i1, nullptr);
|
||||
|
||||
g->dataMalloc();
|
||||
i0->setData(generator);
|
||||
i1->setData(generator);
|
||||
runtime->run(g, true, true);
|
||||
// check answer
|
||||
EXPECT_TRUE(op->getOutput()->equalData(ans));
|
||||
}
|
||||
|
||||
TEST(ElementWise, NaiveCPU) {
|
||||
test_element_wise<AddObj>(IncrementalGenerator(),
|
||||
vector<uint32_t>{0, 2, 2, 4, 6, 8, 8, 10,
|
||||
12, 14, 14, 16, 6, 8, 8, 10,
|
||||
12, 14, 14, 16, 18, 20, 20, 22});
|
||||
test_element_wise<SubObj>(
|
||||
IncrementalGenerator(),
|
||||
vector<uint32_t>{0, 0, 2, 2,
|
||||
2, 2, 4, 4,
|
||||
4, 4, 6, 6,
|
||||
4294967290, 4294967290, 4294967292, 4294967292,
|
||||
4294967292, 4294967292, 4294967294, 4294967294,
|
||||
4294967294, 4294967294, 0, 0});
|
||||
test_element_wise<MulObj>(
|
||||
IncrementalGenerator(),
|
||||
vector<uint32_t>{0, 1, 0, 3, 8, 15, 12, 21, 32, 45, 40, 55,
|
||||
0, 7, 12, 21, 32, 45, 48, 63, 80, 99, 100, 121});
|
||||
test_element_wise<DivObj>(OneGenerator(),
|
||||
vector<uint32_t>{
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
});
|
||||
}
|
||||
*/
|
||||
|
||||
template <class T>
|
||||
void testElementWiseCudnn(
|
||||
const std::function<void(void *, size_t, DataType)> &generator,
|
||||
const Shape &shape, const ExpectOutput &ansVec) {
|
||||
const std::function<void(void *, size_t, DataType)> &generator) {
|
||||
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
|
||||
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
||||
|
||||
// Build input data on CPU
|
||||
Tensor acpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
Tensor acpu = make_ref<TensorObj>(Shape{1, 1, 1, 4}, DataType::Float32, cpuRuntime);
|
||||
acpu->dataMalloc();
|
||||
acpu->setData(generator);
|
||||
|
||||
Tensor bcpu = make_ref<TensorObj>(shape, DataType::Float32, cpuRuntime);
|
||||
Tensor bcpu = make_ref<TensorObj>(Shape{1, 1, 3, 1}, DataType::Float32, cpuRuntime);
|
||||
bcpu->dataMalloc();
|
||||
bcpu->setData(generator);
|
||||
|
||||
|
@ -85,6 +41,7 @@ void testElementWiseCudnn(
|
|||
auto a = g->cloneTensor(acpu);
|
||||
auto b = g->cloneTensor(bcpu);
|
||||
auto op = g->addOp<T>(a, b, nullptr);
|
||||
auto bop = g->addOp<DivObj>(a, b, nullptr);
|
||||
|
||||
// allocate CUDA memory
|
||||
g->dataMalloc();
|
||||
|
@ -94,29 +51,20 @@ void testElementWiseCudnn(
|
|||
|
||||
// clone CUDA output to CPU
|
||||
auto c = op->getOutput();
|
||||
auto bcast = bop->getOutput();
|
||||
auto ccpu = c->clone(cpuRuntime);
|
||||
acpu->printData();
|
||||
bcpu->printData();
|
||||
ccpu->printData();
|
||||
auto bcastcpu = bcast->clone(cpuRuntime);
|
||||
bcastcpu->printData();
|
||||
// cudaPrintTensor(c);
|
||||
// check results on CPU
|
||||
EXPECT_TRUE(ccpu->equalData(ansVec));
|
||||
EXPECT_TRUE(ccpu->equalData(bcastcpu));
|
||||
}
|
||||
|
||||
TEST(ElementWise, CuDNN) {
|
||||
testElementWiseCudnn<AddObj>(
|
||||
IncrementalGenerator(), Shape{1, 2, 2, 3},
|
||||
ExpectOutput{0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22});
|
||||
testElementWiseCudnn<SubObj>(
|
||||
IncrementalGenerator(), Shape{1, 2, 2, 3},
|
||||
ExpectOutput{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0});
|
||||
testElementWiseCudnn<MulObj>(
|
||||
IncrementalGenerator(), Shape{1, 2, 2, 3},
|
||||
ExpectOutput{0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 100, 121});
|
||||
|
||||
testElementWiseCudnn<DivObj>(
|
||||
OneGenerator(), Shape{1, 2, 2, 3},
|
||||
ExpectOutput{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
|
||||
|
||||
testElementWiseCudnn<PowObj>(IncrementalGenerator(), Shape{1, 2, 2, 1},
|
||||
ExpectOutput{1, 1, 4, 27});
|
||||
testElementWiseCudnn<AddObj>(IncrementalGenerator());
|
||||
}
|
||||
|
||||
} // namespace infini
|
Loading…
Reference in New Issue