forked from jiuyuan/InfiniTensor
Simplify tensor transfer between CPU and CUDA (#10)
* Add: OP infers data type & Graph clones tensor * Fix: vecToString format * Add: static assert for Tensor methods * Rename: getDataRawPtr -> getRawDataPtr Co-authored-by: Liyan Zheng <liyan-zheng@outlook.com>
This commit is contained in:
parent
af08df32d2
commit
93f86d3f4d
|
@ -63,8 +63,9 @@ template <typename T> std::string vecToString(const std::vector<T> &vec) {
|
||||||
ret.append("[");
|
ret.append("[");
|
||||||
for (auto d : vec) {
|
for (auto d : vec) {
|
||||||
ret.append(std::to_string(d));
|
ret.append(std::to_string(d));
|
||||||
ret.append(", ");
|
ret.append(",");
|
||||||
}
|
}
|
||||||
|
if (!vec.empty())
|
||||||
ret.pop_back();
|
ret.pop_back();
|
||||||
ret.append("]");
|
ret.append("]");
|
||||||
return ret;
|
return ret;
|
||||||
|
|
|
@ -17,6 +17,12 @@ class GraphObj : public Object {
|
||||||
string toString() const override;
|
string toString() const override;
|
||||||
|
|
||||||
Tensor addTensor(Shape dim, DataType dtype = DataType::UInt32);
|
Tensor addTensor(Shape dim, DataType dtype = DataType::UInt32);
|
||||||
|
Tensor cloneTensor(const Tensor &tensor) {
|
||||||
|
auto ret = addTensor(tensor->getDims(), tensor->getDType());
|
||||||
|
ret->dataMalloc();
|
||||||
|
ret->copyData(tensor);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Add an operator and create its outputs. Output tensor arguments
|
* @brief Add an operator and create its outputs. Output tensor arguments
|
||||||
|
|
|
@ -138,6 +138,7 @@ class OperatorObj : public Object {
|
||||||
: type(opType), inputs(inputs), outputs(outputs) {}
|
: type(opType), inputs(inputs), outputs(outputs) {}
|
||||||
virtual optional<vector<Shape>>
|
virtual optional<vector<Shape>>
|
||||||
inferShape(const TensorVec &inputs) const = 0;
|
inferShape(const TensorVec &inputs) const = 0;
|
||||||
|
virtual vector<DataType> inferDataType(const TensorVec &inputs) const;
|
||||||
/**
|
/**
|
||||||
* @brief Constructs outputs (if requried) and check whether the operator is
|
* @brief Constructs outputs (if requried) and check whether the operator is
|
||||||
* valid.
|
* valid.
|
||||||
|
@ -180,6 +181,7 @@ class OperatorObj : public Object {
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
optional<vector<Shape>> inferShape() const;
|
optional<vector<Shape>> inferShape() const;
|
||||||
|
vector<DataType> inferDataType() const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -24,7 +24,7 @@ class TensorObj : public TensorBaseObj {
|
||||||
size_t getOffset(const Shape &ds) const;
|
size_t getOffset(const Shape &ds) const;
|
||||||
using TensorBaseObj::getData;
|
using TensorBaseObj::getData;
|
||||||
VType getData(const Shape &pos) const;
|
VType getData(const Shape &pos) const;
|
||||||
void dataMalloc(const Runtime &runtime);
|
void dataMalloc();
|
||||||
|
|
||||||
template <typename T> void copyData(const T *dptr) {
|
template <typename T> void copyData(const T *dptr) {
|
||||||
IT_ASSERT(DataType::get<T>() == dtype);
|
IT_ASSERT(DataType::get<T>() == dtype);
|
||||||
|
@ -45,7 +45,8 @@ class TensorObj : public TensorBaseObj {
|
||||||
copyData(dataVector.data());
|
copyData(dataVector.data());
|
||||||
}
|
}
|
||||||
|
|
||||||
void copyData(const Tensor &src) { runtime->copyBlob(this, src.get()); }
|
void copyData(const TensorObj *src);
|
||||||
|
void copyData(const Tensor &src) { copyData(src.get()); }
|
||||||
void setData(
|
void setData(
|
||||||
const std::function<void(void *, size_t, DataType)> &generator) const {
|
const std::function<void(void *, size_t, DataType)> &generator) const {
|
||||||
generator(data->getPtr<void *>(), size(), dtype);
|
generator(data->getPtr<void *>(), size(), dtype);
|
||||||
|
@ -54,11 +55,33 @@ class TensorObj : public TensorBaseObj {
|
||||||
void printData() const;
|
void printData() const;
|
||||||
bool equalData(const Tensor &rhs) const;
|
bool equalData(const Tensor &rhs) const;
|
||||||
|
|
||||||
|
template <typename T> bool equalData(const vector<T> &dataVector) {
|
||||||
|
IT_ASSERT(DataType::get<T>() == dtype);
|
||||||
|
IT_ASSERT(size() == dataVector.size());
|
||||||
|
return equalDataImpl(getRawDataPtr<T *>(), dataVector.data(), size());
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void printDataFloat() const;
|
void printDataFloat() const;
|
||||||
void printDataUint32_t() const;
|
void printDataUint32_t() const;
|
||||||
template <typename T> bool equalDataInt(const Tensor &rhs) const;
|
|
||||||
template <typename T> bool equalDataFloat(const Tensor &rhs) const;
|
template <typename T>
|
||||||
|
bool equalDataImpl(const T *a, const T *b, size_t size) const {
|
||||||
|
for (size_t i = 0; i < size; ++i) {
|
||||||
|
if constexpr (std::is_integral_v<T>) {
|
||||||
|
if (a[i] != b[i])
|
||||||
|
return false;
|
||||||
|
} else if constexpr (std::is_floating_point_v<T>) {
|
||||||
|
if (fabs(a[i] - b[i]) / std::max(fabs(a[i]), fabs(b[i])) >
|
||||||
|
1e-6) {
|
||||||
|
printf("Error on %lu: %f %f\n", i, a[i], b[i]);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
} else
|
||||||
|
static_assert(!sizeof(T), "Unsupported data type");
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
// void setDims(const Dim &dms) { dims = dms; }
|
// void setDims(const Dim &dms) { dims = dms; }
|
||||||
|
|
||||||
// bool dataRand(int seed = 0) {
|
// bool dataRand(int seed = 0) {
|
||||||
|
|
|
@ -32,8 +32,10 @@ class TensorBaseObj : public Object {
|
||||||
IT_ASSERT(data == nullptr);
|
IT_ASSERT(data == nullptr);
|
||||||
data = blob;
|
data = blob;
|
||||||
}
|
}
|
||||||
Blob getDataPtr() const { return data; }
|
Blob getDataBlob() const { return data; }
|
||||||
template <typename T> T getDataRawPtr() const {
|
template <typename T> T getRawDataPtr() const {
|
||||||
|
static_assert(std::is_pointer_v<T>,
|
||||||
|
"Raw data pointer has a type of pointer");
|
||||||
IT_ASSERT(data != nullptr);
|
IT_ASSERT(data != nullptr);
|
||||||
return data->getPtr<T>();
|
return data->getPtr<T>();
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,7 +5,7 @@ namespace infini {
|
||||||
void cudaPrintFloat(float *x, int len);
|
void cudaPrintFloat(float *x, int len);
|
||||||
|
|
||||||
void cudaPrintTensor(const Tensor &tensor) {
|
void cudaPrintTensor(const Tensor &tensor) {
|
||||||
cudaPrintFloat(tensor->getDataRawPtr<float *>(), tensor->size());
|
cudaPrintFloat(tensor->getRawDataPtr<float *>(), tensor->size());
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace infini
|
} // namespace infini
|
|
@ -36,7 +36,7 @@ class ConvObj : public OperatorObj {
|
||||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||||
|
|
||||||
std::string toString() const override;
|
std::string toString() const override;
|
||||||
int numInputs() const override { return 3; }
|
int numInputs() const override { return 2; }
|
||||||
int numOutputs() const override { return 1; }
|
int numOutputs() const override { return 1; }
|
||||||
|
|
||||||
Tensor getBias() const { return inputs[2]; }
|
Tensor getBias() const { return inputs[2]; }
|
||||||
|
|
|
@ -33,7 +33,7 @@ class MatmulObj : public OperatorObj {
|
||||||
std::string toString() const override;
|
std::string toString() const override;
|
||||||
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
|
||||||
|
|
||||||
int numInputs() const override { return 3; }
|
int numInputs() const override { return 2; }
|
||||||
int numOutputs() const override { return 1; }
|
int numOutputs() const override { return 1; }
|
||||||
|
|
||||||
Tensor getBias() const { return inputs[2]; }
|
Tensor getBias() const { return inputs[2]; }
|
||||||
|
|
|
@ -6,6 +6,10 @@ void GraphObj::updateConnection() { IT_TODO_HALT(); }
|
||||||
|
|
||||||
string GraphObj::toString() const {
|
string GraphObj::toString() const {
|
||||||
std::ostringstream oss;
|
std::ostringstream oss;
|
||||||
|
oss << "Graph Tensors:\n";
|
||||||
|
for (const auto &tensor : tensors)
|
||||||
|
oss << tensor << "\n";
|
||||||
|
|
||||||
oss << "Graph operators:\n";
|
oss << "Graph operators:\n";
|
||||||
for (const auto &op : ops)
|
for (const auto &op : ops)
|
||||||
oss << op << "\n";
|
oss << op << "\n";
|
||||||
|
@ -14,7 +18,7 @@ string GraphObj::toString() const {
|
||||||
|
|
||||||
void GraphObj::dataMalloc() {
|
void GraphObj::dataMalloc() {
|
||||||
for (auto &tensor : tensors) {
|
for (auto &tensor : tensors) {
|
||||||
tensor->dataMalloc(runtime);
|
tensor->dataMalloc();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -57,9 +57,10 @@ bool OperatorObj::checkValid(GraphObj *graph) {
|
||||||
if (shapes.size() != outputs.size())
|
if (shapes.size() != outputs.size())
|
||||||
return false;
|
return false;
|
||||||
if (graph) { // if graph != nullptr, outputs should be created
|
if (graph) { // if graph != nullptr, outputs should be created
|
||||||
|
auto dataTypes = inferDataType();
|
||||||
for (size_t i = 0; i < outputs.size(); i++) {
|
for (size_t i = 0; i < outputs.size(); i++) {
|
||||||
IT_ASSERT(!outputs[i]);
|
IT_ASSERT(!outputs[i]);
|
||||||
outputs[i] = graph->addTensor(shapes[i]);
|
outputs[i] = graph->addTensor(shapes[i], dataTypes[i]);
|
||||||
}
|
}
|
||||||
} else { // if graph is not empty, check outputs match inferred shapes
|
} else { // if graph is not empty, check outputs match inferred shapes
|
||||||
for (size_t i = 0; i < shapes.size(); ++i) {
|
for (size_t i = 0; i < shapes.size(); ++i) {
|
||||||
|
@ -74,4 +75,15 @@ optional<vector<Shape>> OperatorObj::inferShape() const {
|
||||||
return inferShape(inputs);
|
return inferShape(inputs);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
vector<DataType> OperatorObj::inferDataType(const TensorVec &inputs) const {
|
||||||
|
auto dataType = inputs[0]->getDType();
|
||||||
|
for (const auto &tensor : inputs)
|
||||||
|
IT_ASSERT(dataType == tensor->getDType());
|
||||||
|
return vector(numOutputs(), dataType);
|
||||||
|
}
|
||||||
|
|
||||||
|
vector<DataType> OperatorObj::inferDataType() const {
|
||||||
|
return inferDataType(inputs);
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace infini
|
} // namespace infini
|
|
@ -116,8 +116,8 @@ Blob RuntimeObj::allocBlob(size_t size) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void RuntimeObj::copyBlob(const TensorObj *dst, const TensorObj *src) const {
|
void RuntimeObj::copyBlob(const TensorObj *dst, const TensorObj *src) const {
|
||||||
void *dstPtr = dst->getDataRawPtr<void *>();
|
void *dstPtr = dst->getRawDataPtr<void *>();
|
||||||
void *srcPtr = src->getDataRawPtr<void *>();
|
void *srcPtr = src->getRawDataPtr<void *>();
|
||||||
size_t bytes = dst->getBytes();
|
size_t bytes = dst->getBytes();
|
||||||
auto dstRuntime = dst->getRuntime();
|
auto dstRuntime = dst->getRuntime();
|
||||||
auto srcRuntime = src->getRuntime();
|
auto srcRuntime = src->getRuntime();
|
||||||
|
|
|
@ -11,7 +11,9 @@ VType TensorObj::getData(const Shape &pos) const {
|
||||||
return getData(getOffset(pos));
|
return getData(getOffset(pos));
|
||||||
}
|
}
|
||||||
|
|
||||||
string TensorObj::toString() const { return "Tensor " + std::to_string(guid); }
|
string TensorObj::toString() const {
|
||||||
|
return "Tensor " + std::to_string(guid) + " shape " + vecToString(shape);
|
||||||
|
}
|
||||||
|
|
||||||
size_t TensorObj::getOffset(const Shape &pos) const {
|
size_t TensorObj::getOffset(const Shape &pos) const {
|
||||||
auto nDim = pos.size();
|
auto nDim = pos.size();
|
||||||
|
@ -103,50 +105,28 @@ void TensorObj::printDataUint32_t() const {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T> bool TensorObj::equalDataInt(const Tensor &rhs) const {
|
|
||||||
auto ptr = data->getPtr<uint32_t *>();
|
|
||||||
auto ptrRhs = rhs->data->getPtr<uint32_t *>();
|
|
||||||
if (shape != rhs->getDims())
|
|
||||||
return false;
|
|
||||||
size_t sz = size();
|
|
||||||
for (size_t i = 0; i < sz; ++i)
|
|
||||||
if (ptr[i] != ptrRhs[i])
|
|
||||||
return false;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename T> bool TensorObj::equalDataFloat(const Tensor &rhs) const {
|
|
||||||
IT_ASSERT(data != nullptr);
|
|
||||||
IT_ASSERT(rhs->data != nullptr);
|
|
||||||
// TODO: deal with data type
|
|
||||||
auto ptr = data->getPtr<T *>();
|
|
||||||
auto ptrRhs = rhs->data->getPtr<T *>();
|
|
||||||
if (shape != rhs->getDims())
|
|
||||||
return false;
|
|
||||||
size_t sz = size();
|
|
||||||
for (size_t i = 0; i < sz; ++i)
|
|
||||||
if (fabs(ptr[i] - ptrRhs[i]) / std::max(fabs(ptr[i]), fabs(ptrRhs[i])) >
|
|
||||||
1e-6) {
|
|
||||||
printf("Error on %lu: %f %f\n", i, ptr[i], ptrRhs[i]);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool TensorObj::equalData(const Tensor &rhs) const {
|
bool TensorObj::equalData(const Tensor &rhs) const {
|
||||||
IT_ASSERT(data != nullptr);
|
IT_ASSERT(data != nullptr);
|
||||||
IT_ASSERT(rhs->data != nullptr);
|
IT_ASSERT(rhs->data != nullptr);
|
||||||
IT_ASSERT(getDType() == rhs->getDType());
|
IT_ASSERT(getDType() == rhs->getDType());
|
||||||
|
IT_ASSERT(runtime->isCpu());
|
||||||
|
IT_ASSERT(rhs->getRuntime()->isCpu());
|
||||||
|
if (shape != rhs->getDims())
|
||||||
|
return false;
|
||||||
if (getDType() == DataType::UInt32)
|
if (getDType() == DataType::UInt32)
|
||||||
return equalDataInt<uint32_t>(rhs);
|
return equalDataImpl(getRawDataPtr<uint32_t *>(),
|
||||||
|
rhs->getRawDataPtr<uint32_t *>(), size());
|
||||||
else if (getDType() == DataType::Float32)
|
else if (getDType() == DataType::Float32)
|
||||||
return equalDataInt<float>(rhs);
|
return equalDataImpl(getRawDataPtr<float *>(),
|
||||||
|
rhs->getRawDataPtr<float *>(), size());
|
||||||
else
|
else
|
||||||
IT_TODO_HALT();
|
IT_TODO_HALT();
|
||||||
}
|
}
|
||||||
|
|
||||||
void TensorObj::dataMalloc(const Runtime &runtime) {
|
void TensorObj::dataMalloc() {
|
||||||
IT_ASSERT(data == nullptr);
|
if (data != nullptr)
|
||||||
|
return;
|
||||||
|
// IT_ASSERT(data == nullptr);
|
||||||
size_t bytesPerElement;
|
size_t bytesPerElement;
|
||||||
if (getDType() == DataType::Float32)
|
if (getDType() == DataType::Float32)
|
||||||
bytesPerElement = sizeof(float);
|
bytesPerElement = sizeof(float);
|
||||||
|
@ -155,4 +135,10 @@ void TensorObj::dataMalloc(const Runtime &runtime) {
|
||||||
data = runtime->allocBlob(size() * bytesPerElement);
|
data = runtime->allocBlob(size() * bytesPerElement);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void TensorObj::copyData(const TensorObj *src) {
|
||||||
|
IT_ASSERT(dtype == src->getDType());
|
||||||
|
IT_ASSERT(size() == src->size());
|
||||||
|
runtime->copyBlob(this, src);
|
||||||
|
}
|
||||||
|
|
||||||
}; // namespace infini
|
}; // namespace infini
|
|
@ -7,9 +7,9 @@ template <typename T> class NaiveConv : public Kernel {
|
||||||
void compute(const Operator &_op, const PerfRecord &record,
|
void compute(const Operator &_op, const PerfRecord &record,
|
||||||
const RuntimeObj *context) const override {
|
const RuntimeObj *context) const override {
|
||||||
auto op = as<ConvObj>(_op);
|
auto op = as<ConvObj>(_op);
|
||||||
T *iptr = op->getInputs(0)->getDataRawPtr<T *>();
|
T *iptr = op->getInputs(0)->getRawDataPtr<T *>();
|
||||||
T *wptr = op->getInputs(1)->getDataRawPtr<T *>();
|
T *wptr = op->getInputs(1)->getRawDataPtr<T *>();
|
||||||
T *optr = op->getOutput()->getDataRawPtr<T *>();
|
T *optr = op->getOutput()->getRawDataPtr<T *>();
|
||||||
auto [n, c, h, w, f, r, s] = op->getNCHWFRS();
|
auto [n, c, h, w, f, r, s] = op->getNCHWFRS();
|
||||||
auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
|
auto [ph, pw, sh, sw, dh, dw] = op->getPadStrideDilation();
|
||||||
int cpg = op->getChannelPerGroup();
|
int cpg = op->getChannelPerGroup();
|
||||||
|
|
|
@ -7,9 +7,10 @@ template <typename T> class NaiveMatmul : public Kernel {
|
||||||
void compute(const Operator &_op, const PerfRecord &record,
|
void compute(const Operator &_op, const PerfRecord &record,
|
||||||
const RuntimeObj *context) const override {
|
const RuntimeObj *context) const override {
|
||||||
auto op = as<MatmulObj>(_op);
|
auto op = as<MatmulObj>(_op);
|
||||||
T *A = op->getInputs(0)->getDataRawPtr<T *>();
|
IT_ASSERT(op->getInputs().size() == 2, "Bias is not supported yet.");
|
||||||
T *B = op->getInputs(1)->getDataRawPtr<T *>();
|
T *A = op->getInputs(0)->getRawDataPtr<T *>();
|
||||||
T *C = op->getOutput()->getDataRawPtr<T *>();
|
T *B = op->getInputs(1)->getRawDataPtr<T *>();
|
||||||
|
T *C = op->getOutput()->getRawDataPtr<T *>();
|
||||||
IT_ASSERT(op->getTransA() == false && op->getTransB() == false);
|
IT_ASSERT(op->getTransA() == false && op->getTransB() == false);
|
||||||
IT_ASSERT(op->getAct() == ActType::None);
|
IT_ASSERT(op->getAct() == ActType::None);
|
||||||
IT_ASSERT(op->getB() == 1);
|
IT_ASSERT(op->getB() == 1);
|
||||||
|
|
|
@ -26,12 +26,12 @@ class convCudnn : public Kernel {
|
||||||
bool cuDNNUnfused(const Ref<ConvObj> &op, const ConvCuDnnPerfRecord &record,
|
bool cuDNNUnfused(const Ref<ConvObj> &op, const ConvCuDnnPerfRecord &record,
|
||||||
const CudaRuntimeObj *context) const {
|
const CudaRuntimeObj *context) const {
|
||||||
cudnnStatus_t stat;
|
cudnnStatus_t stat;
|
||||||
void *const inData = (op->getInputs(0)->getDataRawPtr<void *>());
|
void *const inData = (op->getInputs(0)->getRawDataPtr<void *>());
|
||||||
void *const knData = (op->getInputs(1)->getDataRawPtr<void *>());
|
void *const knData = (op->getInputs(1)->getRawDataPtr<void *>());
|
||||||
if (op->getInputs(2) != nullptr)
|
if (op->getInputs().size() > 2) // Bias is not supported yet
|
||||||
IT_TODO_HALT();
|
IT_TODO_HALT();
|
||||||
// void *const biasData = (op->getInputs(2)->getDataRawPtr<void *>());
|
// void *const biasData = (op->getInputs(2)->getRawDataPtr<void *>());
|
||||||
void *const outData = (op->getOutput()->getDataRawPtr<void *>());
|
void *const outData = (op->getOutput()->getRawDataPtr<void *>());
|
||||||
|
|
||||||
const auto [n, c, h, w, f, r, s] = op->getNCHWFRS();
|
const auto [n, c, h, w, f, r, s] = op->getNCHWFRS();
|
||||||
const int cpg = op->getChannelPerGroup();
|
const int cpg = op->getChannelPerGroup();
|
||||||
|
|
|
@ -3,20 +3,19 @@
|
||||||
namespace infini {
|
namespace infini {
|
||||||
|
|
||||||
ConvObj::ConvObj(GraphObj *graph, Tensor input, Tensor weight, Tensor output,
|
ConvObj::ConvObj(GraphObj *graph, Tensor input, Tensor weight, Tensor output,
|
||||||
int ph, int pw, int sh, int sw, int dh, int dw, Tensor bias,
|
int ph, int pw, int sh, int sw, int dh, int dw,
|
||||||
ActType act)
|
[[maybe_unused]] Tensor bias, ActType act)
|
||||||
: OperatorObj(OpType::Conv, {input, weight, bias}, {output}), ph(ph),
|
: OperatorObj(OpType::Conv, {input, weight}, {output}), ph(ph), pw(pw),
|
||||||
pw(pw), sh(sh), sw(sw), dh(dh), dw(dw), act(act),
|
sh(sh), sw(sw), dh(dh), dw(dw), act(act), padding(PaddingMode::Other) {
|
||||||
padding(PaddingMode::Other) {
|
|
||||||
setAuxilaryAttributes(PaddingMode::Other);
|
setAuxilaryAttributes(PaddingMode::Other);
|
||||||
IT_ASSERT(checkValid(graph));
|
IT_ASSERT(checkValid(graph));
|
||||||
}
|
}
|
||||||
|
|
||||||
ConvObj::ConvObj(GraphObj *graph, Tensor input, Tensor weight, Tensor output,
|
ConvObj::ConvObj(GraphObj *graph, Tensor input, Tensor weight, Tensor output,
|
||||||
PaddingMode mode, int sh, int sw, int dh, int dw, Tensor bias,
|
PaddingMode mode, int sh, int sw, int dh, int dw,
|
||||||
ActType act)
|
[[maybe_unused]] Tensor bias, ActType act)
|
||||||
: OperatorObj(OpType::Conv, {input, weight, bias}, {output}), ph(-1),
|
: OperatorObj(OpType::Conv, {input, weight}, {output}), ph(-1), pw(-1),
|
||||||
pw(-1), sh(sh), sw(sw), dh(dh), dw(dw), act(act), padding(mode) {
|
sh(sh), sw(sw), dh(dh), dw(dw), act(act), padding(mode) {
|
||||||
IT_ASSERT(mode != PaddingMode::Other);
|
IT_ASSERT(mode != PaddingMode::Other);
|
||||||
setAuxilaryAttributes(mode);
|
setAuxilaryAttributes(mode);
|
||||||
IT_ASSERT(checkValid(graph));
|
IT_ASSERT(checkValid(graph));
|
||||||
|
|
|
@ -3,9 +3,9 @@
|
||||||
namespace infini {
|
namespace infini {
|
||||||
|
|
||||||
MatmulObj::MatmulObj(GraphObj *graph, Tensor A, Tensor B, Tensor C, bool transA,
|
MatmulObj::MatmulObj(GraphObj *graph, Tensor A, Tensor B, Tensor C, bool transA,
|
||||||
bool transB, Tensor bias, ActType act)
|
bool transB, [[maybe_unused]] Tensor bias, ActType act)
|
||||||
: OperatorObj(OpType::Matmul, {A, B, bias}, {C}), transA(transA),
|
: OperatorObj(OpType::Matmul, {A, B}, {C}), transA(transA), transB(transB),
|
||||||
transB(transB), act(act), b(A->getDims()[0]),
|
act(act), b(A->getDims()[0]),
|
||||||
m(transA ? A->getDims()[2] : A->getDims()[1]),
|
m(transA ? A->getDims()[2] : A->getDims()[1]),
|
||||||
n(transB ? B->getDims()[1] : B->getDims()[2]),
|
n(transB ? B->getDims()[1] : B->getDims()[2]),
|
||||||
k(transA ? A->getDims()[1] : A->getDims()[2]) {
|
k(transA ? A->getDims()[1] : A->getDims()[2]) {
|
||||||
|
|
|
@ -19,7 +19,7 @@ TEST(Graph, build_and_run) {
|
||||||
runtime->run(g);
|
runtime->run(g);
|
||||||
// check answer
|
// check answer
|
||||||
auto ans = make_ref<TensorObj>(Shape{1, 2, 4}, DataType::UInt32, runtime);
|
auto ans = make_ref<TensorObj>(Shape{1, 2, 4}, DataType::UInt32, runtime);
|
||||||
ans->dataMalloc(runtime);
|
ans->dataMalloc();
|
||||||
ans->copyData(vector<uint32_t>{38, 44, 50, 56, 83, 98, 113, 128});
|
ans->copyData(vector<uint32_t>{38, 44, 50, 56, 83, 98, 113, 128});
|
||||||
EXPECT_TRUE(o0->equalData(ans));
|
EXPECT_TRUE(o0->equalData(ans));
|
||||||
}
|
}
|
||||||
|
@ -41,7 +41,7 @@ TEST(Graph, perf_engine) {
|
||||||
EXPECT_LT(perfTime, 0.01);
|
EXPECT_LT(perfTime, 0.01);
|
||||||
// check answer
|
// check answer
|
||||||
auto ans = make_ref<TensorObj>(Shape{1, 2, 4}, DataType::UInt32, runtime);
|
auto ans = make_ref<TensorObj>(Shape{1, 2, 4}, DataType::UInt32, runtime);
|
||||||
ans->dataMalloc(runtime);
|
ans->dataMalloc();
|
||||||
ans->copyData(vector<uint32_t>{38, 44, 50, 56, 83, 98, 113, 128});
|
ans->copyData(vector<uint32_t>{38, 44, 50, 56, 83, 98, 113, 128});
|
||||||
EXPECT_TRUE(matmul->getOutput()->equalData(ans));
|
EXPECT_TRUE(matmul->getOutput()->equalData(ans));
|
||||||
}
|
}
|
||||||
|
|
|
@ -60,7 +60,7 @@ TEST(Conv, NaiveCPU) {
|
||||||
// check answer
|
// check answer
|
||||||
auto ans =
|
auto ans =
|
||||||
make_ref<TensorObj>(Shape{1, 2, 2, 2}, DataType::UInt32, runtime);
|
make_ref<TensorObj>(Shape{1, 2, 2, 2}, DataType::UInt32, runtime);
|
||||||
ans->dataMalloc(runtime);
|
ans->dataMalloc();
|
||||||
ans->copyData(
|
ans->copyData(
|
||||||
vector<uint32_t>{4794, 4386, 8199, 7506, 11274, 10542, 20835, 19656});
|
vector<uint32_t>{4794, 4386, 8199, 7506, 11274, 10542, 20835, 19656});
|
||||||
EXPECT_TRUE(conv->getOutput()->equalData(ans));
|
EXPECT_TRUE(conv->getOutput()->equalData(ans));
|
||||||
|
@ -69,52 +69,35 @@ TEST(Conv, NaiveCPU) {
|
||||||
void testConvCudnn(
|
void testConvCudnn(
|
||||||
const std::function<void(void *, size_t, DataType)> &generator,
|
const std::function<void(void *, size_t, DataType)> &generator,
|
||||||
vector<float> ansVec) {
|
vector<float> ansVec) {
|
||||||
Runtime cpuRuntime = CpuRuntimeObj::getInstance();
|
// Construct Runtime and graph for CPU and CUDA
|
||||||
auto cudaRuntime = make_ref<CudaRuntimeObj>();
|
Runtime cpu = CpuRuntimeObj::getInstance(); // CPUruntime is singleton
|
||||||
|
Graph gCpu = make_ref<GraphObj>(cpu);
|
||||||
|
Runtime cuda = make_ref<CudaRuntimeObj>();
|
||||||
|
Graph gCuda = make_ref<GraphObj>(cuda);
|
||||||
|
// Set input data on CPU in a CPU Graph
|
||||||
|
Tensor i0Cpu = gCpu->addTensor({1, 3, 4, 4}, DataType::Float32);
|
||||||
|
Tensor w0Cpu = gCpu->addTensor({2, 3, 3, 3}, DataType::Float32);
|
||||||
|
// Malloc data for all tensors in a graph. Do we need implicit allocation?
|
||||||
|
gCpu->dataMalloc();
|
||||||
|
i0Cpu->setData(generator);
|
||||||
|
w0Cpu->setData(generator);
|
||||||
|
|
||||||
|
// Copy input tensors from CPU to CUDA
|
||||||
|
Tensor i0Cuda = gCuda->cloneTensor(i0Cpu);
|
||||||
|
Tensor w0Cuda = gCuda->cloneTensor(w0Cpu);
|
||||||
// Build CUDA graph
|
// Build CUDA graph
|
||||||
Graph g = make_ref<GraphObj>(cudaRuntime);
|
auto conv =
|
||||||
Tensor i0 = g->addTensor({1, 3, 4, 4}, DataType::Float32);
|
gCuda->addOp<ConvObj>(i0Cuda, w0Cuda, nullptr, 1, 1, 2, 1, 1, 2);
|
||||||
Tensor w0 = g->addTensor({2, 3, 3, 3}, DataType::Float32);
|
|
||||||
auto conv = g->addOp<ConvObj>(i0, w0, nullptr, 1, 1, 2, 1, 1, 2);
|
|
||||||
|
|
||||||
// allocate CUDA memory
|
// allocate CUDA memory
|
||||||
g->dataMalloc();
|
gCuda->dataMalloc();
|
||||||
|
|
||||||
// Build input and output data on CPU
|
|
||||||
auto cpui0 =
|
|
||||||
make_ref<TensorObj>(Shape{1, 3, 4, 4}, DataType::Float32, cpuRuntime);
|
|
||||||
cpui0->dataMalloc(cpuRuntime);
|
|
||||||
cpui0->setData(generator);
|
|
||||||
|
|
||||||
auto cpuw0 =
|
|
||||||
make_ref<TensorObj>(Shape{2, 3, 3, 3}, DataType::Float32, cpuRuntime);
|
|
||||||
cpuw0->dataMalloc(cpuRuntime);
|
|
||||||
cpuw0->setData(generator);
|
|
||||||
|
|
||||||
auto ans =
|
|
||||||
make_ref<TensorObj>(Shape{1, 2, 2, 2}, DataType::Float32, cpuRuntime);
|
|
||||||
ans->dataMalloc(cpuRuntime);
|
|
||||||
ans->copyData(ansVec);
|
|
||||||
|
|
||||||
// Copy inputs from CPU to CUDA
|
|
||||||
i0->copyData(cpui0);
|
|
||||||
w0->copyData(cpuw0);
|
|
||||||
// Execute on CUDA
|
// Execute on CUDA
|
||||||
cudaRuntime->run(g);
|
cuda->run(gCuda);
|
||||||
// double perfTime = cudaRuntime->getPerfTime(g);
|
// copy output from CUDA to CPU
|
||||||
// // The example Conv takes 0.015ms with one core
|
auto o0Cpu = gCpu->cloneTensor(conv->getOutput());
|
||||||
// EXPECT_GT(perfTime, 0);
|
|
||||||
// EXPECT_LT(perfTime, 0.1);
|
|
||||||
|
|
||||||
// copy CUDA output to CPU
|
|
||||||
auto o0 = conv->getOutput();
|
|
||||||
auto cpuo0 =
|
|
||||||
make_ref<TensorObj>(Shape{1, 2, 2, 2}, DataType::Float32, cpuRuntime);
|
|
||||||
cpuo0->dataMalloc(cpuRuntime);
|
|
||||||
cpuo0->copyData(o0);
|
|
||||||
|
|
||||||
// check results on CPU
|
// check results on CPU
|
||||||
EXPECT_TRUE(cpuo0->equalData(ans));
|
EXPECT_TRUE(o0Cpu->equalData(ansVec));
|
||||||
|
// print a tensor/operator/graph by print()
|
||||||
|
gCuda->print();
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(Conv, cuDNN) {
|
TEST(Conv, cuDNN) {
|
||||||
|
|
Loading…
Reference in New Issue