diff --git a/include/core/data_type.h b/include/core/data_type.h index 654ce1ce..9cf911b9 100644 --- a/include/core/data_type.h +++ b/include/core/data_type.h @@ -6,8 +6,9 @@ class DataType { public: static const DataType Float32; static const DataType UInt32; - static constexpr size_t sizePerElement[]{sizeof(float), sizeof(uint32_t)}; - static constexpr std::string_view names[]{"Float32", "UInt32"}; + static const DataType Int32; + static constexpr size_t sizePerElement[]{sizeof(float), sizeof(uint32_t), sizeof(int32_t)}; + static constexpr std::string_view names[]{"Float32", "UInt32", "Int32"}; private: int index; @@ -29,9 +30,11 @@ class DataType { inline const DataType DataType::Float32(0); inline const DataType DataType::UInt32(1); +inline const DataType DataType::Int32(2); // Method definitions are out of the declaration due to GCC bug: // https://stackoverflow.com/questions/49707184/explicit-specialization-in-non-namespace-scope-does-not-compile-in-gcc template <> inline DataType DataType::get() { return Float32; } template <> inline DataType DataType::get() { return UInt32; } +template <> inline DataType DataType::get() { return Int32; } -} // namespace infini \ No newline at end of file +} // namespace infini diff --git a/include/core/operator.h b/include/core/operator.h index fae0ae5d..b8830264 100644 --- a/include/core/operator.h +++ b/include/core/operator.h @@ -81,6 +81,7 @@ enum class OpType { Transform, AddN, MulN, + Cast, // MemBound = 300, }; @@ -170,6 +171,7 @@ class OpRegistry { FOP(Transform); FOP(AddN); FOP(MulN); + FOP(Cast); // FOP(MemBound); default: @@ -251,6 +253,7 @@ class OperatorObj : public Object { * function. */ bool checkValid(GraphObj *graph); + bool checkValid(GraphObj *graph, DataType type); OpPerfKey getOpPerfKey() const; /** * @brief Hash operator attributes. Input and output shapes are not diff --git a/include/core/tensor.h b/include/core/tensor.h index cab503b8..1a998e80 100644 --- a/include/core/tensor.h +++ b/include/core/tensor.h @@ -72,6 +72,7 @@ class TensorObj : public TensorBaseObj { private: void printDataFloat() const; void printDataUint32_t() const; + void printDataInt32_t() const; template bool equalDataImpl(const T *a, const T *b, size_t size) const { diff --git a/include/operators/unary.h b/include/operators/unary.h index a6935c66..2bf420e6 100644 --- a/include/operators/unary.h +++ b/include/operators/unary.h @@ -80,6 +80,34 @@ class TransformObj : public OperatorObj { vector getOpAttrVector() const override; }; +class CastObj : public OperatorObj { + public: + enum CastType { Float2Half = 0, Float2HalfIEEE754, Float2Double, Float2Int64, Float2Int32, Float2Int16, Float2Int8, Float2Bool, + Half2Float, Half2Int32, Half2Int64, Half2Int16, Half2Int8, Half2Uint8, Half2Bool, Half2FloatInf, + Int322Float, Int322Half, Int322Int8, Int322Int16, + Int162Float, Int162Half, Int162Int32, + Int82Float, Int82Half, Int82Int16, Int82Int32, + Uint82Float, Uint82Half, Uint82Int32, Uint82Int64, + Bool2Float, Bool2Half, Bool2Int32, + Int322Int64, Int322Bool, + Int642Int32, Int642Uint32, Int642Float, Int642Half, + Uint642Uint32, + Uint322Int64, Uint322Uint64, + Double2Float}; + CastObj(GraphObj *graph, Tensor input, Tensor output, CastType type); + optional> inferShape(const TensorVec &inputs) const override; + + std::string toString() const override; + CastType getType() const { return castType; } + int numInputs() const override { return 1; } + int numOutputs() const override { return 1; } + + private: + CastType castType; + vector getWorkloadVector() const override; + vector getOpAttrVector() const override; +}; + #define DEFINE_UNARY_OBJ(prefix, type) \ class prefix##Obj : public UnaryObj { \ public: \ diff --git a/src/core/graph.cc b/src/core/graph.cc index 9b40d58d..00283348 100644 --- a/src/core/graph.cc +++ b/src/core/graph.cc @@ -61,4 +61,4 @@ OpVec GraphObj::getComputeOps() const { return opList; }; -} // namespace infini \ No newline at end of file +} // namespace infini diff --git a/src/core/operator.cc b/src/core/operator.cc index 0d3a816f..db8b6573 100644 --- a/src/core/operator.cc +++ b/src/core/operator.cc @@ -82,6 +82,29 @@ bool OperatorObj::checkValid(GraphObj *graph) { return true; } +bool OperatorObj::checkValid(GraphObj *graph, DataType type) { + auto optShapes = inferShape(); + if (!optShapes) // shape inference failed + return false; + + const vector &shapes = *optShapes; + if (shapes.size() != outputs.size()) + return false; + if (graph) { // if graph != nullptr, outputs should be created + auto dataTypes = vector(numOutputs(), type);; + for (size_t i = 0; i < outputs.size(); i++) { + IT_ASSERT(!outputs[i]); + outputs[i] = graph->addTensor(shapes[i], dataTypes[i]); + } + } else { // if outputs have been created, check their shapes + for (size_t i = 0; i < shapes.size(); ++i) { + if (shapes[i] != outputs[i]->getDims()) + return false; + } + } + return true; +} + optional> OperatorObj::inferShape() const { return inferShape(inputs); } diff --git a/src/core/tensor.cc b/src/core/tensor.cc index 6ea7e7bb..dcc52902 100644 --- a/src/core/tensor.cc +++ b/src/core/tensor.cc @@ -69,6 +69,8 @@ void TensorObj::printData() const { printDataFloat(); else if (dtype == DataType::UInt32) printDataUint32_t(); + else if (dtype == DataType::Int32) + printDataInt32_t(); else IT_TODO_HALT(); } @@ -128,6 +130,34 @@ void TensorObj::printDataUint32_t() const { } } +void TensorObj::printDataInt32_t() const { + IT_ASSERT(data != nullptr); + std::cout << "Tensor: " << guid << std::endl; + auto numDims = shape.size(); + auto dimSzVec = std::vector(numDims, 1); + auto ptr = data->getPtr(); + dimSzVec[numDims - 1] = shape[numDims - 1]; + for (int i = numDims - 1; i != 0; --i) + dimSzVec[i - 1] = dimSzVec[i] * shape[i - 1]; + for (size_t i = 0, iEnd = size(); i < iEnd; ++i) { + for (size_t j = 0; j < numDims; ++j) { + if (i % dimSzVec[j] == 0) { + std::cout << "["; + } + } + std::cout << ptr[i]; + for (size_t j = 0; j < numDims; ++j) { + if ((int)i % dimSzVec[j] == dimSzVec[j] - 1) { + std::cout << "]"; + } + } + if (i != size() - 1) + std::cout << ", "; + if ((int)i % dimSzVec[numDims - 1] == dimSzVec[numDims - 1] - 1) + std::cout << std::endl; + } +} + bool TensorObj::equalData(const Tensor &rhs) const { IT_ASSERT(data != nullptr); IT_ASSERT(rhs->data != nullptr); @@ -142,6 +172,9 @@ bool TensorObj::equalData(const Tensor &rhs) const { else if (getDType() == DataType::Float32) return equalDataImpl(getRawDataPtr(), rhs->getRawDataPtr(), size()); + else if (getDType() == DataType::Int32) + return equalDataImpl(getRawDataPtr(), + rhs->getRawDataPtr(), size()); else IT_TODO_HALT(); } @@ -155,6 +188,8 @@ void TensorObj::dataMalloc() { bytesPerElement = sizeof(float); else if (getDType() == DataType::UInt32) bytesPerElement = sizeof(uint32_t); + else if (getDType() == DataType::Int32) + bytesPerElement = sizeof(int32_t); data = runtime->allocBlob(size() * bytesPerElement); } diff --git a/src/kernels/bang/cast.cc b/src/kernels/bang/cast.cc new file mode 100644 index 00000000..cc46f90b --- /dev/null +++ b/src/kernels/bang/cast.cc @@ -0,0 +1,116 @@ +#include "bang/bang_kernel_without_config.h" +#include "bang/bang_runtime.h" +#include "operators/unary.h" + +namespace infini { +class CastCnnl : public BangKernelWithoutConfig { + void compute(const Operator &_op, + const RuntimeObj *_context) const override { + auto op = as(_op); + auto context = dynamic_cast(_context); + + void *const aData = (op->getInputs(0)->getRawDataPtr()); + void *const cData = (op->getOutput()->getRawDataPtr()); + + cnnlTensorDescriptor_t aDesc, cDesc; + auto dim = op->getInputs(0)->getDims(); + if (dim.size() != 4) + IT_TODO_HALT(); + + int dim_array[4] = {dim[0], dim[1], dim[2], dim[3]}; + // get inputs + checkCnnlError(cnnlCreateTensorDescriptor(&aDesc)); + checkCnnlError(cnnlCreateTensorDescriptor(&cDesc)); + cnnlCastDataType_t NlCastType; + CastObj::CastType type = op->getType(); + switch(type){ + case CastObj::Float2Half: + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array)); + checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_HALF, 4, dim_array)); + NlCastType = CNNL_CAST_FLOAT_TO_HALF; + break; + case CastObj::Float2HalfIEEE754: + case CastObj::Float2Double: + case CastObj::Float2Int64: + case CastObj::Float2Int32: + checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_FLOAT, 4, dim_array)); + checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW, CNNL_DTYPE_INT32, 4, dim_array)); + NlCastType = CNNL_CAST_FLOAT_TO_INT32; + case CastObj::Float2Int16: + case CastObj::Float2Int8: + case CastObj::Float2Bool: + //Todo + break; + case CastObj::Half2Float: + case CastObj::Half2Int32: + case CastObj::Half2Int64: + case CastObj::Half2Int16: + case CastObj::Half2Int8: + case CastObj::Half2Uint8: + case CastObj::Half2Bool: + case CastObj::Half2FloatInf: + //todo + break; + case CastObj::Int322Float: + case CastObj::Int322Half: + case CastObj::Int322Int8: + case CastObj::Int322Int16: + //todo + break; + case CastObj::Int162Float: + case CastObj::Int162Half: + case CastObj::Int162Int32: + //todo + break; + case CastObj::Int82Float: + case CastObj::Int82Half: + case CastObj::Int82Int16: + case CastObj::Int82Int32: + //todo + break; + case CastObj::Uint82Float: + case CastObj::Uint82Half: + case CastObj::Uint82Int32: + case CastObj::Uint82Int64: + //todo + break; + case CastObj::Bool2Float: + case CastObj::Bool2Half: + case CastObj::Bool2Int32: + //todo + break; + case CastObj::Int322Int64: + case CastObj::Int322Bool: + //todo + break; + case CastObj::Int642Int32: + case CastObj::Int642Uint32: + case CastObj::Int642Float: + case CastObj::Int642Half: + //todo + break; + case CastObj::Uint642Uint32: + case CastObj::Uint322Int64: + case CastObj::Uint322Uint64: + //todo + break; + case CastObj::Double2Float: + //todo + break; + } + cnnlStatus_t stat = + cnnlCastDataType(context->cnnlHandle(), aDesc, aData, NlCastType, cDesc, cData); + if (stat != CNNL_STATUS_SUCCESS) + return; + + // Destories in BANG does not require sync. But cnnl does not state + // whether sync is required before destories. + checkCnnlError(cnnlDestroyTensorDescriptor(aDesc)); + checkCnnlError(cnnlDestroyTensorDescriptor(cDesc)); + } +}; + +REGISTER_KERNEL(Device::BANG, OpType::Cast, DataType::Float32, CastCnnl, + "Cast_cnnl_BANG_Float32"); + +}; // namespace infini diff --git a/src/operators/unary.cc b/src/operators/unary.cc index f077db17..4c782a83 100644 --- a/src/operators/unary.cc +++ b/src/operators/unary.cc @@ -150,4 +150,33 @@ vector TransformObj::getOpAttrVector() const { return {enum_to_underlying(type)}; } +CastObj::CastObj(GraphObj *graph, Tensor input, Tensor output, CastType type) + : OperatorObj(OpType::Cast, {input}, {output}), castType(type) { + IT_ASSERT(checkValid(graph, DataType::Int32)); +} + +optional> CastObj::inferShape(const TensorVec &inputs) const { + const auto A = inputs[0]; + return {{A->getDims()}}; +} + +std::string CastObj::toString() const { + std::ostringstream os; + os << OpRegistry::getOpName(type) << "[" << getGuid() << "]"; + os << "("; + os << "output=" << outputs[0]->getGuid() << ")"; + return os.str(); +} + +vector CastObj::getWorkloadVector() const { + vector ret{enum_to_underlying(type)}; + const Shape shape = outputs[0]->getDims(); + ret.insert(ret.end(), shape.begin(), shape.end()); + return ret; +} + +vector CastObj::getOpAttrVector() const { + return {enum_to_underlying(type)}; +} + }; // namespace infini diff --git a/test/kernels/bang/test_bang_cast.cc b/test/kernels/bang/test_bang_cast.cc new file mode 100644 index 00000000..6f59b7f9 --- /dev/null +++ b/test/kernels/bang/test_bang_cast.cc @@ -0,0 +1,40 @@ +#include "bang/bang_runtime.h" +#include "core/graph.h" +#include "core/kernel.h" +#include "core/runtime.h" +#include "operators/unary.h" + +#include "test.h" + +namespace infini { + +template +void testCast(const std::function &generator, + const Shape &shape) { + // Runtime + Runtime cpuRuntime = CpuRuntimeObj::getInstance(); + auto bangRuntime = make_ref(); + + // Build input data on CPU + Tensor inputCpu = make_ref(shape, DataType::Float32, cpuRuntime); + inputCpu->dataMalloc(); + inputCpu->setData(generator); + + // GPU + Graph bangGraph = make_ref(bangRuntime); + auto inputGpu = bangGraph->cloneTensor(inputCpu); + auto gpuOp = bangGraph->addOp(inputGpu, nullptr, CastObj::Float2Int32); + auto outputGpu = gpuOp->getOutput(); + bangGraph->dataMalloc(); + bangRuntime->run(bangGraph); + auto outputGpu2Cpu = outputGpu->clone(cpuRuntime); + inputCpu->printData(); + outputGpu2Cpu->printData(); + EXPECT_TRUE(1); +} + +TEST(cnnl_Cast, run) { + testCast(IncrementalGenerator(), Shape{1, 2, 2, 3}); +} + +} // namespace infini