Fix: tensor size overflow

This commit is contained in:
Liyan Zheng 2023-04-22 20:28:00 +08:00
parent a732b6f176
commit c451918224
6 changed files with 23 additions and 14 deletions

View File

@ -63,6 +63,7 @@ class GraphObj : public Object {
void optimize();
void dataMalloc();
void dataFree();
/**
* @brief Add an operator and create its outputs. Output tensor arguments

View File

@ -59,7 +59,7 @@ class CudaRuntimeObj : public RuntimeObj {
bool isInCudaGraph() const { return cudaGraphStatus; }
cudaStream_t getStream() const { return stream; }
double timeWithCudaGraph(Graph graph);
double timeWithCudaGraph(Graph graph, int rounds = 1000);
private:
void tune(const Graph &graph, bool profiling) const;

View File

@ -129,6 +129,12 @@ void GraphObj::dataMalloc() {
}
}
void GraphObj::dataFree() {
for (auto &tensor : tensors) {
tensor->freeData();
}
}
Tensor GraphObj::addTensor(Shape dim, DataType dtype, TensorType tensorType) {
return tensors.emplace_back(
make_ref<TensorObj>(dim, dtype, runtime, tensorType));

View File

@ -13,7 +13,7 @@ TensorObj::TensorObj(Shape shape_, DataType dtype, Runtime runtime,
: TensorBaseObj(shape_.size(), dtype, runtime), shape(std::move(shape_)),
_size(shape.empty()
? 0
: std::accumulate(shape.begin(), shape.end(), 1,
: std::accumulate(shape.begin(), shape.end(), 1lu,
[](auto acc, auto x) { return acc * x; })),
tensorType(tensorType) {}

View File

@ -121,7 +121,7 @@ void CudaRuntimeObj::sync() const { checkCudaError(cudaDeviceSynchronize()); }
string CudaRuntimeObj::toString() const { return "CUDA Runtime"; }
double CudaRuntimeObj::timeWithCudaGraph(Graph graph) {
double CudaRuntimeObj::timeWithCudaGraph(Graph graph, int rounds) {
const auto &kernelRegistry = KernelRegistry::getInstance();
auto &perfEngine = PerfEngine::getInstance();
// compile-time computable

View File

@ -283,6 +283,7 @@ Graph optimizeGraph(Graph g, Runtime _runtime, bool tuning, NMutator::Mode mode,
// dbg("Baseline graph");
// printGraph(g);
// dbg(runtme->getPerfTime(g, true));
g->dataFree();
for (size_t i = 0; i < bestGraphs.size(); i++) {
auto bestGraphCpu = bestGraphs[i];
@ -304,18 +305,19 @@ Graph optimizeGraph(Graph g, Runtime _runtime, bool tuning, NMutator::Mode mode,
dbg(bestGraph);
dbg(bestGraph->getOutputs());
if (tuning) {
runtime->run(bestGraph, true); // Tune kernels
runtime->run(bestGraph, false); // Execute transfomraed graph
// if (tuning) {
// runtime->run(bestGraph, true); // Tune kernels
// runtime->run(bestGraph, false); // Execute transfomraed graph
auto go0 = gCpu->cloneTensor(g->getOutputs()[0]);
auto bgo0 = gCpu->cloneTensor(bestGraph->getOutputs()[0]);
// EXPECT_TRUE(go0->equalData(bgo0, 1e-3));
dbg(go0->equalData(bgo0, 1e-3));
dbg(runtime->getPerfTime(bestGraph, true));
dbg(runtime->timeNonCtcOperators(bestGraph));
// dbg(runtime->timeWithCudaGraph(bestGraph));
}
// // FIXME: g is freed
// auto go0 = gCpu->cloneTensor(g->getOutputs()[0]);
// auto bgo0 = gCpu->cloneTensor(bestGraph->getOutputs()[0]);
// // EXPECT_TRUE(go0->equalData(bgo0, 1e-3));
// dbg(go0->equalData(bgo0, 1e-3));
// dbg(runtime->getPerfTime(bestGraph, true));
// dbg(runtime->timeNonCtcOperators(bestGraph));
// // dbg(runtime->timeWithCudaGraph(bestGraph));
// }
// dbg("Best graph");
// printGraph(bestGraph);