forked from jiuyuan/InfiniTensor
Fix: tensor size overflow
This commit is contained in:
parent
a732b6f176
commit
c451918224
|
@ -63,6 +63,7 @@ class GraphObj : public Object {
|
|||
void optimize();
|
||||
|
||||
void dataMalloc();
|
||||
void dataFree();
|
||||
|
||||
/**
|
||||
* @brief Add an operator and create its outputs. Output tensor arguments
|
||||
|
|
|
@ -59,7 +59,7 @@ class CudaRuntimeObj : public RuntimeObj {
|
|||
bool isInCudaGraph() const { return cudaGraphStatus; }
|
||||
cudaStream_t getStream() const { return stream; }
|
||||
|
||||
double timeWithCudaGraph(Graph graph);
|
||||
double timeWithCudaGraph(Graph graph, int rounds = 1000);
|
||||
|
||||
private:
|
||||
void tune(const Graph &graph, bool profiling) const;
|
||||
|
|
|
@ -129,6 +129,12 @@ void GraphObj::dataMalloc() {
|
|||
}
|
||||
}
|
||||
|
||||
void GraphObj::dataFree() {
|
||||
for (auto &tensor : tensors) {
|
||||
tensor->freeData();
|
||||
}
|
||||
}
|
||||
|
||||
Tensor GraphObj::addTensor(Shape dim, DataType dtype, TensorType tensorType) {
|
||||
return tensors.emplace_back(
|
||||
make_ref<TensorObj>(dim, dtype, runtime, tensorType));
|
||||
|
|
|
@ -13,7 +13,7 @@ TensorObj::TensorObj(Shape shape_, DataType dtype, Runtime runtime,
|
|||
: TensorBaseObj(shape_.size(), dtype, runtime), shape(std::move(shape_)),
|
||||
_size(shape.empty()
|
||||
? 0
|
||||
: std::accumulate(shape.begin(), shape.end(), 1,
|
||||
: std::accumulate(shape.begin(), shape.end(), 1lu,
|
||||
[](auto acc, auto x) { return acc * x; })),
|
||||
tensorType(tensorType) {}
|
||||
|
||||
|
|
|
@ -121,7 +121,7 @@ void CudaRuntimeObj::sync() const { checkCudaError(cudaDeviceSynchronize()); }
|
|||
|
||||
string CudaRuntimeObj::toString() const { return "CUDA Runtime"; }
|
||||
|
||||
double CudaRuntimeObj::timeWithCudaGraph(Graph graph) {
|
||||
double CudaRuntimeObj::timeWithCudaGraph(Graph graph, int rounds) {
|
||||
const auto &kernelRegistry = KernelRegistry::getInstance();
|
||||
auto &perfEngine = PerfEngine::getInstance();
|
||||
// compile-time computable
|
||||
|
|
|
@ -283,6 +283,7 @@ Graph optimizeGraph(Graph g, Runtime _runtime, bool tuning, NMutator::Mode mode,
|
|||
// dbg("Baseline graph");
|
||||
// printGraph(g);
|
||||
// dbg(runtme->getPerfTime(g, true));
|
||||
g->dataFree();
|
||||
|
||||
for (size_t i = 0; i < bestGraphs.size(); i++) {
|
||||
auto bestGraphCpu = bestGraphs[i];
|
||||
|
@ -304,18 +305,19 @@ Graph optimizeGraph(Graph g, Runtime _runtime, bool tuning, NMutator::Mode mode,
|
|||
dbg(bestGraph);
|
||||
dbg(bestGraph->getOutputs());
|
||||
|
||||
if (tuning) {
|
||||
runtime->run(bestGraph, true); // Tune kernels
|
||||
runtime->run(bestGraph, false); // Execute transfomraed graph
|
||||
// if (tuning) {
|
||||
// runtime->run(bestGraph, true); // Tune kernels
|
||||
// runtime->run(bestGraph, false); // Execute transfomraed graph
|
||||
|
||||
auto go0 = gCpu->cloneTensor(g->getOutputs()[0]);
|
||||
auto bgo0 = gCpu->cloneTensor(bestGraph->getOutputs()[0]);
|
||||
// EXPECT_TRUE(go0->equalData(bgo0, 1e-3));
|
||||
dbg(go0->equalData(bgo0, 1e-3));
|
||||
dbg(runtime->getPerfTime(bestGraph, true));
|
||||
dbg(runtime->timeNonCtcOperators(bestGraph));
|
||||
// dbg(runtime->timeWithCudaGraph(bestGraph));
|
||||
}
|
||||
// // FIXME: g is freed
|
||||
// auto go0 = gCpu->cloneTensor(g->getOutputs()[0]);
|
||||
// auto bgo0 = gCpu->cloneTensor(bestGraph->getOutputs()[0]);
|
||||
// // EXPECT_TRUE(go0->equalData(bgo0, 1e-3));
|
||||
// dbg(go0->equalData(bgo0, 1e-3));
|
||||
// dbg(runtime->getPerfTime(bestGraph, true));
|
||||
// dbg(runtime->timeNonCtcOperators(bestGraph));
|
||||
// // dbg(runtime->timeWithCudaGraph(bestGraph));
|
||||
// }
|
||||
|
||||
// dbg("Best graph");
|
||||
// printGraph(bestGraph);
|
||||
|
|
Loading…
Reference in New Issue