Add: print derivation steps for conv2gemm

2023-11-10 23:16:44 +08:00
6 changed files with 86 additions and 7 deletions
--- a/6
+++ b/6
@ -1,13 +1,13 @@
 .PHONY : build clean format install-python test-cpp test-onnx

 TYPE ?= Release
-CUDA ?= OFF
+CUDA ?= ON
 BANG ?= OFF
 KUNLUN ?= OFF
 INTELCPU ?= off
 BACKTRACE ?= ON
 TEST ?= ON
-NNET ?= OFF
+NNET ?= ON
 FORMAT_ORIGIN ?=
 # Docker build options
 DOCKER_NAME ?= infinitensor
@ -37,7 +37,7 @@ endif

 build:
 	mkdir -p build/$(TYPE)
-	cd build/$(TYPE) && cmake $(CMAKE_OPT) ../.. && make -j8
+	cd build/$(TYPE) && cmake $(CMAKE_OPT) ../.. && make -j99

 clean:
 	rm -rf build
--- a/include/nnet/common.h
+++ b/include/nnet/common.h
@ -68,8 +68,8 @@ static inline HashType genhash(string s) {
 #define nnet_unimplemented_halt()                                              \
    { IT_TODO_HALT(); }

-#define nnet_unimplemented_continue()                                          \
-    { dbg("Unimplemented"); }
+#define nnet_unimplemented_continue()                                          
+    // { dbg("Unimplemented"); }

 #define nnet_assert(expr, msg) assert(((void)(msg), (expr)))

--- a/run.sh
+++ b/run.sh
@ -0,0 +1,2 @@
+./test_mutator --gtest_filter="NMutator.Conv3x3_to_gemm"
+./test_conv2gemm --gtest_filter="*.NCHW_FCRS_ruleBased"
--- a/src/nnet/derivator.cc
+++ b/src/nnet/derivator.cc
@ -410,7 +410,8 @@ void Derivator::appendCanddiate(const Tensor &tensor, int depth) {
    // dbg("!!!!!!!!!!!!!!!Success!!!!!!!!!!!!!!!");
    if (enableEquivalenceCheck)
        checkDerivationEquivalence();
-    // printIntermediateStates();
+    if (candidates.size() > 2)
+        printIntermediateStates();
 }

 bool Derivator::checkOOB(const RangeOp &rangeOp, bool halt) {
--- a/test/nnet/test_conv2gemm.cc
+++ b/test/nnet/test_conv2gemm.cc
@ -25,7 +25,7 @@ TEST(Conv2gemm, NCHW_FCRS_ruleBased) {

    // Derivation
    Formula conv_1x1_nhwc_fc(range, 0);
-    Derivator derivator(12);
+    Derivator derivator(12, true, nnet::Derivator::LogMode::DumpFristCandiate);
    // const vector<int> rules = {3, 2, 2, 5, 2, 2, 6, 6};
    const vector<int> rules = {3, 2, 2, 5, 8, 8, 6, 90};
    // derivator.ruleBasedDFS(conv_1x1_nhwc_fc, 0, rules);
--- a/test/nnet/test_mutator.cc
+++ b/test/nnet/test_mutator.cc
@ -126,6 +126,82 @@ TEST(Mutator, InfoGAN_TConv_3_correctness) {
    }
 }

+TEST(NMutator, Conv3x3_to_gemm) {
+    const bool useMutatorDirectly = true;
+    Runtime runtime = make_ref<CudaRuntimeObj>();
+    Graph original_graph = make_ref<GraphObj>(runtime);
+    Runtime cpu = NativeCpuRuntimeObj::getInstance(); // CPUruntime is singleton
+    Graph gCpu = make_ref<GraphObj>(cpu);
+
+    const int n = 1, c = 32, h = 64, w = 64, f = 32, r = 3, s = 3;
+    // // Minimum config for test
+    // const int n = 1, c = 1, h = 2, w = 2, f = 1, r = 4, s = 4;
+    // const int n = 1, c = 2, h = 2, w = 2, f = 2, r = 4, s = 4;
+
+    auto i0 = original_graph->addTensor({n, c, h, w}, DataType::Float32);
+                                        // TensorType::Input);
+    auto w0 = original_graph->addTensor({f, c, r, s}, DataType::Float32);
+                                        // TensorType::Initialized);
+    original_graph->addOp<ConvObj>(i0, w0, nullptr, 1, 1, 1, 1, 1, 1);
+
+    auto mutator = make_ref<NMutator>(NMutator::Mode::RuleBased,
+                                      vector<int>{3, 2, 2, 5, 8, 8, 6, 90});
+    // // Translate OP to membound without derivation
+    // mutator->setToNaiveMembound();
+
+    vector<Graph> bestGraphs;
+    if (useMutatorDirectly) { // Use mutator results
+        bestGraphs = mutator->run(original_graph);
+    } else { // Use search engine results
+        SearchEngine searchEngine(runtime, mutator);
+        bestGraphs.emplace_back(searchEngine.run(original_graph));
+    }
+    original_graph->dataMalloc();
+    map<UidBaseType, Tensor> fuidToInputTensor;
+    for (auto t : original_graph->getInputs()) {
+        EXPECT_EQ(fuidToInputTensor.count(t->getFuid()), 0);
+        fuidToInputTensor[t->getFuid()] = t;
+    }
+
+    dbg(original_graph);
+    std::cout << "# bestGraphs = " << bestGraphs.size() << std::endl;
+    for (size_t i = 1; i < bestGraphs.size(); i++) {
+        auto bestGraphCpu = bestGraphs[i];
+        auto bestGraph =
+            make_ref<GraphObj>(runtime, bestGraphCpu->getOperators());
+
+        dbg(bestGraph);
+        break;
+
+        // auto gen = RandomGenerator(0.1, 0.1, i);
+        // bestGraph->dataMalloc();
+        // // Initialize inputs with random data
+        // for (auto t : g->getInputs()) {
+        //     t->setData(gen);
+        // }
+        // for (auto t : bestGraph->getInputs()) {
+        //     t->copyData(fuidToInputTensor[t->getFuid()]);
+        // }
+
+        // // Initialize outputs with zeros
+        // for (auto t : g->getOutputs()) {
+        //     t->setData(ZeroGenerator());
+        // }
+        // for (auto t : bestGraph->getOutputs()) {
+        //     t->setData(ZeroGenerator());
+        // }
+
+        // runtime->run(bestGraph, true); // Tune kernels
+        // runtime->run(g);
+        // runtime->run(bestGraph, false); // Execute transfomraed graph
+
+        // auto go0 = gCpu->cloneTensor(g->getOutputs()[0]);
+        // auto bgo0 = gCpu->cloneTensor(bestGraph->getOutputs()[0]);
+        // dbg(runtime->getPerfTime(bestGraph, true));
+        // EXPECT_TRUE(go0->equalData(bgo0, 1e-4));
+    }
+}
+
 // TEST(Mutator, Conv9x9) {
 //     auto g = new tpm::Graph();
 //     auto i0 = g->tensor({1, 1, 224, 224});