From e72fe791689b071c37e364c2c18fc40fdd5e3e9a Mon Sep 17 00:00:00 2001
From: Liyan Zheng <liyan-zheng@outlook.com>
Date: Thu, 13 Apr 2023 19:46:54 +0800
Subject: [PATCH] Add: search engine uses estimated time

---
 include/core/runtime.h            |   5 +-
 include/core/search_engine.h      |  25 +++---
 include/operators/membound.h      |   1 +
 python/cpp_plugin/gen_ansor_so.py | 133 ++++++++++++++++--------------
 src/core/runtime.cc               |  28 ++++---
 src/core/search_engine.cc         |  20 ++---
 src/core/tensor.cc                |   4 +-
 src/nnet/App/test_models.cc       | 124 ++++++++++++++++++++++++++++
 src/nnet/nmutator.cc              |   4 +-
 9 files changed, 243 insertions(+), 101 deletions(-)
 create mode 100644 src/nnet/App/test_models.cc
diff --git a/include/core/runtime.h b/include/core/runtime.h
index 53920fdb..16b9b60b 100644
--- a/include/core/runtime.h
+++ b/include/core/runtime.h
@@ -59,10 +59,11 @@ class RuntimeObj : public std::enable_shared_from_this<RuntimeObj> {
      * execution happens.
      *
      * @param graph
-     * @param profiling Whether to print breakdown of time
+     * @param printProfiling Whether to print breakdown of time
      * @return double Return the sum of perf time for each operator
      */
-    double getPerfTime(const Graph &graph, bool profiling = false) const;
+    double getPerfTime(const Graph &graph, bool printProfiling = false,
+                       bool allowEstimation = false) const;
     Blob allocBlob(size_t size);
     bool isCpu() const {
         return device == Device::CPU || device == Device::INTELCPU;
diff --git a/include/core/search_engine.h b/include/core/search_engine.h
index 55c90bc5..05c4063f 100644
--- a/include/core/search_engine.h
+++ b/include/core/search_engine.h
@@ -11,11 +11,15 @@ class SearchEngine {
   private:
     Runtime runtimeExec;
     Ref<Mutator> mutator;
+    std::function<bool(const Graph &, const Graph &)> graphTimeComparer;
 
   public:
-    SearchEngine(Runtime _runtime, Ref<Mutator> _mutator) {
-        runtimeExec = _runtime;
-        mutator = _mutator;
+    SearchEngine(Runtime runtime, Ref<Mutator> mutator)
+        : runtimeExec(runtime), mutator(mutator) {
+        // Compare graph with estimated time
+        graphTimeComparer = [this](const Graph &a, const Graph &b) -> bool {
+            return getEstimatedGraphPerf(a) < getEstimatedGraphPerf(b);
+        };
     }
     ~SearchEngine() {}
 
@@ -24,11 +28,7 @@ class SearchEngine {
         3;                  // cut nodes whose #in + #out >= partitionThreshold
     size_t GRAPH_SIZE = 16; // num of best graphs.
 
-  private: // Composed objects
-    std::shared_ptr<Mutator> mutationEngine;
-
   public:
-    std::shared_ptr<Mutator> getMutationEngine() { return mutationEngine; };
     struct GroupEdge {
         int v, next;
         GroupEdge() = delete;
@@ -38,10 +38,7 @@ class SearchEngine {
         std::shared_ptr<Graph> graph;
         double perf = INFINITY;
     };
-    class MetaGraph { // a graph of subgraphs, for searching.
-      public:
-        MetaGraph() {}
-        ~MetaGraph() {}
+    struct MetaGraph { // a graph of subgraphs, for searching.
         struct Node {
             Graph graph;
             std::vector<int> suc;
@@ -51,7 +48,7 @@ class SearchEngine {
         std::vector<Node> nodes;
     };
 
-    Graph run(const Graph graph);                  // entrance of search engine.
+    Graph run(const Graph graph);                  // entrance to search engine.
     std::vector<Graph> search(const Graph &graph); // search for a partition.
 
   private:
@@ -76,5 +73,9 @@ class SearchEngine {
      * branch.
      */
     bool isMultiBranchMergable(const Graph graph);
+
+    double getEstimatedGraphPerf(Graph graph) {
+        return runtimeExec->getPerfTime(graph, false, true);
+    }
 };
 } // namespace infini
diff --git a/include/operators/membound.h b/include/operators/membound.h
index 4a444553..d1707913 100644
--- a/include/operators/membound.h
+++ b/include/operators/membound.h
@@ -30,6 +30,7 @@ class MemBoundObj : public OperatorObj {
     pair<const nnet::Expr, HashType> getSimplifiedNnetExpr() const {
         return {expr, hash};
     }
+    double getEstimatedTime() const { return exec_time; }
 
   private:
     vector<int> getWorkloadVector() const override;
diff --git a/python/cpp_plugin/gen_ansor_so.py b/python/cpp_plugin/gen_ansor_so.py
index d06bc3be..9580b3c7 100644
--- a/python/cpp_plugin/gen_ansor_so.py
+++ b/python/cpp_plugin/gen_ansor_so.py
@@ -1,4 +1,5 @@
-import re
+from contextlib import redirect_stdout
+import time
 
 import numpy as np
 import tvm
@@ -8,99 +9,111 @@ import json
 import logging
 
 USE_CACHE = True
+logging.basicConfig()
 logger = logging.getLogger('InfiniTensor')
-logger.setLevel(logging.DEBUG)
+logger.setLevel(logging.INFO)
 
 
 def gen_ansor_so(input_tensors, input_dtypes, output_tensor, output_dtype,
                  tvm_code, func_name, nnet_expression: str,
                  nnet_simplified_expression: str, hash_code=None):
     assert len(input_tensors) == len(input_dtypes)
-    
-    logging.debug(f'Work on hash {hash_code}')
+
+    logger.debug(f'Work on hash {hash_code}')
     dir_name = os.path.join(".cache", "generated_kernels", str(hash_code))
-    
+
     if not os.path.exists(dir_name):
         os.makedirs(dir_name)
-    
+
     so_fn = os.path.join(dir_name, f"{func_name}.so")
     config_fn = os.path.join(dir_name, "config_so.json")
-    
-    print("Generating Ansor op: ")
-    print(tvm_code)
-    
-    print("Input shape: ")
-    print(input_tensors)
-    print("Output shape: ")
-    print(output_tensor)
-    
+    desc_fn = os.path.join(dir_name, "desc.txt")
+    log_fn = os.path.join(dir_name, f"ansor_{func_name}_log.json")
+    out_fn = os.path.join(dir_name, "out.txt")
+
+    logger.debug(f"Generating Ansor op: {tvm_code}")
+    logger.debug(f"Input shape: {input_tensors}")
+    logger.debug(f"Output shape: {output_tensor}")
+
     if USE_CACHE and hash_code is not None:
         if os.path.exists(dir_name) and \
-            os.path.exists(so_fn) and \
-            os.path.exists(config_fn):
+                os.path.exists(so_fn) and \
+                os.path.exists(config_fn):
             print(f"Use cache in {dir_name}")
             with open(config_fn, "r") as config_fin:
                 config = json.loads(config_fin.read().strip())
                 conv_time = config["conv_time"]
 
-            logger.debug(f'Find tuning log for {hash_code}')
+            logger.info(f'Find tuning log for {hash_code} in {so_fn}')
             return so_fn, conv_time
-    
+    logger.info(f"TVM Tuning kernel with hash {hash_code}. See {out_fn}")
+
+    time_start = time.perf_counter()
+    # Print descriptions of the task
+    if USE_CACHE and hash_code is not None:
+        with redirect_stdout(open(desc_fn, "w")):
+            print("====NNET tensor expression====")
+            print(nnet_expression+"\n")
+            print("====NNET simplified tensor expression====")
+            print(nnet_simplified_expression+"\n")
+            print("====TVM compute====")
+            print(tvm_code+"\n")
+            print("Input shape: ", input_tensors)
+            print("Output shape: ", output_tensor)
+
     @auto_scheduler.register_workload(func_name)
     def compute():
         _locals = locals()
-        exec(tvm_code, {'tvm': tvm, 'te': te, 'tir': tir, 'topi': topi}, _locals)
+        exec(tvm_code, {'tvm': tvm, 'te': te,
+             'tir': tir, 'topi': topi}, _locals)
         return _locals['ret']
-    
+
     target = tvm.target.Target("cuda")
 
     task = auto_scheduler.SearchTask(func=func_name, args=(), target=target)
 
-    # Inspect the computational graph
-    print("Computational DAG:")
-    print(task.compute_dag)
+    with redirect_stdout(open(out_fn, 'w')):
+        # Inspect the computational graph
+        print("Computational DAG:")
+        print(task.compute_dag)
 
-    log_file = f"ansor_{func_name}_log.json"
-    measure_ctx = auto_scheduler.LocalRPCMeasureContext(min_repeat_ms=300)
-    tune_option = auto_scheduler.TuningOptions(
-        num_measure_trials=10,
-        runner=measure_ctx.runner,
-        measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
-        verbose=2,
-    )
+        measure_ctx = auto_scheduler.LocalRPCMeasureContext(min_repeat_ms=300)
+        tune_option = auto_scheduler.TuningOptions(
+            num_measure_trials=10,
+            runner=measure_ctx.runner,
+            measure_callbacks=[auto_scheduler.RecordToFile(log_fn)],
+            verbose=2,
+        )
 
-    # Run auto-tuning (search)
-    task.tune(tune_option)
-    # Apply the best schedule
-    sch, args = task.apply_best(log_file)
+        # Run auto-tuning (search)
+        task.tune(tune_option)
+        # Apply the best schedule
+        sch, args = task.apply_best(log_fn)
+
+        # Kill the measurement process
+        del measure_ctx
+
+        func = tvm.build(sch, args, target, name=func_name)
+        func.export_library(so_fn)
+
+        ctx = tvm.cuda(0)
+        input_a = []
+        for i, (shape, dtype) in enumerate(zip(input_tensors, input_dtypes)):
+            a_np = np.random.uniform(size=shape).astype(dtype)
+            input_a.append(tvm.nd.array(a_np, ctx))
+        a_out = tvm.nd.array(np.zeros(output_tensor, dtype=output_dtype), ctx)
+        func(a_out, *input_a)
+        evaluator = func.time_evaluator(func.entry_name, ctx, number=100)
+        conv_time = evaluator(a_out, *input_a).mean * 1e3
+
+    time_end = time.perf_counter()
 
-    # Kill the measurement process
-    del measure_ctx
-    
-    func = tvm.build(sch, args, target, name=func_name)
-    func.export_library(so_fn)
-    
-    ctx = tvm.cuda(0)
-    input_a = []
-    for i, (shape, dtype) in enumerate(zip(input_tensors, input_dtypes)):
-        a_np = np.random.uniform(size=shape).astype(dtype)
-        input_a.append(tvm.nd.array(a_np, ctx))
-    a_out = tvm.nd.array(np.zeros(output_tensor, dtype=output_dtype), ctx)
-    func(a_out, *input_a)
-    evaluator = func.time_evaluator(func.entry_name, ctx, number=100)
-    conv_time = evaluator(a_out, *input_a).mean * 1e3
-    
-    print("====NNET tensor expression====")
-    print(nnet_expression+"\n")
-    print("====NNET simplified tensor expression====")
-    print(nnet_simplified_expression+"\n")
-    print("====Time====")
-    print(conv_time)
-    
     if USE_CACHE and hash_code is not None:
         with open(config_fn, "w") as config_fout:
             config_fout.write(json.dumps({
                 "conv_time": conv_time,
+                "tuning_time": time_end - time_start,
+                "timestamp": time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()),
             }, ensure_ascii=False, indent=2))
-    
+
     return so_fn, conv_time
diff --git a/src/core/runtime.cc b/src/core/runtime.cc
index 1e1e7c1d..8151a6f0 100644
--- a/src/core/runtime.cc
+++ b/src/core/runtime.cc
@@ -2,6 +2,7 @@
 #include "core/blob.h"
 #include "core/kernel.h"
 #include "core/perf_engine.h"
+#include "operators/membound.h"
 #include "utils/data_generator.h"
 #include <chrono>
 #include <cstring>
@@ -56,7 +57,8 @@ void CpuRuntimeObj::run(const Graph &graph, bool tune, bool profiling) const {
         printProfilingData(totalTime, opTime, opCnt);
 }
 
-double RuntimeObj::getPerfTime(const Graph &graph, bool profiling) const {
+double RuntimeObj::getPerfTime(const Graph &graph, bool profiling,
+                               bool allowEstimation) const {
     const auto &kernelRegistry = KernelRegistry::getInstance();
     auto &perfEngine = PerfEngine::getInstance();
     // Statistics
@@ -70,11 +72,16 @@ double RuntimeObj::getPerfTime(const Graph &graph, bool profiling) const {
         auto perfKey = PerfEngine::Key{kernelAttrs, op->getOpPerfKey()};
         auto perfData = perfEngine.getPerfData(perfKey);
 
-        PerfRecord record;
+        double time = -1e9;
         // Tune the kernel if there is no record
-        if (!perfData) {
+        if (perfData) {
+            time = perfData->time;
+        } else if (allowEstimation && op->getOpType() == OpType::MemBound) {
+            time = as<MemBoundObj>(op)->getEstimatedTime();
+        } else {
             // TODO: should tenosrs automatically allocate when access data?
-            // allocate memory for empty tensors and release it after profiling
+            // allocate memory for empty tensors and release it after
+            // profiling
             TensorVec allocatedTensors;
             for (auto t : op->getInputs())
                 if (!t->hasData())
@@ -88,21 +95,20 @@ double RuntimeObj::getPerfTime(const Graph &graph, bool profiling) const {
             }
 
             // Profile operators and record the results
-            record = kernel->tune(op, this);
+            PerfRecord record = kernel->tune(op, this);
+            time = record->time;
             perfEngine.setPerfData(perfKey, record);
 
             // Free allocated memory
             for (auto t : allocatedTensors)
                 t->freeData();
-        } else
-            record = perfData;
+        }
 
-        double t = record->time;
-        totalTime += t;
+        totalTime += time;
         if (profiling) {
             op->print();
-            printf(" op_time %lf\n", t);
-            opTime[op->getOpType()] += t;
+            printf(" op_time %lf\n", time);
+            opTime[op->getOpType()] += time;
             opCnt[op->getOpType()]++;
         }
     }
diff --git a/src/core/search_engine.cc b/src/core/search_engine.cc
index af643686..b1c621f2 100644
--- a/src/core/search_engine.cc
+++ b/src/core/search_engine.cc
@@ -32,8 +32,7 @@ Graph SearchEngine::run(const Graph graph) {
     IT_ASSERT(runtimeExec == graph->getRuntime());
     std::cout << "[INFO] original graph: " << std::endl;
     std::cout << graph->toString();
-    std::cout << "[INFO] perf: " << runtimeExec->getPerfTime(graph)
-              << std::endl;
+    std::cout << "[INFO] perf: " << getEstimatedGraphPerf(graph) << std::endl;
 
     std::vector<Graph> partitions = partitionGraph(graph);
 
@@ -65,9 +64,7 @@ Graph SearchEngine::run(const Graph graph) {
                 nextGraphs.emplace_back(tmp);
             }
         }
-        std::sort(nextGraphs.begin(), nextGraphs.end(), [&](Graph x, Graph y) {
-            return runtimeExec->getPerfTime(x) < runtimeExec->getPerfTime(y);
-        });
+        std::sort(nextGraphs.begin(), nextGraphs.end(), graphTimeComparer);
         if (nextGraphs.size() > GRAPH_SIZE) {
             nextGraphs.resize(GRAPH_SIZE);
         }
@@ -81,7 +78,7 @@ Graph SearchEngine::run(const Graph graph) {
     for (size_t i = 0; i < bestGraphs.size(); i++) {
         std::cout << "bestGraph " << i << ":" << std::endl;
         std::cout << bestGraphs[i]->toString();
-        std::cout << "[INFO] perf: " << runtimeExec->getPerfTime(bestGraphs[i])
+        std::cout << "[INFO] perf: " << getEstimatedGraphPerf(bestGraphs[i])
                   << std::endl;
     }
 
@@ -102,9 +99,8 @@ std::vector<Graph> SearchEngine::search(const Graph &graph) {
         }
     }
 
-    sort(results.begin(), results.end(), [&](Graph x, Graph y) {
-        return runtimeExec->getPerfTime(x) < runtimeExec->getPerfTime(y);
-    }); // compare with perf time
+    // compare with perf time
+    std::sort(results.begin(), results.end(), graphTimeComparer);
     if (results.size() > GRAPH_SIZE) {
         results.resize(GRAPH_SIZE);
     }
@@ -360,9 +356,7 @@ std::vector<Graph> SearchEngine::searchMutation(
         for (auto g : nextGraphs) {
             g->dataMalloc();
         }
-        std::sort(nextGraphs.begin(), nextGraphs.end(), [&](Graph x, Graph y) {
-            return runtimeExec->getPerfTime(x) < runtimeExec->getPerfTime(y);
-        });
+        std::sort(nextGraphs.begin(), nextGraphs.end(), graphTimeComparer);
         if (nextGraphs.size() > GRAPH_SIZE) {
             nextGraphs.resize(GRAPH_SIZE);
         }
@@ -372,7 +366,7 @@ std::vector<Graph> SearchEngine::searchMutation(
 }
 
 bool SearchEngine::isMultiBranchMergable(const Graph graph) {
-    return mutationEngine->isMultiBranchMergable(graph);
+    return mutator->isMultiBranchMergable(graph);
 }
 
 // Split a graph into multiple independt graphs. Search engine will search for
diff --git a/src/core/tensor.cc b/src/core/tensor.cc
index e63039d5..609b1720 100644
--- a/src/core/tensor.cc
+++ b/src/core/tensor.cc
@@ -24,8 +24,7 @@ string TensorObj::toString() const {
         ss << "nullptr data";
     string ret = "Tensor " + std::to_string(guid) + ", Fuid " +
                  std::to_string(fuid) + ", shape " + vecToString(shape) +
-                 ", dtype " + dtype.toString() + ", " + runtime->toString() +
-                 ", " + ss.str() + "\n";
+                 ", dtype " + dtype.toString();
     vector<UidBaseType> targetGuids;
     for (const auto &op : targets)
         targetGuids.emplace_back(op.lock()->getGuid());
@@ -34,6 +33,7 @@ string TensorObj::toString() const {
     else
         ret += ", source None";
     ret += ", targets " + vecToString(targetGuids);
+    ret += ", " + runtime->toString() + ", " + ss.str();
     return ret;
 }
 
diff --git a/src/nnet/App/test_models.cc b/src/nnet/App/test_models.cc
new file mode 100644
index 00000000..570f713c
--- /dev/null
+++ b/src/nnet/App/test_models.cc
@@ -0,0 +1,124 @@
+#include "core/blob.h"
+#include "core/dummy_mutator.h"
+#include "core/graph.h"
+#include "core/runtime.h"
+#include "core/search_engine.h"
+#include "cuda/cuda_runtime.h"
+#include "nnet/nmutator.h"
+#include "operators/conv.h"
+#include "test.h"
+#include <pybind11/stl.h>
+
+namespace infini {
+
+// NHWC format
+Graph getInfoGAN(int batch, Runtime runtime) {
+    Graph g = make_ref<GraphObj>(runtime);
+    vector<Tensor> weights;
+    vector<tuple<int, int, int, int>> cs{
+        // Channel, kernelSize, pad, stride
+        {448, 2, 0, 1}, {256, 4, 1, 2}, {128, 4, 1, 2},
+        {64, 4, 1, 2},  {32, 4, 1, 2},
+    };
+    Tensor input = g->addTensor({batch, 1, 1, 228});
+    for (auto [channel, kernelSize, pad, stride] : cs) {
+        int f = input->getDims()[3]; // n, h, w, f
+        auto weight =
+            g->addTensor({f, kernelSize, kernelSize, channel}); // f, r, s, c
+        input = g->addOp<ConvTransposed2dNHWCObj>(input, weight, nullptr, pad,
+                                                  pad, stride, stride, 1, 1)
+                    ->getOutput();
+        // TODO: activation
+    }
+    return g;
+}
+
+void printGraph(Graph g) {
+    g->print();
+    puts("============ Data ============");
+    for (auto t : g->getTensors()) {
+        dbg(t);
+        t->printData();
+    }
+}
+
+vector<Tensor> runInfoGAN() {
+    const bool useMutatorDirectly = true;
+    Runtime cuda = make_ref<CudaRuntimeObj>();
+    Runtime cpu = NativeCpuRuntimeObj::getInstance();
+    Graph gCpu = make_ref<GraphObj>(cpu);
+
+    Graph g = getInfoGAN(1, cuda);
+
+    auto mutator =
+        make_ref<NMutator>(NMutator::Mode::RuleBased,
+                           vector<int>{3, 2, 2, 2, 2, 5, 8, 8, 6, 91, 90});
+    // // Translate OP to membound without derivation
+    // mutator->setToNaiveMembound();
+
+    vector<Graph> bestGraphs;
+    SearchEngine searchEngine(cuda, mutator);
+    bestGraphs.emplace_back(searchEngine.run(g));
+    g->topo_sort();
+    dbg(g, bestGraphs[0], bestGraphs.size());
+    g->print();
+
+    g->dataMalloc();
+    map<UidBaseType, Tensor> fuidToInputTensor;
+    for (auto t : g->getInputs()) {
+        IT_ASSERT(fuidToInputTensor.count(t->getFuid()) == 0);
+        fuidToInputTensor[t->getFuid()] = t;
+    }
+
+    auto gen = RandomGenerator(-1, 1, 0);
+    for (auto t : g->getInputs()) {
+        t->setData(gen);
+    }
+    for (auto t : g->getOutputs()) {
+        t->setData(ZeroGenerator());
+    }
+    cuda->run(g);
+    dbg("Baseline graph");
+    printGraph(g);
+    dbg(cuda->getPerfTime(g, true));
+
+    for (size_t i = 0; i < bestGraphs.size(); i++) {
+        auto bestGraphCpu = bestGraphs[i];
+        auto bestGraph = make_ref<GraphObj>(cuda, bestGraphCpu->getOperators());
+        bestGraph->topo_sort();
+
+        bestGraph->dataMalloc();
+        // Initialize inputs with random data
+        for (auto t : bestGraph->getInputs()) {
+            t->copyData(fuidToInputTensor[t->getFuid()]);
+        }
+
+        // Initialize outputs with zeros
+        for (auto t : bestGraph->getOutputs()) {
+            t->setData(ZeroGenerator());
+        }
+
+        dbg(bestGraph);
+        dbg(bestGraph->getOutputs());
+
+        cuda->run(bestGraph, true);  // Tune kernels
+        cuda->run(bestGraph, false); // Execute transfomraed graph
+
+        auto go0 = gCpu->cloneTensor(g->getOutputs()[0]);
+        auto bgo0 = gCpu->cloneTensor(bestGraph->getOutputs()[0]);
+        // EXPECT_TRUE(go0->equalData(bgo0, 1e-3));
+        std::cout << go0->equalData(bgo0, 1e-3) << std::endl;
+        bgo0->printData();
+        go0->printData();
+        dbg(cuda->getPerfTime(bestGraph, true));
+
+        dbg("Best graph");
+        printGraph(bestGraph);
+        return {g->getOutputs()[0], bestGraph->getOutputs()[0]};
+    }
+    return {};
+}
+
+// TEST(ModelE2E, InfoGAN) { runInfoGAN(); }
+
+} // namespace infini
diff --git a/src/nnet/nmutator.cc b/src/nnet/nmutator.cc
index 50575082..9cc7ef59 100644
--- a/src/nnet/nmutator.cc
+++ b/src/nnet/nmutator.cc
@@ -49,7 +49,7 @@ void NMutator::runSingleOpToNaiveMembound(Graph in_graph,
     assert(computeOps.size() == 1);
     const auto &computeOp = computeOps[0];
     auto g = infini::make_ref<GraphObj>(in_graph->getRuntime());
-    auto expr = opToExpression(computeOp);
+    nnet::Expr expr = opToExpression(computeOp);
     auto inputsN = nnet::GetTensorsVisitor().get(expr);
     dbg(inputsN, expr);
     IT_ASSERT(inputsN.count("B") + inputsN.count("K") == 1,
@@ -258,6 +258,8 @@ nnet::Expr NMutator::opToExpression(Operator op) {
         const auto &[n, c, h, w, f, r, s] = convOp->getNCHWFRS();
         const auto &[ph, pw, sh, sw, dh, dw] = convOp->getPadStrideDilation();
         IT_ASSERT_TODO(convOp->getNumGroups() == 1);
+        if (r != 4)
+            return nullptr;
         IT_ASSERT_TODO(r == 4);
         IT_ASSERT_TODO(ph == pw);
         IT_ASSERT_TODO(tie(sh, sw) == tuple(2, 2));