memcopy instead of special kernel

add currency operator
modified expand
2024-05-06 14:49:39 +08:00 · 2024-04-10 15:01:22 +08:00 · 2024-04-10 11:16:54 +08:00 · 2024-04-10 10:51:35 +08:00 · 2024-04-10 10:17:45 +08:00
35 changed files with 834 additions and 1211 deletions
--- a/.gitmodules
+++ b/.gitmodules
@ -13,6 +13,3 @@
 [submodule "example"]
 	path = examples/NNmodel
 	url = git@github.com:wanghailu0717/NNmodel.git
 [submodule "examples/distributed/onnxsim_large_model"]
 	path = examples/distributed/onnxsim_large_model
 	url = git@github.com:luchangli03/onnxsim_large_model.git
--- a/examples/distributed/README.md
+++ b/examples/distributed/README.md
@ -1,7 +1,5 @@
 # 分布式脚本
 ## 英伟达平台运行方式
 #### 1. 运行pytorch模型并生成输入和标准输出，可选择导出onnx
 使用 `--export_onnx` 设置导出onnx的目录，默认为当前路径 `./`，不使用这个flag则只进行计算和生成输入输出。
@ -17,23 +15,3 @@ python run_pytorch.py --model gpt2  --batch_size 1  --length 1 --export_onnx ./
 ```bash
 python cuda_launch.py --model "/XXX/XXX.onnx" --nproc_per_node 4 
 ```
 ## 寒武纪平台运行方式
 **将上述运行脚本 `run_pytorch.py` 以及 `cuda_launch.py` 针对寒武纪平台做了相应的适配，具体见 `run_pytorch_mlu.py` 以及 `bang_launch.py`。**
 #### 1. 运行pytorch模型并生成输入和标准输出，可选择导出onnx
 使用 `--export_onnx` 设置导出onnx的目录，默认为当前路径 `./`，不使用这个flag则只进行计算和生成输入输出。
 ```bash
 python run_pytorch_mlu.py --model gpt2  --batch_size 1  --length 1 --export_onnx ./
 ```
 会在当前目录下生成输入输出文件`test_inputs.npy` 和 `test_results.npy`，目前只支持单一输入输出。
 #### 2. 运行InfiniTensor分布式脚本
 ```bash
 python bang_launch.py --model "/XXX/XXX.onnx" --nproc_per_node 4 
 ```
--- a/examples/distributed/init.py
+++ b/examples/distributed/init.py
--- a/examples/distributed/bang/run_pytorch_mlu.py
+++ b/examples/distributed/bang/run_pytorch_mlu.py
@ -1,249 +0,0 @@
 import argparse
 import torch
 import torch_mlu
 from transformers import BertModel, BertConfig
 from transformers import GPT2Model, GPT2Config
 from transformers import OPTModel, OPTConfig
 from transformers import AlbertModel, AlbertConfig
 from transformers import LlamaModel, LlamaConfig
 import time
 import numpy as np
 import onnx
 import sys
 import os
 from onnx.external_data_helper import convert_model_to_external_data
 from onnxsim import simplify
 def parse_args():
    parser = argparse.ArgumentParser(description="Run pytorch gpt2/bert/opt and optionally export onnx.")
    parser.add_argument(
        "--model", type=str, choices=["gpt2", "bert", "opt", "llama", "albert"], required=True, help="model type"
    )
    parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
    parser.add_argument("--length", type=int, default=1, help="sequence length.")
    parser.add_argument(
        "--export_onnx",
        type=str,
        nargs="?",
        default=None,
        const="./",
        help="whether and where to export onnx file",
    )
    parser.add_argument(
        "--type", type=str, choices=["fp32", "fp16", "tf32"], required=True, help="model data type"
    )
    args = parser.parse_args()
    print("arg setting: ", args)
    return (
        args.model,
        args.batch_size,
        args.length,
        args.export_onnx,
        args.type
    )
 def get_model(modelname):
    match modelname:
        case "albert":
            model = AlbertModel.from_pretrained("albert/albert-base-v2")
            voc_size = AlbertConfig().vocab_size
        case "bert":
            model = BertModel.from_pretrained("bert-base-uncased", add_pooling_layer=False, hidden_act="gelu_new") # erf is not impl by infini
            voc_size = BertConfig().vocab_size
        case "gpt2":
            model = GPT2Model.from_pretrained("GPT2")
            voc_size = GPT2Config().vocab_size
        case "opt":
            model = OPTModel.from_pretrained("facebook/opt-125m")
            voc_size = OPTConfig().vocab_size
        case "llama":
            model = LlamaModel.from_pretrained("meta-llama/Llama-2-7b-hf")
            voc_size = LlamaConfig().vocab_size
        case _:
            raise KeyError(modelname)
    model = model.eval()
    return model, voc_size
 def run_pytorch(torch_model, voc_size, batchsize, len, dtype="fp32"):
    data = np.random.randint(0, voc_size, (batchsize, len), dtype=np.int32)
    os.makedirs(os.path.dirname("./data/"), exist_ok=True)
    np.save("./data/input_0", data)
    inputs = torch.from_numpy(data).to("mlu")
    torch_model = torch_model.to("mlu")
    if dtype == "fp16":
        torch_model = torch_model.half()
    n_iter = 20
    with torch.no_grad():
        for _ in range(10):
            outputs = torch_model(inputs)
    torch.mlu.synchronize()
    begin = time.time()
    with torch.no_grad():
        for _ in range(n_iter):
            torch.mlu.synchronize()
            outputs = torch_model(inputs)
            torch.mlu.synchronize()
    torch.mlu.synchronize()
    end = time.time()
    avg_time = (end - begin) / n_iter
    outputs = outputs.last_hidden_state.to("cpu")
    print("outputs abs mean:", abs(np.array(outputs)).mean())
    print(f"average time: {avg_time}")
    # torch.mlu.memory.empty_cache()
    np.save("./data/output", np.array(outputs))
    print("Save input & output into ./data.")
 def export_onnx(modelname, model, data, path, extern=False, dtype="fp32"):
    data = data.to("mlu")
    model = model.to("mlu")
    if dtype == "fp16":
        model = model.half()
    torch.onnx.export(model, data, path, verbose=False, do_constant_folding=True)
    if modelname != "llama":
        # use onnxsim to simplify
        onnx_model = onnx.load(path)
        onnx_model, check = simplify(onnx_model, skipped_optimizers=['eliminate_duplicate_initializer'])
        # onnx_model, check = simplify(onnx_model, skipped_optimizers=['fuse_qkv', 'eliminate_duplicate_initializer'])
        assert check
        add_value_info_for_constants(onnx_model)
        onnx_model = onnx.shape_inference.infer_shapes(onnx_model)
        if extern:
            extern_path = path.replace('.onnx', '.pb')
            if os.path.exists(extern_path):
                os.remove(extern_path)
            extern_path = extern_path.split("/")[-1]
            convert_model_to_external_data(
                onnx_model,
                all_tensors_to_one_file=True,
                location=extern_path,
                size_threshold=1024,
                convert_attribute=False,
            )
        onnx.save(onnx_model, path)
    else:
        # use third party tool to simplify llama
        # reference: https://github.com/luchangli03/onnxsim_large_model/
        sys.path.append("onnxsim_large_model")
        from onnx_utils import set_onnx_input_shape
        from compress_model import SIZE_1MB, compress_onnx_model, uncompress_onnx_model
        in_model_path = path
        out_model_path = path
        if not out_model_path:
            out_model_path = in_model_path[:-5] + ".sim.onnx"
        if os.path.isdir(out_model_path):
            out_model_path = os.path.join(out_model_path, os.path.basename(in_model_path))
        onnx_model = onnx.load(in_model_path)
        print(f"load model from {in_model_path} success")
        size_th_bytes = 1024 * 1024
        onnx_model, removed_inits = compress_onnx_model(onnx_model, size_th_bytes=size_th_bytes)
        print(f"compress model success")
        onnx_model = set_onnx_input_shape(onnx_model, "")
        tensor_size_threshold = f"1024KB"
        skipped_optimizers = []
        skipped_optimizers.append("eliminate_duplicate_initializer")
        onnx_model, check = simplify(onnx_model, skipped_optimizers=skipped_optimizers,
                                    tensor_size_threshold=tensor_size_threshold)
        if not check:
            raise ValueError(f"simplify compressed model {in_model_path} failed")
        print(f"simplify model success")
        onnx_model = uncompress_onnx_model(onnx_model, removed_inits)
        print(f"uncompress model success")
        add_value_info_for_constants(onnx_model)
        onnx.save(onnx_model, out_model_path, save_as_external_data=True)
 def add_value_info_for_constants(model : onnx.ModelProto):
    """
    Currently onnx.shape_inference doesn't use the shape of initializers, so add
    that info explicitly as ValueInfoProtos.
    Mutates the model.
    Args:
        model: The ModelProto to update.
    """
    # All (top-level) constants will have ValueInfos before IRv4 as they are all inputs
    if model.ir_version < 4:
        return
    def add_const_value_infos_to_graph(graph : onnx.GraphProto):
        inputs = {i.name for i in graph.input}
        existing_info = {vi.name: vi for vi in graph.value_info}
        for init in graph.initializer:
            # Check it really is a constant, not an input
            if init.name in inputs:
                continue
            # The details we want to add
            elem_type = init.data_type
            shape = init.dims
            # Get existing or create new value info for this constant
            vi = existing_info.get(init.name)
            if vi is None:
                vi = graph.value_info.add()
                vi.name = init.name
            # Even though it would be weird, we will not overwrite info even if it doesn't match
            tt = vi.type.tensor_type
            if tt.elem_type == onnx.TensorProto.UNDEFINED:
                tt.elem_type = elem_type
            if not tt.HasField("shape"):
                # Ensure we set an empty list if the const is scalar (zero dims)
                tt.shape.dim.extend([])
                for dim in shape:
                    tt.shape.dim.add().dim_value = dim
        # Handle subgraphs
        for node in graph.node:
            for attr in node.attribute:
                # Ref attrs refer to other attrs, so we don't need to do anything
                if attr.ref_attr_name != "":
                    continue
                if attr.type == onnx.AttributeProto.GRAPH:
                    add_const_value_infos_to_graph(attr.g)
                if attr.type == onnx.AttributeProto.GRAPHS:
                    for g in attr.graphs:
                        add_const_value_infos_to_graph(g)
    return add_const_value_infos_to_graph(model.graph)
 def main():
    torch.backends.mlu.matmul.allow_tf32 = False
    torch.backends.cnnl.allow_tf32 = False
    modelname, batchsize, seqlen, export_path, dtype = parse_args()
    if dtype == "tf32":
        torch.backends.mlu.matmul.allow_tf32 = True
    else:
        os.environ["CAMBRICON_TF32_OVERRIDE"] = "0"
    model, voc_size = get_model(modelname)
    if export_path is not None:
        filename = "{}_{}_{}_{}.onnx".format(modelname, batchsize, seqlen, dtype)
        path = os.path.join(export_path, filename)
        if not os.path.exists(path):
            param = torch.zeros((batchsize, seqlen), dtype=torch.int)
            export_onnx(modelname, model, param, path, True, dtype)
        else:
            print("Onnx path exists, skipping export.")
    run_pytorch(model, voc_size, batchsize, seqlen, dtype)
 if __name__ == "__main__":
    main()
--- a/examples/distributed/bang/bang_launch.py
+++ b/examples/distributed/bang/bang_launch.py
@ -1,39 +1,35 @@
 import sys
 sys.path.append('../')
 import argparse
 import os
 import time
 import multiprocessing as mp
 from pyinfinitensor.onnx import OnnxStub, backend
 import onnx
 from onnx.external_data_helper import convert_model_to_external_data
 from onnx.shape_inference import infer_shapes_path
 import numpy as np
 from parallel_opt import parallel_model
 def parse_args():
    parser = argparse.ArgumentParser(description="launch distributed infinitensor")
    parser.add_argument("--num_nodes", type=int, default=1, help="number of nodes")
    parser.add_argument(
-        "--nproc_per_node", type=int, default=1, help="number of processes per node"
+        "--nproc_per_node", type=int, default=2, help="number of processes per node"
    )
    parser.add_argument(
        "--name", type=str, default="test", help="name of this instance."
    )
    parser.add_argument(
-        "--model", type=str, required=True, help="path to the ONNX model file."
+        "--model", type=str, default="/data/onnx_models/llama2/llama_bs1_seq1024.onnx", 
        help="path to the ONNX model file."
    )
    parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
    parser.add_argument("--length", type=int, default=1, help="sequence length.")
    parser.add_argument(
        "--gen_std",
        default=False,
        action="store_true",
        help="whether to generate the standard results.",
    )
    parser.add_argument(
        "--type", type=str, choices=["fp32", "fp16", "tf32"], default="fp32", help="data type"
    )
    args = parser.parse_args()
    print("arg setting: ", args)
    return (
@ -44,46 +40,39 @@ def parse_args():
        args.batch_size,
        args.length,
        args.gen_std,
        args.type,
    )
-def run_model(model, runtime, world_size=1, rank=0, n=10, data_type="default"):
+def run_model(model, runtime, world_size=1, rank=0, n=10):
-    stub = OnnxStub(model, runtime, matmul_compute_type=data_type)
+    stub = OnnxStub(model, runtime)
    load_inputs(stub, world_size, rank)
    # stub.tune()
    stub.run()
    # get outputs
    time.sleep(0.01)
    outputs = next(stub.outputs.values().__iter__()).copyout_numpy()
    # bench
    begin = time.time()
    for _ in range(n):
        stub.run()
    begin = time.time()
    for _ in range(n * 2):
        stub.run()
    end = time.time()
-    avg_time = (end - begin) / (n * 2)
+    avg_time = (end - begin) / n
    print(f"average time: {avg_time}")
    return outputs
 def load_inputs(stub, world_size=1, rank=0):
    for i, (name, tensor) in enumerate(stub.inputs.items()):
        input = np.load(f"./data/input_{i}.npy")
        if all(x == y for x,y in zip(input.shape,tensor.shape())):
            tensor.copyin_numpy(input)
        else:
            tensor.copyin_numpy(np.hsplit(input, world_size)[rank])
-
+def run_and_compare(name, model, runtime, world_size=1, rank = 0):
 def run_and_compare(name, model, runtime, world_size=1, rank=0, data_type="default"):
    results = np.load(f"./data/output.npy")
-    outputs = run_model(model, runtime, world_size, rank, data_type=data_type)
+    outputs = run_model(model, runtime, world_size, rank)
-    print("outputs abs mean:", abs(outputs).mean())
+    print("answer argmax:", np.argmax(results))
-    print("max abs diff:", abs(outputs - results).max())
+    print("output argmax:", np.argmax(outputs))
    #np.testing.assert_allclose(outputs, results, rtol=1e-3, atol=1e-3)
    getDiff(results, outputs)
 def start_worker(
-    name: str, world_size: int, rank: int, local_rank: int, model: onnx.ModelProto, data_type: str
+    name: str, world_size: int, rank: int, local_rank: int, model: onnx.ModelProto
 ):
    dist_name = name + "_dist"
    model = parallel_model(model, world_size, rank)
@ -96,7 +85,7 @@ def start_worker(
        save_as_external_data=True,
        location=extern_path,
    )
-    #infer_shapes_path(f"./{dist_name}_rank{rank}.onnx")
+    infer_shapes_path(f"./{dist_name}_rank{rank}.onnx")
    runtime = backend.BangRuntime(local_rank)
    # print("init comm")
    runtime.init_comm(
@ -104,12 +93,13 @@ def start_worker(
        world_size,
        rank,
    )
-    run_and_compare(name, model, runtime, world_size, rank, data_type)
+    run_and_compare(name, model, runtime, world_size, rank)
-def start_single(name, model, data_type):
+def start_single(name, model):
    runtime = backend.BangRuntime(0)
-    run_and_compare(name, model, runtime, data_type=data_type)
+    run_and_compare(name, model, runtime)
 def generate_input_output(model):
    os.makedirs(os.path.dirname("./data/"), exist_ok=True)
@ -142,36 +132,55 @@ def generate_input_output(model):
    np.save(f"./data/output", output)
 def load_inputs(stub, world_size=1, rank=0):
    for i, (name, tensor) in enumerate(stub.inputs.items()):
        input = np.load(f"./data/input_{i}.npy")
        if all(x == y for x,y in zip(input.shape,tensor.shape())):
            tensor.copyin_numpy(input)
        else:
            tensor.copyin_numpy(np.hsplit(input, world_size)[rank])
 def getDiff(base, test):
    absolute_diff = np.abs(np.subtract(base, test))
    max_absolute_diff = np.max(absolute_diff)
    baseCopy = base.astype(np.float64).ravel()
    testCopy = test.astype(np.float64).ravel()
    upValue = np.sum(np.abs(baseCopy - testCopy))
    downValue = np.sum(np.abs(baseCopy)) + np.float64(1e-9)
    max_relative_diff = upValue / downValue
    print(f"Max absolute difference: {max_absolute_diff}\n"
          f"Max relative difference: {max_relative_diff}")
    return max_absolute_diff, max_relative_diff
 def main():
-    nnodes, nproc_per_node, name, model_path, bs, length, gen_std, data_type = parse_args()
+    nnodes, nproc_per_node, name, model_path, bs, length, gen_std = parse_args()
-    data_type = "default" if data_type == "fp32" else data_type
+
    model = onnx.load(model_path)
    # generate standart output
    if gen_std:
-        print(f"generate standard data for {name}.")
+        print("Generate inputs and outputs.")
-        # a small vocabulary size to fit all LLM.
+        p = mp.Process(target=generate_input_output, args=[model])
-        generate_input_output(model)
+        p.start()
        p.join()
        return
-    if nproc_per_node == 1:
+    # run single process.
-        # run single process.
+    # use standalone process to isolate cuda.
-        # use standalone process to isolate bang.
+    print("run model by single MLU.")
-        print("run model by single MLU.")
+    p = mp.Process(target=start_single, args=(name, model))
-        # p = mp.Process(target=start_single, args=(name, model, data_type))
+    p.start()
-        # p.start()
+    p.join()
        # p.join()
        start_single(name, model, data_type)
        return
    # run distributed parallel.
    world_size = nnodes * nproc_per_node
-    print(f"run model by {world_size} MLU in parallel.")
+    print(f"run model by {world_size} MLUs in parallel.")
    workers = [
        mp.Process(
            target=start_worker,
-            args=(name, world_size, rank, rank % nproc_per_node, model, data_type),
+            args=(name, world_size, rank, rank % nproc_per_node, model),
        )
        for rank in range(world_size)
    ]
--- a/examples/distributed/cuda/cuda_launch.py
+++ b/examples/distributed/cuda/cuda_launch.py
--- a/examples/distributed/kunlun/export_onnx.sh
+++ b/examples/distributed/kunlun/export_onnx.sh
@ -1,14 +0,0 @@
 export HF_ENDPOINT=https://hf-mirror.com
 models=("bert" "gpt2" "llama")
 batch_size=(1 32)
 seq_len=(100 500)
 nproc=(1 2 4)
 for model in "${models[@]}"; do
    for bs in "${batch_size[@]}"; do
        for len in "${seq_len[@]}"; do
            python run_pytorch.py --model "$model" --batch_size "$bs" --length "$len" --export_onnx ../models/"$model" --export_only 
        done
    done
 done 
--- a/examples/distributed/kunlun/kunlun_launch.py
+++ b/examples/distributed/kunlun/kunlun_launch.py
@ -1,280 +0,0 @@
 import sys
 sys.path.append('../')
 import argparse
 import os
 import time
 import multiprocessing as mp
 from pyinfinitensor.onnx import OnnxStub, backend
 import onnx
 from onnx.external_data_helper import convert_model_to_external_data
 from onnx.shape_inference import infer_shapes_path
 import numpy as np
 from parallel_opt import parallel_model
 from functools import wraps
 def parse_args():
    parser = argparse.ArgumentParser(description="launch distributed infinitensor")
    parser.add_argument("--num_nodes", type=int, default=1, help="number of nodes")
    parser.add_argument(
        "--nproc_per_node", type=int, default=2, help="number of processes per node"
    )
    parser.add_argument(
        "--name", type=str, choices=["gpt2", "bert", "llama"], help="name of model."
    )
    parser.add_argument(
        "--model", type=str, default="", help="path to the ONNX model file."
    )
    parser.add_argument(
        "--gen_std",
        default=False,
        action="store_true",
        help="whether to generate the standard results.",
    )
    parser.add_argument(
        "--run_single",
        default=False,
        action="store_true",
        help="whether run model with single process with standard inputs"
    )
    parser.add_argument(
        "--input_dir",
        default="./",
        help="path to save model input data"
    )
    parser.add_argument(
        "--result_dir",
        default="./",
        help="path to save model standard output"
    )
    parser.add_argument(
        "--internal_model_dir",
        default="./",
        help="path to save internal onnx model for parallel run"
    )
    args = parser.parse_args()
    # check path, mkdir if not exist
    check_exists(args.input_dir)
    check_exists(args.result_dir)
    check_exists(args.internal_model_dir)
    print("arg setting: ", args)
    return (
        args.num_nodes,
        args.nproc_per_node,
        args.name,
        args.model,
        args.gen_std,
        args.run_single,
        args.input_dir,
        args.result_dir,
        args.internal_model_dir
    )
 """
 utils function for this scripts
 """
 def check_exists(path: str):
    if not os.path.exists(path):
        os.makedirs(path)
 def np_assert(base, test, rtol=1e-2, atol=1e-1):
    # np.testing.assert_allclose(test, base, rtol, atol)
    print("max abs diff:", abs(base - test).max())
 """
 Perf wrapper, run function n times
 then average
 """
 def perf_it(n):
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            # warmup
            for _ in range(n):
                func(*args, **kwargs)
            t_total = 0
            for _ in range(n):
                t0 = time.time()
                func(*args, **kwargs)
                t1 = time.time()
                t_total += t1 - t0
            avg_time = (t_total) / n
            print(f"Avg runtime of {n} time is {avg_time:.6f} seconds")
            return avg_time
        return wrapper
    return decorator
 """
 Run InfiniTensor model with Standard input
 check=True: check with standard output gen by pytorch
 perf=True: run n times to get avg time
 """
 def run_model(task_name,
              model,
              runtime,
              world_size=1,
              rank=0,
              n=10,
              check=True,
              perf=True):
    stub = OnnxStub(model, runtime,
                    use_naive_allocator=True \
                    if task_name == "llama" else False)
    # load in Onnx model inputs
    def load_inputs(stub: OnnxStub):
        # check exists
        inputs = []
        for i, (name, tensor) in enumerate(stub.inputs.items()):
            input_path = os.path.join(input_dir, \
                                f"{task_name}_input_{i}.npy")
            print(input_path)
            if os.path.exists(input_path):
                input = np.load(input_path)
            else :
                raise KeyError(f"{i} th input of model not exists")
            # check shape
            if all(x == y for x,y in zip(input.shape, tensor.shape())):
                tensor.copyin_numpy(input)
            else:
                tensor.copyin_numpy(np.hsplit(input, world_size)[rank])
    load_inputs(stub)
    # stub.tune()
    stub.run()
    time.sleep(0.01)
    output = next(stub.outputs.values().__iter__()).copyout_numpy()
    # check output results with standard output
    if check:
        st_output_path = os.path.join(result_dir, \
                                f"{task_name}_output.npy")
        assert os.path.exists(st_output_path) , \
                    "standard output not exists"
        st_output = np.load(st_output_path)
        if np.isnan(output).any():
            print("Nan in output")
            exit()
        np_assert(st_output, output)
    # perf
    if perf:
        @perf_it(n)
        def perf_infinitensor(stub: OnnxStub):
            stub.run()
        perf_infinitensor(stub)
    return output
 """
 Start a worker in Parallel
 """
 def start_worker(name: str,
           world_size: int,
           rank: int,
           local_rank: int,
           model: onnx.ModelProto):
    dist_name = name + "_dist"
    # partial a onnx model to world_size part
    model = parallel_model(model, world_size, rank)
    onnx.save(model, os.path.join(internal_model_dir, \
                                    f"{dist_name}_rank{rank}.onnx"), save_as_external_data=True)
    runtime = backend.KUNLUNRuntime(local_rank)
    # print("init comm")
    runtime.init_comm(
        dist_name,
        world_size,
        rank,
    )
    run_model(name, model, runtime, world_size, rank)
 """
 generate standard input/output with
 sigle card run
 """
 def gen_standard(task_name: str, model: onnx.ModelProto):
    runtime = backend.KUNLUNRuntime(0)
    stub = OnnxStub(model, runtime)
    position_id = 0
    # generate random input for model
    for i, (name, tensor) in enumerate(stub.inputs.items()):
        input = tensor.copyout_numpy()
        if np.issubdtype(input.dtype, np.integer):
            if input.size == 1:
                input = np.random.randint(0,2,size=input.shape, dtype=input.dtype)
            else:
                input = np.random.randint(0,2,size=input.shape, dtype=input.dtype)
        elif input.dtype == np.bool_:
            input = np.random.randint(0,2,size=input.shape) > 0
        else:
            if i == 0:
                input = np.ones(input.shape).astype(input.dtype)
                position_id = input.shape[-1] - 1
            else:
                input = np.random.rand(*input.shape).astype(input.dtype)
        tensor.copyin_numpy(input)
        np.save(os.path.join(input_dir, \
                    f"{task_name}_input_{i}.npy"), input)
    stub.run()
    # print(stub.outputs)
    output = next(stub.outputs.values().__iter__()).copyout_numpy()
    if np.isnan(output).any():
        print("Nan in output")
        exit()
    np.save(os.path.join(result_dir, f"{task_name}_output.npy"), output)
 def main():
    global input_dir, result_dir, internal_model_dir
    nnodes, nproc_per_node, task_name, \
        model_path, gen_std, run_single, \
            input_dir, result_dir, internal_model_dir = parse_args()
    # load input onnx model
    model = onnx.load(model_path)
    # generate standart output
    if gen_std:
        print("Generate inputs and outputs.")
        gen_standard(task_name, model)
        return
    if run_single:
        print("Run model by one GPU card.")
        runtime = backend.KUNLUNRuntime(0)
        run_model(task_name, model, runtime)
        return
    # run distributed parallel.
    world_size = nnodes * nproc_per_node
    print(f"Run model by {world_size} GPU in parallel.")
    workers = [
        mp.Process(
            target=start_worker,
            args=(task_name, world_size, rank, rank % nproc_per_node, model),
        )
        for rank in range(world_size)
    ]
    for w in workers:
        w.start()
    for w in workers:
        w.join()
 if __name__ == "__main__":
    main()
--- a/examples/distributed/kunlun/launch.sh
+++ b/examples/distributed/kunlun/launch.sh
@ -1,36 +0,0 @@
 export HF_ENDPOINT=https://hf-mirror.com
 # models=("bert" "gpt2" "llama")
 models=("bert" "gpt2")
 batch_size=(1 32)
 seq_len=(100 500)
 nproc=(1 2 4)
 results_dir="results"
 if [ -d "$results_dir" ]; then
    echo "directory ./$results_dir exists"
 else
    mkdir -p "$results_dir"
    echo "mkdir $results_dir, logs saved there"
 fi
 for model in "${models[@]}"; do
    for bs in "${batch_size[@]}"; do
        for len in "${seq_len[@]}"; do
            # run pytorch model
            echo "Run pytorch $model with batch_size=$bs length=$len ."
            python run_pytorch.py --model "$model" --batch_size "$bs" --length "$len" #> results/"$model"_"$bs"_"$len"_pytorch
            for n in "${nproc[@]}"; do
                # run infinitensor 
                echo "Run $n parallel infinitensor "$model" with batch_size=$bs and length=$len ."
                python kunlun_launch.py --name "$model" --model ../models/"$model"/"$model"_"$bs"_"$len".onnx --nproc_per_node=$n # >> results/"$model"_"$bs"_"$len"_infini 
                # delete internal files
                find ./ -type f -name "*.onnx" -delete
                find ./ -type f -name "*.pb" -delete
            done
            find ./ -type f -name "*.npy" -delete
        done
    done
 done
--- a/examples/distributed/kunlun/llama_launch.sh
+++ b/examples/distributed/kunlun/llama_launch.sh
@ -1,35 +0,0 @@
 export HF_ENDPOINT=https://hf-mirror.com
 # models=("bert" "gpt2" "llama")
 models=("llama")
 batch_size=(1 )
 seq_len=(100 500)
 nproc=(1 2 4)
 results_dir="results"
 if [ -d "$results_dir" ]; then
    echo "directory ./$results_dir exists"
 else
    mkdir -p "$results_dir"
    echo "mkdir $results_dir, logs saved there"
 fi
 for model in "${models[@]}"; do
    for bs in "${batch_size[@]}"; do
        for len in "${seq_len[@]}"; do
            echo "Run pytorch llama with batch_size="$bs" and length="$len""
            python run_pytorch.py --model "$model" --batch_size "$bs" --length "$len"
            for n in "${nproc[@]}"; do
                    # run pytorch model
                    echo "Run infinitensor llama with batch_size="$bs" and length="$len" and nproc="$n"."
                    python kunlun_launch.py --name llama --model ../models/llama/llama_"$bs"_"$len"_fp32.onnx --nproc_per_node=$n
                    # delete internal files
                    find ./ -type f -name "*.onnx" -delete
                    find ./ -type f -name "*0c" -delete
            done
            find ./ -type f -name "*.npy" -delete
        done
    done
 done
--- a/examples/distributed/kunlun/run_pytorch.py
+++ b/examples/distributed/kunlun/run_pytorch.py
@ -1,245 +0,0 @@
 import argparse
 import torch
 from transformers import BertModel, BertConfig
 from transformers import GPT2Model, GPT2Config
 from transformers import OPTModel, OPTConfig
 from transformers import LlamaModel, LlamaConfig
 import time
 import numpy as np
 import onnx
 import os
 import sys
 from onnx.external_data_helper import convert_model_to_external_data
 from onnxsim import simplify
 torch.backends.cuda.matmul.allow_tf32 = False
 torch.backends.cudnn.allow_tf32 = False
 def parse_args():
    parser = argparse.ArgumentParser(description="Run pytorch gpt2/bert/opt and optionally export onnx.")
    parser.add_argument(
        "--model", type=str, choices=["gpt2", "bert", "opt", "llama"], required=True, help="model type"
    )
    parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
    parser.add_argument("--length", type=int, default=1, help="sequence length.")
    parser.add_argument(
        "--export_onnx",
        type=str,
        nargs="?",
        default=None,
        const="./",
        help="whether and where to export onnx file",
    )
    parser.add_argument(
        "--input_dir",
        type=str,
        default="./",
        help="path to save pytorch model input data"
    )
    parser.add_argument(
        "--result_dir",
        type=str,
        default="./",
        help="path to save pytorch model output data"
    )
    parser.add_argument(
        "--export_only",
        action="store_true"
    )
    args = parser.parse_args()
    print("arg setting: ", args)
    return (
        args.model,
        args.batch_size,
        args.length,
        args.export_onnx,
        args.input_dir,
        args.result_dir,
        args.export_only
    )
 def get_model(modelname):
    if modelname == "bert":
        model = BertModel.from_pretrained("bert-base-uncased", add_pooling_layer=False, hidden_act="gelu_new") # erf is not impl by infini
        voc_size = BertConfig().vocab_size
    elif modelname == "gpt2":
        model = GPT2Model.from_pretrained("gpt2")
        voc_size = GPT2Config().vocab_size
    elif modelname == "opt":
        model = OPTModel.from_pretrained("./opt-125m")
        voc_size = OPTConfig().vocab_size
    elif modelname == "llama":
        model = LlamaModel.from_pretrained("meta-llama/Llama-2-7b-hf")
        voc_size = LlamaConfig().vocab_size
    else :
        raise KeyError(modelname)
    model = model.eval()
    return model, voc_size
 def run_pytorch(torch_model, voc_size, batchsize, len, model_name):
    data = np.random.randint(0, voc_size, (batchsize, len), dtype=np.int32)
    np.save(os.path.join(input_dir, f"{model_name}_input_0.npy"), data)
    inputs = torch.from_numpy(data).to("cuda")
    torch_model = torch_model.to("cuda")
    n_iter = 10
    with torch.no_grad():
        for _ in range(10):
            outputs = torch_model(inputs)
    torch.cuda.synchronize()
    begin = time.time()
    with torch.no_grad():
        for _ in range(n_iter):
            torch.cuda.synchronize()
            outputs = torch_model(inputs)
            #
            torch.cuda.synchronize()
    torch.cuda.synchronize()
    end = time.time()
    avg_time = (end - begin) / n_iter
    outputs = outputs.last_hidden_state.to("cpu")
    print("outputs abs mean:", abs(np.array(outputs)).mean())
    print(f"average time: {avg_time}")
    torch.cuda.memory.empty_cache()
    np.save(os.path.join(result_dir, f"{model_name}_output.npy"), \
                                        np.array(outputs))
    print(f"Save input & output as {model_name}_input_0.npy and {model_name}_output.npy")
 def export_onnx(model_name, model, data, path, extern=False):
    # torch.onnx.export(model, data, path, verbose=False, do_constant_folding=True)
    if model_name != "llama":
        onnx_model = onnx.load(path)
        onnx_model, check = simplify(onnx_model,
                                 skipped_optimizers=['fuse_qkv', 'eliminate_duplicate_initializer'])
                                 # skipped_optimizers=['fuse_qkv'])
        assert check
        add_value_info_for_constants(onnx_model)
        onnx_model = onnx.shape_inference.infer_shapes(onnx_model)
        if extern:
            extern_path = path.replace('.onnx', '.pb')
            if os.path.exists(extern_path):
                os.remove(extern_path)
            convert_model_to_external_data(
                onnx_model,
                all_tensors_to_one_file=True,
                location=extern_path.split("/")[-1],
                size_threshold=1024,
                convert_attribute=False,
            )
        onnx.save(onnx_model, path)
    else:
        sys.path.append("onnxsim_large_model")
        from onnx_utils import set_onnx_input_shape
        from compress_model import SIZE_1MB, compress_onnx_model, uncompress_onnx_model
        in_model_path = path
        out_model_path = in_model_path[:-5] + ".sim.onnx"
        onnx_model = onnx.load(in_model_path)
        print(f"load model from {in_model_path} success")
        size_th_bytes = 1024 * 1024
        onnx_model, removed_inits = compress_onnx_model(onnx_model, size_th_bytes=size_th_bytes)
        print("compress model success")
        onnx_model = set_onnx_input_shape(onnx_model, "")
        tensor_size_threshold = f"1024KB"
        skipped_optimizers = []
        skipped_optimizers.append("eliminate_duplicate_initializer")
        onnx_model, check = simplify(onnx_model, skipped_optimizers=skipped_optimizers,
                                    tensor_size_threshold=tensor_size_threshold)
        if not check:
            raise ValueError(f"simplify compressed model {in_model_path} failed")
        print(f"simplify model success")
        onnx_model = uncompress_onnx_model(onnx_model, removed_inits)
        print(f"uncompress model success")
        add_value_info_for_constants(onnx_model)
        onnx.save(onnx_model, out_model_path, save_as_external_data=True)
 def add_value_info_for_constants(model : onnx.ModelProto):
    """
    Currently onnx.shape_inference doesn't use the shape of initializers, so add
    that info explicitly as ValueInfoProtos.
    Mutates the model.
    Args:
        model: The ModelProto to update.
    """
    # All (top-level) constants will have ValueInfos before IRv4 as they are all inputs
    if model.ir_version < 4:
        return
    def add_const_value_infos_to_graph(graph : onnx.GraphProto):
        inputs = {i.name for i in graph.input}
        existing_info = {vi.name: vi for vi in graph.value_info}
        for init in graph.initializer:
            # Check it really is a constant, not an input
            if init.name in inputs:
                continue
            # The details we want to add
            elem_type = init.data_type
            shape = init.dims
            # Get existing or create new value info for this constant
            vi = existing_info.get(init.name)
            if vi is None:
                vi = graph.value_info.add()
                vi.name = init.name
            # Even though it would be weird, we will not overwrite info even if it doesn't match
            tt = vi.type.tensor_type
            if tt.elem_type == onnx.TensorProto.UNDEFINED:
                tt.elem_type = elem_type
            if not tt.HasField("shape"):
                # Ensure we set an empty list if the const is scalar (zero dims)
                tt.shape.dim.extend([])
                for dim in shape:
                    tt.shape.dim.add().dim_value = dim
        # Handle subgraphs
        for node in graph.node:
            for attr in node.attribute:
                # Ref attrs refer to other attrs, so we don't need to do anything
                if attr.ref_attr_name != "":
                    continue
                if attr.type == onnx.AttributeProto.GRAPH:
                    add_const_value_infos_to_graph(attr.g)
                if attr.type == onnx.AttributeProto.GRAPHS:
                    for g in attr.graphs:
                        add_const_value_infos_to_graph(g)
    return add_const_value_infos_to_graph(model.graph)
 def main():
    global input_dir, result_dir
    modelname, batchsize, seqlen, \
        export_path, input_dir, result_dir, export_only = parse_args()
    model, voc_size = get_model(modelname) # pytorch model
    if export_path is not None:
        os.makedirs(export_path, exist_ok=True)
        filename = "{}_{}_{}.onnx".format(modelname, batchsize, seqlen)
        path = os.path.join(export_path, filename)
        param = torch.zeros((batchsize, seqlen), dtype=torch.int)
        export_onnx(modelname, model, param, path, True) # export pytorch model to onnx model
        if export_only:
            return
    run_pytorch(model, voc_size, batchsize, seqlen, modelname)
 if __name__ == "__main__":
    main()
--- a/examples/distributed/launch_kunlun.py
+++ b/examples/distributed/launch_kunlun.py
@ -0,0 +1,213 @@
 import argparse
 import os
 import time
 import multiprocessing as mp
 from pyinfinitensor.onnx import OnnxStub, backend
 import onnx
 from onnx.external_data_helper import convert_model_to_external_data
 from onnx.shape_inference import infer_shapes_path
 import numpy as np
 from parallel_opt import parallel_model
 st_input_dir = "standard/inputs/"
 st_output_dir = "standard/outputs/"
 def parse_args():
    parser = argparse.ArgumentParser(description="launch distributed infinitensor")
    parser.add_argument("--num_nodes", type=int, default=1, help="number of nodes")
    parser.add_argument(
        "--nproc_per_node", type=int, default=2, help="number of processes per node"
    )
    parser.add_argument(
        "--name", type=str, default="test", help="name of this instance."
    )
    parser.add_argument(
        "--model", type=str, default="/data1/shared/panzezhong/llama/fp32/my_llama_fp32.sim.onnx", help="path to the ONNX model file."
    )
    parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
    parser.add_argument("--length", type=int, default=1, help="sequence length.")
    parser.add_argument(
        "--gen_std",
        default=False,
        action="store_true",
        help="whether to generate the standard results.",
    )
    parser.add_argument(
        "--run_single",
        default=False,
        action="store_true",
        help="whether run model with single process with standard inputs"
    )
    args = parser.parse_args()
    print("arg setting: ", args)
    return (
        args.num_nodes,
        args.nproc_per_node,
        args.name,
        args.model,
        args.batch_size,
        args.length,
        args.gen_std,
        args.run_single
    )
 def run_model(model, runtime, world_size=1, rank=0, n=10):
    stub = OnnxStub(model, runtime)
    load_inputs(stub, world_size, rank)
    # stub.tune()
    stub.run()
    # get outputs
    time.sleep(0.01)
    outputs = next(stub.outputs.values().__iter__()).copyout_numpy()
    # bench
    begin = time.time()
    for _ in range(n):
        stub.run()
    end = time.time()
    avg_time = (end - begin) / n
    print(f"average time: {avg_time}")
    return outputs
 def run_and_compare(name, model, runtime, world_size=1, rank = 0):
    results = np.load(os.path.join(st_output_dir,f"output.npy"))
    outputs = run_model(model, runtime, world_size, rank)
    print(outputs[:100])
    if np.isnan(outputs).any():
        print("Nan in output")
    print("answer argmax:", np.argmax(results))
    print("output argmax:", np.argmax(outputs))
    #np.testing.assert_allclose(outputs, results, rtol=1e-3, atol=1e-3)
    getDiff(results, outputs)
 def start_worker(
    name: str, world_size: int, rank: int, local_rank: int, model: onnx.ModelProto
 ):
    dist_name = name + "_dist"
    model = parallel_model(model, world_size, rank)
    extern_path = f"./{dist_name}_rank{rank}.pb"
    if os.path.exists(extern_path):
        os.remove(extern_path)
    onnx.save_model(
        model,
        f"./{dist_name}_rank{rank}.onnx",
        save_as_external_data=True,
        location=extern_path,
    )
    infer_shapes_path(f"./{dist_name}_rank{rank}.onnx")
    runtime = backend.KUNLUNRuntime(local_rank)
    # print("init comm")
    runtime.init_comm(
        dist_name,
        world_size,
        rank,
    )
    run_and_compare(name, model, runtime, world_size, rank)
 def start_single(name, model):
    runtime = backend.KUNLUNRuntime(0)
    run_and_compare(name, model, runtime)
 def generate_input_output(model):
    runtime = backend.KUNLUNRuntime(0)
    stub = OnnxStub(model, runtime)
    position_id = 0
    for i, (name, tensor) in enumerate(stub.inputs.items()):
        input = tensor.copyout_numpy()
        if np.issubdtype(input.dtype, np.integer):
            if input.size == 1:
                # input = np.array([position_id])
                input = np.random.randint(0,2,size=input.shape, dtype=input.dtype)
            else:
                input = np.random.randint(0,2,size=input.shape, dtype=input.dtype)
        elif input.dtype == np.bool_:
            input = np.random.randint(0,2,size=input.shape) > 0
        else:
            if i == 0:
                input = np.ones(input.shape).astype(input.dtype)
                position_id = input.shape[-1] - 1
            else:
                input = np.random.rand(*input.shape).astype(input.dtype)
        tensor.copyin_numpy(input)
        np.save(os.path.join(st_input_dir, f"input_{i}"), input)
    stub.run()
    # print(stub.outputs)
    time.sleep(0.01)
    output = next(stub.outputs.values().__iter__()).copyout_numpy()
    print(output[:100])
    if np.isnan(output).any():
        print("Nan in output")
    np.save(os.path.join(st_output_dir, f"output"), output)
 def load_inputs(stub, world_size=1, rank=0):
    for i, (name, tensor) in enumerate(stub.inputs.items()):
        input = np.load(os.path.join(st_input_dir, f"input_{i}.npy"))
        if all(x == y for x,y in zip(input.shape,tensor.shape())):
            tensor.copyin_numpy(input)
        else:
            tensor.copyin_numpy(np.hsplit(input, world_size)[rank])
 def getDiff(base, test):
    absolute_diff = np.abs(np.subtract(base, test))
    max_absolute_diff = np.max(absolute_diff)
    baseCopy = base.astype(np.float64).ravel()
    testCopy = test.astype(np.float64).ravel()
    upValue = np.sum(np.abs(baseCopy - testCopy))
    downValue = np.sum(np.abs(baseCopy)) + np.float64(1e-9)
    max_relative_diff = upValue / downValue
    print(f"Max absolute difference: {max_absolute_diff}\nMax relative difference: {max_relative_diff}")
    return max_absolute_diff, max_relative_diff
 def main():
    nnodes, nproc_per_node, name, model_path, bs, length, gen_std, run_single = parse_args()
    model = onnx.load(model_path)
    # generate standart output
    if gen_std:
        print("Generate inputs and outputs.")
        p = mp.Process(target=generate_input_output, args=[model])
        p.start()
        p.join()
        return
    # # run single process.
    # # use standalone process to isolate cuda.
    if run_single:
        print("run model by single GPU.")
        p = mp.Process(target=start_single, args=(name, model))
        p.start()
        p.join()
        return 
    # run distributed parallel.
    world_size = nnodes * nproc_per_node
    print(f"run model by {world_size} GPU in parallel.")
    workers = [
        mp.Process(
            target=start_worker,
            args=(name, world_size, rank, rank % nproc_per_node, model),
        )
        for rank in range(world_size)
    ]
    for w in workers:
        w.start()
    for w in workers:
        w.join()
 if __name__ == "__main__":
    main()
--- a/examples/distributed/cuda/launch_kvcache.py
+++ b/examples/distributed/cuda/launch_kvcache.py
--- a/examples/distributed/onnxsim_large_model
+++ b/examples/distributed/onnxsim_large_model
@ -1 +0,0 @@
 Subproject commit cbcf3fbf985a00494b0f136c92eaccd42031bf65
--- a/examples/distributed/parallel_opt.py
+++ b/examples/distributed/parallel_opt.py
@ -110,6 +110,7 @@ def parallel_model(model: ModelProto, tp_world_size: int = 1, tp_rank: int = 0):
                s_dim = 0
            elif in_plc.dim == 2:
                s_dim = 1
        assert s_dim != -1
        assert out_dims[s_dim] % tp_world_size == 0, out_dims
        out_dims[s_dim] //= tp_world_size
--- a/examples/distributed/cuda/run_pytorch.py
+++ b/examples/distributed/cuda/run_pytorch.py
--- a/include/cuda/cuda_expand.h
+++ b/include/cuda/cuda_expand.h
@ -3,10 +3,11 @@
 #include "operators/unary.h"
 #include "utils/small_array.h"
 namespace infini {
 void expandKernel(int dType, void *input, void *output, int a0, int a1, int a2,
                  int a3, int b0, int b1, int b2, int b3);
 void expandKernel(int dType, void *input, void *output, int nDims,
                  int outputsize, SmallArray inputShape,
                  SmallArray outputShape);
 void expandRowKernel(int dType, void *input, void *output, int n_rows,
                     int row_len);
 }; // namespace infini
--- a/include/cuda/cuda_where.h
+++ b/include/cuda/cuda_where.h
@ -1,16 +1,14 @@
 #pragma once
 #include "operators/unary.h"
 #include "utils/small_array.h"
 namespace infini {
-void whereKernel(const float *inputX, const float *inputY,
+void whereKernel(int dTypeIndex, void *inputX, void *inputY,
-                 const uint8_t *condition, float *output, int nDims,
+                 const uint8_t *condition, void *output, int a0, int a1, int a2,
-                 int outputsize, SmallArray inputXShape, SmallArray inputYShape,
+                 int a3, int b0, int b1, int b2, int b3, int c0, int c1, int c2,
-                 SmallArray conditionShape, SmallArray outputShape, int xSize,
+                 int c3, int d0, int d1, int d2, int d3);
-                 int ySize, int cSize);
+void whereKernel(int dTypeIndex, void *inputX, void *inputY,
-void whereKernel(const half *inputX, const half *inputY,
+                 const uint8_t *condition, void *output, int nDims,
                 const uint8_t *condition, half *output, int nDims,
                 int outputsize, SmallArray inputXShape, SmallArray inputYShape,
                 SmallArray conditionShape, SmallArray outputShape, int xSize,
                 int ySize, int cSize);
--- a/include/kunlun/kunlun_runtime.h
+++ b/include/kunlun/kunlun_runtime.h
@ -21,7 +21,7 @@ class KUNLUNRuntimeObj : public RuntimeObj {
        ctx = xdnn::create_context();
        // 10GB for Longformer
        // size_t longformerNum = 3lu * (1 << 30);
-        size_t workspaceSize = 2llu << 30; // 2 GB
+        size_t workspaceSize = 3llu << 30; // 3 GB
        KUNLUNPtr wkspacePtr = alloc(workspaceSize);
        workspace =
            make_ref<WorkspaceObj<KUNLUNPtr>>(wkspacePtr, workspaceSize);
@ -42,7 +42,7 @@ class KUNLUNRuntimeObj : public RuntimeObj {
    KUNLUNPtr alloc(size_t size) override {
        void *ptr;
        checkKUNLUNError(
-            xpu_malloc((void **)&ptr, size, XPUMemoryKind::XPU_MEM_HBM));
+            xpu_malloc_ex((void **)&ptr, size, XPUMemoryKind::XPU_MEM_MAIN));
        return ptr;
    }
    void dealloc(void *ptr) override { xpu_free(ptr); }
--- a/include/kunlun/xccl_communicator.h
+++ b/include/kunlun/xccl_communicator.h
@ -34,8 +34,8 @@ class XcclCommunicatorObj final : public CommunicatorObj {
            auto begin = std::chrono::steady_clock::now();
            while (!std::filesystem::exists(filePath)) {
                auto now = std::chrono::steady_clock::now();
-                _IT_ASSERT_2(now < begin + std::chrono::seconds(100),
+                _IT_ASSERT_2(now < begin + std::chrono::seconds(10),
-                             "time limit (100s) exceeded.");
+                             "time limit (10s) exceeded.");
                std::this_thread::sleep_for(std::chrono::milliseconds(100));
            }
            std::ifstream ifs(filePath, std::ios::binary);
--- a/pyinfinitensor/src/pyinfinitensor/onnx.py
+++ b/pyinfinitensor/src/pyinfinitensor/onnx.py
@ -967,7 +967,7 @@ class OnnxStub:
                    tensors[node.input[0]],
                    tensors.get(node.output[0]),
                )
-            elif node.op_type in ["Constant", "ConstantOfShape"]:
+            elif node.op_type == "Constant":
                output_name = node.output[0]
                attributes = _parse_attribute(node)
                tensor = attributes["value"]
--- a/src/kernels/bang/cast.cc
+++ b/src/kernels/bang/cast.cc
@ -199,24 +199,6 @@ class CastCnnl : public BangKernelWithoutConfig {
                                                   dim.data()));
            NlCastType = CNNL_CAST_UINT32_TO_INT64;
            break;
        case CastType::Float162Float:
            checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
                                                   CNNL_DTYPE_HALF, dim.size(),
                                                   dim.data()));
            checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
                                                   CNNL_DTYPE_FLOAT, dim.size(),
                                                   dim.data()));
            NlCastType = CNNL_CAST_HALF_TO_FLOAT;
            break;
        case CastType::Float2Float16:
            checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
                                                   CNNL_DTYPE_FLOAT, dim.size(),
                                                   dim.data()));
            checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
                                                   CNNL_DTYPE_HALF, dim.size(),
                                                   dim.data()));
            NlCastType = CNNL_CAST_FLOAT_TO_HALF;
            break;
        default:
            IT_TODO_HALT();
        }
--- a/src/kernels/bang/layer_norm.cc
+++ b/src/kernels/bang/layer_norm.cc
@ -19,16 +19,14 @@ class LayerNormCnnl : public BangKernelWithoutConfig {
        void *const outputData = (op->getOutput()->getRawDataPtr<void *>());
        auto inDims = op->getInputs(0)->getDims();
        auto fiterDims = op->getInputs(1)->getDims();
        auto outDims = op->getOutput()->getDims();
        auto fiterDims = op->getOutput(1)->getDims();
        float eps = op->getEps();
        const int axis = op->getAxis();
-        Shape outMeanDims(outDims);
+        cnnlTensorDescriptor_t inDesc, fiterDesc, outDesc;
        outMeanDims.erase(outMeanDims.begin() + axis);
        cnnlTensorDescriptor_t inDesc, fiterDesc, outDesc, outMeanDesc;
        checkCnnlError(cnnlCreateTensorDescriptor(&inDesc));
        checkCnnlError(cnnlSetTensorDescriptor(
            inDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
@ -41,23 +39,15 @@ class LayerNormCnnl : public BangKernelWithoutConfig {
        checkCnnlError(cnnlSetTensorDescriptor(
            outDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
            outDims.size(), outDims.data()));
        checkCnnlError(cnnlCreateTensorDescriptor(&outMeanDesc));
        checkCnnlError(cnnlSetTensorDescriptor(
            outMeanDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
            outMeanDims.size(), outMeanDims.data()));
        size_t wsSize;
        cnnlGetLayerNormOpWorkspaceSize(context->cnnlHandle(), axis, inDesc,
                                        &wsSize);
        BangPtr wsData = context->getWorkspace(wsSize);
        size_t meanSize =
            cnnlGetTensorElementNum(outMeanDesc) * op->getDType().getSize();
        BangPtr meanData = context->getWorkspace(meanSize);
        BangPtr rstdData = context->getWorkspace(meanSize);
        cnnlStatus_t stat = cnnlLayerNormForward(
            context->cnnlHandle(), inDesc, inputData, axis, fiterDesc,
            scaleData, biasData, eps, wsData, wsSize, outDesc, outputData,
-            outMeanDesc, meanData, rstdData);
+            inDesc, NULL, NULL);
        if (stat != CNNL_STATUS_SUCCESS)
            return;
--- a/src/kernels/bang/matmul.cc
+++ b/src/kernels/bang/matmul.cc
@ -66,13 +66,6 @@ class MatmulCnnl : public BangKernelWithoutConfig {
        cnnlSetMatMulDescAttr(bmm_desc, CNNL_MATMUL_DESC_TRANSB, &transB,
                              sizeof(int32_t));
        std::string computeTypeStr = op->getComputeType();
        if (computeTypeStr == "tf32") {
            int32_t tf32 = 1;
            cnnlSetMatMulDescAttr(bmm_desc, CNNL_MATMUL_ALLOW_TF32, &tf32,
                                  sizeof(int32_t));
        }
        cnnlMatMulAlgo_t bmm_algo;
        cnnlMatMulAlgoCreate(&bmm_algo);
--- a/src/kernels/cuda/element_wise.cu
+++ b/src/kernels/cuda/element_wise.cu
@ -5,34 +5,42 @@
 constexpr unsigned int num_threads() { return 32 * 4; }
 constexpr int thread_work_size() { return 4; }
 constexpr int block_work_size() { return thread_work_size() * num_threads(); }
-
+const int repeat = 1;
 template <class T>
 __global__ void _div_kernel(void *x, void *y, void *z, int a0, int a1, int a2,
                            int a3, int b0, int b1, int b2, int b3, int c0,
                            int c1, int c2, int c3) {
    int index = threadIdx.x + blockIdx.x * blockDim.x;
    int stride = blockDim.x * gridDim.x;
    int n = c0 * c1 * c2 * c3;
-    for (int i = index; i < n; i += stride) {
+    int stride1 = c2 * c3;
-        int c0_index = i / (c1 * c2 * c3);
+    int stride0 = c1 * stride1;
-        int c1_index = (i % (c1 * c2 * c3)) / (c2 * c3);
+    int n = c0 * stride0;
-        int c2_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) / c3;
+    int end = (repeat * index + repeat < n ? repeat * index + repeat : n);
-        int c3_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) % c3;
+    for (int i = repeat * index; i < end; i++) {
        int xIdx = (a0 * a1 * a2 * a3 == n ? i : 0);
        int yIdx = (b0 * b1 * b2 * b3 == n ? i : 0);
-        int a0_index = c0_index % a0;
+        bool aIdx = (a0 * a1 * a2 * a3 < n && a0 * a1 * a2 * a3 > 1);
-        int a1_index = c1_index % a1;
+        bool bIdx = (b0 * b1 * b2 * b3 < n && b0 * b1 * b2 * b3 > 1);
-        int a2_index = c2_index % a2;
+        if (aIdx || bIdx) {
-        int a3_index = c3_index % a3;
+            int c0_index = i / stride0;
            int c1_index = (i % stride0) / stride1;
            int c2_index = (i % stride1) / c3;
            int c3_index = i % c3;
            if (aIdx) {
-        int b0_index = c0_index % b0;
+                xIdx = (c0_index % a0) * a1 * a2 * a3 +
-        int b1_index = c1_index % b1;
+                       (c1_index % a1) * a2 * a3 + (c2_index % a2) * a3 +
-        int b2_index = c2_index % b2;
+                       c3_index % a3;
-        int b3_index = c3_index % b3;
+            }
-        ((T *)z)[i] = ((T *)x)[a0_index * a1 * a2 * a3 + a1_index * a2 * a3 +
+            if (bIdx) {
-                               a2_index * a3 + a3_index] /
+
-                      ((T *)y)[b0_index * b1 * b2 * b3 + b1_index * b2 * b3 +
+                yIdx = (c0_index % b0) * b1 * b2 * b3 +
-                               b2_index * b3 + b3_index];
+                       (c1_index % b1) * b2 * b3 + (c2_index % b2) * b3 +
                       c3_index % b3;
            }
        }
        ((T *)z)[i] = ((T *)x)[xIdx] / ((T *)y)[yIdx];
    }
 }
@ -41,28 +49,36 @@ __global__ void _add_kernel(void *x, void *y, void *z, int a0, int a1, int a2,
                            int a3, int b0, int b1, int b2, int b3, int c0,
                            int c1, int c2, int c3) {
    int index = threadIdx.x + blockIdx.x * blockDim.x;
    int stride = blockDim.x * gridDim.x;
    int n = c0 * c1 * c2 * c3;
-    for (int i = index; i < n; i += stride) {
+    int stride1 = c2 * c3;
-        int c0_index = i / (c1 * c2 * c3);
+    int stride0 = c1 * stride1;
-        int c1_index = (i % (c1 * c2 * c3)) / (c2 * c3);
+    int n = c0 * stride0;
-        int c2_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) / c3;
+    int end = (repeat * index + repeat < n ? repeat * index + repeat : n);
-        int c3_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) % c3;
+    for (int i = repeat * index; i < end; i++) {
        int xIdx = (a0 * a1 * a2 * a3 == n ? i : 0);
        int yIdx = (b0 * b1 * b2 * b3 == n ? i : 0);
-        int a0_index = c0_index % a0;
+        bool aIdx = (a0 * a1 * a2 * a3 < n && a0 * a1 * a2 * a3 > 1);
-        int a1_index = c1_index % a1;
+        bool bIdx = (b0 * b1 * b2 * b3 < n && b0 * b1 * b2 * b3 > 1);
-        int a2_index = c2_index % a2;
+        if (aIdx || bIdx) {
-        int a3_index = c3_index % a3;
+            int c0_index = i / stride0;
            int c1_index = (i % stride0) / stride1;
            int c2_index = (i % stride1) / c3;
            int c3_index = i % c3;
            if (aIdx) {
-        int b0_index = c0_index % b0;
+                xIdx = (c0_index % a0) * a1 * a2 * a3 +
-        int b1_index = c1_index % b1;
+                       (c1_index % a1) * a2 * a3 + (c2_index % a2) * a3 +
-        int b2_index = c2_index % b2;
+                       c3_index % a3;
-        int b3_index = c3_index % b3;
+            }
-        ((T *)z)[i] = ((T *)x)[a0_index * a1 * a2 * a3 + a1_index * a2 * a3 +
+            if (bIdx) {
-                               a2_index * a3 + a3_index] +
+
-                      ((T *)y)[b0_index * b1 * b2 * b3 + b1_index * b2 * b3 +
+                yIdx = (c0_index % b0) * b1 * b2 * b3 +
-                               b2_index * b3 + b3_index];
+                       (c1_index % b1) * b2 * b3 + (c2_index % b2) * b3 +
                       c3_index % b3;
            }
        }
        ((T *)z)[i] = ((T *)x)[xIdx] + ((T *)y)[yIdx];
    }
 }
@ -71,29 +87,36 @@ __global__ void _pow_kernel(void *x, void *y, void *z, int a0, int a1, int a2,
                            int a3, int b0, int b1, int b2, int b3, int c0,
                            int c1, int c2, int c3) {
    int index = threadIdx.x + blockIdx.x * blockDim.x;
    int stride = blockDim.x * gridDim.x;
    int n = c0 * c1 * c2 * c3;
-    for (int i = index; i < n; i += stride) {
+    int stride1 = c2 * c3;
-        int c0_index = i / (c1 * c2 * c3);
+    int stride0 = c1 * stride1;
-        int c1_index = (i % (c1 * c2 * c3)) / (c2 * c3);
+    int n = c0 * stride0;
-        int c2_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) / c3;
+    int end = (repeat * index + repeat < n ? repeat * index + repeat : n);
-        int c3_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) % c3;
+    for (int i = repeat * index; i < end; i++) {
        int xIdx = (a0 * a1 * a2 * a3 == n ? i : 0);
        int yIdx = (b0 * b1 * b2 * b3 == n ? i : 0);
-        int a0_index = c0_index % a0;
+        bool aIdx = (a0 * a1 * a2 * a3 < n && a0 * a1 * a2 * a3 > 1);
-        int a1_index = c1_index % a1;
+        bool bIdx = (b0 * b1 * b2 * b3 < n && b0 * b1 * b2 * b3 > 1);
-        int a2_index = c2_index % a2;
+        if (aIdx || bIdx) {
-        int a3_index = c3_index % a3;
+            int c0_index = i / stride0;
            int c1_index = (i % stride0) / stride1;
            int c2_index = (i % stride1) / c3;
            int c3_index = i % c3;
            if (aIdx) {
-        int b0_index = c0_index % b0;
+                xIdx = (c0_index % a0) * a1 * a2 * a3 +
-        int b1_index = c1_index % b1;
+                       (c1_index % a1) * a2 * a3 + (c2_index % a2) * a3 +
-        int b2_index = c2_index % b2;
+                       c3_index % a3;
-        int b3_index = c3_index % b3;
+            }
-        ((T *)z)[i] =
+            if (bIdx) {
-            pow(((T *)x)[a0_index * a1 * a2 * a3 + a1_index * a2 * a3 +
+
-                         a2_index * a3 + a3_index],
+                yIdx = (c0_index % b0) * b1 * b2 * b3 +
-                ((T *)y)[b0_index * b1 * b2 * b3 + b1_index * b2 * b3 +
+                       (c1_index % b1) * b2 * b3 + (c2_index % b2) * b3 +
-                         b2_index * b3 + b3_index]);
+                       c3_index % b3;
            }
        }
        ((T *)z)[i] = pow(((T *)x)[xIdx], ((T *)y)[yIdx]);
    }
 }
@ -102,31 +125,36 @@ __global__ void _less_kernel(void *x, void *y, void *z, int a0, int a1, int a2,
                             int a3, int b0, int b1, int b2, int b3, int c0,
                             int c1, int c2, int c3) {
    int index = threadIdx.x + blockIdx.x * blockDim.x;
    int stride = blockDim.x * gridDim.x;
    int n = c0 * c1 * c2 * c3;
-    for (int i = index; i < n; i += stride) {
+    int stride1 = c2 * c3;
-        int c0_index = i / (c1 * c2 * c3);
+    int stride0 = c1 * stride1;
-        int c1_index = (i % (c1 * c2 * c3)) / (c2 * c3);
+    int n = c0 * stride0;
-        int c2_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) / c3;
+    int end = (repeat * index + repeat < n ? repeat * index + repeat : n);
-        int c3_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) % c3;
+    for (int i = repeat * index; i < end; i++) {
        int xIdx = (a0 * a1 * a2 * a3 == n ? i : 0);
        int yIdx = (b0 * b1 * b2 * b3 == n ? i : 0);
-        int a0_index = c0_index % a0;
+        bool aIdx = (a0 * a1 * a2 * a3 < n && a0 * a1 * a2 * a3 > 1);
-        int a1_index = c1_index % a1;
+        bool bIdx = (b0 * b1 * b2 * b3 < n && b0 * b1 * b2 * b3 > 1);
-        int a2_index = c2_index % a2;
+        if (aIdx || bIdx) {
-        int a3_index = c3_index % a3;
+            int c0_index = i / stride0;
            int c1_index = (i % stride0) / stride1;
            int c2_index = (i % stride1) / c3;
            int c3_index = i % c3;
            if (aIdx) {
-        int b0_index = c0_index % b0;
+                xIdx = (c0_index % a0) * a1 * a2 * a3 +
-        int b1_index = c1_index % b1;
+                       (c1_index % a1) * a2 * a3 + (c2_index % a2) * a3 +
-        int b2_index = c2_index % b2;
+                       c3_index % a3;
-        int b3_index = c3_index % b3;
+            }
-        ((bool *)z)[i] =
+            if (bIdx) {
-            ((T *)x)[a0_index * a1 * a2 * a3 + a1_index * a2 * a3 +
+
-                     a2_index * a3 + a3_index] <
+                yIdx = (c0_index % b0) * b1 * b2 * b3 +
-                    ((T *)y)[b0_index * b1 * b2 * b3 + b1_index * b2 * b3 +
+                       (c1_index % b1) * b2 * b3 + (c2_index % b2) * b3 +
-                             b2_index * b3 + b3_index]
+                       c3_index % b3;
-                ? true
+            }
-                : false;
+        }
        ((bool *)z)[i] = ((T *)x)[xIdx] < ((T *)y)[yIdx] ? true : false;
    }
 }
@ -176,7 +204,6 @@ __global__ void _less_kernel(void *x, void *y, void *z, int a0, int a1, int a2,
    default:                                                                   \
        IT_TODO_HALT();                                                        \
    }
 template <class T>
 __global__ void _div_const_kernel(void const *__restrict__ x,
                                  void const *__restrict__ y,
@ -269,7 +296,8 @@ void div_kernel(int dType, void *a, void *b, void *c, int a0, int a1, int a2,
    int blocksize = block_work_size();
    int num = c0 * c1 * c2 * c3;
-    int gridsize = (num + block_work_size() - 1) / block_work_size();
+    int gridsize =
        (num + repeat * block_work_size() - 1) / (repeat * block_work_size());
    SWITCH_DTYPE(div, dType)
 }
 void add_kernel(int dType, void *a, void *b, void *c, int a0, int a1, int a2,
@ -278,7 +306,8 @@ void add_kernel(int dType, void *a, void *b, void *c, int a0, int a1, int a2,
    int blocksize = block_work_size();
    int num = c0 * c1 * c2 * c3;
-    int gridsize = (num + block_work_size() - 1) / block_work_size();
+    int gridsize =
        (num + repeat * block_work_size() - 1) / (repeat * block_work_size());
    SWITCH_DTYPE(add, dType)
 }
 void pow_kernel(int dType, void *a, void *b, void *c, int a0, int a1, int a2,
@ -286,7 +315,8 @@ void pow_kernel(int dType, void *a, void *b, void *c, int a0, int a1, int a2,
                int c3) {
    int blocksize = block_work_size();
    int num = c0 * c1 * c2 * c3;
-    int gridsize = (num + block_work_size() - 1) / block_work_size();
+    int gridsize =
        (num + repeat * block_work_size() - 1) / (repeat * block_work_size());
    if (dType == 1) {
        _pow_kernel<float>
            <<<gridsize, blocksize, 0, CUDAStream::getCurrentStream()>>>(
@ -324,7 +354,8 @@ void less_kernel(int dType, void *a, void *b, void *c, int a0, int a1, int a2,
                 int c3) {
    int blocksize = block_work_size();
    int num = c0 * c1 * c2 * c3;
-    int gridsize = (num + block_work_size() - 1) / block_work_size();
+    int gridsize =
        (num + repeat * block_work_size() - 1) / (repeat * block_work_size());
    SWITCH_DTYPE(less, dType)
 }
--- a/src/kernels/cuda/expand.cc
+++ b/src/kernels/cuda/expand.cc
@ -12,22 +12,33 @@ class ExpandCuda : public CudaKernelWithoutConfig {
        void *const inputData = (op->getInputs(0)->getRawDataPtr<void *>());
        void *const outputData = (op->getOutput()->getRawDataPtr<void *>());
-        const auto &in_Shape = op->getInputs(0)->getDims(); // input shape
+        auto a_dim = op->getInputs(0)->getDims();
-        const auto &out_Shape = op->getShape();             // output shape
+        auto b_dim = op->getOutput()->getDims(); // output shape
        SmallArray inputShape, outputShape;
        int nDims = op->getInputs(0)->getDims().size();
        IT_ASSERT(nDims <= SMALL_ARRAY_SIZE);
        int outputsize = 1; // the length of the output vector after flatten
        for (int i = 0; i < nDims; ++i) {
            outputShape.data[i] = out_Shape[i];
            inputShape.data[i] = in_Shape[i];
            outputsize *= out_Shape[i];
        }
        const int dType = op->getDType().getIndex();
-        expandKernel(dType, inputData, outputData, nDims, outputsize,
+        if (a_dim.size() > 4 || b_dim.size() > 4) {
-                     inputShape, outputShape);
+            SmallArray inputShape, outputShape;
            int nDims = op->getInputs(0)->getDims().size();
            IT_ASSERT(nDims <= SMALL_ARRAY_SIZE);
            int outputsize = 1; // the length of the output vector after flatten
            for (int i = 0; i < nDims; ++i) {
                outputShape.data[i] = b_dim[i];
                inputShape.data[i] = a_dim[i];
                outputsize *= b_dim[i];
            }
            const int dType = op->getDType().getIndex();
            expandKernel(dType, inputData, outputData, nDims, outputsize,
                         inputShape, outputShape);
        } else {
            int a[4] = {1, 1, 1, 1};
            int b[4] = {1, 1, 1, 1};
            std::copy(a_dim.begin(), a_dim.end(), a + (4 - a_dim.size()));
            std::copy(b_dim.begin(), b_dim.end(), b + (4 - b_dim.size()));
            expandKernel(dType, inputData, outputData, a[0], a[1], a[2], a[3],
                         b[0], b[1], b[2], b[3]);
        }
    }
 };
--- a/src/kernels/cuda/expand.cu
+++ b/src/kernels/cuda/expand.cu
@ -6,7 +6,31 @@
 constexpr unsigned int num_threads() { return 32 * 4; }
 constexpr int thread_work_size() { return 4; }
 constexpr int block_work_size() { return thread_work_size() * num_threads(); }
 const int repeat = 1;
 template <class T>
 __global__ void _expandKernel(void *input, void *output, int a0, int a1, int a2,
                              int a3, int b0, int b1, int b2, int b3) {
    int index = threadIdx.x + blockIdx.x * blockDim.x;
    int stride1 = b2 * b3;
    int stride0 = b1 * stride1;
    int n = b0 * stride0;
    int end = (repeat * index + repeat < n ? repeat * index + repeat : n);
    for (int i = repeat * index; i < end; i++) {
        int xIdx = (a0 * a1 * a2 * a3 == n ? i : 0);
        bool aIdx = (a0 * a1 * a2 * a3 < n && a0 * a1 * a2 * a3 > 1);
        if (aIdx) {
            int b0_index = i / stride0;
            int b1_index = (i % stride0) / stride1;
            int b2_index = (i % stride1) / b3;
            int b3_index = i % b3;
            xIdx = (b0_index % a0) * a1 * a2 * a3 + (b1_index % a1) * a2 * a3 +
                   (b2_index % a2) * a3 + b3_index % a3;
        }
        ((T *)output)[i] = ((T *)input)[xIdx];
    }
 }
 template <class T>
 __global__ void _expandKernel(void *input, void *output, int nDims,
                              int outputsize, infini::SmallArray inputShape,
@ -38,7 +62,6 @@ __global__ void _expandKernel(void *input, void *output, int nDims,
        ((T *)output)[outputIdx] = ((T *)input)[inputIdx];
    }
 }
 template <class T>
 static __global__ void _expandRowKernel(void *__restrict__ dst,
                                        void const *__restrict__ src) {
@ -50,9 +73,9 @@ static __global__ void _expandRowKernel(void *__restrict__ dst,
 namespace infini {
 #define CASE(T)                                                                \
-    _expandKernel<DT_CUDA<T>::t><<<gridsize, blocksize,                        \
+    _expandKernel<DT_CUDA<T>::t>                                               \
-        0, CUDAStream::getCurrentStream()>>>(                                  \
+        <<<gridsize, blocksize, 0, CUDAStream::getCurrentStream()>>>(          \
-        input, output, nDims, outputsize, inputShape, outputShape);
+            input, output, a0, a1, a2, a3, b0, b1, b2, b3);
 #define SWITCH_DTYPE(DTYPE)                                                    \
    switch (DTYPE) {                                                           \
@ -96,14 +119,56 @@ namespace infini {
        IT_TODO_HALT();                                                        \
    }
 void expandKernel(int dType, void *input, void *output, int a0, int a1, int a2,
                  int a3, int b0, int b1, int b2, int b3) {
    int blocksize = block_work_size();
    int outputsize = b0 * b1 * b2 * b3;
    int gridsize = (outputsize + repeat * block_work_size() - 1) /
                   (repeat * block_work_size());
    SWITCH_DTYPE(dType)
 }
 #define CASECurrency(T)                                                        \
    _expandKernel<DT_CUDA<T>::t>                                               \
        <<<gridsize, blocksize, 0, CUDAStream::getCurrentStream()>>>(          \
            input, output, nDims, outputsize, inputShape, outputShape);
 #define SWITCHCurrency_DTYPE(DTYPE)                                            \
    switch (DTYPE) {                                                           \
    case 1:                                                                    \
        CASECurrency(1) break;                                                 \
    case 2:                                                                    \
        CASECurrency(2) break;                                                 \
    case 3:                                                                    \
        CASECurrency(3) break;                                                 \
    case 4:                                                                    \
        CASECurrency(4) break;                                                 \
    case 5:                                                                    \
        CASECurrency(5) break;                                                 \
    case 6:                                                                    \
        CASECurrency(6) break;                                                 \
    case 7:                                                                    \
        CASECurrency(7) break;                                                 \
    case 10:                                                                   \
        CASECurrency(10) break;                                                \
    case 11:                                                                   \
        CASECurrency(11) break;                                                \
    case 12:                                                                   \
        CASECurrency(12) break;                                                \
    case 13:                                                                   \
        CASECurrency(13) break;                                                \
    case 16:                                                                   \
        CASECurrency(16) break;                                                \
    default:                                                                   \
        IT_TODO_HALT();                                                        \
    }
 void expandKernel(int dType, void *input, void *output, int nDims,
                  int outputsize, SmallArray inputShape,
                  SmallArray outputShape) {
    int blocksize = block_work_size();
    int gridsize = (outputsize + block_work_size() - 1) / block_work_size();
-    SWITCH_DTYPE(dType)
+    SWITCHCurrency_DTYPE(dType)
 }
 #define CASE_ROW(T)                                                            \
    _expandRowKernel<float>                                                    \
        <<<grid, block, 0, CUDAStream::getCurrentStream()>>>(output, input);
@ -150,7 +215,8 @@ void expandKernel(int dType, void *input, void *output, int nDims,
        IT_TODO_HALT();                                                        \
    }
-// Optimization for expanding a row vector. The row length must be a multiple of 32
+// Optimization for expanding a row vector. The row length must be a multiple of
 // 32
 void expandRowKernel(int dType, void *input, void *output, int n_rows,
                     int row_len) {
    // Factorize row_len: row_len = a x b x 32 (32 is the warp size), b<=32
@ -160,7 +226,8 @@ void expandRowKernel(int dType, void *input, void *output, int n_rows,
    // block: b x 32
    auto c = row_len / 32, b = c;
    if (b > 32) {
-        for (b = 32; c % b != 0; --b);
+        for (b = 32; c % b != 0; --b)
            ;
    }
    auto a = c / b;
    dim3 grid(a, n_rows), block(32, b);
--- a/src/kernels/cuda/matmul.cc
+++ b/src/kernels/cuda/matmul.cc
@ -87,20 +87,7 @@ class matmulCublas : public Kernel {
            beta_naive = 1.f;
            auto inC = op->getInputs(2);
            auto out = op->getOutput();
-            SmallArray inputShape, outputShape;
+
            int nDims = out->getRank();
            IT_ASSERT(nDims <= SMALL_ARRAY_SIZE);
            // FIXME(constroy): use size_t for outputsize.
            int outputsize = 1; // the length of the output vector after flatten
            int offset = nDims - inC->getRank();
            for (int i = 0; i < offset; ++i)
                inputShape.data[i] = 1;
            for (int i = 0; i < nDims; ++i) {
                outputShape.data[i] = out->getDims()[i];
                outputsize *= outputShape.data[i];
                if (i >= offset)
                    inputShape.data[i] = inC->getDims()[i - offset];
            }
            const int dType = dataType.getIndex();
            // Bias in linear layer is row vector of (1,n), n is the number of
@ -111,9 +98,40 @@ class matmulCublas : public Kernel {
                                out->size() / inC->getDims()[0],
                                inC->getDims()[0]);
            } else {
-                expandKernel(dType, inC->getRawDataPtr<void *>(),
+                auto a_dim = out->getDims();
-                             out->getRawDataPtr<void *>(), nDims, outputsize,
+                auto b_dim = inC->getDims(); // output shape
-                             inputShape, outputShape);
+
                if (a_dim.size() > 4 || b_dim.size() > 4) {
                    SmallArray inputShape, outputShape;
                    int nDims = out->getRank();
                    IT_ASSERT(nDims <= SMALL_ARRAY_SIZE);
                    // FIXME(constroy): use size_t for outputsize.
                    int outputsize =
                        1; // the length of the output vector after flatten
                    int offset = nDims - inC->getRank();
                    for (int i = 0; i < offset; ++i)
                        inputShape.data[i] = 1;
                    for (int i = 0; i < nDims; ++i) {
                        outputShape.data[i] = out->getDims()[i];
                        outputsize *= outputShape.data[i];
                        if (i >= offset)
                            inputShape.data[i] = inC->getDims()[i - offset];
                    }
                    expandKernel(dType, inC->getRawDataPtr<void *>(),
                                 out->getRawDataPtr<void *>(), nDims,
                                 outputsize, inputShape, outputShape);
                } else {
                    int a[4] = {1, 1, 1, 1};
                    int b[4] = {1, 1, 1, 1};
                    std::copy(a_dim.begin(), a_dim.end(),
                              a + (4 - a_dim.size()));
                    std::copy(b_dim.begin(), b_dim.end(),
                              b + (4 - b_dim.size()));
                    expandKernel(dType, inC->getRawDataPtr<void *>(),
                                 out->getRawDataPtr<void *>(), a[0], a[1], a[2],
                                 a[3], b[0], b[1], b[2], b[3]);
                }
            }
        }
        // TODO:use compute type
--- a/src/kernels/cuda/transpose.cc
+++ b/src/kernels/cuda/transpose.cc
@ -16,31 +16,57 @@ class TransposeCuda : public CudaKernelWithoutConfig {
        void *const outputData = output->getRawDataPtr<void *>();
        const auto &inputShape = input->getDims();
        const auto &outputShape = output->getDims();
-
+        const int dType = op->getDType().getIndex();
        const auto &perm = op->getPermute();
        int size = input->size();
        int nDims = input->getDims().size();
-
+        //----------------
-        // Compute strides
+        bool condition = true;
-        SmallArray strides, buffer;
+        int gnum = 0;
-        IT_ASSERT(nDims <= SMALL_ARRAY_SIZE);
+        for (int i = 0; i < nDims; i++) {
-        int curStride = 1;
+            if (inputShape[i] > 1) {
-        for (int i = nDims - 1; i >= 0; --i) {
+                while (gnum < nDims) {
-            buffer.data[i] = curStride;
+                    if (outputShape[gnum] > 1) {
-            curStride *= inputShape[i];
+                        gnum += 1;
-        }
+                        break;
-        for (int i = 0; i < nDims; ++i) {
+                    } else {
-            strides.data[i] = buffer.data[perm[i]];
+                        gnum += 1;
                    }
                }
                if (inputShape[i] != outputShape[gnum - 1]) {
                    condition = false;
                    break;
                }
            }
        }
        //----------------
        if (condition) {
            cudaMemcpyAsync(outputData, inputData, op->getInputs(0)->getBytes(),
                            cudaMemcpyDeviceToDevice,
                            CUDAStream::getCurrentStream());
-        SmallArray outputDims;
+        } else {
-        for (int i = 0; i < nDims; ++i) {
+            const auto &perm = op->getPermute();
            outputDims.data[i] = outputShape[i];
        }
-        const int dType = op->getDType().getIndex();
+            // Compute strides
-        transpose_kernel(dType, inputData, outputData, nDims, size, strides,
+            SmallArray strides, buffer;
-                         outputDims);
+            IT_ASSERT(nDims <= SMALL_ARRAY_SIZE);
            int curStride = 1;
            for (int i = nDims - 1; i >= 0; --i) {
                buffer.data[i] = curStride;
                curStride *= inputShape[i];
            }
            for (int i = 0; i < nDims; ++i) {
                strides.data[i] = buffer.data[perm[i]];
            }
            SmallArray outputDims;
            for (int i = 0; i < nDims; ++i) {
                outputDims.data[i] = outputShape[i];
            }
            transpose_kernel(dType, inputData, outputData, nDims, size, strides,
                             outputDims);
        }
    }
 };
--- a/src/kernels/cuda/transpose.cu
+++ b/src/kernels/cuda/transpose.cu
@ -24,8 +24,8 @@ __global__ void _transpose_kernel(void *input, void *output, int nDims,
 }
 #define CASE(T)                                                                \
    _transpose_kernel<DT_CUDA<T>::t>                                           \
-        <<<gridsize, blocksize, 0, CUDAStream::getCurrentStream()>>>           \
+        <<<gridsize, blocksize, 0, CUDAStream::getCurrentStream()>>>(          \
-        (input, output, nDims, size, strides, outputShape);
+            input, output, nDims, size, strides, outputShape);
 #define SWITCH_DTYPE(DTYPE)                                                    \
    switch (DTYPE) {                                                           \
--- a/src/kernels/cuda/where.cc
+++ b/src/kernels/cuda/where.cc
@ -1,8 +1,8 @@
 #include "operators/where.h"
 #include "cuda/cuda_kernel_wihtout_config.h"
 #include "cuda/cuda_runtime.h"
 #include "cuda/cuda_utility.h"
 #include "cuda/cuda_where.h"
 #include "utils/operator_utils.h"
 namespace infini {
@ -15,39 +15,50 @@ class WhereCuda : public CudaKernelWithoutConfig {
        void *const inputYData = (op->getInputs(1)->getRawDataPtr<void *>());
        void *const conditionData = (op->getInputs(2)->getRawDataPtr<void *>());
        void *const outputData = (op->getOutput()->getRawDataPtr<void *>());
        const auto &opInputXShape = op->getInputs(0)->getDims();
        const auto &opInputYShape = op->getInputs(1)->getDims();
        const auto &opConditionShape = op->getInputs(2)->getDims();
        const auto &opOutputShape = op->getOutput()->getDims();
-        const int xSize = op->getInputs(0)->getRank();
+        auto a_dim = op->getInputs(0)->getDims();
-        const int ySize = op->getInputs(1)->getRank();
+        auto b_dim = op->getInputs(1)->getDims();
-        const int cSize = op->getInputs(2)->getRank();
+        auto c_dim = op->getInputs(2)->getDims();
        auto d_dim = op->getOutput()->getDims();
        const int dTypeIndex = op->getDType().getIndex();
        if (a_dim.size() > 4 || b_dim.size() > 4 || c_dim.size() > 4 ||
            d_dim.size() > 4) {
            const int xSize = op->getInputs(0)->getRank();
            const int ySize = op->getInputs(1)->getRank();
            const int cSize = op->getInputs(2)->getRank();
-        int nDims = op->getOutput()->getDims().size();
+            int nDims = op->getOutput()->getDims().size();
-        IT_ASSERT(nDims <= SMALL_ARRAY_SIZE);
+            IT_ASSERT(nDims <= SMALL_ARRAY_SIZE);
-        int outputsize = 1;
+            int outputsize = 1;
-        SmallArray inputXShape, inputYShape, conditionShape, outputShape;
+            SmallArray inputXShape, inputYShape, conditionShape, outputShape;
-        for (int i = nDims - 1; i >= 0; --i) {
+            for (int i = nDims - 1; i >= 0; --i) {
-            outputShape.data[i] = opOutputShape[i];
+                outputShape.data[i] = d_dim[i];
-            outputsize *= outputShape.data[i];
+                outputsize *= outputShape.data[i];
            }
            broadcastShape(a_dim, inputXShape, nDims, xSize);
            broadcastShape(b_dim, inputYShape, nDims, ySize);
            broadcastShape(c_dim, conditionShape, nDims, cSize);
            whereKernel(dTypeIndex, inputXData, inputYData,
                        (uint8_t *)conditionData, outputData, nDims, outputsize,
                        inputXShape, inputYShape, conditionShape, outputShape,
                        xSize, ySize, cSize);
        }
        broadcastShape(opInputXShape, inputXShape, nDims, xSize);
        broadcastShape(opInputYShape, inputYShape, nDims, ySize);
        broadcastShape(opConditionShape, conditionShape, nDims, cSize);
-        if (op->getDType() == DataType::Float32) {
+        else {
-            whereKernel((float *)inputXData, (float *)inputYData,
+            int a[4] = {1, 1, 1, 1};
-                        (uint8_t *)conditionData, (float *)outputData, nDims,
+            int b[4] = {1, 1, 1, 1};
-                        outputsize, inputXShape, inputYShape, conditionShape,
+            int c[4] = {1, 1, 1, 1};
-                        outputShape, xSize, ySize, cSize);
+            int d[4] = {1, 1, 1, 1};
-        } else if (op->getDType() == DataType::Float16) {
+
-            whereKernel((half *)inputXData, (half *)inputYData,
+            std::copy(a_dim.begin(), a_dim.end(), a + (4 - a_dim.size()));
-                        (uint8_t *)conditionData, (half *)outputData, nDims,
+            std::copy(b_dim.begin(), b_dim.end(), b + (4 - b_dim.size()));
-                        outputsize, inputXShape, inputYShape, conditionShape,
+            std::copy(c_dim.begin(), c_dim.end(), c + (4 - c_dim.size()));
-                        outputShape, xSize, ySize, cSize);
+            std::copy(d_dim.begin(), d_dim.end(), d + (4 - d_dim.size()));
-        } else {
+
-            IT_ASSERT(false);
+            whereKernel(dTypeIndex, inputXData, inputYData,
                        (uint8_t *)conditionData, outputData, a[0], a[1], a[2],
                        a[3], b[0], b[1], b[2], b[3], c[0], c[1], c[2], c[3],
                        d[0], d[1], d[2], d[3]);
        }
    }
 };
--- a/src/kernels/cuda/where.cu
+++ b/src/kernels/cuda/where.cu
@ -1,6 +1,109 @@
 #include "cuda/cuda_common.h"
 #include "cuda/cuda_utility.h"
 #include "utils/small_array.h"
 const int repeat = 1;
 template <typename T>
 __global__ void
 _whereKernel(void *inputX, void *inputY, const uint8_t *condition, void *output,
             int a0, int a1, int a2, int a3, int b0, int b1, int b2, int b3,
             int c0, int c1, int c2, int c3, int d0, int d1, int d2, int d3) {
    int stride1 = d2 * d3;
    int stride0 = d1 * stride1;
    int n = d0 * stride0;
    int index = threadIdx.x + blockIdx.x * blockDim.x;
    int end = (repeat * index + repeat < n ? repeat * index + repeat : n);
    for (int i = repeat * index; i < end; i++) {
        int inputXIdx = (a0 * a1 * a2 * a3 == n ? i : 0);
        int inputYIdx = (b0 * b1 * b2 * b3 == n ? i : 0);
        int conditionIdx = (c0 * c1 * c2 * c3 == n ? i : 0);
        bool aIdx = (a0 * a1 * a2 * a3 < n && a0 * a1 * a2 * a3 > 1);
        bool bIdx = (b0 * b1 * b2 * b3 < n && b0 * b1 * b2 * b3 > 1);
        bool cIdx = (c0 * c1 * c2 * c3 < n && c0 * c1 * c2 * c3 > 1);
        if (aIdx || bIdx || cIdx) {
            int d0_index = i / stride0;
            int d1_index = (i % stride0) / stride1;
            int d2_index = (i % stride1) / d3;
            int d3_index = i % d3;
            if (aIdx) {
                int a0_index = d0_index % a0;
                int a1_index = d1_index % a1;
                int a2_index = d2_index % a2;
                int a3_index = d3_index % a3;
                inputXIdx = a0_index * a1 * a2 * a3 + a1_index * a2 * a3 +
                            a2_index * a3 + a3_index;
            }
            if (bIdx) {
                int b0_index = d0_index % b0;
                int b1_index = d1_index % b1;
                int b2_index = d2_index % b2;
                int b3_index = d3_index % b3;
                inputYIdx = b0_index * b1 * b2 * b3 + b1_index * b2 * b3 +
                            b2_index * b3 + b3_index;
            }
            if (cIdx) {
                int c0_index = d0_index % c0;
                int c1_index = d1_index % c1;
                int c2_index = d2_index % c2;
                int c3_index = d3_index % c3;
                conditionIdx = c0_index * c1 * c2 * c3 + c1_index * c2 * c3 +
                               c2_index * c3 + c3_index;
            }
        }
        ((T *)output)[i] = condition[conditionIdx] ? ((T *)inputX)[inputXIdx]
                                                   : ((T *)inputY)[inputYIdx];
    }
 }
 #define CASE(T)                                                                \
    _whereKernel<DT_CUDA<T>::t>                                                \
        <<<gridsize, blocksize, 0, CUDAStream::getCurrentStream()>>>(          \
            inputX, inputY, condition, output, a0, a1, a2, a3, b0, b1, b2, b3, \
            c0, c1, c2, c3, d0, d1, d2, d3);
 #define SWITCH_DTYPE(DTYPE)                                                    \
    switch (DTYPE) {                                                           \
    case 1:                                                                    \
        CASE(1)                                                                \
        break;                                                                 \
    case 2:                                                                    \
        CASE(2)                                                                \
        break;                                                                 \
    case 3:                                                                    \
        CASE(3)                                                                \
        break;                                                                 \
    case 4:                                                                    \
        CASE(4)                                                                \
        break;                                                                 \
    case 5:                                                                    \
        CASE(5)                                                                \
        break;                                                                 \
    case 6:                                                                    \
        CASE(6)                                                                \
        break;                                                                 \
    case 7:                                                                    \
        CASE(7)                                                                \
        break;                                                                 \
    case 10:                                                                   \
        CASE(10)                                                               \
        break;                                                                 \
    case 11:                                                                   \
        CASE(11)                                                               \
        break;                                                                 \
    case 12:                                                                   \
        CASE(12)                                                               \
        break;                                                                 \
    case 13:                                                                   \
        CASE(13)                                                               \
        break;                                                                 \
    case 16:                                                                   \
        CASE(16)                                                               \
        break;                                                                 \
    default:                                                                   \
        IT_TODO_HALT();                                                        \
    }
 __device__ int inferIndex(infini::SmallArray inputShape,
                          infini::SmallArray outputShape, int nDims, int size,
                          int outputIdx) {
@ -19,11 +122,10 @@ __device__ int inferIndex(infini::SmallArray inputShape,
 }
 template <typename T>
 __global__ void
-_whereKernel(const T *inputX, const T *inputY, const uint8_t *condition,
+_whereKernel(void *inputX, void *inputY, const uint8_t *condition, void *output,
-             T *output, int nDims, int outputsize,
+             int nDims, int outputsize, infini::SmallArray inputXShape,
-             infini::SmallArray inputXShape, infini::SmallArray inputYShape,
+             infini::SmallArray inputYShape, infini::SmallArray conditionShape,
-             infini::SmallArray conditionShape, infini::SmallArray outputShape,
+             infini::SmallArray outputShape, int xSize, int ySize, int cSize) {
             int xSize, int ySize, int cSize) {
    int outputIdx = blockIdx.x * blockDim.x + threadIdx.x;
    if (outputIdx < outputsize) {
@ -35,14 +137,74 @@ _whereKernel(const T *inputX, const T *inputY, const uint8_t *condition,
        int inputYIdx =
            inferIndex(inputYShape, outputShape, nDims, ySize, outputIdx);
-        output[outputIdx] =
+        ((T *)output)[outputIdx] = condition[conditionIdx]
-            condition[conditionIdx] ? inputX[inputXIdx] : inputY[inputYIdx];
+                                       ? ((T *)inputX)[inputXIdx]
                                       : ((T *)inputY)[inputYIdx];
    }
 }
 #define CASECurrency(T)                                                        \
    _whereKernel<DT_CUDA<T>::t>                                                \
        <<<gridsize, blocksize, 0, CUDAStream::getCurrentStream()>>>(          \
            inputX, inputY, condition, output, nDims, outputsize, inputXShape, \
            inputYShape, conditionShape, outputShape, xSize, ySize, cSize);
 #define SWITCHCurrency_DTYPE(DTYPE)                                            \
    switch (DTYPE) {                                                           \
    case 1:                                                                    \
        CASECurrency(1) break;                                                 \
    case 2:                                                                    \
        CASECurrency(2) break;                                                 \
    case 3:                                                                    \
        CASECurrency(3) break;                                                 \
    case 4:                                                                    \
        CASECurrency(4) break;                                                 \
    case 5:                                                                    \
        CASECurrency(5) break;                                                 \
    case 6:                                                                    \
        CASECurrency(6) break;                                                 \
    case 7:                                                                    \
        CASECurrency(7) break;                                                 \
    case 10:                                                                   \
        CASECurrency(10) break;                                                \
    case 11:                                                                   \
        CASECurrency(11) break;                                                \
    case 12:                                                                   \
        CASECurrency(12) break;                                                \
    case 13:                                                                   \
        CASECurrency(13) break;                                                \
    case 16:                                                                   \
        CASECurrency(16) break;                                                \
    default:                                                                   \
        IT_TODO_HALT();                                                        \
    }
 namespace infini {
 void whereKernel(int dTypeIndex, void *inputX, void *inputY,
                 const uint8_t *condition, void *output, int a0, int a1, int a2,
                 int a3, int b0, int b1, int b2, int b3, int c0, int c1, int c2,
                 int c3, int d0, int d1, int d2, int d3) {
    int blocksize;
    int outputsize = d0 * d1 * d2 * d3;
    if (outputsize > 511 * repeat) {
        blocksize = 1024;
    } else if (outputsize > 255 * repeat) {
        blocksize = 512;
    } else if (outputsize > 127 * repeat) {
        blocksize = 256;
    } else if (outputsize > 63 * repeat) {
        blocksize = 128;
    } else if (outputsize > 31 * repeat) {
        blocksize = 64;
    } else {
        blocksize = 32;
    }
    int gridsize = (outputsize + repeat * blocksize - 1) / (repeat * blocksize);
    SWITCH_DTYPE(dTypeIndex)
 }
-namespace infini {
+void whereKernel(int dTypeIndex, void *inputX, void *inputY,
-void whereKernel(const float *inputX, const float *inputY,
+                 const uint8_t *condition, void *output, int nDims,
                 const uint8_t *condition, float *output, int nDims,
                 int outputsize, SmallArray inputXShape, SmallArray inputYShape,
                 SmallArray conditionShape, SmallArray outputShape, int xSize,
                 int ySize, int cSize) {
@ -61,34 +223,8 @@ void whereKernel(const float *inputX, const float *inputY,
        blocksize = 32;
    }
    int gridsize = (outputsize + blocksize - 1) / blocksize;
-    _whereKernel<float>
+
-        <<<gridsize, blocksize, 0, CUDAStream::getCurrentStream()>>>(
+    SWITCHCurrency_DTYPE(dTypeIndex)
        inputX, inputY, condition, output, nDims, outputsize, inputXShape,
        inputYShape, conditionShape, outputShape, xSize, ySize, cSize);
 }
 void whereKernel(const half *inputX, const half *inputY,
                 const uint8_t *condition, half *output, int nDims,
                 int outputsize, SmallArray inputXShape, SmallArray inputYShape,
                 SmallArray conditionShape, SmallArray outputShape, int xSize,
                 int ySize, int cSize) {
    int blocksize;
    if (outputsize > 511) {
        blocksize = 1024;
    } else if (outputsize > 255) {
        blocksize = 512;
    } else if (outputsize > 127) {
        blocksize = 256;
    } else if (outputsize > 63) {
        blocksize = 128;
    } else if (outputsize > 31) {
        blocksize = 64;
    } else {
        blocksize = 32;
    }
    int gridsize = (outputsize + blocksize - 1) / blocksize;
    _whereKernel<half>
        <<<gridsize, blocksize, 0, CUDAStream::getCurrentStream()>>>(
        inputX, inputY, condition, output, nDims, outputsize, inputXShape,
        inputYShape, conditionShape, outputShape, xSize, ySize, cSize);
 }
 } // namespace infini
--- a/src/kernels/kunlun/element_wise.cc
+++ b/src/kernels/kunlun/element_wise.cc
@ -97,14 +97,11 @@ class DivXdnn : public KUNLUNKernelWithoutConfig {
        auto aDim = op->getInputs(0)->getDims();
        auto bSize = op->getInputs(1)->size();
        auto bDim = op->getInputs(1)->getDims();
        auto dtype = op->getDType();
        // op input a, b is scalar while aDim and b Dim is empty
        if (bDim.size() == 0) {
            bDim.push_back(1);
        }
        if (aDim.size() == 0) {
            aDim.push_back(1);
        }
        if (aSize == bSize) {
            // Do ElementWise Sub with no broadcast
@ -112,9 +109,23 @@ class DivXdnn : public KUNLUNKernelWithoutConfig {
                                              (float *)aData, (float *)bData,
                                              (float *)cData, aSize));
        } else {
-            checkKUNLUNError(xdnn::broadcast_div<float>(
+            // Do broadcast div
-                context->KUNLUNHandle(), (float *)aData, (float *)bData,
+            Shape aligned = infer_broadcast(aDim, bDim);
-                (float *)cData, aDim, bDim));
+            if (aligned == aDim) {
                // BData need to be broadcasted
                checkKUNLUNError(xdnn::broadcast_div<float>(
                    context->KUNLUNHandle(), (float *)aData, (float *)bData,
                    (float *)cData, aDim, bDim));
            } else {
                // Use workspace to broadcast aData
                KUNLUNPtr wks = context->getWorkspace(bSize * dtype.getSize());
                checkKUNLUNError(xdnn::broadcast<float>(
                    context->KUNLUNHandle(), (float *)aData, (float *)wks, aDim,
                    bDim));
                checkKUNLUNError(xdnn::div<float>(context->KUNLUNHandle(),
                                                  (float *)wks, (float *)bData,
                                                  (float *)cData, bSize));
            }
        }
        return;
    }
--- a/src/kernels/kunlun/unary.cc
+++ b/src/kernels/kunlun/unary.cc
@ -570,7 +570,6 @@ REGISTER_KERNEL(Device::KUNLUN, OpType::Reciprocal, ReciprocalXdnn,
 REGISTER_KERNEL(Device::KUNLUN, OpType::Reshape, CopyXdnn, "Reshape_xdnn");
 REGISTER_KERNEL(Device::KUNLUN, OpType::Flatten, CopyXdnn, "Flatten_xdnn");
 REGISTER_KERNEL(Device::KUNLUN, OpType::Identity, CopyXdnn, "Identity_xdnn");
 REGISTER_KERNEL(Device::KUNLUN, OpType::Squeeze, CopyXdnn, "Squeeze_xdnn");
 REGISTER_KERNEL(Device::KUNLUN, OpType::Abs, AbsXdnn, "Abs_xdnn");
 REGISTER_KERNEL(Device::KUNLUN, OpType::Atan, ATanXdnn, "Atan_xdnn");
 REGISTER_KERNEL(Device::KUNLUN, OpType::Log, LogXdnn, "Log_xdnn");
--- a/test/kernels/cuda/test_cuda_where.cc
+++ b/test/kernels/cuda/test_cuda_where.cc
@ -84,6 +84,17 @@ void test_whereFp16(
 }
 TEST(CUDA_WhereFp32, run) {
    test_whereFp32(
        Shape{2, 2, 3, 1, 2},
        vector<float>{0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,
                      8.,  9.,  10., 11., 12., 13., 14., 15.,
                      16., 17., 18., 19., 20., 21., 22., 23.},
        Shape{2, 2, 3, 1, 2},
        vector<float>{0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
                      0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.},
        Shape{2, 3, 1, 2}, vector<uint8_t>{0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1},
        vector<float>{0., 1.,  2.,  0., 0., 0., 6.,  7.,  0., 9.,  10., 11.,
                      0., 13., 14., 0., 0., 0., 18., 19., 0., 21., 22., 23.});
    test_whereFp32(
        Shape{2, 2, 3, 1}, vector<float>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
        Shape{2, 2, 3, 1}, vector<float>{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
Author	SHA1	Message	Date
xgqdut2016	7146294baa	memcopy instead of special kernel	2024-05-06 14:49:39 +08:00
xgqdut2016	73e3f1fc6f	add currency operator	2024-04-10 15:01:22 +08:00
xgqdut2016	86133c8d0a	modified expand	2024-04-10 11:16:54 +08:00
xgqdut2016	2761d46737	modified div_kernel	2024-04-10 10:51:35 +08:00
xgqdut2016	aa1c3222ed	modified transpose and where	2024-04-10 10:17:45 +08:00
		`@ -1 +0,0 @@`
			`Subproject commit cbcf3fbf985a00494b0f136c92eaccd42031bf65`