memcopy instead of special kernel

add currency operator
modified expand
2024-05-06 14:49:39 +08:00 · 2024-04-10 15:01:22 +08:00 · 2024-04-10 11:16:54 +08:00 · 2024-04-10 10:51:35 +08:00 · 2024-04-10 10:17:45 +08:00
35 changed files with 834 additions and 1211 deletions
--- a/.gitmodules
+++ b/.gitmodules
@ -13,6 +13,3 @@
 [submodule "example"]
 	path = examples/NNmodel
 	url = git@github.com:wanghailu0717/NNmodel.git
-[submodule "examples/distributed/onnxsim_large_model"]
-	path = examples/distributed/onnxsim_large_model
-	url = git@github.com:luchangli03/onnxsim_large_model.git
--- a/examples/distributed/README.md
+++ b/examples/distributed/README.md
@ -1,7 +1,5 @@
 # 分布式脚本

-## 英伟达平台运行方式
-
 #### 1. 运行pytorch模型并生成输入和标准输出，可选择导出onnx

 使用 `--export_onnx` 设置导出onnx的目录，默认为当前路径 `./`，不使用这个flag则只进行计算和生成输入输出。
@ -17,23 +15,3 @@ python run_pytorch.py --model gpt2  --batch_size 1  --length 1 --export_onnx ./
 ```bash
 python cuda_launch.py --model "/XXX/XXX.onnx" --nproc_per_node 4 
 ```
-
-## 寒武纪平台运行方式
-
-**将上述运行脚本 `run_pytorch.py` 以及 `cuda_launch.py` 针对寒武纪平台做了相应的适配，具体见 `run_pytorch_mlu.py` 以及 `bang_launch.py`。**
-
-#### 1. 运行pytorch模型并生成输入和标准输出，可选择导出onnx
-
-使用 `--export_onnx` 设置导出onnx的目录，默认为当前路径 `./`，不使用这个flag则只进行计算和生成输入输出。
-
-```bash
-python run_pytorch_mlu.py --model gpt2  --batch_size 1  --length 1 --export_onnx ./
-```
-
-会在当前目录下生成输入输出文件`test_inputs.npy` 和 `test_results.npy`，目前只支持单一输入输出。
-
-#### 2. 运行InfiniTensor分布式脚本
-
-```bash
-python bang_launch.py --model "/XXX/XXX.onnx" --nproc_per_node 4 
-```
--- a/examples/distributed/init.py
+++ b/examples/distributed/init.py
--- a/examples/distributed/bang/run_pytorch_mlu.py
+++ b/examples/distributed/bang/run_pytorch_mlu.py
@ -1,249 +0,0 @@
-import argparse
-import torch
-import torch_mlu
-from transformers import BertModel, BertConfig
-from transformers import GPT2Model, GPT2Config
-from transformers import OPTModel, OPTConfig
-from transformers import AlbertModel, AlbertConfig
-from transformers import LlamaModel, LlamaConfig
-import time
-import numpy as np
-import onnx
-import sys
-import os
-from onnx.external_data_helper import convert_model_to_external_data
-from onnxsim import simplify
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="Run pytorch gpt2/bert/opt and optionally export onnx.")
-    parser.add_argument(
-        "--model", type=str, choices=["gpt2", "bert", "opt", "llama", "albert"], required=True, help="model type"
-    )
-    parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
-    parser.add_argument("--length", type=int, default=1, help="sequence length.")
-    parser.add_argument(
-        "--export_onnx",
-        type=str,
-        nargs="?",
-        default=None,
-        const="./",
-        help="whether and where to export onnx file",
-    )
-    parser.add_argument(
-        "--type", type=str, choices=["fp32", "fp16", "tf32"], required=True, help="model data type"
-    )
-    args = parser.parse_args()
-    print("arg setting: ", args)
-    return (
-        args.model,
-        args.batch_size,
-        args.length,
-        args.export_onnx,
-        args.type
-    )
-
-
-def get_model(modelname):
-    match modelname:
-        case "albert":
-            model = AlbertModel.from_pretrained("albert/albert-base-v2")
-            voc_size = AlbertConfig().vocab_size
-        case "bert":
-            model = BertModel.from_pretrained("bert-base-uncased", add_pooling_layer=False, hidden_act="gelu_new") # erf is not impl by infini
-            voc_size = BertConfig().vocab_size
-        case "gpt2":
-            model = GPT2Model.from_pretrained("GPT2")
-            voc_size = GPT2Config().vocab_size
-        case "opt":
-            model = OPTModel.from_pretrained("facebook/opt-125m")
-            voc_size = OPTConfig().vocab_size
-        case "llama":
-            model = LlamaModel.from_pretrained("meta-llama/Llama-2-7b-hf")
-            voc_size = LlamaConfig().vocab_size
-        case _:
-            raise KeyError(modelname)
-
-    model = model.eval()
-    return model, voc_size
-
-def run_pytorch(torch_model, voc_size, batchsize, len, dtype="fp32"):
-    data = np.random.randint(0, voc_size, (batchsize, len), dtype=np.int32)
-    os.makedirs(os.path.dirname("./data/"), exist_ok=True)
-    np.save("./data/input_0", data)
-    inputs = torch.from_numpy(data).to("mlu")
-    torch_model = torch_model.to("mlu")
-    if dtype == "fp16":
-        torch_model = torch_model.half()
-
-    n_iter = 20
-    with torch.no_grad():
-        for _ in range(10):
-            outputs = torch_model(inputs)
-    torch.mlu.synchronize()
-    begin = time.time()
-    with torch.no_grad():
-        for _ in range(n_iter):
-            torch.mlu.synchronize()
-            outputs = torch_model(inputs)
-            torch.mlu.synchronize()
-    torch.mlu.synchronize()
-    end = time.time()
-    
-    avg_time = (end - begin) / n_iter
-    outputs = outputs.last_hidden_state.to("cpu")
-    print("outputs abs mean:", abs(np.array(outputs)).mean())
-    print(f"average time: {avg_time}")
-    # torch.mlu.memory.empty_cache()
-    np.save("./data/output", np.array(outputs))
-    print("Save input & output into ./data.")
-
-
-def export_onnx(modelname, model, data, path, extern=False, dtype="fp32"):
-    data = data.to("mlu")
-    model = model.to("mlu")
-    if dtype == "fp16":
-        model = model.half()
-    torch.onnx.export(model, data, path, verbose=False, do_constant_folding=True)
-    if modelname != "llama":
-        # use onnxsim to simplify
-        onnx_model = onnx.load(path)
-        onnx_model, check = simplify(onnx_model, skipped_optimizers=['eliminate_duplicate_initializer'])
-        # onnx_model, check = simplify(onnx_model, skipped_optimizers=['fuse_qkv', 'eliminate_duplicate_initializer'])
-        assert check
-        add_value_info_for_constants(onnx_model)
-        onnx_model = onnx.shape_inference.infer_shapes(onnx_model)
-        if extern:
-            extern_path = path.replace('.onnx', '.pb')
-            if os.path.exists(extern_path):
-                os.remove(extern_path)
-            extern_path = extern_path.split("/")[-1]
-            convert_model_to_external_data(
-                onnx_model,
-                all_tensors_to_one_file=True,
-                location=extern_path,
-                size_threshold=1024,
-                convert_attribute=False,
-            )
-        onnx.save(onnx_model, path)
-    else:
-        # use third party tool to simplify llama
-        # reference: https://github.com/luchangli03/onnxsim_large_model/
-        sys.path.append("onnxsim_large_model")
-        from onnx_utils import set_onnx_input_shape
-        from compress_model import SIZE_1MB, compress_onnx_model, uncompress_onnx_model
-
-        in_model_path = path
-        out_model_path = path
-        if not out_model_path:
-            out_model_path = in_model_path[:-5] + ".sim.onnx"
-        if os.path.isdir(out_model_path):
-            out_model_path = os.path.join(out_model_path, os.path.basename(in_model_path))
-
-        onnx_model = onnx.load(in_model_path)
-        print(f"load model from {in_model_path} success")
-
-        size_th_bytes = 1024 * 1024
-
-        onnx_model, removed_inits = compress_onnx_model(onnx_model, size_th_bytes=size_th_bytes)
-        print(f"compress model success")
-
-        onnx_model = set_onnx_input_shape(onnx_model, "")
-
-        tensor_size_threshold = f"1024KB"
-        skipped_optimizers = []
-        skipped_optimizers.append("eliminate_duplicate_initializer")
-        onnx_model, check = simplify(onnx_model, skipped_optimizers=skipped_optimizers,
-                                    tensor_size_threshold=tensor_size_threshold)
-        if not check:
-            raise ValueError(f"simplify compressed model {in_model_path} failed")
-
-        print(f"simplify model success")
-
-        onnx_model = uncompress_onnx_model(onnx_model, removed_inits)
-        print(f"uncompress model success")
-
-        add_value_info_for_constants(onnx_model)
-
-        onnx.save(onnx_model, out_model_path, save_as_external_data=True)
-
-
-def add_value_info_for_constants(model : onnx.ModelProto):
-    """
-    Currently onnx.shape_inference doesn't use the shape of initializers, so add
-    that info explicitly as ValueInfoProtos.
-    Mutates the model.
-    Args:
-        model: The ModelProto to update.
-    """
-    # All (top-level) constants will have ValueInfos before IRv4 as they are all inputs
-    if model.ir_version < 4:
-        return
-
-    def add_const_value_infos_to_graph(graph : onnx.GraphProto):
-        inputs = {i.name for i in graph.input}
-        existing_info = {vi.name: vi for vi in graph.value_info}
-        for init in graph.initializer:
-            # Check it really is a constant, not an input
-            if init.name in inputs:
-                continue
-
-            # The details we want to add
-            elem_type = init.data_type
-            shape = init.dims
-
-            # Get existing or create new value info for this constant
-            vi = existing_info.get(init.name)
-            if vi is None:
-                vi = graph.value_info.add()
-                vi.name = init.name
-
-            # Even though it would be weird, we will not overwrite info even if it doesn't match
-            tt = vi.type.tensor_type
-            if tt.elem_type == onnx.TensorProto.UNDEFINED:
-                tt.elem_type = elem_type
-            if not tt.HasField("shape"):
-                # Ensure we set an empty list if the const is scalar (zero dims)
-                tt.shape.dim.extend([])
-                for dim in shape:
-                    tt.shape.dim.add().dim_value = dim
-
-        # Handle subgraphs
-        for node in graph.node:
-            for attr in node.attribute:
-                # Ref attrs refer to other attrs, so we don't need to do anything
-                if attr.ref_attr_name != "":
-                    continue
-
-                if attr.type == onnx.AttributeProto.GRAPH:
-                    add_const_value_infos_to_graph(attr.g)
-                if attr.type == onnx.AttributeProto.GRAPHS:
-                    for g in attr.graphs:
-                        add_const_value_infos_to_graph(g)
-
-
-    return add_const_value_infos_to_graph(model.graph)
-
-
-def main():
-    torch.backends.mlu.matmul.allow_tf32 = False
-    torch.backends.cnnl.allow_tf32 = False
-    modelname, batchsize, seqlen, export_path, dtype = parse_args()
-    if dtype == "tf32":
-        torch.backends.mlu.matmul.allow_tf32 = True
-    else:
-        os.environ["CAMBRICON_TF32_OVERRIDE"] = "0"
-
-    model, voc_size = get_model(modelname)
-    if export_path is not None:
-        filename = "{}_{}_{}_{}.onnx".format(modelname, batchsize, seqlen, dtype)
-        path = os.path.join(export_path, filename)
-        if not os.path.exists(path):
-            param = torch.zeros((batchsize, seqlen), dtype=torch.int)
-            export_onnx(modelname, model, param, path, True, dtype)
-        else:
-            print("Onnx path exists, skipping export.")
-
-    run_pytorch(model, voc_size, batchsize, seqlen, dtype)
-
-if __name__ == "__main__":
-    main()
--- a/examples/distributed/bang/bang_launch.py
+++ b/examples/distributed/bang/bang_launch.py
@ -1,39 +1,35 @@
-import sys
-sys.path.append('../')
-
 import argparse
 import os
 import time
 import multiprocessing as mp
 from pyinfinitensor.onnx import OnnxStub, backend
 import onnx
-from onnx.external_data_helper import convert_model_to_external_data
 from onnx.shape_inference import infer_shapes_path
 import numpy as np
 from parallel_opt import parallel_model

+
 def parse_args():
    parser = argparse.ArgumentParser(description="launch distributed infinitensor")
    parser.add_argument("--num_nodes", type=int, default=1, help="number of nodes")
    parser.add_argument(
-        "--nproc_per_node", type=int, default=1, help="number of processes per node"
+        "--nproc_per_node", type=int, default=2, help="number of processes per node"
    )
    parser.add_argument(
        "--name", type=str, default="test", help="name of this instance."
    )
    parser.add_argument(
-        "--model", type=str, required=True, help="path to the ONNX model file."
+        "--model", type=str, default="/data/onnx_models/llama2/llama_bs1_seq1024.onnx", 
+        help="path to the ONNX model file."
    )
    parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
    parser.add_argument("--length", type=int, default=1, help="sequence length.")
    parser.add_argument(
        "--gen_std",
+        default=False,
        action="store_true",
        help="whether to generate the standard results.",
    )
-    parser.add_argument(
-        "--type", type=str, choices=["fp32", "fp16", "tf32"], default="fp32", help="data type"
-    )
    args = parser.parse_args()
    print("arg setting: ", args)
    return (
@ -44,46 +40,39 @@ def parse_args():
        args.batch_size,
        args.length,
        args.gen_std,
-        args.type,
    )


-def run_model(model, runtime, world_size=1, rank=0, n=10, data_type="default"):
-    stub = OnnxStub(model, runtime, matmul_compute_type=data_type)
+def run_model(model, runtime, world_size=1, rank=0, n=10):
+    stub = OnnxStub(model, runtime)
    load_inputs(stub, world_size, rank)
    # stub.tune()
    stub.run()
    # get outputs
+    time.sleep(0.01)
    outputs = next(stub.outputs.values().__iter__()).copyout_numpy()

    # bench
+    begin = time.time()
    for _ in range(n):
        stub.run()
-    begin = time.time()
-    for _ in range(n * 2):
-        stub.run()
    end = time.time()
-    avg_time = (end - begin) / (n * 2)
+    avg_time = (end - begin) / n
    print(f"average time: {avg_time}")
    return outputs

-def load_inputs(stub, world_size=1, rank=0):
-    for i, (name, tensor) in enumerate(stub.inputs.items()):
-        input = np.load(f"./data/input_{i}.npy")
-        if all(x == y for x,y in zip(input.shape,tensor.shape())):
-            tensor.copyin_numpy(input)
-        else:
-            tensor.copyin_numpy(np.hsplit(input, world_size)[rank])

-
-def run_and_compare(name, model, runtime, world_size=1, rank=0, data_type="default"):
+def run_and_compare(name, model, runtime, world_size=1, rank = 0):
    results = np.load(f"./data/output.npy")
-    outputs = run_model(model, runtime, world_size, rank, data_type=data_type)
-    print("outputs abs mean:", abs(outputs).mean())
-    print("max abs diff:", abs(outputs - results).max())
+    outputs = run_model(model, runtime, world_size, rank)
+    print("answer argmax:", np.argmax(results))
+    print("output argmax:", np.argmax(outputs))
+    #np.testing.assert_allclose(outputs, results, rtol=1e-3, atol=1e-3)
+    getDiff(results, outputs)
+

 def start_worker(
-    name: str, world_size: int, rank: int, local_rank: int, model: onnx.ModelProto, data_type: str
+    name: str, world_size: int, rank: int, local_rank: int, model: onnx.ModelProto
 ):
    dist_name = name + "_dist"
    model = parallel_model(model, world_size, rank)
@ -96,7 +85,7 @@ def start_worker(
        save_as_external_data=True,
        location=extern_path,
    )
-    #infer_shapes_path(f"./{dist_name}_rank{rank}.onnx")
+    infer_shapes_path(f"./{dist_name}_rank{rank}.onnx")
    runtime = backend.BangRuntime(local_rank)
    # print("init comm")
    runtime.init_comm(
@ -104,12 +93,13 @@ def start_worker(
        world_size,
        rank,
    )
-    run_and_compare(name, model, runtime, world_size, rank, data_type)
+    run_and_compare(name, model, runtime, world_size, rank)


-def start_single(name, model, data_type):
+def start_single(name, model):
    runtime = backend.BangRuntime(0)
-    run_and_compare(name, model, runtime, data_type=data_type)
+    run_and_compare(name, model, runtime)
+

 def generate_input_output(model):
    os.makedirs(os.path.dirname("./data/"), exist_ok=True)
@ -142,36 +132,55 @@ def generate_input_output(model):
    np.save(f"./data/output", output)


+def load_inputs(stub, world_size=1, rank=0):
+    for i, (name, tensor) in enumerate(stub.inputs.items()):
+        input = np.load(f"./data/input_{i}.npy")
+        if all(x == y for x,y in zip(input.shape,tensor.shape())):
+            tensor.copyin_numpy(input)
+        else:
+            tensor.copyin_numpy(np.hsplit(input, world_size)[rank])
+
+def getDiff(base, test):
+    absolute_diff = np.abs(np.subtract(base, test))
+    max_absolute_diff = np.max(absolute_diff)
+
+    baseCopy = base.astype(np.float64).ravel()
+    testCopy = test.astype(np.float64).ravel()
+    upValue = np.sum(np.abs(baseCopy - testCopy))
+    downValue = np.sum(np.abs(baseCopy)) + np.float64(1e-9)
+    max_relative_diff = upValue / downValue
+    print(f"Max absolute difference: {max_absolute_diff}\n"
+          f"Max relative difference: {max_relative_diff}")
+    return max_absolute_diff, max_relative_diff
+
+
 def main():
-    nnodes, nproc_per_node, name, model_path, bs, length, gen_std, data_type = parse_args()
-    data_type = "default" if data_type == "fp32" else data_type
-    
+    nnodes, nproc_per_node, name, model_path, bs, length, gen_std = parse_args()
+
    model = onnx.load(model_path)

    # generate standart output
    if gen_std:
-        print(f"generate standard data for {name}.")
-        # a small vocabulary size to fit all LLM.
-        generate_input_output(model)
+        print("Generate inputs and outputs.")
+        p = mp.Process(target=generate_input_output, args=[model])
+        p.start()
+        p.join()
        return

-    if nproc_per_node == 1:
-        # run single process.
-        # use standalone process to isolate bang.
-        print("run model by single MLU.")
-        # p = mp.Process(target=start_single, args=(name, model, data_type))
-        # p.start()
-        # p.join()
-        start_single(name, model, data_type)
-        return
+    # run single process.
+    # use standalone process to isolate cuda.
+    print("run model by single MLU.")
+    p = mp.Process(target=start_single, args=(name, model))
+    p.start()
+    p.join()

    # run distributed parallel.
    world_size = nnodes * nproc_per_node
-    print(f"run model by {world_size} MLU in parallel.")
+    print(f"run model by {world_size} MLUs in parallel.")
    workers = [
        mp.Process(
            target=start_worker,
-            args=(name, world_size, rank, rank % nproc_per_node, model, data_type),
+            args=(name, world_size, rank, rank % nproc_per_node, model),
        )
        for rank in range(world_size)
    ]
--- a/examples/distributed/cuda/cuda_launch.py
+++ b/examples/distributed/cuda/cuda_launch.py
--- a/examples/distributed/kunlun/export_onnx.sh
+++ b/examples/distributed/kunlun/export_onnx.sh
@ -1,14 +0,0 @@
- export HF_ENDPOINT=https://hf-mirror.com
-
-models=("bert" "gpt2" "llama")
-batch_size=(1 32)
-seq_len=(100 500)
-nproc=(1 2 4)
-
-for model in "${models[@]}"; do
-    for bs in "${batch_size[@]}"; do
-        for len in "${seq_len[@]}"; do
-            python run_pytorch.py --model "$model" --batch_size "$bs" --length "$len" --export_onnx ../models/"$model" --export_only 
-        done
-    done
-done 
--- a/examples/distributed/kunlun/kunlun_launch.py
+++ b/examples/distributed/kunlun/kunlun_launch.py
@ -1,280 +0,0 @@
-import sys
-sys.path.append('../')
-
-import argparse
-import os
-import time
-import multiprocessing as mp
-from pyinfinitensor.onnx import OnnxStub, backend
-import onnx
-from onnx.external_data_helper import convert_model_to_external_data
-from onnx.shape_inference import infer_shapes_path
-import numpy as np
-from parallel_opt import parallel_model
-from functools import wraps
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="launch distributed infinitensor")
-    parser.add_argument("--num_nodes", type=int, default=1, help="number of nodes")
-    parser.add_argument(
-        "--nproc_per_node", type=int, default=2, help="number of processes per node"
-    )
-    parser.add_argument(
-        "--name", type=str, choices=["gpt2", "bert", "llama"], help="name of model."
-    )
-    parser.add_argument(
-        "--model", type=str, default="", help="path to the ONNX model file."
-    )
-    parser.add_argument(
-        "--gen_std",
-        default=False,
-        action="store_true",
-        help="whether to generate the standard results.",
-    )
-    parser.add_argument(
-        "--run_single",
-        default=False,
-        action="store_true",
-        help="whether run model with single process with standard inputs"
-    )
-    parser.add_argument(
-        "--input_dir",
-        default="./",
-        help="path to save model input data"
-    )
-    parser.add_argument(
-        "--result_dir",
-        default="./",
-        help="path to save model standard output"
-    )
-    parser.add_argument(
-        "--internal_model_dir",
-        default="./",
-        help="path to save internal onnx model for parallel run"
-    )
-    args = parser.parse_args()
-
-    # check path, mkdir if not exist
-    check_exists(args.input_dir)
-    check_exists(args.result_dir)
-    check_exists(args.internal_model_dir)
-
-    print("arg setting: ", args)
-    return (
-        args.num_nodes,
-        args.nproc_per_node,
-        args.name,
-        args.model,
-        args.gen_std,
-        args.run_single,
-        args.input_dir,
-        args.result_dir,
-        args.internal_model_dir
-    )
-
-
-"""
-utils function for this scripts
-"""
-def check_exists(path: str):
-    if not os.path.exists(path):
-        os.makedirs(path)
-
-def np_assert(base, test, rtol=1e-2, atol=1e-1):
-    # np.testing.assert_allclose(test, base, rtol, atol)
-    print("max abs diff:", abs(base - test).max())
-
-
-"""
-Perf wrapper, run function n times
-then average
-"""
-def perf_it(n):
-    def decorator(func):
-        @wraps(func)
-        def wrapper(*args, **kwargs):
-            # warmup
-            for _ in range(n):
-                func(*args, **kwargs)
-
-            t_total = 0
-            for _ in range(n):
-                t0 = time.time()
-                func(*args, **kwargs)
-                t1 = time.time()
-                t_total += t1 - t0
-            avg_time = (t_total) / n
-            print(f"Avg runtime of {n} time is {avg_time:.6f} seconds")
-            return avg_time
-        return wrapper
-    return decorator
-
-
-"""
-Run InfiniTensor model with Standard input
-check=True: check with standard output gen by pytorch
-perf=True: run n times to get avg time
-"""
-def run_model(task_name,
-              model,
-              runtime,
-              world_size=1,
-              rank=0,
-              n=10,
-              check=True,
-              perf=True):
-
-    stub = OnnxStub(model, runtime,
-                    use_naive_allocator=True \
-                    if task_name == "llama" else False)
-
-    # load in Onnx model inputs
-    def load_inputs(stub: OnnxStub):
-        # check exists
-        inputs = []
-        for i, (name, tensor) in enumerate(stub.inputs.items()):
-            input_path = os.path.join(input_dir, \
-                                f"{task_name}_input_{i}.npy")
-            print(input_path)
-            if os.path.exists(input_path):
-                input = np.load(input_path)
-            else :
-                raise KeyError(f"{i} th input of model not exists")
-            # check shape
-            if all(x == y for x,y in zip(input.shape, tensor.shape())):
-                tensor.copyin_numpy(input)
-            else:
-                tensor.copyin_numpy(np.hsplit(input, world_size)[rank])
-
-    load_inputs(stub)
-    # stub.tune()
-    stub.run()
-    time.sleep(0.01)
-    output = next(stub.outputs.values().__iter__()).copyout_numpy()
-
-    # check output results with standard output
-    if check:
-        st_output_path = os.path.join(result_dir, \
-                                f"{task_name}_output.npy")
-        assert os.path.exists(st_output_path) , \
-                    "standard output not exists"
-        st_output = np.load(st_output_path)
-        if np.isnan(output).any():
-            print("Nan in output")
-            exit()
-        np_assert(st_output, output)
-
-    # perf
-    if perf:
-        @perf_it(n)
-        def perf_infinitensor(stub: OnnxStub):
-            stub.run()
-        perf_infinitensor(stub)
-
-    return output
-
-
-"""
-Start a worker in Parallel
-"""
-def start_worker(name: str,
-           world_size: int,
-           rank: int,
-           local_rank: int,
-           model: onnx.ModelProto):
-
-    dist_name = name + "_dist"
-    # partial a onnx model to world_size part
-    model = parallel_model(model, world_size, rank)
-    onnx.save(model, os.path.join(internal_model_dir, \
-                                    f"{dist_name}_rank{rank}.onnx"), save_as_external_data=True)
-    runtime = backend.KUNLUNRuntime(local_rank)
-    # print("init comm")
-    runtime.init_comm(
-        dist_name,
-        world_size,
-        rank,
-    )
-    run_model(name, model, runtime, world_size, rank)
-
-
-"""
-generate standard input/output with
-sigle card run
-"""
-def gen_standard(task_name: str, model: onnx.ModelProto):
-    runtime = backend.KUNLUNRuntime(0)
-    stub = OnnxStub(model, runtime)
-    position_id = 0
-    # generate random input for model
-    for i, (name, tensor) in enumerate(stub.inputs.items()):
-        input = tensor.copyout_numpy()
-        if np.issubdtype(input.dtype, np.integer):
-            if input.size == 1:
-                input = np.random.randint(0,2,size=input.shape, dtype=input.dtype)
-            else:
-                input = np.random.randint(0,2,size=input.shape, dtype=input.dtype)
-        elif input.dtype == np.bool_:
-            input = np.random.randint(0,2,size=input.shape) > 0
-        else:
-            if i == 0:
-                input = np.ones(input.shape).astype(input.dtype)
-                position_id = input.shape[-1] - 1
-            else:
-                input = np.random.rand(*input.shape).astype(input.dtype)
-        tensor.copyin_numpy(input)
-        np.save(os.path.join(input_dir, \
-                    f"{task_name}_input_{i}.npy"), input)
-    stub.run()
-    # print(stub.outputs)
-    output = next(stub.outputs.values().__iter__()).copyout_numpy()
-    if np.isnan(output).any():
-        print("Nan in output")
-        exit()
-    np.save(os.path.join(result_dir, f"{task_name}_output.npy"), output)
-
-
-def main():
-
-    global input_dir, result_dir, internal_model_dir
-
-    nnodes, nproc_per_node, task_name, \
-        model_path, gen_std, run_single, \
-            input_dir, result_dir, internal_model_dir = parse_args()
-
-    # load input onnx model
-    model = onnx.load(model_path)
-
-    # generate standart output
-    if gen_std:
-        print("Generate inputs and outputs.")
-        gen_standard(task_name, model)
-        return
-
-    if run_single:
-        print("Run model by one GPU card.")
-        runtime = backend.KUNLUNRuntime(0)
-        run_model(task_name, model, runtime)
-        return
-
-    # run distributed parallel.
-    world_size = nnodes * nproc_per_node
-    print(f"Run model by {world_size} GPU in parallel.")
-    workers = [
-        mp.Process(
-            target=start_worker,
-            args=(task_name, world_size, rank, rank % nproc_per_node, model),
-        )
-        for rank in range(world_size)
-    ]
-
-    for w in workers:
-        w.start()
-
-    for w in workers:
-        w.join()
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/distributed/kunlun/launch.sh
+++ b/examples/distributed/kunlun/launch.sh
@ -1,36 +0,0 @@
-export HF_ENDPOINT=https://hf-mirror.com
-
-# models=("bert" "gpt2" "llama")
-models=("bert" "gpt2")
-batch_size=(1 32)
-seq_len=(100 500)
-nproc=(1 2 4)
-
-results_dir="results"
-
-if [ -d "$results_dir" ]; then
-    echo "directory ./$results_dir exists"
-else
-    mkdir -p "$results_dir"
-    echo "mkdir $results_dir, logs saved there"
-fi
-
-
-for model in "${models[@]}"; do
-    for bs in "${batch_size[@]}"; do
-        for len in "${seq_len[@]}"; do
-            # run pytorch model
-            echo "Run pytorch $model with batch_size=$bs length=$len ."
-            python run_pytorch.py --model "$model" --batch_size "$bs" --length "$len" #> results/"$model"_"$bs"_"$len"_pytorch
-            for n in "${nproc[@]}"; do
-                # run infinitensor 
-                echo "Run $n parallel infinitensor "$model" with batch_size=$bs and length=$len ."
-                python kunlun_launch.py --name "$model" --model ../models/"$model"/"$model"_"$bs"_"$len".onnx --nproc_per_node=$n # >> results/"$model"_"$bs"_"$len"_infini 
-                # delete internal files
-                find ./ -type f -name "*.onnx" -delete
-                find ./ -type f -name "*.pb" -delete
-            done
-            find ./ -type f -name "*.npy" -delete
-        done
-    done
-done
--- a/examples/distributed/kunlun/llama_launch.sh
+++ b/examples/distributed/kunlun/llama_launch.sh
@ -1,35 +0,0 @@
-export HF_ENDPOINT=https://hf-mirror.com
-
-# models=("bert" "gpt2" "llama")
-models=("llama")
-batch_size=(1 )
-seq_len=(100 500)
-nproc=(1 2 4)
-
-results_dir="results"
-
-if [ -d "$results_dir" ]; then
-    echo "directory ./$results_dir exists"
-else
-    mkdir -p "$results_dir"
-    echo "mkdir $results_dir, logs saved there"
-fi
-
-
-for model in "${models[@]}"; do
-    for bs in "${batch_size[@]}"; do
-        for len in "${seq_len[@]}"; do
-            echo "Run pytorch llama with batch_size="$bs" and length="$len""
-            python run_pytorch.py --model "$model" --batch_size "$bs" --length "$len"
-            for n in "${nproc[@]}"; do
-                    # run pytorch model
-                    echo "Run infinitensor llama with batch_size="$bs" and length="$len" and nproc="$n"."
-                    python kunlun_launch.py --name llama --model ../models/llama/llama_"$bs"_"$len"_fp32.onnx --nproc_per_node=$n
-                    # delete internal files
-                    find ./ -type f -name "*.onnx" -delete
-                    find ./ -type f -name "*0c" -delete
-            done
-            find ./ -type f -name "*.npy" -delete
-        done
-    done
-done
--- a/examples/distributed/kunlun/run_pytorch.py
+++ b/examples/distributed/kunlun/run_pytorch.py
@ -1,245 +0,0 @@
-import argparse
-import torch
-from transformers import BertModel, BertConfig
-from transformers import GPT2Model, GPT2Config
-from transformers import OPTModel, OPTConfig
-from transformers import LlamaModel, LlamaConfig
-import time
-import numpy as np
-import onnx
-import os
-import sys
-from onnx.external_data_helper import convert_model_to_external_data
-from onnxsim import simplify
-
-torch.backends.cuda.matmul.allow_tf32 = False
-torch.backends.cudnn.allow_tf32 = False
-def parse_args():
-    parser = argparse.ArgumentParser(description="Run pytorch gpt2/bert/opt and optionally export onnx.")
-    parser.add_argument(
-        "--model", type=str, choices=["gpt2", "bert", "opt", "llama"], required=True, help="model type"
-    )
-    parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
-    parser.add_argument("--length", type=int, default=1, help="sequence length.")
-    parser.add_argument(
-        "--export_onnx",
-        type=str,
-        nargs="?",
-        default=None,
-        const="./",
-        help="whether and where to export onnx file",
-    )
-    parser.add_argument(
-        "--input_dir",
-        type=str,
-        default="./",
-        help="path to save pytorch model input data"
-    )
-    parser.add_argument(
-        "--result_dir",
-        type=str,
-        default="./",
-        help="path to save pytorch model output data"
-    )
-    parser.add_argument(
-        "--export_only",
-        action="store_true"
-    )
-    args = parser.parse_args()
-    print("arg setting: ", args)
-    return (
-        args.model,
-        args.batch_size,
-        args.length,
-        args.export_onnx,
-        args.input_dir,
-        args.result_dir,
-        args.export_only
-    )
-
-
-def get_model(modelname):
-    if modelname == "bert":
-        model = BertModel.from_pretrained("bert-base-uncased", add_pooling_layer=False, hidden_act="gelu_new") # erf is not impl by infini
-        voc_size = BertConfig().vocab_size
-    elif modelname == "gpt2":
-        model = GPT2Model.from_pretrained("gpt2")
-        voc_size = GPT2Config().vocab_size
-    elif modelname == "opt":
-        model = OPTModel.from_pretrained("./opt-125m")
-        voc_size = OPTConfig().vocab_size
-    elif modelname == "llama":
-        model = LlamaModel.from_pretrained("meta-llama/Llama-2-7b-hf")
-        voc_size = LlamaConfig().vocab_size
-    else :
-        raise KeyError(modelname)
-
-    model = model.eval()
-    return model, voc_size
-
-def run_pytorch(torch_model, voc_size, batchsize, len, model_name):
-    data = np.random.randint(0, voc_size, (batchsize, len), dtype=np.int32)
-    np.save(os.path.join(input_dir, f"{model_name}_input_0.npy"), data)
-    inputs = torch.from_numpy(data).to("cuda")
-    torch_model = torch_model.to("cuda")
-
-    n_iter = 10
-    with torch.no_grad():
-        for _ in range(10):
-            outputs = torch_model(inputs)
-    torch.cuda.synchronize()
-    begin = time.time()
-    with torch.no_grad():
-        for _ in range(n_iter):
-            torch.cuda.synchronize()
-            outputs = torch_model(inputs)
-            #
-            torch.cuda.synchronize()
-    torch.cuda.synchronize()
-    end = time.time()
-
-    avg_time = (end - begin) / n_iter
-    outputs = outputs.last_hidden_state.to("cpu")
-    print("outputs abs mean:", abs(np.array(outputs)).mean())
-    print(f"average time: {avg_time}")
-    torch.cuda.memory.empty_cache()
-    np.save(os.path.join(result_dir, f"{model_name}_output.npy"), \
-                                        np.array(outputs))
-    print(f"Save input & output as {model_name}_input_0.npy and {model_name}_output.npy")
-
-
-def export_onnx(model_name, model, data, path, extern=False):
-    # torch.onnx.export(model, data, path, verbose=False, do_constant_folding=True)
-
-    if model_name != "llama":
-        onnx_model = onnx.load(path)
-        onnx_model, check = simplify(onnx_model,
-                                 skipped_optimizers=['fuse_qkv', 'eliminate_duplicate_initializer'])
-                                 # skipped_optimizers=['fuse_qkv'])
-        assert check
-        add_value_info_for_constants(onnx_model)
-        onnx_model = onnx.shape_inference.infer_shapes(onnx_model)
-        if extern:
-            extern_path = path.replace('.onnx', '.pb')
-            if os.path.exists(extern_path):
-                os.remove(extern_path)
-            convert_model_to_external_data(
-                onnx_model,
-                all_tensors_to_one_file=True,
-                location=extern_path.split("/")[-1],
-                size_threshold=1024,
-                convert_attribute=False,
-            )
-        onnx.save(onnx_model, path)
-    else:
-        sys.path.append("onnxsim_large_model")
-        from onnx_utils import set_onnx_input_shape
-        from compress_model import SIZE_1MB, compress_onnx_model, uncompress_onnx_model
-
-        in_model_path = path
-        out_model_path = in_model_path[:-5] + ".sim.onnx"
-
-        onnx_model = onnx.load(in_model_path)
-        print(f"load model from {in_model_path} success")
-
-        size_th_bytes = 1024 * 1024
-        onnx_model, removed_inits = compress_onnx_model(onnx_model, size_th_bytes=size_th_bytes)
-        print("compress model success")
-
-        onnx_model = set_onnx_input_shape(onnx_model, "")
-        tensor_size_threshold = f"1024KB"
-        skipped_optimizers = []
-        skipped_optimizers.append("eliminate_duplicate_initializer")
-        onnx_model, check = simplify(onnx_model, skipped_optimizers=skipped_optimizers,
-                                    tensor_size_threshold=tensor_size_threshold)
-        if not check:
-            raise ValueError(f"simplify compressed model {in_model_path} failed")
-
-        print(f"simplify model success")
-
-        onnx_model = uncompress_onnx_model(onnx_model, removed_inits)
-        print(f"uncompress model success")
-
-        add_value_info_for_constants(onnx_model)
-
-        onnx.save(onnx_model, out_model_path, save_as_external_data=True)
-
-
-def add_value_info_for_constants(model : onnx.ModelProto):
-    """
-    Currently onnx.shape_inference doesn't use the shape of initializers, so add
-    that info explicitly as ValueInfoProtos.
-    Mutates the model.
-    Args:
-        model: The ModelProto to update.
-    """
-    # All (top-level) constants will have ValueInfos before IRv4 as they are all inputs
-    if model.ir_version < 4:
-        return
-
-    def add_const_value_infos_to_graph(graph : onnx.GraphProto):
-        inputs = {i.name for i in graph.input}
-        existing_info = {vi.name: vi for vi in graph.value_info}
-        for init in graph.initializer:
-            # Check it really is a constant, not an input
-            if init.name in inputs:
-                continue
-
-            # The details we want to add
-            elem_type = init.data_type
-            shape = init.dims
-
-            # Get existing or create new value info for this constant
-            vi = existing_info.get(init.name)
-            if vi is None:
-                vi = graph.value_info.add()
-                vi.name = init.name
-
-            # Even though it would be weird, we will not overwrite info even if it doesn't match
-            tt = vi.type.tensor_type
-            if tt.elem_type == onnx.TensorProto.UNDEFINED:
-                tt.elem_type = elem_type
-            if not tt.HasField("shape"):
-                # Ensure we set an empty list if the const is scalar (zero dims)
-                tt.shape.dim.extend([])
-                for dim in shape:
-                    tt.shape.dim.add().dim_value = dim
-
-        # Handle subgraphs
-        for node in graph.node:
-            for attr in node.attribute:
-                # Ref attrs refer to other attrs, so we don't need to do anything
-                if attr.ref_attr_name != "":
-                    continue
-
-                if attr.type == onnx.AttributeProto.GRAPH:
-                    add_const_value_infos_to_graph(attr.g)
-                if attr.type == onnx.AttributeProto.GRAPHS:
-                    for g in attr.graphs:
-                        add_const_value_infos_to_graph(g)
-
-
-    return add_const_value_infos_to_graph(model.graph)
-
-
-def main():
-    global input_dir, result_dir
-
-    modelname, batchsize, seqlen, \
-        export_path, input_dir, result_dir, export_only = parse_args()
-
-    model, voc_size = get_model(modelname) # pytorch model
-
-    if export_path is not None:
-        os.makedirs(export_path, exist_ok=True)
-        filename = "{}_{}_{}.onnx".format(modelname, batchsize, seqlen)
-        path = os.path.join(export_path, filename)
-        param = torch.zeros((batchsize, seqlen), dtype=torch.int)
-        export_onnx(modelname, model, param, path, True) # export pytorch model to onnx model
-        if export_only:
-            return
-
-    run_pytorch(model, voc_size, batchsize, seqlen, modelname)
-
-if __name__ == "__main__":
-    main()
--- a/examples/distributed/launch_kunlun.py
+++ b/examples/distributed/launch_kunlun.py
@ -0,0 +1,213 @@
+import argparse
+import os
+import time
+import multiprocessing as mp
+from pyinfinitensor.onnx import OnnxStub, backend
+import onnx
+from onnx.external_data_helper import convert_model_to_external_data
+from onnx.shape_inference import infer_shapes_path
+import numpy as np
+from parallel_opt import parallel_model
+
+st_input_dir = "standard/inputs/"
+st_output_dir = "standard/outputs/"
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="launch distributed infinitensor")
+    parser.add_argument("--num_nodes", type=int, default=1, help="number of nodes")
+    parser.add_argument(
+        "--nproc_per_node", type=int, default=2, help="number of processes per node"
+    )
+    parser.add_argument(
+        "--name", type=str, default="test", help="name of this instance."
+    )
+    parser.add_argument(
+        "--model", type=str, default="/data1/shared/panzezhong/llama/fp32/my_llama_fp32.sim.onnx", help="path to the ONNX model file."
+    )
+    parser.add_argument("--batch_size", type=int, default=1, help="batch size.")
+    parser.add_argument("--length", type=int, default=1, help="sequence length.")
+    parser.add_argument(
+        "--gen_std",
+        default=False,
+        action="store_true",
+        help="whether to generate the standard results.",
+    )
+    parser.add_argument(
+        "--run_single",
+        default=False,
+        action="store_true",
+        help="whether run model with single process with standard inputs"
+    )
+    args = parser.parse_args()
+    print("arg setting: ", args)
+    return (
+        args.num_nodes,
+        args.nproc_per_node,
+        args.name,
+        args.model,
+        args.batch_size,
+        args.length,
+        args.gen_std,
+        args.run_single
+    )
+
+
+def run_model(model, runtime, world_size=1, rank=0, n=10):
+    stub = OnnxStub(model, runtime)
+    load_inputs(stub, world_size, rank)
+    # stub.tune()
+    stub.run()
+    # get outputs
+    time.sleep(0.01)
+    outputs = next(stub.outputs.values().__iter__()).copyout_numpy()
+
+    # bench
+    begin = time.time()
+    for _ in range(n):
+        stub.run()
+    end = time.time()
+    avg_time = (end - begin) / n
+    print(f"average time: {avg_time}")
+    return outputs
+
+
+
+def run_and_compare(name, model, runtime, world_size=1, rank = 0):
+    results = np.load(os.path.join(st_output_dir,f"output.npy"))
+    outputs = run_model(model, runtime, world_size, rank)
+    print(outputs[:100])
+    if np.isnan(outputs).any():
+        print("Nan in output")
+    print("answer argmax:", np.argmax(results))
+    print("output argmax:", np.argmax(outputs))
+    #np.testing.assert_allclose(outputs, results, rtol=1e-3, atol=1e-3)
+    getDiff(results, outputs)
+
+
+def start_worker(
+    name: str, world_size: int, rank: int, local_rank: int, model: onnx.ModelProto
+):
+    dist_name = name + "_dist"
+    model = parallel_model(model, world_size, rank)
+    extern_path = f"./{dist_name}_rank{rank}.pb"
+    if os.path.exists(extern_path):
+        os.remove(extern_path)
+    onnx.save_model(
+        model,
+        f"./{dist_name}_rank{rank}.onnx",
+        save_as_external_data=True,
+        location=extern_path,
+    )
+    infer_shapes_path(f"./{dist_name}_rank{rank}.onnx")
+    runtime = backend.KUNLUNRuntime(local_rank)
+    # print("init comm")
+    runtime.init_comm(
+        dist_name,
+        world_size,
+        rank,
+    )
+    run_and_compare(name, model, runtime, world_size, rank)
+
+
+def start_single(name, model):
+    runtime = backend.KUNLUNRuntime(0)
+    run_and_compare(name, model, runtime)
+
+
+def generate_input_output(model):
+    runtime = backend.KUNLUNRuntime(0)
+    stub = OnnxStub(model, runtime)
+    position_id = 0
+    for i, (name, tensor) in enumerate(stub.inputs.items()):
+        input = tensor.copyout_numpy()
+        if np.issubdtype(input.dtype, np.integer):
+            if input.size == 1:
+                # input = np.array([position_id])
+                input = np.random.randint(0,2,size=input.shape, dtype=input.dtype)
+            else:
+                input = np.random.randint(0,2,size=input.shape, dtype=input.dtype)
+        elif input.dtype == np.bool_:
+            input = np.random.randint(0,2,size=input.shape) > 0
+        else:
+            if i == 0:
+                input = np.ones(input.shape).astype(input.dtype)
+                position_id = input.shape[-1] - 1
+            else:
+                input = np.random.rand(*input.shape).astype(input.dtype)
+        tensor.copyin_numpy(input)
+        np.save(os.path.join(st_input_dir, f"input_{i}"), input)
+    stub.run()
+    # print(stub.outputs)
+    time.sleep(0.01)
+    output = next(stub.outputs.values().__iter__()).copyout_numpy()
+    print(output[:100])
+    if np.isnan(output).any():
+        print("Nan in output")
+    np.save(os.path.join(st_output_dir, f"output"), output)
+
+
+def load_inputs(stub, world_size=1, rank=0):
+    for i, (name, tensor) in enumerate(stub.inputs.items()):
+        input = np.load(os.path.join(st_input_dir, f"input_{i}.npy"))
+        if all(x == y for x,y in zip(input.shape,tensor.shape())):
+            tensor.copyin_numpy(input)
+        else:
+            tensor.copyin_numpy(np.hsplit(input, world_size)[rank])
+
+
+def getDiff(base, test):
+    absolute_diff = np.abs(np.subtract(base, test))
+    max_absolute_diff = np.max(absolute_diff)
+
+    baseCopy = base.astype(np.float64).ravel()
+    testCopy = test.astype(np.float64).ravel()
+    upValue = np.sum(np.abs(baseCopy - testCopy))
+    downValue = np.sum(np.abs(baseCopy)) + np.float64(1e-9)
+    max_relative_diff = upValue / downValue
+    print(f"Max absolute difference: {max_absolute_diff}\nMax relative difference: {max_relative_diff}")
+
+    return max_absolute_diff, max_relative_diff
+
+
+def main():
+    nnodes, nproc_per_node, name, model_path, bs, length, gen_std, run_single = parse_args()
+
+    model = onnx.load(model_path)
+
+    # generate standart output
+    if gen_std:
+        print("Generate inputs and outputs.")
+        p = mp.Process(target=generate_input_output, args=[model])
+        p.start()
+        p.join()
+        return
+
+    # # run single process.
+    # # use standalone process to isolate cuda.
+    if run_single:
+        print("run model by single GPU.")
+        p = mp.Process(target=start_single, args=(name, model))
+        p.start()
+        p.join()
+        return 
+
+    # run distributed parallel.
+    world_size = nnodes * nproc_per_node
+    print(f"run model by {world_size} GPU in parallel.")
+    workers = [
+        mp.Process(
+            target=start_worker,
+            args=(name, world_size, rank, rank % nproc_per_node, model),
+        )
+        for rank in range(world_size)
+    ]
+
+    for w in workers:
+        w.start()
+
+    for w in workers:
+        w.join()
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/distributed/cuda/launch_kvcache.py
+++ b/examples/distributed/cuda/launch_kvcache.py
--- a/examples/distributed/onnxsim_large_model
+++ b/examples/distributed/onnxsim_large_model
@ -1 +0,0 @@
-Subproject commit cbcf3fbf985a00494b0f136c92eaccd42031bf65
--- a/examples/distributed/parallel_opt.py
+++ b/examples/distributed/parallel_opt.py
@ -110,6 +110,7 @@ def parallel_model(model: ModelProto, tp_world_size: int = 1, tp_rank: int = 0):
                s_dim = 0
            elif in_plc.dim == 2:
                s_dim = 1
+
        assert s_dim != -1
        assert out_dims[s_dim] % tp_world_size == 0, out_dims
        out_dims[s_dim] //= tp_world_size
--- a/examples/distributed/cuda/run_pytorch.py
+++ b/examples/distributed/cuda/run_pytorch.py
--- a/include/cuda/cuda_expand.h
+++ b/include/cuda/cuda_expand.h
@ -3,10 +3,11 @@
 #include "operators/unary.h"
 #include "utils/small_array.h"
 namespace infini {
+void expandKernel(int dType, void *input, void *output, int a0, int a1, int a2,
+                  int a3, int b0, int b1, int b2, int b3);
 void expandKernel(int dType, void *input, void *output, int nDims,
                  int outputsize, SmallArray inputShape,
                  SmallArray outputShape);
-
 void expandRowKernel(int dType, void *input, void *output, int n_rows,
                     int row_len);
 }; // namespace infini
--- a/include/cuda/cuda_where.h
+++ b/include/cuda/cuda_where.h
@ -1,16 +1,14 @@
 #pragma once
 #include "operators/unary.h"
 #include "utils/small_array.h"
-
 namespace infini {

-void whereKernel(const float *inputX, const float *inputY,
-                 const uint8_t *condition, float *output, int nDims,
-                 int outputsize, SmallArray inputXShape, SmallArray inputYShape,
-                 SmallArray conditionShape, SmallArray outputShape, int xSize,
-                 int ySize, int cSize);
-void whereKernel(const half *inputX, const half *inputY,
-                 const uint8_t *condition, half *output, int nDims,
+void whereKernel(int dTypeIndex, void *inputX, void *inputY,
+                 const uint8_t *condition, void *output, int a0, int a1, int a2,
+                 int a3, int b0, int b1, int b2, int b3, int c0, int c1, int c2,
+                 int c3, int d0, int d1, int d2, int d3);
+void whereKernel(int dTypeIndex, void *inputX, void *inputY,
+                 const uint8_t *condition, void *output, int nDims,
                 int outputsize, SmallArray inputXShape, SmallArray inputYShape,
                 SmallArray conditionShape, SmallArray outputShape, int xSize,
                 int ySize, int cSize);
--- a/include/kunlun/kunlun_runtime.h
+++ b/include/kunlun/kunlun_runtime.h
@ -21,7 +21,7 @@ class KUNLUNRuntimeObj : public RuntimeObj {
        ctx = xdnn::create_context();
        // 10GB for Longformer
        // size_t longformerNum = 3lu * (1 << 30);
-        size_t workspaceSize = 2llu << 30; // 2 GB
+        size_t workspaceSize = 3llu << 30; // 3 GB
        KUNLUNPtr wkspacePtr = alloc(workspaceSize);
        workspace =
            make_ref<WorkspaceObj<KUNLUNPtr>>(wkspacePtr, workspaceSize);
@ -42,7 +42,7 @@ class KUNLUNRuntimeObj : public RuntimeObj {
    KUNLUNPtr alloc(size_t size) override {
        void *ptr;
        checkKUNLUNError(
-            xpu_malloc((void **)&ptr, size, XPUMemoryKind::XPU_MEM_HBM));
+            xpu_malloc_ex((void **)&ptr, size, XPUMemoryKind::XPU_MEM_MAIN));
        return ptr;
    }
    void dealloc(void *ptr) override { xpu_free(ptr); }
--- a/include/kunlun/xccl_communicator.h
+++ b/include/kunlun/xccl_communicator.h
@ -34,8 +34,8 @@ class XcclCommunicatorObj final : public CommunicatorObj {
            auto begin = std::chrono::steady_clock::now();
            while (!std::filesystem::exists(filePath)) {
                auto now = std::chrono::steady_clock::now();
-                _IT_ASSERT_2(now < begin + std::chrono::seconds(100),
-                             "time limit (100s) exceeded.");
+                _IT_ASSERT_2(now < begin + std::chrono::seconds(10),
+                             "time limit (10s) exceeded.");
                std::this_thread::sleep_for(std::chrono::milliseconds(100));
            }
            std::ifstream ifs(filePath, std::ios::binary);
--- a/pyinfinitensor/src/pyinfinitensor/onnx.py
+++ b/pyinfinitensor/src/pyinfinitensor/onnx.py
@ -967,7 +967,7 @@ class OnnxStub:
                    tensors[node.input[0]],
                    tensors.get(node.output[0]),
                )
-            elif node.op_type in ["Constant", "ConstantOfShape"]:
+            elif node.op_type == "Constant":
                output_name = node.output[0]
                attributes = _parse_attribute(node)
                tensor = attributes["value"]
--- a/src/kernels/bang/cast.cc
+++ b/src/kernels/bang/cast.cc
@ -199,24 +199,6 @@ class CastCnnl : public BangKernelWithoutConfig {
                                                   dim.data()));
            NlCastType = CNNL_CAST_UINT32_TO_INT64;
            break;
-        case CastType::Float162Float:
-            checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
-                                                   CNNL_DTYPE_HALF, dim.size(),
-                                                   dim.data()));
-            checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
-                                                   CNNL_DTYPE_FLOAT, dim.size(),
-                                                   dim.data()));
-            NlCastType = CNNL_CAST_HALF_TO_FLOAT;
-            break;
-        case CastType::Float2Float16:
-            checkCnnlError(cnnlSetTensorDescriptor(aDesc, CNNL_LAYOUT_NCHW,
-                                                   CNNL_DTYPE_FLOAT, dim.size(),
-                                                   dim.data()));
-            checkCnnlError(cnnlSetTensorDescriptor(cDesc, CNNL_LAYOUT_NCHW,
-                                                   CNNL_DTYPE_HALF, dim.size(),
-                                                   dim.data()));
-            NlCastType = CNNL_CAST_FLOAT_TO_HALF;
-            break;
        default:
            IT_TODO_HALT();
        }
--- a/src/kernels/bang/layer_norm.cc
+++ b/src/kernels/bang/layer_norm.cc
@ -19,16 +19,14 @@ class LayerNormCnnl : public BangKernelWithoutConfig {
        void *const outputData = (op->getOutput()->getRawDataPtr<void *>());

        auto inDims = op->getInputs(0)->getDims();
-        auto fiterDims = op->getInputs(1)->getDims();
        auto outDims = op->getOutput()->getDims();
+        auto fiterDims = op->getOutput(1)->getDims();

        float eps = op->getEps();
        const int axis = op->getAxis();

-        Shape outMeanDims(outDims);
-        outMeanDims.erase(outMeanDims.begin() + axis);
+        cnnlTensorDescriptor_t inDesc, fiterDesc, outDesc;

-        cnnlTensorDescriptor_t inDesc, fiterDesc, outDesc, outMeanDesc;
        checkCnnlError(cnnlCreateTensorDescriptor(&inDesc));
        checkCnnlError(cnnlSetTensorDescriptor(
            inDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
@ -41,23 +39,15 @@ class LayerNormCnnl : public BangKernelWithoutConfig {
        checkCnnlError(cnnlSetTensorDescriptor(
            outDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
            outDims.size(), outDims.data()));
-        checkCnnlError(cnnlCreateTensorDescriptor(&outMeanDesc));
-        checkCnnlError(cnnlSetTensorDescriptor(
-            outMeanDesc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(op->getDType()),
-            outMeanDims.size(), outMeanDims.data()));
        size_t wsSize;
        cnnlGetLayerNormOpWorkspaceSize(context->cnnlHandle(), axis, inDesc,
                                        &wsSize);
        BangPtr wsData = context->getWorkspace(wsSize);
-        size_t meanSize =
-            cnnlGetTensorElementNum(outMeanDesc) * op->getDType().getSize();
-        BangPtr meanData = context->getWorkspace(meanSize);
-        BangPtr rstdData = context->getWorkspace(meanSize);

        cnnlStatus_t stat = cnnlLayerNormForward(
            context->cnnlHandle(), inDesc, inputData, axis, fiterDesc,
            scaleData, biasData, eps, wsData, wsSize, outDesc, outputData,
-            outMeanDesc, meanData, rstdData);
+            inDesc, NULL, NULL);

        if (stat != CNNL_STATUS_SUCCESS)
            return;
--- a/src/kernels/bang/matmul.cc
+++ b/src/kernels/bang/matmul.cc
@ -66,13 +66,6 @@ class MatmulCnnl : public BangKernelWithoutConfig {
        cnnlSetMatMulDescAttr(bmm_desc, CNNL_MATMUL_DESC_TRANSB, &transB,
                              sizeof(int32_t));

-        std::string computeTypeStr = op->getComputeType();
-        if (computeTypeStr == "tf32") {
-            int32_t tf32 = 1;
-            cnnlSetMatMulDescAttr(bmm_desc, CNNL_MATMUL_ALLOW_TF32, &tf32,
-                                  sizeof(int32_t));
-        }
-
        cnnlMatMulAlgo_t bmm_algo;
        cnnlMatMulAlgoCreate(&bmm_algo);

--- a/src/kernels/cuda/element_wise.cu
+++ b/src/kernels/cuda/element_wise.cu
@ -5,34 +5,42 @@
 constexpr unsigned int num_threads() { return 32 * 4; }
 constexpr int thread_work_size() { return 4; }
 constexpr int block_work_size() { return thread_work_size() * num_threads(); }
-
+const int repeat = 1;
 template <class T>
 __global__ void _div_kernel(void *x, void *y, void *z, int a0, int a1, int a2,
                            int a3, int b0, int b1, int b2, int b3, int c0,
                            int c1, int c2, int c3) {
    int index = threadIdx.x + blockIdx.x * blockDim.x;
-    int stride = blockDim.x * gridDim.x;
-    int n = c0 * c1 * c2 * c3;

-    for (int i = index; i < n; i += stride) {
-        int c0_index = i / (c1 * c2 * c3);
-        int c1_index = (i % (c1 * c2 * c3)) / (c2 * c3);
-        int c2_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) / c3;
-        int c3_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) % c3;
+    int stride1 = c2 * c3;
+    int stride0 = c1 * stride1;
+    int n = c0 * stride0;
+    int end = (repeat * index + repeat < n ? repeat * index + repeat : n);
+    for (int i = repeat * index; i < end; i++) {
+        int xIdx = (a0 * a1 * a2 * a3 == n ? i : 0);
+        int yIdx = (b0 * b1 * b2 * b3 == n ? i : 0);

-        int a0_index = c0_index % a0;
-        int a1_index = c1_index % a1;
-        int a2_index = c2_index % a2;
-        int a3_index = c3_index % a3;
+        bool aIdx = (a0 * a1 * a2 * a3 < n && a0 * a1 * a2 * a3 > 1);
+        bool bIdx = (b0 * b1 * b2 * b3 < n && b0 * b1 * b2 * b3 > 1);
+        if (aIdx || bIdx) {
+            int c0_index = i / stride0;
+            int c1_index = (i % stride0) / stride1;
+            int c2_index = (i % stride1) / c3;
+            int c3_index = i % c3;
+            if (aIdx) {

-        int b0_index = c0_index % b0;
-        int b1_index = c1_index % b1;
-        int b2_index = c2_index % b2;
-        int b3_index = c3_index % b3;
-        ((T *)z)[i] = ((T *)x)[a0_index * a1 * a2 * a3 + a1_index * a2 * a3 +
-                               a2_index * a3 + a3_index] /
-                      ((T *)y)[b0_index * b1 * b2 * b3 + b1_index * b2 * b3 +
-                               b2_index * b3 + b3_index];
+                xIdx = (c0_index % a0) * a1 * a2 * a3 +
+                       (c1_index % a1) * a2 * a3 + (c2_index % a2) * a3 +
+                       c3_index % a3;
+            }
+            if (bIdx) {
+
+                yIdx = (c0_index % b0) * b1 * b2 * b3 +
+                       (c1_index % b1) * b2 * b3 + (c2_index % b2) * b3 +
+                       c3_index % b3;
+            }
+        }
+        ((T *)z)[i] = ((T *)x)[xIdx] / ((T *)y)[yIdx];
    }
 }

@ -41,28 +49,36 @@ __global__ void _add_kernel(void *x, void *y, void *z, int a0, int a1, int a2,
                            int a3, int b0, int b1, int b2, int b3, int c0,
                            int c1, int c2, int c3) {
    int index = threadIdx.x + blockIdx.x * blockDim.x;
-    int stride = blockDim.x * gridDim.x;
-    int n = c0 * c1 * c2 * c3;

-    for (int i = index; i < n; i += stride) {
-        int c0_index = i / (c1 * c2 * c3);
-        int c1_index = (i % (c1 * c2 * c3)) / (c2 * c3);
-        int c2_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) / c3;
-        int c3_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) % c3;
+    int stride1 = c2 * c3;
+    int stride0 = c1 * stride1;
+    int n = c0 * stride0;
+    int end = (repeat * index + repeat < n ? repeat * index + repeat : n);
+    for (int i = repeat * index; i < end; i++) {
+        int xIdx = (a0 * a1 * a2 * a3 == n ? i : 0);
+        int yIdx = (b0 * b1 * b2 * b3 == n ? i : 0);

-        int a0_index = c0_index % a0;
-        int a1_index = c1_index % a1;
-        int a2_index = c2_index % a2;
-        int a3_index = c3_index % a3;
+        bool aIdx = (a0 * a1 * a2 * a3 < n && a0 * a1 * a2 * a3 > 1);
+        bool bIdx = (b0 * b1 * b2 * b3 < n && b0 * b1 * b2 * b3 > 1);
+        if (aIdx || bIdx) {
+            int c0_index = i / stride0;
+            int c1_index = (i % stride0) / stride1;
+            int c2_index = (i % stride1) / c3;
+            int c3_index = i % c3;
+            if (aIdx) {

-        int b0_index = c0_index % b0;
-        int b1_index = c1_index % b1;
-        int b2_index = c2_index % b2;
-        int b3_index = c3_index % b3;
-        ((T *)z)[i] = ((T *)x)[a0_index * a1 * a2 * a3 + a1_index * a2 * a3 +
-                               a2_index * a3 + a3_index] +
-                      ((T *)y)[b0_index * b1 * b2 * b3 + b1_index * b2 * b3 +
-                               b2_index * b3 + b3_index];
+                xIdx = (c0_index % a0) * a1 * a2 * a3 +
+                       (c1_index % a1) * a2 * a3 + (c2_index % a2) * a3 +
+                       c3_index % a3;
+            }
+            if (bIdx) {
+
+                yIdx = (c0_index % b0) * b1 * b2 * b3 +
+                       (c1_index % b1) * b2 * b3 + (c2_index % b2) * b3 +
+                       c3_index % b3;
+            }
+        }
+        ((T *)z)[i] = ((T *)x)[xIdx] + ((T *)y)[yIdx];
    }
 }

@ -71,29 +87,36 @@ __global__ void _pow_kernel(void *x, void *y, void *z, int a0, int a1, int a2,
                            int a3, int b0, int b1, int b2, int b3, int c0,
                            int c1, int c2, int c3) {
    int index = threadIdx.x + blockIdx.x * blockDim.x;
-    int stride = blockDim.x * gridDim.x;
-    int n = c0 * c1 * c2 * c3;

-    for (int i = index; i < n; i += stride) {
-        int c0_index = i / (c1 * c2 * c3);
-        int c1_index = (i % (c1 * c2 * c3)) / (c2 * c3);
-        int c2_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) / c3;
-        int c3_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) % c3;
+    int stride1 = c2 * c3;
+    int stride0 = c1 * stride1;
+    int n = c0 * stride0;
+    int end = (repeat * index + repeat < n ? repeat * index + repeat : n);
+    for (int i = repeat * index; i < end; i++) {
+        int xIdx = (a0 * a1 * a2 * a3 == n ? i : 0);
+        int yIdx = (b0 * b1 * b2 * b3 == n ? i : 0);

-        int a0_index = c0_index % a0;
-        int a1_index = c1_index % a1;
-        int a2_index = c2_index % a2;
-        int a3_index = c3_index % a3;
+        bool aIdx = (a0 * a1 * a2 * a3 < n && a0 * a1 * a2 * a3 > 1);
+        bool bIdx = (b0 * b1 * b2 * b3 < n && b0 * b1 * b2 * b3 > 1);
+        if (aIdx || bIdx) {
+            int c0_index = i / stride0;
+            int c1_index = (i % stride0) / stride1;
+            int c2_index = (i % stride1) / c3;
+            int c3_index = i % c3;
+            if (aIdx) {

-        int b0_index = c0_index % b0;
-        int b1_index = c1_index % b1;
-        int b2_index = c2_index % b2;
-        int b3_index = c3_index % b3;
-        ((T *)z)[i] =
-            pow(((T *)x)[a0_index * a1 * a2 * a3 + a1_index * a2 * a3 +
-                         a2_index * a3 + a3_index],
-                ((T *)y)[b0_index * b1 * b2 * b3 + b1_index * b2 * b3 +
-                         b2_index * b3 + b3_index]);
+                xIdx = (c0_index % a0) * a1 * a2 * a3 +
+                       (c1_index % a1) * a2 * a3 + (c2_index % a2) * a3 +
+                       c3_index % a3;
+            }
+            if (bIdx) {
+
+                yIdx = (c0_index % b0) * b1 * b2 * b3 +
+                       (c1_index % b1) * b2 * b3 + (c2_index % b2) * b3 +
+                       c3_index % b3;
+            }
+        }
+        ((T *)z)[i] = pow(((T *)x)[xIdx], ((T *)y)[yIdx]);
    }
 }

@ -102,31 +125,36 @@ __global__ void _less_kernel(void *x, void *y, void *z, int a0, int a1, int a2,
                             int a3, int b0, int b1, int b2, int b3, int c0,
                             int c1, int c2, int c3) {
    int index = threadIdx.x + blockIdx.x * blockDim.x;
-    int stride = blockDim.x * gridDim.x;
-    int n = c0 * c1 * c2 * c3;

-    for (int i = index; i < n; i += stride) {
-        int c0_index = i / (c1 * c2 * c3);
-        int c1_index = (i % (c1 * c2 * c3)) / (c2 * c3);
-        int c2_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) / c3;
-        int c3_index = ((i % (c1 * c2 * c3)) % (c2 * c3)) % c3;
+    int stride1 = c2 * c3;
+    int stride0 = c1 * stride1;
+    int n = c0 * stride0;
+    int end = (repeat * index + repeat < n ? repeat * index + repeat : n);
+    for (int i = repeat * index; i < end; i++) {
+        int xIdx = (a0 * a1 * a2 * a3 == n ? i : 0);
+        int yIdx = (b0 * b1 * b2 * b3 == n ? i : 0);

-        int a0_index = c0_index % a0;
-        int a1_index = c1_index % a1;
-        int a2_index = c2_index % a2;
-        int a3_index = c3_index % a3;
+        bool aIdx = (a0 * a1 * a2 * a3 < n && a0 * a1 * a2 * a3 > 1);
+        bool bIdx = (b0 * b1 * b2 * b3 < n && b0 * b1 * b2 * b3 > 1);
+        if (aIdx || bIdx) {
+            int c0_index = i / stride0;
+            int c1_index = (i % stride0) / stride1;
+            int c2_index = (i % stride1) / c3;
+            int c3_index = i % c3;
+            if (aIdx) {

-        int b0_index = c0_index % b0;
-        int b1_index = c1_index % b1;
-        int b2_index = c2_index % b2;
-        int b3_index = c3_index % b3;
-        ((bool *)z)[i] =
-            ((T *)x)[a0_index * a1 * a2 * a3 + a1_index * a2 * a3 +
-                     a2_index * a3 + a3_index] <
-                    ((T *)y)[b0_index * b1 * b2 * b3 + b1_index * b2 * b3 +
-                             b2_index * b3 + b3_index]
-                ? true
-                : false;
+                xIdx = (c0_index % a0) * a1 * a2 * a3 +
+                       (c1_index % a1) * a2 * a3 + (c2_index % a2) * a3 +
+                       c3_index % a3;
+            }
+            if (bIdx) {
+
+                yIdx = (c0_index % b0) * b1 * b2 * b3 +
+                       (c1_index % b1) * b2 * b3 + (c2_index % b2) * b3 +
+                       c3_index % b3;
+            }
+        }
+        ((bool *)z)[i] = ((T *)x)[xIdx] < ((T *)y)[yIdx] ? true : false;
    }
 }

@ -176,7 +204,6 @@ __global__ void _less_kernel(void *x, void *y, void *z, int a0, int a1, int a2,
    default:                                                                   \
        IT_TODO_HALT();                                                        \
    }
-
 template <class T>
 __global__ void _div_const_kernel(void const *__restrict__ x,
                                  void const *__restrict__ y,
@ -269,7 +296,8 @@ void div_kernel(int dType, void *a, void *b, void *c, int a0, int a1, int a2,

    int blocksize = block_work_size();
    int num = c0 * c1 * c2 * c3;
-    int gridsize = (num + block_work_size() - 1) / block_work_size();
+    int gridsize =
+        (num + repeat * block_work_size() - 1) / (repeat * block_work_size());
    SWITCH_DTYPE(div, dType)
 }
 void add_kernel(int dType, void *a, void *b, void *c, int a0, int a1, int a2,
@ -278,7 +306,8 @@ void add_kernel(int dType, void *a, void *b, void *c, int a0, int a1, int a2,

    int blocksize = block_work_size();
    int num = c0 * c1 * c2 * c3;
-    int gridsize = (num + block_work_size() - 1) / block_work_size();
+    int gridsize =
+        (num + repeat * block_work_size() - 1) / (repeat * block_work_size());
    SWITCH_DTYPE(add, dType)
 }
 void pow_kernel(int dType, void *a, void *b, void *c, int a0, int a1, int a2,
@ -286,7 +315,8 @@ void pow_kernel(int dType, void *a, void *b, void *c, int a0, int a1, int a2,
                int c3) {
    int blocksize = block_work_size();
    int num = c0 * c1 * c2 * c3;
-    int gridsize = (num + block_work_size() - 1) / block_work_size();
+    int gridsize =
+        (num + repeat * block_work_size() - 1) / (repeat * block_work_size());
    if (dType == 1) {
        _pow_kernel<float>
            <<<gridsize, blocksize, 0, CUDAStream::getCurrentStream()>>>(
@ -324,7 +354,8 @@ void less_kernel(int dType, void *a, void *b, void *c, int a0, int a1, int a2,
                 int c3) {
    int blocksize = block_work_size();
    int num = c0 * c1 * c2 * c3;
-    int gridsize = (num + block_work_size() - 1) / block_work_size();
+    int gridsize =
+        (num + repeat * block_work_size() - 1) / (repeat * block_work_size());
    SWITCH_DTYPE(less, dType)
 }

--- a/src/kernels/cuda/expand.cc
+++ b/src/kernels/cuda/expand.cc
@ -12,22 +12,33 @@ class ExpandCuda : public CudaKernelWithoutConfig {

        void *const inputData = (op->getInputs(0)->getRawDataPtr<void *>());
        void *const outputData = (op->getOutput()->getRawDataPtr<void *>());
-        const auto &in_Shape = op->getInputs(0)->getDims(); // input shape
-        const auto &out_Shape = op->getShape();             // output shape
+        auto a_dim = op->getInputs(0)->getDims();
+        auto b_dim = op->getOutput()->getDims(); // output shape

-        SmallArray inputShape, outputShape;
-        int nDims = op->getInputs(0)->getDims().size();
-
-        IT_ASSERT(nDims <= SMALL_ARRAY_SIZE);
-        int outputsize = 1; // the length of the output vector after flatten
-        for (int i = 0; i < nDims; ++i) {
-            outputShape.data[i] = out_Shape[i];
-            inputShape.data[i] = in_Shape[i];
-            outputsize *= out_Shape[i];
-        }
        const int dType = op->getDType().getIndex();
-        expandKernel(dType, inputData, outputData, nDims, outputsize,
-                     inputShape, outputShape);
+        if (a_dim.size() > 4 || b_dim.size() > 4) {
+            SmallArray inputShape, outputShape;
+            int nDims = op->getInputs(0)->getDims().size();
+
+            IT_ASSERT(nDims <= SMALL_ARRAY_SIZE);
+            int outputsize = 1; // the length of the output vector after flatten
+            for (int i = 0; i < nDims; ++i) {
+                outputShape.data[i] = b_dim[i];
+                inputShape.data[i] = a_dim[i];
+                outputsize *= b_dim[i];
+            }
+            const int dType = op->getDType().getIndex();
+            expandKernel(dType, inputData, outputData, nDims, outputsize,
+                         inputShape, outputShape);
+
+        } else {
+            int a[4] = {1, 1, 1, 1};
+            int b[4] = {1, 1, 1, 1};
+            std::copy(a_dim.begin(), a_dim.end(), a + (4 - a_dim.size()));
+            std::copy(b_dim.begin(), b_dim.end(), b + (4 - b_dim.size()));
+            expandKernel(dType, inputData, outputData, a[0], a[1], a[2], a[3],
+                         b[0], b[1], b[2], b[3]);
+        }
    }
 };

--- a/src/kernels/cuda/expand.cu
+++ b/src/kernels/cuda/expand.cu
@ -6,7 +6,31 @@
 constexpr unsigned int num_threads() { return 32 * 4; }
 constexpr int thread_work_size() { return 4; }
 constexpr int block_work_size() { return thread_work_size() * num_threads(); }
+const int repeat = 1;
+template <class T>
+__global__ void _expandKernel(void *input, void *output, int a0, int a1, int a2,
+                              int a3, int b0, int b1, int b2, int b3) {

+    int index = threadIdx.x + blockIdx.x * blockDim.x;
+
+    int stride1 = b2 * b3;
+    int stride0 = b1 * stride1;
+    int n = b0 * stride0;
+    int end = (repeat * index + repeat < n ? repeat * index + repeat : n);
+    for (int i = repeat * index; i < end; i++) {
+        int xIdx = (a0 * a1 * a2 * a3 == n ? i : 0);
+        bool aIdx = (a0 * a1 * a2 * a3 < n && a0 * a1 * a2 * a3 > 1);
+        if (aIdx) {
+            int b0_index = i / stride0;
+            int b1_index = (i % stride0) / stride1;
+            int b2_index = (i % stride1) / b3;
+            int b3_index = i % b3;
+            xIdx = (b0_index % a0) * a1 * a2 * a3 + (b1_index % a1) * a2 * a3 +
+                   (b2_index % a2) * a3 + b3_index % a3;
+        }
+        ((T *)output)[i] = ((T *)input)[xIdx];
+    }
+}
 template <class T>
 __global__ void _expandKernel(void *input, void *output, int nDims,
                              int outputsize, infini::SmallArray inputShape,
@ -38,7 +62,6 @@ __global__ void _expandKernel(void *input, void *output, int nDims,
        ((T *)output)[outputIdx] = ((T *)input)[inputIdx];
    }
 }
-
 template <class T>
 static __global__ void _expandRowKernel(void *__restrict__ dst,
                                        void const *__restrict__ src) {
@ -50,9 +73,9 @@ static __global__ void _expandRowKernel(void *__restrict__ dst,
 namespace infini {

 #define CASE(T)                                                                \
-    _expandKernel<DT_CUDA<T>::t><<<gridsize, blocksize,                        \
-        0, CUDAStream::getCurrentStream()>>>(                                  \
-        input, output, nDims, outputsize, inputShape, outputShape);
+    _expandKernel<DT_CUDA<T>::t>                                               \
+        <<<gridsize, blocksize, 0, CUDAStream::getCurrentStream()>>>(          \
+            input, output, a0, a1, a2, a3, b0, b1, b2, b3);

 #define SWITCH_DTYPE(DTYPE)                                                    \
    switch (DTYPE) {                                                           \
@ -96,14 +119,56 @@ namespace infini {
        IT_TODO_HALT();                                                        \
    }

+void expandKernel(int dType, void *input, void *output, int a0, int a1, int a2,
+                  int a3, int b0, int b1, int b2, int b3) {
+    int blocksize = block_work_size();
+    int outputsize = b0 * b1 * b2 * b3;
+    int gridsize = (outputsize + repeat * block_work_size() - 1) /
+                   (repeat * block_work_size());
+    SWITCH_DTYPE(dType)
+}
+#define CASECurrency(T)                                                        \
+    _expandKernel<DT_CUDA<T>::t>                                               \
+        <<<gridsize, blocksize, 0, CUDAStream::getCurrentStream()>>>(          \
+            input, output, nDims, outputsize, inputShape, outputShape);
+
+#define SWITCHCurrency_DTYPE(DTYPE)                                            \
+    switch (DTYPE) {                                                           \
+    case 1:                                                                    \
+        CASECurrency(1) break;                                                 \
+    case 2:                                                                    \
+        CASECurrency(2) break;                                                 \
+    case 3:                                                                    \
+        CASECurrency(3) break;                                                 \
+    case 4:                                                                    \
+        CASECurrency(4) break;                                                 \
+    case 5:                                                                    \
+        CASECurrency(5) break;                                                 \
+    case 6:                                                                    \
+        CASECurrency(6) break;                                                 \
+    case 7:                                                                    \
+        CASECurrency(7) break;                                                 \
+    case 10:                                                                   \
+        CASECurrency(10) break;                                                \
+    case 11:                                                                   \
+        CASECurrency(11) break;                                                \
+    case 12:                                                                   \
+        CASECurrency(12) break;                                                \
+    case 13:                                                                   \
+        CASECurrency(13) break;                                                \
+    case 16:                                                                   \
+        CASECurrency(16) break;                                                \
+    default:                                                                   \
+        IT_TODO_HALT();                                                        \
+    }
+
 void expandKernel(int dType, void *input, void *output, int nDims,
                  int outputsize, SmallArray inputShape,
                  SmallArray outputShape) {
    int blocksize = block_work_size();
    int gridsize = (outputsize + block_work_size() - 1) / block_work_size();
-    SWITCH_DTYPE(dType)
+    SWITCHCurrency_DTYPE(dType)
 }
-
 #define CASE_ROW(T)                                                            \
    _expandRowKernel<float>                                                    \
        <<<grid, block, 0, CUDAStream::getCurrentStream()>>>(output, input);
@ -150,7 +215,8 @@ void expandKernel(int dType, void *input, void *output, int nDims,
        IT_TODO_HALT();                                                        \
    }

-// Optimization for expanding a row vector. The row length must be a multiple of 32
+// Optimization for expanding a row vector. The row length must be a multiple of
+// 32
 void expandRowKernel(int dType, void *input, void *output, int n_rows,
                     int row_len) {
    // Factorize row_len: row_len = a x b x 32 (32 is the warp size), b<=32
@ -160,7 +226,8 @@ void expandRowKernel(int dType, void *input, void *output, int n_rows,
    // block: b x 32
    auto c = row_len / 32, b = c;
    if (b > 32) {
-        for (b = 32; c % b != 0; --b);
+        for (b = 32; c % b != 0; --b)
+            ;
    }
    auto a = c / b;
    dim3 grid(a, n_rows), block(32, b);
--- a/src/kernels/cuda/matmul.cc
+++ b/src/kernels/cuda/matmul.cc
@ -87,20 +87,7 @@ class matmulCublas : public Kernel {
            beta_naive = 1.f;
            auto inC = op->getInputs(2);
            auto out = op->getOutput();
-            SmallArray inputShape, outputShape;
-            int nDims = out->getRank();
-            IT_ASSERT(nDims <= SMALL_ARRAY_SIZE);
-            // FIXME(constroy): use size_t for outputsize.
-            int outputsize = 1; // the length of the output vector after flatten
-            int offset = nDims - inC->getRank();
-            for (int i = 0; i < offset; ++i)
-                inputShape.data[i] = 1;
-            for (int i = 0; i < nDims; ++i) {
-                outputShape.data[i] = out->getDims()[i];
-                outputsize *= outputShape.data[i];
-                if (i >= offset)
-                    inputShape.data[i] = inC->getDims()[i - offset];
-            }
+
            const int dType = dataType.getIndex();

            // Bias in linear layer is row vector of (1,n), n is the number of
@ -111,9 +98,40 @@ class matmulCublas : public Kernel {
                                out->size() / inC->getDims()[0],
                                inC->getDims()[0]);
            } else {
-                expandKernel(dType, inC->getRawDataPtr<void *>(),
-                             out->getRawDataPtr<void *>(), nDims, outputsize,
-                             inputShape, outputShape);
+                auto a_dim = out->getDims();
+                auto b_dim = inC->getDims(); // output shape
+
+                if (a_dim.size() > 4 || b_dim.size() > 4) {
+                    SmallArray inputShape, outputShape;
+                    int nDims = out->getRank();
+                    IT_ASSERT(nDims <= SMALL_ARRAY_SIZE);
+                    // FIXME(constroy): use size_t for outputsize.
+                    int outputsize =
+                        1; // the length of the output vector after flatten
+                    int offset = nDims - inC->getRank();
+                    for (int i = 0; i < offset; ++i)
+                        inputShape.data[i] = 1;
+                    for (int i = 0; i < nDims; ++i) {
+                        outputShape.data[i] = out->getDims()[i];
+                        outputsize *= outputShape.data[i];
+                        if (i >= offset)
+                            inputShape.data[i] = inC->getDims()[i - offset];
+                    }
+                    expandKernel(dType, inC->getRawDataPtr<void *>(),
+                                 out->getRawDataPtr<void *>(), nDims,
+                                 outputsize, inputShape, outputShape);
+
+                } else {
+                    int a[4] = {1, 1, 1, 1};
+                    int b[4] = {1, 1, 1, 1};
+                    std::copy(a_dim.begin(), a_dim.end(),
+                              a + (4 - a_dim.size()));
+                    std::copy(b_dim.begin(), b_dim.end(),
+                              b + (4 - b_dim.size()));
+                    expandKernel(dType, inC->getRawDataPtr<void *>(),
+                                 out->getRawDataPtr<void *>(), a[0], a[1], a[2],
+                                 a[3], b[0], b[1], b[2], b[3]);
+                }
            }
        }
        // TODO:use compute type
--- a/src/kernels/cuda/transpose.cc
+++ b/src/kernels/cuda/transpose.cc
@ -16,31 +16,57 @@ class TransposeCuda : public CudaKernelWithoutConfig {
        void *const outputData = output->getRawDataPtr<void *>();
        const auto &inputShape = input->getDims();
        const auto &outputShape = output->getDims();
-
-        const auto &perm = op->getPermute();
+        const int dType = op->getDType().getIndex();
        int size = input->size();
        int nDims = input->getDims().size();
-
-        // Compute strides
-        SmallArray strides, buffer;
-        IT_ASSERT(nDims <= SMALL_ARRAY_SIZE);
-        int curStride = 1;
-        for (int i = nDims - 1; i >= 0; --i) {
-            buffer.data[i] = curStride;
-            curStride *= inputShape[i];
-        }
-        for (int i = 0; i < nDims; ++i) {
-            strides.data[i] = buffer.data[perm[i]];
+        //----------------
+        bool condition = true;
+        int gnum = 0;
+        for (int i = 0; i < nDims; i++) {
+            if (inputShape[i] > 1) {
+                while (gnum < nDims) {
+                    if (outputShape[gnum] > 1) {
+                        gnum += 1;
+                        break;
+                    } else {
+                        gnum += 1;
+                    }
+                }
+                if (inputShape[i] != outputShape[gnum - 1]) {
+                    condition = false;
+                    break;
+                }
+            }
        }
+        //----------------
+        if (condition) {
+            cudaMemcpyAsync(outputData, inputData, op->getInputs(0)->getBytes(),
+                            cudaMemcpyDeviceToDevice,
+                            CUDAStream::getCurrentStream());

-        SmallArray outputDims;
-        for (int i = 0; i < nDims; ++i) {
-            outputDims.data[i] = outputShape[i];
-        }
+        } else {
+            const auto &perm = op->getPermute();

-        const int dType = op->getDType().getIndex();
-        transpose_kernel(dType, inputData, outputData, nDims, size, strides,
-                         outputDims);
+            // Compute strides
+            SmallArray strides, buffer;
+            IT_ASSERT(nDims <= SMALL_ARRAY_SIZE);
+            int curStride = 1;
+            for (int i = nDims - 1; i >= 0; --i) {
+                buffer.data[i] = curStride;
+                curStride *= inputShape[i];
+            }
+            for (int i = 0; i < nDims; ++i) {
+                strides.data[i] = buffer.data[perm[i]];
+            }
+
+            SmallArray outputDims;
+            for (int i = 0; i < nDims; ++i) {
+                outputDims.data[i] = outputShape[i];
+            }
+
+            transpose_kernel(dType, inputData, outputData, nDims, size, strides,
+                             outputDims);
+        }
    }
 };

--- a/src/kernels/cuda/transpose.cu
+++ b/src/kernels/cuda/transpose.cu
@ -24,8 +24,8 @@ __global__ void _transpose_kernel(void *input, void *output, int nDims,
 }
 #define CASE(T)                                                                \
    _transpose_kernel<DT_CUDA<T>::t>                                           \
-        <<<gridsize, blocksize, 0, CUDAStream::getCurrentStream()>>>           \
-        (input, output, nDims, size, strides, outputShape);
+        <<<gridsize, blocksize, 0, CUDAStream::getCurrentStream()>>>(          \
+            input, output, nDims, size, strides, outputShape);

 #define SWITCH_DTYPE(DTYPE)                                                    \
    switch (DTYPE) {                                                           \
--- a/src/kernels/cuda/where.cc
+++ b/src/kernels/cuda/where.cc
@ -1,8 +1,8 @@
 #include "operators/where.h"
 #include "cuda/cuda_kernel_wihtout_config.h"
 #include "cuda/cuda_runtime.h"
+#include "cuda/cuda_utility.h"
 #include "cuda/cuda_where.h"
-#include "utils/operator_utils.h"

 namespace infini {

@ -15,39 +15,50 @@ class WhereCuda : public CudaKernelWithoutConfig {
        void *const inputYData = (op->getInputs(1)->getRawDataPtr<void *>());
        void *const conditionData = (op->getInputs(2)->getRawDataPtr<void *>());
        void *const outputData = (op->getOutput()->getRawDataPtr<void *>());
-        const auto &opInputXShape = op->getInputs(0)->getDims();
-        const auto &opInputYShape = op->getInputs(1)->getDims();
-        const auto &opConditionShape = op->getInputs(2)->getDims();
-        const auto &opOutputShape = op->getOutput()->getDims();

-        const int xSize = op->getInputs(0)->getRank();
-        const int ySize = op->getInputs(1)->getRank();
-        const int cSize = op->getInputs(2)->getRank();
+        auto a_dim = op->getInputs(0)->getDims();
+        auto b_dim = op->getInputs(1)->getDims();
+        auto c_dim = op->getInputs(2)->getDims();
+        auto d_dim = op->getOutput()->getDims();
+        const int dTypeIndex = op->getDType().getIndex();
+        if (a_dim.size() > 4 || b_dim.size() > 4 || c_dim.size() > 4 ||
+            d_dim.size() > 4) {
+            const int xSize = op->getInputs(0)->getRank();
+            const int ySize = op->getInputs(1)->getRank();
+            const int cSize = op->getInputs(2)->getRank();

-        int nDims = op->getOutput()->getDims().size();
-        IT_ASSERT(nDims <= SMALL_ARRAY_SIZE);
-        int outputsize = 1;
-        SmallArray inputXShape, inputYShape, conditionShape, outputShape;
-        for (int i = nDims - 1; i >= 0; --i) {
-            outputShape.data[i] = opOutputShape[i];
-            outputsize *= outputShape.data[i];
+            int nDims = op->getOutput()->getDims().size();
+            IT_ASSERT(nDims <= SMALL_ARRAY_SIZE);
+            int outputsize = 1;
+            SmallArray inputXShape, inputYShape, conditionShape, outputShape;
+            for (int i = nDims - 1; i >= 0; --i) {
+                outputShape.data[i] = d_dim[i];
+                outputsize *= outputShape.data[i];
+            }
+            broadcastShape(a_dim, inputXShape, nDims, xSize);
+            broadcastShape(b_dim, inputYShape, nDims, ySize);
+            broadcastShape(c_dim, conditionShape, nDims, cSize);
+            whereKernel(dTypeIndex, inputXData, inputYData,
+                        (uint8_t *)conditionData, outputData, nDims, outputsize,
+                        inputXShape, inputYShape, conditionShape, outputShape,
+                        xSize, ySize, cSize);
        }
-        broadcastShape(opInputXShape, inputXShape, nDims, xSize);
-        broadcastShape(opInputYShape, inputYShape, nDims, ySize);
-        broadcastShape(opConditionShape, conditionShape, nDims, cSize);

-        if (op->getDType() == DataType::Float32) {
-            whereKernel((float *)inputXData, (float *)inputYData,
-                        (uint8_t *)conditionData, (float *)outputData, nDims,
-                        outputsize, inputXShape, inputYShape, conditionShape,
-                        outputShape, xSize, ySize, cSize);
-        } else if (op->getDType() == DataType::Float16) {
-            whereKernel((half *)inputXData, (half *)inputYData,
-                        (uint8_t *)conditionData, (half *)outputData, nDims,
-                        outputsize, inputXShape, inputYShape, conditionShape,
-                        outputShape, xSize, ySize, cSize);
-        } else {
-            IT_ASSERT(false);
+        else {
+            int a[4] = {1, 1, 1, 1};
+            int b[4] = {1, 1, 1, 1};
+            int c[4] = {1, 1, 1, 1};
+            int d[4] = {1, 1, 1, 1};
+
+            std::copy(a_dim.begin(), a_dim.end(), a + (4 - a_dim.size()));
+            std::copy(b_dim.begin(), b_dim.end(), b + (4 - b_dim.size()));
+            std::copy(c_dim.begin(), c_dim.end(), c + (4 - c_dim.size()));
+            std::copy(d_dim.begin(), d_dim.end(), d + (4 - d_dim.size()));
+
+            whereKernel(dTypeIndex, inputXData, inputYData,
+                        (uint8_t *)conditionData, outputData, a[0], a[1], a[2],
+                        a[3], b[0], b[1], b[2], b[3], c[0], c[1], c[2], c[3],
+                        d[0], d[1], d[2], d[3]);
        }
    }
 };
--- a/src/kernels/cuda/where.cu
+++ b/src/kernels/cuda/where.cu
@ -1,6 +1,109 @@
 #include "cuda/cuda_common.h"
+#include "cuda/cuda_utility.h"
 #include "utils/small_array.h"
+const int repeat = 1;

+template <typename T>
+__global__ void
+_whereKernel(void *inputX, void *inputY, const uint8_t *condition, void *output,
+             int a0, int a1, int a2, int a3, int b0, int b1, int b2, int b3,
+             int c0, int c1, int c2, int c3, int d0, int d1, int d2, int d3) {
+
+    int stride1 = d2 * d3;
+    int stride0 = d1 * stride1;
+    int n = d0 * stride0;
+    int index = threadIdx.x + blockIdx.x * blockDim.x;
+    int end = (repeat * index + repeat < n ? repeat * index + repeat : n);
+    for (int i = repeat * index; i < end; i++) {
+        int inputXIdx = (a0 * a1 * a2 * a3 == n ? i : 0);
+        int inputYIdx = (b0 * b1 * b2 * b3 == n ? i : 0);
+        int conditionIdx = (c0 * c1 * c2 * c3 == n ? i : 0);
+
+        bool aIdx = (a0 * a1 * a2 * a3 < n && a0 * a1 * a2 * a3 > 1);
+        bool bIdx = (b0 * b1 * b2 * b3 < n && b0 * b1 * b2 * b3 > 1);
+        bool cIdx = (c0 * c1 * c2 * c3 < n && c0 * c1 * c2 * c3 > 1);
+        if (aIdx || bIdx || cIdx) {
+            int d0_index = i / stride0;
+            int d1_index = (i % stride0) / stride1;
+            int d2_index = (i % stride1) / d3;
+            int d3_index = i % d3;
+            if (aIdx) {
+                int a0_index = d0_index % a0;
+                int a1_index = d1_index % a1;
+                int a2_index = d2_index % a2;
+                int a3_index = d3_index % a3;
+                inputXIdx = a0_index * a1 * a2 * a3 + a1_index * a2 * a3 +
+                            a2_index * a3 + a3_index;
+            }
+            if (bIdx) {
+                int b0_index = d0_index % b0;
+                int b1_index = d1_index % b1;
+                int b2_index = d2_index % b2;
+                int b3_index = d3_index % b3;
+                inputYIdx = b0_index * b1 * b2 * b3 + b1_index * b2 * b3 +
+                            b2_index * b3 + b3_index;
+            }
+            if (cIdx) {
+                int c0_index = d0_index % c0;
+                int c1_index = d1_index % c1;
+                int c2_index = d2_index % c2;
+                int c3_index = d3_index % c3;
+                conditionIdx = c0_index * c1 * c2 * c3 + c1_index * c2 * c3 +
+                               c2_index * c3 + c3_index;
+            }
+        }
+
+        ((T *)output)[i] = condition[conditionIdx] ? ((T *)inputX)[inputXIdx]
+                                                   : ((T *)inputY)[inputYIdx];
+    }
+}
+#define CASE(T)                                                                \
+    _whereKernel<DT_CUDA<T>::t>                                                \
+        <<<gridsize, blocksize, 0, CUDAStream::getCurrentStream()>>>(          \
+            inputX, inputY, condition, output, a0, a1, a2, a3, b0, b1, b2, b3, \
+            c0, c1, c2, c3, d0, d1, d2, d3);
+
+#define SWITCH_DTYPE(DTYPE)                                                    \
+    switch (DTYPE) {                                                           \
+    case 1:                                                                    \
+        CASE(1)                                                                \
+        break;                                                                 \
+    case 2:                                                                    \
+        CASE(2)                                                                \
+        break;                                                                 \
+    case 3:                                                                    \
+        CASE(3)                                                                \
+        break;                                                                 \
+    case 4:                                                                    \
+        CASE(4)                                                                \
+        break;                                                                 \
+    case 5:                                                                    \
+        CASE(5)                                                                \
+        break;                                                                 \
+    case 6:                                                                    \
+        CASE(6)                                                                \
+        break;                                                                 \
+    case 7:                                                                    \
+        CASE(7)                                                                \
+        break;                                                                 \
+    case 10:                                                                   \
+        CASE(10)                                                               \
+        break;                                                                 \
+    case 11:                                                                   \
+        CASE(11)                                                               \
+        break;                                                                 \
+    case 12:                                                                   \
+        CASE(12)                                                               \
+        break;                                                                 \
+    case 13:                                                                   \
+        CASE(13)                                                               \
+        break;                                                                 \
+    case 16:                                                                   \
+        CASE(16)                                                               \
+        break;                                                                 \
+    default:                                                                   \
+        IT_TODO_HALT();                                                        \
+    }
 __device__ int inferIndex(infini::SmallArray inputShape,
                          infini::SmallArray outputShape, int nDims, int size,
                          int outputIdx) {
@ -19,11 +122,10 @@ __device__ int inferIndex(infini::SmallArray inputShape,
 }
 template <typename T>
 __global__ void
-_whereKernel(const T *inputX, const T *inputY, const uint8_t *condition,
-             T *output, int nDims, int outputsize,
-             infini::SmallArray inputXShape, infini::SmallArray inputYShape,
-             infini::SmallArray conditionShape, infini::SmallArray outputShape,
-             int xSize, int ySize, int cSize) {
+_whereKernel(void *inputX, void *inputY, const uint8_t *condition, void *output,
+             int nDims, int outputsize, infini::SmallArray inputXShape,
+             infini::SmallArray inputYShape, infini::SmallArray conditionShape,
+             infini::SmallArray outputShape, int xSize, int ySize, int cSize) {

    int outputIdx = blockIdx.x * blockDim.x + threadIdx.x;
    if (outputIdx < outputsize) {
@ -35,14 +137,74 @@ _whereKernel(const T *inputX, const T *inputY, const uint8_t *condition,
        int inputYIdx =
            inferIndex(inputYShape, outputShape, nDims, ySize, outputIdx);

-        output[outputIdx] =
-            condition[conditionIdx] ? inputX[inputXIdx] : inputY[inputYIdx];
+        ((T *)output)[outputIdx] = condition[conditionIdx]
+                                       ? ((T *)inputX)[inputXIdx]
+                                       : ((T *)inputY)[inputYIdx];
    }
 }
+#define CASECurrency(T)                                                        \
+    _whereKernel<DT_CUDA<T>::t>                                                \
+        <<<gridsize, blocksize, 0, CUDAStream::getCurrentStream()>>>(          \
+            inputX, inputY, condition, output, nDims, outputsize, inputXShape, \
+            inputYShape, conditionShape, outputShape, xSize, ySize, cSize);
+
+#define SWITCHCurrency_DTYPE(DTYPE)                                            \
+    switch (DTYPE) {                                                           \
+    case 1:                                                                    \
+        CASECurrency(1) break;                                                 \
+    case 2:                                                                    \
+        CASECurrency(2) break;                                                 \
+    case 3:                                                                    \
+        CASECurrency(3) break;                                                 \
+    case 4:                                                                    \
+        CASECurrency(4) break;                                                 \
+    case 5:                                                                    \
+        CASECurrency(5) break;                                                 \
+    case 6:                                                                    \
+        CASECurrency(6) break;                                                 \
+    case 7:                                                                    \
+        CASECurrency(7) break;                                                 \
+    case 10:                                                                   \
+        CASECurrency(10) break;                                                \
+    case 11:                                                                   \
+        CASECurrency(11) break;                                                \
+    case 12:                                                                   \
+        CASECurrency(12) break;                                                \
+    case 13:                                                                   \
+        CASECurrency(13) break;                                                \
+    case 16:                                                                   \
+        CASECurrency(16) break;                                                \
+    default:                                                                   \
+        IT_TODO_HALT();                                                        \
+    }
+namespace infini {
+
+void whereKernel(int dTypeIndex, void *inputX, void *inputY,
+                 const uint8_t *condition, void *output, int a0, int a1, int a2,
+                 int a3, int b0, int b1, int b2, int b3, int c0, int c1, int c2,
+                 int c3, int d0, int d1, int d2, int d3) {
+    int blocksize;
+    int outputsize = d0 * d1 * d2 * d3;
+    if (outputsize > 511 * repeat) {
+        blocksize = 1024;
+    } else if (outputsize > 255 * repeat) {
+        blocksize = 512;
+    } else if (outputsize > 127 * repeat) {
+        blocksize = 256;
+    } else if (outputsize > 63 * repeat) {
+        blocksize = 128;
+    } else if (outputsize > 31 * repeat) {
+        blocksize = 64;
+    } else {
+        blocksize = 32;
+    }
+    int gridsize = (outputsize + repeat * blocksize - 1) / (repeat * blocksize);
+
+    SWITCH_DTYPE(dTypeIndex)
+}

-namespace infini {
-void whereKernel(const float *inputX, const float *inputY,
-                 const uint8_t *condition, float *output, int nDims,
+void whereKernel(int dTypeIndex, void *inputX, void *inputY,
+                 const uint8_t *condition, void *output, int nDims,
                 int outputsize, SmallArray inputXShape, SmallArray inputYShape,
                 SmallArray conditionShape, SmallArray outputShape, int xSize,
                 int ySize, int cSize) {
@ -61,34 +223,8 @@ void whereKernel(const float *inputX, const float *inputY,
        blocksize = 32;
    }
    int gridsize = (outputsize + blocksize - 1) / blocksize;
-    _whereKernel<float>
-        <<<gridsize, blocksize, 0, CUDAStream::getCurrentStream()>>>(
-        inputX, inputY, condition, output, nDims, outputsize, inputXShape,
-        inputYShape, conditionShape, outputShape, xSize, ySize, cSize);
-}
-void whereKernel(const half *inputX, const half *inputY,
-                 const uint8_t *condition, half *output, int nDims,
-                 int outputsize, SmallArray inputXShape, SmallArray inputYShape,
-                 SmallArray conditionShape, SmallArray outputShape, int xSize,
-                 int ySize, int cSize) {
-    int blocksize;
-    if (outputsize > 511) {
-        blocksize = 1024;
-    } else if (outputsize > 255) {
-        blocksize = 512;
-    } else if (outputsize > 127) {
-        blocksize = 256;
-    } else if (outputsize > 63) {
-        blocksize = 128;
-    } else if (outputsize > 31) {
-        blocksize = 64;
-    } else {
-        blocksize = 32;
-    }
-    int gridsize = (outputsize + blocksize - 1) / blocksize;
-    _whereKernel<half>
-        <<<gridsize, blocksize, 0, CUDAStream::getCurrentStream()>>>(
-        inputX, inputY, condition, output, nDims, outputsize, inputXShape,
-        inputYShape, conditionShape, outputShape, xSize, ySize, cSize);
+
+    SWITCHCurrency_DTYPE(dTypeIndex)
 }
+
 } // namespace infini
--- a/src/kernels/kunlun/element_wise.cc
+++ b/src/kernels/kunlun/element_wise.cc
@ -97,14 +97,11 @@ class DivXdnn : public KUNLUNKernelWithoutConfig {
        auto aDim = op->getInputs(0)->getDims();
        auto bSize = op->getInputs(1)->size();
        auto bDim = op->getInputs(1)->getDims();
+        auto dtype = op->getDType();

-        // op input a, b is scalar while aDim and b Dim is empty
        if (bDim.size() == 0) {
            bDim.push_back(1);
        }
-        if (aDim.size() == 0) {
-            aDim.push_back(1);
-        }

        if (aSize == bSize) {
            // Do ElementWise Sub with no broadcast
@ -112,9 +109,23 @@ class DivXdnn : public KUNLUNKernelWithoutConfig {
                                              (float *)aData, (float *)bData,
                                              (float *)cData, aSize));
        } else {
-            checkKUNLUNError(xdnn::broadcast_div<float>(
-                context->KUNLUNHandle(), (float *)aData, (float *)bData,
-                (float *)cData, aDim, bDim));
+            // Do broadcast div
+            Shape aligned = infer_broadcast(aDim, bDim);
+            if (aligned == aDim) {
+                // BData need to be broadcasted
+                checkKUNLUNError(xdnn::broadcast_div<float>(
+                    context->KUNLUNHandle(), (float *)aData, (float *)bData,
+                    (float *)cData, aDim, bDim));
+            } else {
+                // Use workspace to broadcast aData
+                KUNLUNPtr wks = context->getWorkspace(bSize * dtype.getSize());
+                checkKUNLUNError(xdnn::broadcast<float>(
+                    context->KUNLUNHandle(), (float *)aData, (float *)wks, aDim,
+                    bDim));
+                checkKUNLUNError(xdnn::div<float>(context->KUNLUNHandle(),
+                                                  (float *)wks, (float *)bData,
+                                                  (float *)cData, bSize));
+            }
        }
        return;
    }
--- a/src/kernels/kunlun/unary.cc
+++ b/src/kernels/kunlun/unary.cc
@ -570,7 +570,6 @@ REGISTER_KERNEL(Device::KUNLUN, OpType::Reciprocal, ReciprocalXdnn,
 REGISTER_KERNEL(Device::KUNLUN, OpType::Reshape, CopyXdnn, "Reshape_xdnn");
 REGISTER_KERNEL(Device::KUNLUN, OpType::Flatten, CopyXdnn, "Flatten_xdnn");
 REGISTER_KERNEL(Device::KUNLUN, OpType::Identity, CopyXdnn, "Identity_xdnn");
-REGISTER_KERNEL(Device::KUNLUN, OpType::Squeeze, CopyXdnn, "Squeeze_xdnn");
 REGISTER_KERNEL(Device::KUNLUN, OpType::Abs, AbsXdnn, "Abs_xdnn");
 REGISTER_KERNEL(Device::KUNLUN, OpType::Atan, ATanXdnn, "Atan_xdnn");
 REGISTER_KERNEL(Device::KUNLUN, OpType::Log, LogXdnn, "Log_xdnn");
--- a/test/kernels/cuda/test_cuda_where.cc
+++ b/test/kernels/cuda/test_cuda_where.cc
@ -84,6 +84,17 @@ void test_whereFp16(
 }

 TEST(CUDA_WhereFp32, run) {
+    test_whereFp32(
+        Shape{2, 2, 3, 1, 2},
+        vector<float>{0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,
+                      8.,  9.,  10., 11., 12., 13., 14., 15.,
+                      16., 17., 18., 19., 20., 21., 22., 23.},
+        Shape{2, 2, 3, 1, 2},
+        vector<float>{0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
+                      0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.},
+        Shape{2, 3, 1, 2}, vector<uint8_t>{0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1},
+        vector<float>{0., 1.,  2.,  0., 0., 0., 6.,  7.,  0., 9.,  10., 11.,
+                      0., 13., 14., 0., 0., 0., 18., 19., 0., 21., 22., 23.});
    test_whereFp32(
        Shape{2, 2, 3, 1}, vector<float>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
        Shape{2, 2, 3, 1}, vector<float>{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
Author	SHA1	Message	Date
xgqdut2016	7146294baa	memcopy instead of special kernel	2024-05-06 14:49:39 +08:00
xgqdut2016	73e3f1fc6f	add currency operator	2024-04-10 15:01:22 +08:00
xgqdut2016	86133c8d0a	modified expand	2024-04-10 11:16:54 +08:00
xgqdut2016	2761d46737	modified div_kernel	2024-04-10 10:51:35 +08:00
xgqdut2016	aa1c3222ed	modified transpose and where	2024-04-10 10:17:45 +08:00
				`@ -1 +0,0 @@`
				`Subproject commit cbcf3fbf985a00494b0f136c92eaccd42031bf65`